Fixed bug in RolloutBuffer when using parallel envs

2022-08-27 16:02:40 +02:00 · 2022-08-27 16:02:40 +02:00 · afec4e709c
commit afec4e709c
parent 02e4ed1510
1 changed files with 40 additions and 4 deletions
--- a/metastable_baselines/misc/rollout_buffer.py
+++ b/metastable_baselines/misc/rollout_buffer.py
@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Type, Union, NamedTuple
+from typing import Any, Dict, Optional, Type, Union, NamedTuple, Generator
 import numpy as np
 import torch as th
@ -13,6 +13,9 @@ from stable_baselines3.common.utils import obs_as_tensor
 from ..misc.distTools import get_mean_and_chol
 from ..distributions.distributions import Strength, UniversalGaussianDistribution
 from stable_baselines3.common.vec_env import VecNormalize
 # TRL requires the origina mean and covariance from the policy when the datapoint was created.
 # GaussianRolloutBuffer extends the RolloutBuffer by these two fields
@ -54,7 +57,7 @@ class GaussianRolloutBuffer(RolloutBuffer):
    def reset(self) -> None:
        self.means = np.zeros(
-            (self.buffer_size, self.n_envs) + self.action_space.shape, dtype=np.float32)
+            (self.buffer_size, self.n_envs, self.action_dim), dtype=np.float32)
        self.chols = np.zeros(
            (self.buffer_size, self.n_envs) + self.cov_shape, dtype=np.float32)
        super().reset()
@ -98,12 +101,43 @@ class GaussianRolloutBuffer(RolloutBuffer):
        self.episode_starts[self.pos] = np.array(episode_start).copy()
        self.values[self.pos] = value.clone().cpu().numpy().flatten()
        self.log_probs[self.pos] = log_prob.clone().cpu().numpy()
-        self.means[self.pos] = mean.clone().cpu().numpy()
+        self.means[self.pos] = np.array(mean).copy()
-        self.chols[self.pos] = chol.clone().cpu().numpy()
+        self.chols[self.pos] = np.array(chol).copy()
        self.pos += 1
        if self.pos == self.buffer_size:
            self.full = True
    def get(self, batch_size: Optional[int] = None) -> Generator[GaussianRolloutBufferSamples, None, None]:
        assert self.full, ""
        indices = np.random.permutation(self.buffer_size * self.n_envs)
        # Prepare the data
        if not self.generator_ready:
            _tensor_names = [
                "observations",
                "actions",
                "values",
                "log_probs",
                "advantages",
                "returns",
                "means",
                "chols"
            ]
            for tensor in _tensor_names:
                self.__dict__[tensor] = self.swap_and_flatten(
                    self.__dict__[tensor])
            self.generator_ready = True
        # Return everything, don't create minibatches
        if batch_size is None:
            batch_size = self.buffer_size * self.n_envs
        start_idx = 0
        while start_idx < self.buffer_size * self.n_envs:
            yield self._get_samples(indices[start_idx: start_idx + batch_size])
            start_idx += batch_size
    def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> GaussianRolloutBufferSamples:
        data = (
            self.observations[batch_inds],
@ -183,6 +217,8 @@ class GaussianRolloutCollectorAuxclass():
                dist = self.policy.get_distribution(obs_tensor).distribution
                mean, chol = get_mean_and_chol(dist)
            actions = actions.cpu().numpy()
            mean = mean.cpu().numpy()
            chol = chol.cpu().numpy()
            # Rescale and perform action
            clipped_actions = actions