Fix: Buffer supplying weird shapes for mean and cov

2024-01-26 12:40:24 +01:00 · 2024-01-26 12:40:24 +01:00 · e788e9f998
commit e788e9f998
parent 53505a80ad
2 changed files with 25 additions and 17 deletions
--- a/metastable_baselines2/common/buffers.py
+++ b/metastable_baselines2/common/buffers.py
@ -29,7 +29,7 @@ class BetterRolloutBufferSamples(NamedTuple):
    old_values: th.Tensor
    old_log_prob: th.Tensor
    mean: th.Tensor
-    cov_Decomp: th.Tensor
+    cov_decomp: th.Tensor
    advantages: th.Tensor
    returns: th.Tensor
@ -40,7 +40,7 @@ class BetterDictRolloutBufferSamples(NamedTuple):
    old_values: th.Tensor
    old_log_prob: th.Tensor
    mean: th.Tensor
-    cov_Decomp: th.Tensor
+    cov_decomp: th.Tensor
    advantages: th.Tensor
    returns: th.Tensor
@ -227,8 +227,8 @@ class BetterRolloutBuffer(RolloutBuffer):
            self.actions[batch_inds],
            self.values[batch_inds].flatten(),
            self.log_probs[batch_inds].flatten(),
-            self.means[batch_inds].flatten(),
+            np.squeeze(self.means[batch_inds], axis=1),
-            self.cov_decomps[batch_inds].flatten(),
+            np.squeeze(self.cov_decomps[batch_inds], axis=1),
            self.advantages[batch_inds].flatten(),
            self.returns[batch_inds].flatten(),
        )
--- a/metastable_baselines2/trpl/trpl.py
+++ b/metastable_baselines2/trpl/trpl.py
@ -39,28 +39,31 @@ class TRPL(BetterOnPolicyAlgorithm):
    :param n_epochs: Number of epoch when optimizing the surrogate loss
    :param gamma: Discount factor
    :param gae_lambda: Factor for trade-off of bias vs variance for Generalized Advantage Estimator
-    :param clip_range: Clipping parameter, it can be a function of the current progress
+    :param clip_range: Should not be used for normal TRPL usage. Is only here to bridge the gap to PPO.
-        remaining (from 1 to 0).
+        Clipping parameter, it can be a function of the current progress remaining (from 1 to 0).
-    :param clip_range_vf: Clipping parameter for the value function,
+        Setting it to None will result in no clipping (default)
-        it can be a function of the current progress remaining (from 1 to 0).
+    :param clip_range_vf: Should not be used for normal TRPL usage. Is only here to bridge the gap to PPO.
        Clipping parameter for the value function, it can be a function of the current progress remaining (from 1 to 0).
        This is a parameter specific to the OpenAI implementation. If None is passed (default),
        no clipping will be done on the value function.
        IMPORTANT: this clipping depends on the reward scaling.
-    :param normalize_advantage: Whether to normalize or not the advantage
+    :param normalize_advantage: Normally a good idea; but TRPL actually often works better without normalization of the advantage.
        Whether to normalize or not the advantage. (Default: False)
    :param ent_coef: Entropy coefficient for the loss calculation
    :param vf_coef: Value function coefficient for the loss calculation
-    :param max_grad_norm: The maximum value for the gradient clipping
+    :param max_grad_norm: Should not be used for normal TRPL usage. Is only here to bridge the gap to PPO..
        The maximum value for the gradient clipping. Setting it to None will result in no gradient clipping (default)
    :param use_sde: Whether to use generalized State Dependent Exploration (gSDE)
        instead of action noise exploration (default: False)
    :param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE
        Default: -1 (only sample at the beginning of the rollout)
-    :param use_pca: Wether to use Prior Conditioned Annealing
+    :param use_pca: Wether to use Prior Conditioned Annealing.
    :param rollout_buffer_class: Rollout buffer class to use. If ``None``, it will be automatically selected.
    :param rollout_buffer_kwargs: Keyword arguments to pass to the rollout buffer on creation        
-    :param target_kl: Limit the KL divergence between updates,
+    :param target_kl: Not part of reference implementation of TRPL, but we still ported it over from sb3's PPO.
-        because the clipping is not enough to prevent large update
+        Default to None: No limit.
        Limit the KL divergence between updates, because the clipping is not enough to prevent large update
        see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
        By default, there is no limit on the kl div.
    :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
        the reported success rate, mean episode length, and mean reward over
    :param tensorboard_log: the log location for tensorboard (if None, no logging)
@ -89,7 +92,7 @@ class TRPL(BetterOnPolicyAlgorithm):
        gae_lambda: float = 0.95,
        clip_range: Union[float, Schedule, None] = None,
        clip_range_vf: Union[None, float, Schedule] = None,
-        normalize_advantage: bool = True,
+        normalize_advantage: bool = False,
        ent_coef: float = 0.0,
        vf_coef: float = 0.5,
        max_grad_norm: Union[float, None] = None,
@ -97,7 +100,8 @@ class TRPL(BetterOnPolicyAlgorithm):
        sde_sample_freq: int = -1,
        use_pca: bool = False,
        pca_is: bool = False,
-        projection: Union[BaseProjectionLayer, str] = BaseProjectionLayer(),
+        projection_class: Union[BaseProjectionLayer, str] = BaseProjectionLayer,
        projection_kwargs: Optional[Dict[str, Any]] = None,
        rollout_buffer_class: Optional[Type[RolloutBuffer]] = None,
        rollout_buffer_kwargs: Optional[Dict[str, Any]] = None,
        target_kl: Optional[float] = None,
@ -109,6 +113,11 @@ class TRPL(BetterOnPolicyAlgorithm):
        device: Union[th.device, str] = "auto",
        _init_setup_model: bool = True,
    ):
        self.projection_class = castProjection(projection_class)
        self.projection_kwargs = projection_kwargs
        self.projection = self.projection_class(**self.projection_kwargs)
        policy_kwargs['policy_projection'] = self.projection
        super().__init__(
            policy,
            env,
@ -176,7 +185,6 @@ class TRPL(BetterOnPolicyAlgorithm):
            clip_range_vf = None
        self.clip_range_vf = clip_range_vf
        self.normalize_advantage = normalize_advantage
        self.projection = castProjection(projection)
        self.target_kl = target_kl
        if _init_setup_model: