Fixed all the bugs in TRPL

2022-08-15 16:55:17 +02:00 · 2022-08-15 16:55:17 +02:00 · d35c3d8520
commit d35c3d8520
parent 28d0c609bc
4 changed files with 31 additions and 10 deletions
--- a/metastable_baselines/distributions/distributions.py
+++ b/metastable_baselines/distributions/distributions.py
@ -221,13 +221,19 @@ class UniversalGaussianDistribution(SB3_Distribution):

    def _sqrt_to_chol(self, cov_sqrt):
        vec = False
-        if len(cov_sqrt.shape) == 2:
+        nobatch = False
+        if len(cov_sqrt.shape) <= 2:
            vec = True
+        if len(cov_sqrt.shape) == 1:
+            nobatch = True

        if vec:
            cov_sqrt = th.diag_embed(cov_sqrt)

-        cov = th.bmm(cov_sqrt.mT, cov_sqrt)
+        if nobatch:
+            cov = th.mm(cov_sqrt.mT, cov_sqrt)
+        else:
+            cov = th.bmm(cov_sqrt.mT, cov_sqrt)
        chol = th.linalg.cholesky(cov)

        if vec:
--- a/metastable_baselines/misc/distTools.py
+++ b/metastable_baselines/misc/distTools.py
@ -26,7 +26,12 @@ def get_mean_and_sqrt(p: UniversalGaussianDistribution, expand=False):
    else:
        mean, chol = get_mean_and_chol(p, expand=False)
        sqrt_cov = p.cov_sqrt
-        if expand and len(sqrt_cov.shape) == 2:
+        if mean.shape[0] != sqrt_cov.shape[0]:
+            shape = list(sqrt_cov.shape)
+            shape[0] = mean.shape[0]
+            shape = tuple(shape)
+            sqrt_cov = sqrt_cov.expand(shape)
+        if expand and len(sqrt_cov.shape) <= 2:
            sqrt_cov = th.diag_embed(sqrt_cov)
        return mean, sqrt_cov

--- a/metastable_baselines/ppo/policies.py
+++ b/metastable_baselines/ppo/policies.py
@ -315,9 +315,19 @@ class ActorCriticPolicy(BasePolicy):
        elif isinstance(self.action_dist, UniversalGaussianDistribution):
            if self.sqrt_induced_gaussian:
                chol_sqrt_cov = self.chol_net(latent_pi)
-                if len(chol_sqrt_cov.shape) == 2:
+                unembed = False
+                squeeze = False
+                if len(chol_sqrt_cov.shape) <= 2:
+                    unembed = True
                    chol_sqrt_cov = th.diag_embed(chol_sqrt_cov)
+                if len(chol_sqrt_cov.shape) <= 2:
+                    squeeze = True
+                    chol_sqrt_cov = chol_sqrt_cov.unsqueeze(0)
                cov_sqrt = th.bmm(chol_sqrt_cov.mT, chol_sqrt_cov)
+                if squeeze and False:
+                    cov_sqrt = cov_sqrt.squeeze()
+                if unembed:
+                    cov_sqrt = th.diagonal(cov_sqrt, dim1=-2, dim2=-1)
                dist = self.action_dist.proba_distribution_from_sqrt(
                    mean_actions, cov_sqrt, latent_pi)
                mean, chol = get_mean_and_chol(dist, expand=False)
--- a/test.py
+++ b/test.py
@ -20,21 +20,21 @@ root_path = '.'

 def main(env_name='ColumbusCandyland_Aux10-v0', timesteps=1_000_000, showRes=True, saveModel=True, n_eval_episodes=0):
    env = gym.make(env_name)
-    use_sde = True
+    use_sde = False
    ppo = PPO(
        MlpPolicyPPO,
        env,
-        projection=BaseProjectionLayer(),
-        policy_kwargs={'dist_kwargs': {'neural_strength': Strength.FULL, 'cov_strength': Strength.FULL, 'parameterization_type':
-                       ParametrizationType.CHOL, 'enforce_positive_type': EnforcePositiveType.ABS, 'prob_squashing_type': ProbSquashingType.NONE}},
+        projection=KLProjectionLayer(trust_region_coeff=0.01),
+        policy_kwargs={'dist_kwargs': {'neural_strength': Strength.SCALAR, 'cov_strength': Strength.DIAG, 'parameterization_type':
+                       ParametrizationType.NONE, 'enforce_positive_type': EnforcePositiveType.ABS, 'prob_squashing_type': ProbSquashingType.NONE}},
        verbose=0,
        tensorboard_log=root_path+"/logs_tb/" +
        env_name+"/ppo"+(['', '_sde'][use_sde])+"/",
-        learning_rate=3e-4,
+        learning_rate=3e-4,  # 3e-4,
        gamma=0.99,
        gae_lambda=0.95,
        normalize_advantage=True,
-        ent_coef=0.02,  # 0.1
+        ent_coef=0.1,  # 0.1
        vf_coef=0.5,
        use_sde=use_sde,  # False
        clip_range=1  # 0.2,