ProbSquashing implemented (tanh)

2022-07-20 10:32:19 +02:00 · 2022-07-20 10:32:19 +02:00 · 199ce0c8cb
commit 199ce0c8cb
parent 05dad44b6e
2 changed files with 77 additions and 7 deletions
--- a/metastable_baselines/distributions/distributions.py
+++ b/metastable_baselines/distributions/distributions.py
@ -20,6 +20,7 @@ from stable_baselines3.common.distributions import (
 from stable_baselines3.common.distributions import DiagGaussianDistribution

 from ..misc.tensor_ops import fill_triangular
+from ..misc.tanhBijector import TanhBijector

 # TODO: Integrate and Test what I currently have before adding more complexity
 # TODO: Support Squashed Dists (tanh)
@ -67,6 +68,9 @@ class ProbSquashingType(Enum):
    def apply(self, x):
        return [nn.Identity(), th.tanh][self.value](x)

+    def apply_inv(self, x):
+        return [nn.Identity(), TanhBijector.inverse][self.value](x)
+

 def get_legal_setups(allowedEPTs=None, allowedParStrength=None, allowedCovStrength=None, allowedPTs=None, allowedPSTs=None):
    allowedEPTs = allowedEPTs or EnforcePositiveType
@ -130,7 +134,7 @@ class UniversalGaussianDistribution(SB3_Distribution):
    :param action_dim:  Dimension of the action space.
    """

-    def __init__(self, action_dim: int, use_sde: bool = False, neural_strength: Strength = Strength.DIAG, cov_strength: Strength = Strength.DIAG, parameterization_type: ParametrizationType = ParametrizationType.NONE, enforce_positive_type: EnforcePositiveType = EnforcePositiveType.ABS, prob_squashing_type: ProbSquashingType = ProbSquashingType.NONE):
+    def __init__(self, action_dim: int, use_sde: bool = False, neural_strength: Strength = Strength.DIAG, cov_strength: Strength = Strength.DIAG, parameterization_type: ParametrizationType = ParametrizationType.NONE, enforce_positive_type: EnforcePositiveType = EnforcePositiveType.ABS, prob_squashing_type: ProbSquashingType = ProbSquashingType.NONE, epsilon=1e-6):
        super(UniversalGaussianDistribution, self).__init__()
        self.action_dim = action_dim
        self.par_strength = neural_strength
@ -139,7 +143,10 @@ class UniversalGaussianDistribution(SB3_Distribution):
        self.enforce_positive_type = enforce_positive_type
        self.prob_squashing_type = prob_squashing_type

+        self.epsilon = epsilon
+
        self.distribution = None
+        self.gaussian_actions = None

        if self.prob_squashing_type != ProbSquashingType.NONE:
            raise Exception('ProbSquasing is not yet implmenented!')
@ -209,7 +216,7 @@ class UniversalGaussianDistribution(SB3_Distribution):
            raise Exception('Unable to create torch distribution')
        return self

-    def log_prob(self, actions: th.Tensor) -> th.Tensor:
+    def log_prob(self, actions: th.Tensor, gaussian_actions: Optional[th.Tensor] = None) -> th.Tensor:
        """
        Get the log probabilities of actions according to the distribution.
        Note that you must first call the ``proba_distribution()`` method.
@ -217,18 +224,37 @@ class UniversalGaussianDistribution(SB3_Distribution):
        :param actions:
        :return:
        """
-        log_prob = self.distribution.log_prob(actions)
-        return log_prob
+        if self.prob_squashing_type == ProbSquashingType.NONE:
+            log_prob = self.distribution.log_prob(actions)
+            return log_prob
+
+        if gaussian_actions is None:
+            # It will be clipped to avoid NaN when inversing tanh
+            gaussian_actions = self.prob_squashing_type.apply_inv(actions)
+
+        log_prob = self.distribution.log_prob(gaussian_actions)
+
+        if self.prob_squashing_type == ProbSquashingType.TANH:
+            log_prob -= th.sum(th.log(1 - actions **
+                               2 + self.epsilon), dim=1)
+            return log_prob
+
+        raise Exception()

    def entropy(self) -> th.Tensor:
+        # TODO: This will return incorrect results when using prob-squashing
        return self.distribution.entropy()

    def sample(self) -> th.Tensor:
        # Reparametrization trick to pass gradients
-        return self.distribution.rsample()
+        sample = self.distribution.rsample()
+        self.gaussian_actions = sample
+        return self.prob_squashing_type.apply(sample)

    def mode(self) -> th.Tensor:
-        return self.distribution.mean
+        mode = self.distribution.mean
+        self.gaussian_actions = mode
+        return self.prob_squashing_type.apply(mode)

    def actions_from_params(self, mean_actions: th.Tensor, log_std: th.Tensor, deterministic: bool = False, latent_pi=None) -> th.Tensor:
        # Update the proba distribution
@ -245,7 +271,7 @@ class UniversalGaussianDistribution(SB3_Distribution):
        :return:
        """
        actions = self.actions_from_params(mean_actions, log_std)
-        log_prob = self.log_prob(actions)
+        log_prob = self.log_prob(actions, self.gaussian_actions)
        return actions, log_prob


--- a/metastable_baselines/misc/tanhBijector.py
+++ b/metastable_baselines/misc/tanhBijector.py
@ -0,0 +1,44 @@
+import torch as th
+
+
+class TanhBijector:
+    """
+    Stolen from SB3
+
+    Bijective transformation of a probability distribution
+    using a squashing function (tanh)
+    TODO: use Pyro instead (https://pyro.ai/)
+    :param epsilon: small value to avoid NaN due to numerical imprecision.
+    """
+
+    def __init__(self, epsilon: float = 1e-6):
+        super().__init__()
+        self.epsilon = epsilon
+
+    @staticmethod
+    def forward(x: th.Tensor) -> th.Tensor:
+        return th.tanh(x)
+
+    @staticmethod
+    def atanh(x: th.Tensor) -> th.Tensor:
+        """
+        Inverse of Tanh
+        Taken from Pyro: https://github.com/pyro-ppl/pyro
+        0.5 * torch.log((1 + x ) / (1 - x))
+        """
+        return 0.5 * (x.log1p() - (-x).log1p())
+
+    @staticmethod
+    def inverse(y: th.Tensor) -> th.Tensor:
+        """
+        Inverse tanh.
+        :param y:
+        :return:
+        """
+        eps = th.finfo(y.dtype).eps
+        # Clip the action to avoid NaN
+        return TanhBijector.atanh(y.clamp(min=-1.0 + eps, max=1.0 - eps))
+
+    def log_prob_correction(self, x: th.Tensor) -> th.Tensor:
+        # Squash correction (from original SAC implementation)
+        return th.log(1.0 - th.tanh(x) ** 2 + self.epsilon)