metastable-baselines/projections_orig/papi_projection.py

#   Copyright (c) 2021 Robert Bosch GmbH
#   Author: Fabian Otto
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published
#   by the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

import logging

import copy
import numpy as np
import torch as ch
from typing import Tuple, Union

from trust_region_projections.utils.projection_utils import gaussian_kl
from trust_region_projections.models.policy.abstract_gaussian_policy import AbstractGaussianPolicy
from trust_region_projections.projections.base_projection_layer import BaseProjectionLayer
from trust_region_projections.utils.torch_utils import torch_batched_trace

logger = logging.getLogger("papi_projection")


class PAPIProjection(BaseProjectionLayer):

    def __init__(self,
                 proj_type: str = "",
                 mean_bound: float = 0.015,
                 cov_bound: float = 0.0,

                 entropy_eq: bool = False,
                 entropy_first: bool = True,

                 cpu: bool = True,
                 dtype: ch.dtype = ch.float32,
                 **kwargs
                 ):

        """
        PAPI projection, which can be used after each training epoch to satisfy the trust regions.
        Args:
           proj_type: Which type of projection to use. None specifies no projection and uses the TRPO objective.
           mean_bound: projection bound for the step size,
                       PAPI only has a joint KL constraint, mean and cov bound are summed for this bound.
           cov_bound: projection bound for the step size,
                      PAPI only has a joint KL constraint, mean and cov bound are summed for this bound.
           entropy_eq: Use entropy equality constraints.
           entropy_first: Project entropy before trust region.
           cpu: Compute on CPU only.
           dtype: Data type to use, either of float32 or float64. The later might be necessary for higher
                   dimensions in order to learn the full covariance.
        """

        assert entropy_first
        super().__init__(proj_type, mean_bound, cov_bound, 0.0, False, None, None, None, 0.0, 0.0, entropy_eq,
                         entropy_first, cpu, dtype)

        self.last_policies = []

    def __call__(self, policy, p, q, step=0, *args, **kwargs):
        if kwargs.get("obs"):
            self._papi_steps(policy, q, **kwargs)
        else:
            return p

    def _trust_region_projection(self, policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor],
                                 q: Tuple[ch.Tensor, ch.Tensor], eps: Union[ch.Tensor, float],
                                 eps_cov: Union[ch.Tensor, float], **kwargs):
        """
        runs papi projection layer and constructs sqrt of covariance
        Args:
            policy: policy instance
            p: current distribution
            q: old distribution
            eps: (modified) kl bound/ kl bound for mean part
            eps_cov: (modified) kl bound for cov part
            **kwargs:

        Returns:
            mean, cov sqrt
        """

        mean, chol = p
        old_mean, old_chol = q
        intermed_mean = kwargs.get('intermed_mean')

        dtype = mean.dtype
        device = mean.device

        dim = mean.shape[-1]

        ################################################################################################################
        # Precompute basic matrices

        # Joint bound
        eps += eps_cov

        I = ch.eye(dim, dtype=dtype, device=device)
        old_precision = ch.cholesky_solve(I, old_chol)[0]
        logdet_old = policy.log_determinant(old_chol)
        cov = policy.covariance(chol)

        ################################################################################################################
        # compute expected KL
        maha_part, cov_part = gaussian_kl(policy, p, q)
        maha_part = maha_part.mean()
        cov_part = cov_part.mean()

        if intermed_mean is not None:
            maha_intermediate = 0.5 * policy.maha(intermed_mean, old_mean, old_chol).mean()
            mm = ch.min(maha_part, maha_intermediate)

        ################################################################################################################
        # matrix rotation/rescaling projection
        if maha_part + cov_part > eps + 1e-6:
            old_cov = policy.covariance(old_chol)

            maha_delta = eps if intermed_mean is None else (eps - mm)
            eta_rot = maha_delta / ch.max(maha_part + cov_part, ch.tensor(1e-16, dtype=dtype, device=device))
            new_cov = (1 - eta_rot) * old_cov + eta_rot * cov
            proj_chol = ch.cholesky(new_cov)

            # recompute covariance part of KL for new chol
            trace_term = 0.5 * (torch_batched_trace(old_precision @ new_cov) - dim).mean()  # rotation difference
            entropy_diff = 0.5 * (logdet_old - policy.log_determinant(proj_chol)).mean()

            cov_part = trace_term + entropy_diff

        else:
            proj_chol = chol

        ################################################################################################################
        # mean interpolation projection
        if maha_part + cov_part > eps + 1e-6:

            if intermed_mean is not None:
                a = 0.5 * policy.maha(mean, intermed_mean, old_chol).mean()
                b = 0.5 * ((mean - intermed_mean) @ old_precision @ (intermed_mean - old_mean).T).mean()
                c = maha_intermediate - ch.max(eps - cov_part, ch.tensor(0., dtype=dtype, device=device))
                eta_mean = (-b + ch.sqrt(ch.max(b * b - a * c, ch.tensor(1e-16, dtype=dtype, device=device)))) / \
                           ch.max(a, ch.tensor(1e-16, dtype=dtype, device=device))
            else:
                eta_mean = ch.sqrt(
                    ch.max(eps - cov_part, ch.tensor(1e-16, dtype=dtype, device=device)) /
                    ch.max(maha_part, ch.tensor(1e-16, dtype=dtype, device=device)))
        else:
            eta_mean = ch.tensor(1., dtype=dtype, device=device)

        return eta_mean, proj_chol

    def _papi_steps(self, policy: AbstractGaussianPolicy, q: Tuple[ch.Tensor, ch.Tensor], obs: ch.Tensor, lr_schedule,
                    lr_schedule_vf=None):
        """
        Take PAPI steps after PPO finished its steps. Policy parameters are updated in-place.
        Args:
            policy: policy instance
            q: old distribution
            obs: collected observations from trajectories
            lr_schedule: lr schedule for policy
            lr_schedule_vf: lr schedule for vf

        Returns:

        """
        assert not policy.contextual_std

        # save latest policy in history
        self.last_policies.append(copy.deepcopy(policy))

        ################################################################################################################
        # policy backtracking: out of last n policies and current one find one that satisfies the kl constraint

        intermed_policy = None
        n_backtracks = 0

        for i, pi in enumerate(reversed(self.last_policies)):
            p_prime = pi(obs)
            mean_part, cov_part = pi.kl_divergence(p_prime, q)
            if (mean_part + cov_part).mean() <= self.mean_bound + self.cov_bound:
                intermed_policy = pi
                n_backtracks = i
                break

        ################################################################################################################
        # LR update

        # reduce learning rate when appropriate policy not within the last 4 epochs
        if n_backtracks >= 4 or intermed_policy is None:
            # Linear learning rate annealing
            lr_schedule.step()
            if lr_schedule_vf:
                lr_schedule_vf.step()

        if intermed_policy is None:
            # pop last policy and make it current one, as the updated one was poor
            # do not keep last policy in history, otherwise we could stack the same policy multiple times.
            if len(self.last_policies) >= 1:
                policy.load_state_dict(self.last_policies.pop().state_dict())
                logger.warning(f"No suitable policy found in backtracking of {len(self.last_policies)} policies.")
            return

        ################################################################################################################
        # PAPI iterations

        # We assume only non contextual covariances here, therefore we only need to project for one
        q = (q[0], q[1][:1])  # (means, covs[:1])

        # This is A from Alg. 2 [Akrour et al., 2019]
        intermed_weight = intermed_policy.get_last_layer().detach().clone()
        # This is A @ phi(s)
        intermed_mean = p_prime[0].detach().clone()

        entropy = policy.entropy(q)
        entropy_bound = obs.new_tensor([-np.inf]) if entropy / self.initial_entropy > 0.5 \
            else entropy - (self.mean_bound + self.cov_bound)

        for _ in range(20):
            eta, proj_chol = self._projection(intermed_policy, (p_prime[0], p_prime[1][:1]), q,
                                              self.mean_bound, self.cov_bound, entropy_bound,
                                              intermed_mean=intermed_mean)
            intermed_policy.papi_weight_update(eta, intermed_weight)
            intermed_policy.set_std(proj_chol[0])
            p_prime = intermed_policy(obs)

        policy.load_state_dict(intermed_policy.state_dict())