# Copyright (c) 2021 Robert Bosch GmbH # Author: Fabian Otto # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published # by the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import logging import copy import numpy as np import torch as ch from typing import Tuple, Union from trust_region_projections.utils.projection_utils import gaussian_kl from trust_region_projections.models.policy.abstract_gaussian_policy import AbstractGaussianPolicy from trust_region_projections.projections.base_projection_layer import BaseProjectionLayer from trust_region_projections.utils.torch_utils import torch_batched_trace logger = logging.getLogger("papi_projection") class PAPIProjection(BaseProjectionLayer): def __init__(self, proj_type: str = "", mean_bound: float = 0.015, cov_bound: float = 0.0, entropy_eq: bool = False, entropy_first: bool = True, cpu: bool = True, dtype: ch.dtype = ch.float32, **kwargs ): """ PAPI projection, which can be used after each training epoch to satisfy the trust regions. Args: proj_type: Which type of projection to use. None specifies no projection and uses the TRPO objective. mean_bound: projection bound for the step size, PAPI only has a joint KL constraint, mean and cov bound are summed for this bound. cov_bound: projection bound for the step size, PAPI only has a joint KL constraint, mean and cov bound are summed for this bound. entropy_eq: Use entropy equality constraints. entropy_first: Project entropy before trust region. cpu: Compute on CPU only. dtype: Data type to use, either of float32 or float64. The later might be necessary for higher dimensions in order to learn the full covariance. """ assert entropy_first super().__init__(proj_type, mean_bound, cov_bound, 0.0, False, None, None, None, 0.0, 0.0, entropy_eq, entropy_first, cpu, dtype) self.last_policies = [] def __call__(self, policy, p, q, step=0, *args, **kwargs): if kwargs.get("obs"): self._papi_steps(policy, q, **kwargs) else: return p def _trust_region_projection(self, policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor], q: Tuple[ch.Tensor, ch.Tensor], eps: Union[ch.Tensor, float], eps_cov: Union[ch.Tensor, float], **kwargs): """ runs papi projection layer and constructs sqrt of covariance Args: policy: policy instance p: current distribution q: old distribution eps: (modified) kl bound/ kl bound for mean part eps_cov: (modified) kl bound for cov part **kwargs: Returns: mean, cov sqrt """ mean, chol = p old_mean, old_chol = q intermed_mean = kwargs.get('intermed_mean') dtype = mean.dtype device = mean.device dim = mean.shape[-1] ################################################################################################################ # Precompute basic matrices # Joint bound eps += eps_cov I = ch.eye(dim, dtype=dtype, device=device) old_precision = ch.cholesky_solve(I, old_chol)[0] logdet_old = policy.log_determinant(old_chol) cov = policy.covariance(chol) ################################################################################################################ # compute expected KL maha_part, cov_part = gaussian_kl(policy, p, q) maha_part = maha_part.mean() cov_part = cov_part.mean() if intermed_mean is not None: maha_intermediate = 0.5 * policy.maha(intermed_mean, old_mean, old_chol).mean() mm = ch.min(maha_part, maha_intermediate) ################################################################################################################ # matrix rotation/rescaling projection if maha_part + cov_part > eps + 1e-6: old_cov = policy.covariance(old_chol) maha_delta = eps if intermed_mean is None else (eps - mm) eta_rot = maha_delta / ch.max(maha_part + cov_part, ch.tensor(1e-16, dtype=dtype, device=device)) new_cov = (1 - eta_rot) * old_cov + eta_rot * cov proj_chol = ch.cholesky(new_cov) # recompute covariance part of KL for new chol trace_term = 0.5 * (torch_batched_trace(old_precision @ new_cov) - dim).mean() # rotation difference entropy_diff = 0.5 * (logdet_old - policy.log_determinant(proj_chol)).mean() cov_part = trace_term + entropy_diff else: proj_chol = chol ################################################################################################################ # mean interpolation projection if maha_part + cov_part > eps + 1e-6: if intermed_mean is not None: a = 0.5 * policy.maha(mean, intermed_mean, old_chol).mean() b = 0.5 * ((mean - intermed_mean) @ old_precision @ (intermed_mean - old_mean).T).mean() c = maha_intermediate - ch.max(eps - cov_part, ch.tensor(0., dtype=dtype, device=device)) eta_mean = (-b + ch.sqrt(ch.max(b * b - a * c, ch.tensor(1e-16, dtype=dtype, device=device)))) / \ ch.max(a, ch.tensor(1e-16, dtype=dtype, device=device)) else: eta_mean = ch.sqrt( ch.max(eps - cov_part, ch.tensor(1e-16, dtype=dtype, device=device)) / ch.max(maha_part, ch.tensor(1e-16, dtype=dtype, device=device))) else: eta_mean = ch.tensor(1., dtype=dtype, device=device) return eta_mean, proj_chol def _papi_steps(self, policy: AbstractGaussianPolicy, q: Tuple[ch.Tensor, ch.Tensor], obs: ch.Tensor, lr_schedule, lr_schedule_vf=None): """ Take PAPI steps after PPO finished its steps. Policy parameters are updated in-place. Args: policy: policy instance q: old distribution obs: collected observations from trajectories lr_schedule: lr schedule for policy lr_schedule_vf: lr schedule for vf Returns: """ assert not policy.contextual_std # save latest policy in history self.last_policies.append(copy.deepcopy(policy)) ################################################################################################################ # policy backtracking: out of last n policies and current one find one that satisfies the kl constraint intermed_policy = None n_backtracks = 0 for i, pi in enumerate(reversed(self.last_policies)): p_prime = pi(obs) mean_part, cov_part = pi.kl_divergence(p_prime, q) if (mean_part + cov_part).mean() <= self.mean_bound + self.cov_bound: intermed_policy = pi n_backtracks = i break ################################################################################################################ # LR update # reduce learning rate when appropriate policy not within the last 4 epochs if n_backtracks >= 4 or intermed_policy is None: # Linear learning rate annealing lr_schedule.step() if lr_schedule_vf: lr_schedule_vf.step() if intermed_policy is None: # pop last policy and make it current one, as the updated one was poor # do not keep last policy in history, otherwise we could stack the same policy multiple times. if len(self.last_policies) >= 1: policy.load_state_dict(self.last_policies.pop().state_dict()) logger.warning(f"No suitable policy found in backtracking of {len(self.last_policies)} policies.") return ################################################################################################################ # PAPI iterations # We assume only non contextual covariances here, therefore we only need to project for one q = (q[0], q[1][:1]) # (means, covs[:1]) # This is A from Alg. 2 [Akrour et al., 2019] intermed_weight = intermed_policy.get_last_layer().detach().clone() # This is A @ phi(s) intermed_mean = p_prime[0].detach().clone() entropy = policy.entropy(q) entropy_bound = obs.new_tensor([-np.inf]) if entropy / self.initial_entropy > 0.5 \ else entropy - (self.mean_bound + self.cov_bound) for _ in range(20): eta, proj_chol = self._projection(intermed_policy, (p_prime[0], p_prime[1][:1]), q, self.mean_bound, self.cov_bound, entropy_bound, intermed_mean=intermed_mean) intermed_policy.papi_weight_update(eta, intermed_weight) intermed_policy.set_std(proj_chol[0]) p_prime = intermed_policy(obs) policy.load_state_dict(intermed_policy.state_dict())