dppo/agent/finetune/train_ppo_agent.py
2024-09-03 21:03:27 -04:00

113 lines
4.2 KiB
Python

"""
Parent PPO fine-tuning agent class.
"""
from typing import Optional
import torch
import logging
from util.scheduler import CosineAnnealingWarmupRestarts
log = logging.getLogger(__name__)
from agent.finetune.train_agent import TrainAgent
from util.reward_scaling import RunningRewardScaler
class TrainPPOAgent(TrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
# Batch size for logprobs calculations after an iteration --- prevent out of memory if using a single batch
self.logprob_batch_size = cfg.train.get("logprob_batch_size", 10000)
assert (
self.logprob_batch_size % self.n_envs == 0
), "logprob_batch_size must be divisible by n_envs"
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
self.gamma = cfg.train.gamma
# Wwarm up period for critic before actor updates
self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
# Optimizer
self.actor_optimizer = torch.optim.AdamW(
self.model.actor_ft.parameters(),
lr=cfg.train.actor_lr,
weight_decay=cfg.train.actor_weight_decay,
)
# use cosine scheduler with linear warmup
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
self.actor_optimizer,
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.actor_lr,
min_lr=cfg.train.actor_lr_scheduler.min_lr,
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
gamma=1.0,
)
self.critic_optimizer = torch.optim.AdamW(
self.model.critic.parameters(),
lr=cfg.train.critic_lr,
weight_decay=cfg.train.critic_weight_decay,
)
self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
self.critic_optimizer,
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.critic_lr,
min_lr=cfg.train.critic_lr_scheduler.min_lr,
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
gamma=1.0,
)
# Generalized advantage estimation
self.gae_lambda: float = cfg.train.get("gae_lambda", 0.95)
# If specified, stop gradient update once KL difference reaches it
self.target_kl: Optional[float] = cfg.train.target_kl
# Number of times the collected data is used in gradient update
self.update_epochs: int = cfg.train.update_epochs
# Entropy loss coefficient
self.ent_coef: float = cfg.train.get("ent_coef", 0)
# Value loss coefficient
self.vf_coef: float = cfg.train.get("vf_coef", 0)
# Whether to use running reward scaling
self.reward_scale_running: bool = cfg.train.reward_scale_running
if self.reward_scale_running:
self.running_reward_scaler = RunningRewardScaler(self.n_envs)
# Scaling reward with constant
self.reward_scale_const: float = cfg.train.get("reward_scale_const", 1)
# Use base policy
self.use_bc_loss: bool = cfg.train.get("use_bc_loss", False)
self.bc_loss_coeff: float = cfg.train.get("bc_loss_coeff", 0)
def reset_actor_optimizer(self):
"""Not used anywhere currently"""
new_optimizer = torch.optim.AdamW(
self.model.actor_ft.parameters(),
lr=self.cfg.train.actor_lr,
weight_decay=self.cfg.train.actor_weight_decay,
)
new_optimizer.load_state_dict(self.actor_optimizer.state_dict())
self.actor_optimizer = new_optimizer
new_scheduler = CosineAnnealingWarmupRestarts(
self.actor_optimizer,
first_cycle_steps=self.cfg.train.actor_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=self.cfg.train.actor_lr,
min_lr=self.cfg.train.actor_lr_scheduler.min_lr,
warmup_steps=self.cfg.train.actor_lr_scheduler.warmup_steps,
gamma=1.0,
)
new_scheduler.load_state_dict(self.actor_lr_scheduler.state_dict())
self.actor_lr_scheduler = new_scheduler
log.info("Reset actor optimizer")