113 lines
4.2 KiB
Python
113 lines
4.2 KiB
Python
"""
|
|
Parent PPO fine-tuning agent class.
|
|
|
|
"""
|
|
|
|
from typing import Optional
|
|
import torch
|
|
import logging
|
|
from util.scheduler import CosineAnnealingWarmupRestarts
|
|
|
|
log = logging.getLogger(__name__)
|
|
from agent.finetune.train_agent import TrainAgent
|
|
from util.reward_scaling import RunningRewardScaler
|
|
|
|
|
|
class TrainPPOAgent(TrainAgent):
|
|
|
|
def __init__(self, cfg):
|
|
super().__init__(cfg)
|
|
|
|
# Batch size for logprobs calculations after an iteration --- prevent out of memory if using a single batch
|
|
self.logprob_batch_size = cfg.train.get("logprob_batch_size", 10000)
|
|
assert (
|
|
self.logprob_batch_size % self.n_envs == 0
|
|
), "logprob_batch_size must be divisible by n_envs"
|
|
|
|
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
|
|
self.gamma = cfg.train.gamma
|
|
|
|
# Wwarm up period for critic before actor updates
|
|
self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
|
|
|
|
# Optimizer
|
|
self.actor_optimizer = torch.optim.AdamW(
|
|
self.model.actor_ft.parameters(),
|
|
lr=cfg.train.actor_lr,
|
|
weight_decay=cfg.train.actor_weight_decay,
|
|
)
|
|
# use cosine scheduler with linear warmup
|
|
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
|
|
self.actor_optimizer,
|
|
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
|
|
cycle_mult=1.0,
|
|
max_lr=cfg.train.actor_lr,
|
|
min_lr=cfg.train.actor_lr_scheduler.min_lr,
|
|
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
|
|
gamma=1.0,
|
|
)
|
|
self.critic_optimizer = torch.optim.AdamW(
|
|
self.model.critic.parameters(),
|
|
lr=cfg.train.critic_lr,
|
|
weight_decay=cfg.train.critic_weight_decay,
|
|
)
|
|
self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
|
|
self.critic_optimizer,
|
|
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
|
|
cycle_mult=1.0,
|
|
max_lr=cfg.train.critic_lr,
|
|
min_lr=cfg.train.critic_lr_scheduler.min_lr,
|
|
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
|
|
gamma=1.0,
|
|
)
|
|
|
|
# Generalized advantage estimation
|
|
self.gae_lambda: float = cfg.train.get("gae_lambda", 0.95)
|
|
|
|
# If specified, stop gradient update once KL difference reaches it
|
|
self.target_kl: Optional[float] = cfg.train.target_kl
|
|
|
|
# Number of times the collected data is used in gradient update
|
|
self.update_epochs: int = cfg.train.update_epochs
|
|
|
|
# Entropy loss coefficient
|
|
self.ent_coef: float = cfg.train.get("ent_coef", 0)
|
|
|
|
# Value loss coefficient
|
|
self.vf_coef: float = cfg.train.get("vf_coef", 0)
|
|
|
|
# Whether to use running reward scaling
|
|
self.reward_scale_running: bool = cfg.train.reward_scale_running
|
|
if self.reward_scale_running:
|
|
self.running_reward_scaler = RunningRewardScaler(self.n_envs)
|
|
|
|
# Scaling reward with constant
|
|
self.reward_scale_const: float = cfg.train.get("reward_scale_const", 1)
|
|
|
|
# Use base policy
|
|
self.use_bc_loss: bool = cfg.train.get("use_bc_loss", False)
|
|
self.bc_loss_coeff: float = cfg.train.get("bc_loss_coeff", 0)
|
|
|
|
def reset_actor_optimizer(self):
|
|
"""Not used anywhere currently"""
|
|
new_optimizer = torch.optim.AdamW(
|
|
self.model.actor_ft.parameters(),
|
|
lr=self.cfg.train.actor_lr,
|
|
weight_decay=self.cfg.train.actor_weight_decay,
|
|
)
|
|
new_optimizer.load_state_dict(self.actor_optimizer.state_dict())
|
|
self.actor_optimizer = new_optimizer
|
|
|
|
new_scheduler = CosineAnnealingWarmupRestarts(
|
|
self.actor_optimizer,
|
|
first_cycle_steps=self.cfg.train.actor_lr_scheduler.first_cycle_steps,
|
|
cycle_mult=1.0,
|
|
max_lr=self.cfg.train.actor_lr,
|
|
min_lr=self.cfg.train.actor_lr_scheduler.min_lr,
|
|
warmup_steps=self.cfg.train.actor_lr_scheduler.warmup_steps,
|
|
gamma=1.0,
|
|
)
|
|
new_scheduler.load_state_dict(self.actor_lr_scheduler.state_dict())
|
|
self.actor_lr_scheduler = new_scheduler
|
|
log.info("Reset actor optimizer")
|