v0.5 to main (#10)

* v0.5 (#9)

* update idql configs

* update awr configs

* update dipo configs

* update qsm configs

* update dqm configs

* update project version to 0.5.0
This commit is contained in:
Allen Z. Ren 2024-10-07 16:35:13 -04:00 committed by GitHub
parent dd14c5887c
commit e0842e71dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
267 changed files with 6769 additions and 1645 deletions

1
.gitignore vendored
View File

@ -10,6 +10,7 @@ checkpoints/
out/
err/
*.pkl
*.sh
# Byte-compiled / optimized / DLL files
__pycache__/

View File

@ -11,10 +11,15 @@ import torch
import logging
import pickle
import random
from tqdm import tqdm
log = logging.getLogger(__name__)
Batch = namedtuple("Batch", "actions conditions")
Transition = namedtuple("Transition", "actions conditions rewards dones")
TransitionWithReturn = namedtuple(
"Transition", "actions conditions rewards dones reward_to_gos"
)
class StitchedSequenceDataset(torch.utils.data.Dataset):
@ -49,6 +54,8 @@ class StitchedSequenceDataset(torch.utils.data.Dataset):
self.img_cond_steps = img_cond_steps
self.device = device
self.use_img = use_img
self.max_n_episodes = max_n_episodes
self.dataset_path = dataset_path
# Load dataset to device specified
if dataset_path.endswith(".npz"):
@ -87,7 +94,7 @@ class StitchedSequenceDataset(torch.utils.data.Dataset):
"""
start, num_before_start = self.indices[idx]
end = start + self.horizon_steps
states = self.states[(start - num_before_start) : end]
states = self.states[(start - num_before_start) : (start + 1)]
actions = self.actions[start:end]
states = torch.stack(
[
@ -116,9 +123,9 @@ class StitchedSequenceDataset(torch.utils.data.Dataset):
indices = []
cur_traj_index = 0
for traj_length in traj_lengths:
max_start = cur_traj_index + traj_length - horizon_steps + 1
max_start = cur_traj_index + traj_length - horizon_steps
indices += [
(i, i - cur_traj_index) for i in range(cur_traj_index, max_start)
(i, i - cur_traj_index) for i in range(cur_traj_index, max_start + 1)
]
cur_traj_index += traj_length
return indices
@ -135,3 +142,151 @@ class StitchedSequenceDataset(torch.utils.data.Dataset):
def __len__(self):
return len(self.indices)
class StitchedSequenceQLearningDataset(StitchedSequenceDataset):
"""
Extends StitchedSequenceDataset to include rewards and dones for Q learning
Do not load the last step of **truncated** episodes since we do not have the correct next state for the final step of each episode. Truncation can be determined by terminal=False but end of episode.
"""
def __init__(
self,
dataset_path,
max_n_episodes=10000,
discount_factor=1.0,
device="cuda:0",
get_mc_return=False,
**kwargs,
):
if dataset_path.endswith(".npz"):
dataset = np.load(dataset_path, allow_pickle=False)
elif dataset_path.endswith(".pkl"):
with open(dataset_path, "rb") as f:
dataset = pickle.load(f)
else:
raise ValueError(f"Unsupported file format: {dataset_path}")
traj_lengths = dataset["traj_lengths"][:max_n_episodes]
total_num_steps = np.sum(traj_lengths)
# discount factor
self.discount_factor = discount_factor
# rewards and dones(terminals)
self.rewards = (
torch.from_numpy(dataset["rewards"][:total_num_steps]).float().to(device)
)
log.info(f"Rewards shape/type: {self.rewards.shape, self.rewards.dtype}")
self.dones = (
torch.from_numpy(dataset["terminals"][:total_num_steps]).to(device).float()
)
log.info(f"Dones shape/type: {self.dones.shape, self.dones.dtype}")
super().__init__(
dataset_path=dataset_path,
max_n_episodes=max_n_episodes,
device=device,
**kwargs,
)
log.info(f"Total number of transitions using: {len(self)}")
# compute discounted reward-to-go for each trajectory
self.get_mc_return = get_mc_return
if get_mc_return:
self.reward_to_go = torch.zeros_like(self.rewards)
cumulative_traj_length = np.cumsum(traj_lengths)
prev_traj_length = 0
for i, traj_length in tqdm(
enumerate(cumulative_traj_length), desc="Computing reward-to-go"
):
traj_rewards = self.rewards[prev_traj_length:traj_length]
returns = torch.zeros_like(traj_rewards)
prev_return = 0
for t in range(len(traj_rewards)):
returns[-t - 1] = (
traj_rewards[-t - 1] + self.discount_factor * prev_return
)
prev_return = returns[-t - 1]
self.reward_to_go[prev_traj_length:traj_length] = returns
prev_traj_length = traj_length
log.info(f"Computed reward-to-go for each trajectory.")
def make_indices(self, traj_lengths, horizon_steps):
"""
skip last step of truncated episodes
"""
num_skip = 0
indices = []
cur_traj_index = 0
for traj_length in traj_lengths:
max_start = cur_traj_index + traj_length - horizon_steps
if not self.dones[cur_traj_index + traj_length - 1]: # truncation
max_start -= 1
num_skip += 1
indices += [
(i, i - cur_traj_index) for i in range(cur_traj_index, max_start + 1)
]
cur_traj_index += traj_length
log.info(f"Number of transitions skipped due to truncation: {num_skip}")
return indices
def __getitem__(self, idx):
start, num_before_start = self.indices[idx]
end = start + self.horizon_steps
states = self.states[(start - num_before_start) : (start + 1)]
actions = self.actions[start:end]
rewards = self.rewards[start : (start + 1)]
dones = self.dones[start : (start + 1)]
# Account for action horizon
if idx < len(self.indices) - self.horizon_steps:
next_states = self.states[
(start - num_before_start + self.horizon_steps) : start
+ 1
+ self.horizon_steps
] # even if this uses the first state(s) of the next episode, done=True will prevent bootstrapping. We have already filtered out cases where done=False but end of episode (truncation).
else:
# prevents indexing error, but ignored since done=True
next_states = torch.zeros_like(states)
# stack obs history
states = torch.stack(
[
states[max(num_before_start - t, 0)]
for t in reversed(range(self.cond_steps))
]
) # more recent is at the end
next_states = torch.stack(
[
next_states[max(num_before_start - t, 0)]
for t in reversed(range(self.cond_steps))
]
) # more recent is at the end
conditions = {"state": states, "next_state": next_states}
if self.use_img:
images = self.images[(start - num_before_start) : end]
images = torch.stack(
[
images[max(num_before_start - t, 0)]
for t in reversed(range(self.img_cond_steps))
]
)
conditions["rgb"] = images
if self.get_mc_return:
reward_to_gos = self.reward_to_go[start : (start + 1)]
batch = TransitionWithReturn(
actions,
conditions,
rewards,
dones,
reward_to_gos,
)
else:
batch = Transition(
actions,
conditions,
rewards,
dones,
)
return batch

View File

@ -36,7 +36,7 @@ class EvalDiffusionAgent(EvalAgent):
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
reward_trajs = np.empty((0, self.n_envs))
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -57,9 +57,13 @@ class EvalDiffusionAgent(EvalAgent):
action_venv = output_venv[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(action_venv)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
firsts_trajs[step + 1] = done_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = terminated_venv | truncated_venv
# update for next step
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.

View File

@ -40,7 +40,7 @@ class EvalImgDiffusionAgent(EvalAgent):
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
reward_trajs = np.empty((0, self.n_envs))
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -60,9 +60,13 @@ class EvalImgDiffusionAgent(EvalAgent):
action_venv = output_venv[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(action_venv)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
firsts_trajs[step + 1] = done_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = terminated_venv | truncated_venv
# update for next step
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.

View File

@ -36,7 +36,7 @@ class EvalGaussianAgent(EvalAgent):
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
reward_trajs = np.empty((0, self.n_envs))
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -55,9 +55,13 @@ class EvalGaussianAgent(EvalAgent):
action_venv = output_venv[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(action_venv)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
firsts_trajs[step + 1] = done_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = terminated_venv | truncated_venv
# update for next step
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.

View File

@ -40,7 +40,7 @@ class EvalImgGaussianAgent(EvalAgent):
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
reward_trajs = np.empty((0, self.n_envs))
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -58,9 +58,13 @@ class EvalImgGaussianAgent(EvalAgent):
action_venv = output_venv[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(action_venv)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
firsts_trajs[step + 1] = done_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = terminated_venv | truncated_venv
# update for next step
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.

View File

@ -26,7 +26,7 @@ from util.scheduler import CosineAnnealingWarmupRestarts
def td_values(
states,
rewards,
dones,
terminateds,
state_values,
gamma=0.99,
alpha=0.95,
@ -43,21 +43,20 @@ def td_values(
"""
sample_count = len(states)
tds = np.zeros_like(state_values, dtype=np.float32)
dones[-1] = 1
next_value = 1 - dones[-1]
next_value = state_values[-1].copy()
next_value[terminateds[-1]] = 0.0
val = 0.0
for i in range(sample_count - 1, -1, -1):
# next_value = 0.0 if dones[i] else state_values[i + 1]
# get next_value for vectorized
if i < sample_count - 1:
next_value = state_values[i + 1]
next_value = next_value * (1 - dones[i])
next_value = next_value * (1 - terminateds[i])
state_value = state_values[i]
error = rewards[i] + gamma * next_value - state_value
val = alpha * error + gamma * lam * (1 - dones[i]) * val
val = alpha * error + gamma * lam * (1 - terminateds[i]) * val
tds[i] = val + state_value
return tds
@ -127,12 +126,12 @@ class TrainAWRDiffusionAgent(TrainAgent):
obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
done_buffer = deque(maxlen=self.buffer_size)
first_buffer = deque(maxlen=self.buffer_size)
terminated_buffer = deque(maxlen=self.buffer_size)
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
@ -156,10 +155,9 @@ class TrainAWRDiffusionAgent(TrainAgent):
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
reward_trajs = np.empty((0, self.n_envs))
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -184,21 +182,26 @@ class TrainAWRDiffusionAgent(TrainAgent):
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
done_venv = terminated_venv | truncated_venv
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = done_venv
# add to buffer
obs_buffer.append(prev_obs_venv["state"])
action_buffer.append(action_venv)
reward_buffer.append(reward_venv * self.scale_reward_factor)
done_buffer.append(done_venv)
first_buffer.append(firsts_trajs[step])
if not eval_mode:
obs_buffer.append(prev_obs_venv["state"])
action_buffer.append(action_venv)
reward_buffer.append(reward_venv * self.scale_reward_factor)
terminated_buffer.append(terminated_venv)
firsts_trajs[step + 1] = done_venv
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
@ -240,7 +243,7 @@ class TrainAWRDiffusionAgent(TrainAgent):
if not eval_mode:
obs_trajs = np.array(deepcopy(obs_buffer)) # assume only state
reward_trajs = np.array(deepcopy(reward_buffer))
dones_trajs = np.array(deepcopy(done_buffer))
terminated_trajs = np.array(deepcopy(terminated_buffer))
obs_t = einops.rearrange(
torch.from_numpy(obs_trajs).float().to(self.device),
"s e h d -> (s e) h d",
@ -248,7 +251,9 @@ class TrainAWRDiffusionAgent(TrainAgent):
values_trajs = np.array(
self.model.critic({"state": obs_t}).detach().cpu().numpy()
).reshape(-1, self.n_envs)
td_trajs = td_values(obs_trajs, reward_trajs, dones_trajs, values_trajs)
td_trajs = td_values(
obs_trajs, reward_trajs, terminated_trajs, values_trajs
)
td_t = torch.from_numpy(td_trajs.flatten()).float().to(self.device)
# Update critic
@ -268,7 +273,7 @@ class TrainAWRDiffusionAgent(TrainAgent):
obs_trajs = np.array(deepcopy(obs_buffer))
samples_trajs = np.array(deepcopy(action_buffer))
reward_trajs = np.array(deepcopy(reward_buffer))
dones_trajs = np.array(deepcopy(done_buffer))
terminated_trajs = np.array(deepcopy(terminated_buffer))
obs_t = einops.rearrange(
torch.from_numpy(obs_trajs).float().to(self.device),
"s e h d -> (s e) h d",
@ -276,7 +281,9 @@ class TrainAWRDiffusionAgent(TrainAgent):
values_trajs = np.array(
self.model.critic({"state": obs_t}).detach().cpu().numpy()
).reshape(-1, self.n_envs)
td_trajs = td_values(obs_trajs, reward_trajs, dones_trajs, values_trajs)
td_trajs = td_values(
obs_trajs, reward_trajs, terminated_trajs, values_trajs
)
advantages_trajs = td_trajs - values_trajs
# flatten
@ -315,13 +322,13 @@ class TrainAWRDiffusionAgent(TrainAgent):
advantages_b_scaled.clamp_(max=self.max_adv_weight)
# Update policy with collected trajectories
loss = self.model.loss(
loss_actor = self.model.loss(
actions_b,
obs_b,
advantages_b_scaled.detach(),
)
self.actor_optimizer.zero_grad()
loss.backward()
loss_actor.backward()
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
@ -341,10 +348,12 @@ class TrainAWRDiffusionAgent(TrainAgent):
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.itr % self.log_freq == 0:
time = timer()
run_results[-1]["time"] = time
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
@ -365,12 +374,13 @@ class TrainAWRDiffusionAgent(TrainAgent):
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}"
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"total env step": cnt_train_step,
"loss - actor": loss_actor,
"loss - critic": loss_critic,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
@ -378,10 +388,7 @@ class TrainAWRDiffusionAgent(TrainAgent):
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["loss_critic"] = loss_critic
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = time
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,501 @@
"""
Reinforcement Learning with Prior Data (RLPD) agent training script.
Does not support image observations right now.
"""
import os
import pickle
import numpy as np
import torch
import logging
import wandb
import hydra
from collections import deque
log = logging.getLogger(__name__)
from util.timer import Timer
from agent.finetune.train_agent import TrainAgent
from util.scheduler import CosineAnnealingWarmupRestarts
class TrainCalQLAgent(TrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
assert self.n_envs == 1, "Cal-QL only supports single env for now"
# Train mode (offline or online)
self.train_online = cfg.train.train_online
# Build dataset
self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset)
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
self.gamma = cfg.train.gamma
# Optimizer
self.actor_optimizer = torch.optim.AdamW(
self.model.network.parameters(),
lr=cfg.train.actor_lr,
weight_decay=cfg.train.actor_weight_decay,
)
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
self.actor_optimizer,
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.actor_lr,
min_lr=cfg.train.actor_lr_scheduler.min_lr,
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
gamma=1.0,
)
self.critic_optimizer = torch.optim.AdamW(
self.model.critic.parameters(),
lr=cfg.train.critic_lr,
weight_decay=cfg.train.critic_weight_decay,
)
self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
self.critic_optimizer,
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.critic_lr,
min_lr=cfg.train.critic_lr_scheduler.min_lr,
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
gamma=1.0,
)
# Perturbation scale
self.target_ema_rate = cfg.train.target_ema_rate
# Number of random actions to sample for Cal-QL
self.n_random_actions = cfg.train.n_random_actions
# Reward scale
self.scale_reward_factor = cfg.train.scale_reward_factor
# Number of critic updates
self.num_update = cfg.train.num_update
# Buffer size
self.buffer_size = cfg.train.buffer_size
# Online only configs
if self.train_online:
# number of episode to colect per epoch for training
self.n_episode_per_epoch = cfg.train.n_episode_per_epoch
# UTD ratio
self.online_utd_ratio = cfg.train.online_utd_ratio
# Eval episodes
self.n_eval_episode = cfg.train.n_eval_episode
# Exploration steps at the beginning - using randomly sampled action
self.n_explore_steps = cfg.train.n_explore_steps
# Initialize temperature parameter for entropy
init_temperature = cfg.train.init_temperature
self.log_alpha = torch.tensor(np.log(init_temperature)).to(self.device)
self.log_alpha.requires_grad = True
self.automatic_entropy_tuning = cfg.train.automatic_entropy_tuning
self.target_entropy = cfg.train.target_entropy
self.log_alpha_optimizer = torch.optim.Adam(
[self.log_alpha],
lr=cfg.train.critic_lr,
)
def run(self):
# make a FIFO replay buffer for obs, action, and reward
obs_buffer = deque(maxlen=self.buffer_size)
next_obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
reward_to_go_buffer = deque(maxlen=self.buffer_size)
terminated_buffer = deque(maxlen=self.buffer_size)
if not self.train_online:
obs_array = np.array(obs_buffer)
next_obs_array = np.array(next_obs_buffer)
actions_array = np.array(action_buffer)
rewards_array = np.array(reward_buffer)
reward_to_go_array = np.array(reward_to_go_buffer)
terminated_array = np.array(terminated_buffer)
# load offline dataset into replay buffer
dataloader_offline = torch.utils.data.DataLoader(
self.dataset_offline,
batch_size=len(self.dataset_offline),
drop_last=False,
)
for batch in dataloader_offline:
actions, states_and_next, rewards, terminated, reward_to_go = batch
states = states_and_next["state"]
next_states = states_and_next["next_state"]
obs_buffer_off = states.cpu().numpy()
next_obs_buffer_off = next_states.cpu().numpy()
action_buffer_off = actions.cpu().numpy()
reward_buffer_off = rewards.cpu().numpy().flatten()
reward_to_go_buffer_off = reward_to_go.cpu().numpy().flatten()
terminated_buffer_off = terminated.cpu().numpy().flatten()
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
if self.itr % 1000 == 0:
print(f"Finished training iteration {self.itr} of {self.n_train_itr}")
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = (
self.itr % self.val_freq == 0
and self.itr >= self.n_explore_steps
and not self.force_train
)
# during eval, we collect a fixed number of episodes, so we set n_steps to a large value
if eval_mode:
n_steps = int(1e5)
elif not self.train_online:
n_steps = 0
else:
n_steps = int(1e5) # use episodes
self.model.eval() if eval_mode else self.model.train()
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning
firsts_trajs = np.zeros((n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or self.itr == 0:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
reward_trajs = np.zeros((n_steps, self.n_envs))
# Collect a set of trajectories from env
cnt_episode = 0
for step in range(n_steps):
if step % 100 == 0:
print(f"Completed environment step {step}")
# Select action
if self.itr < self.n_explore_steps:
action_venv = self.venv.action_space.sample()
else:
with torch.no_grad():
cond = {
"state": torch.from_numpy(prev_obs_venv["state"])
.float()
.to(self.device)
}
samples = (
self.model(
cond=cond,
deterministic=eval_mode,
)
.cpu()
.numpy()
) # n_env x horizon x act
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
done_venv = terminated_venv | truncated_venv
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = done_venv
# add to buffer in train mode
if not eval_mode:
for i in range(self.n_envs):
obs_buffer.append(prev_obs_venv["state"][i])
if truncated_venv[i]:
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
else: # first obs in new episode
next_obs_buffer.append(obs_venv["state"][i])
action_buffer.append(action_venv[i])
reward_buffer.extend(
(reward_venv * self.scale_reward_factor).tolist()
)
terminated_buffer.extend(terminated_venv.tolist())
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# check if enough eval episodes are done
cnt_episode += np.sum(done_venv)
if eval_mode and cnt_episode >= self.n_eval_episode:
break
if not eval_mode and cnt_episode >= self.n_episode_per_epoch:
break
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
# compute episode returns
returns_trajs_split = [
np.zeros_like(reward_trajs) for reward_trajs in reward_trajs_split
]
for traj_rewards, traj_returns in zip(
reward_trajs_split, returns_trajs_split
):
prev_return = 0
for t in range(len(traj_rewards)):
traj_returns[-t - 1] = (
traj_rewards[-t - 1] + self.gamma * prev_return
)
prev_return = traj_returns[-t - 1]
# flatten (note: only works for single env!)
returns_trajs_split = np.concatenate(returns_trajs_split)
# extend buffer
reward_to_go_buffer.extend(returns_trajs_split)
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
# Update models
if not eval_mode and self.itr >= self.n_explore_steps:
# TODO: is this slow in online?
if self.train_online:
obs_array = np.array(obs_buffer)
next_obs_array = np.array(next_obs_buffer)
actions_array = np.array(action_buffer)
rewards_array = np.array(reward_buffer)
reward_to_go_array = np.array(reward_to_go_buffer)
terminated_array = np.array(terminated_buffer)
# override num_update
if self.train_online:
num_update = len(reward_trajs) # assume one env!
else:
num_update = self.num_update
for _ in range(num_update):
# Sample from OFFLINE buffer
inds = np.random.choice(
len(obs_buffer_off),
self.batch_size // 2 if self.train_online else self.batch_size,
)
obs_b = (
torch.from_numpy(obs_buffer_off[inds]).float().to(self.device)
)
next_obs_b = (
torch.from_numpy(next_obs_buffer_off[inds])
.float()
.to(self.device)
)
actions_b = (
torch.from_numpy(action_buffer_off[inds])
.float()
.to(self.device)
)
rewards_b = (
torch.from_numpy(reward_buffer_off[inds])
.float()
.to(self.device)
)
terminated_b = (
torch.from_numpy(terminated_buffer_off[inds])
.float()
.to(self.device)
)
reward_to_go_b = (
torch.from_numpy(reward_to_go_buffer_off[inds])
.float()
.to(self.device)
)
# Sample from ONLINE buffer
if self.train_online:
inds = np.random.choice(len(obs_buffer), self.batch_size // 2)
obs_b_on = (
torch.from_numpy(obs_array[inds]).float().to(self.device)
)
next_obs_b_on = (
torch.from_numpy(next_obs_array[inds])
.float()
.to(self.device)
)
actions_b_on = (
torch.from_numpy(actions_array[inds])
.float()
.to(self.device)
)
rewards_b_on = (
torch.from_numpy(rewards_array[inds])
.float()
.to(self.device)
)
terminated_b_on = (
torch.from_numpy(terminated_array[inds])
.float()
.to(self.device)
)
reward_to_go_b_on = (
torch.from_numpy(reward_to_go_array[inds])
.float()
.to(self.device)
)
# merge offline and online data
obs_b = torch.cat([obs_b, obs_b_on], dim=0)
next_obs_b = torch.cat([next_obs_b, next_obs_b_on], dim=0)
actions_b = torch.cat([actions_b, actions_b_on], dim=0)
rewards_b = torch.cat([rewards_b, rewards_b_on], dim=0)
terminated_b = torch.cat([terminated_b, terminated_b_on], dim=0)
reward_to_go_b = torch.cat(
[reward_to_go_b, reward_to_go_b_on], dim=0
)
# Get a random action for Cal-QL
random_actions = (
torch.rand(
(
self.batch_size,
self.n_random_actions,
self.horizon_steps,
self.action_dim,
)
).to(self.device)
* 2
- 1
) # scale to [-1, 1]
# Update critic
alpha = self.log_alpha.exp().item()
loss_critic = self.model.loss_critic(
{"state": obs_b},
{"state": next_obs_b},
actions_b,
random_actions,
rewards_b,
reward_to_go_b,
terminated_b,
self.gamma,
alpha,
)
self.critic_optimizer.zero_grad()
loss_critic.backward()
self.critic_optimizer.step()
# Update target critic
self.model.update_target_critic(self.target_ema_rate)
# Update actor
loss_actor = self.model.loss_actor(
{"state": obs_b},
alpha,
)
self.actor_optimizer.zero_grad()
loss_actor.backward()
self.actor_optimizer.step()
# Update temperature parameter
if self.automatic_entropy_tuning:
self.log_alpha_optimizer.zero_grad()
loss_alpha = self.model.loss_temperature(
{"state": obs_b},
self.log_alpha.exp(), # with grad
self.target_entropy,
)
loss_alpha.backward()
self.log_alpha_optimizer.step()
# Update lr
self.actor_lr_scheduler.step()
self.critic_lr_scheduler.step()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.itr % self.log_freq == 0 and self.itr >= self.n_explore_steps:
time = timer()
run_results[-1]["time"] = time
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | alpha {alpha:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"total env step": cnt_train_step,
"loss - actor": loss_actor,
"loss - critic": loss_critic,
"entropy coeff": alpha,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
},
step=self.itr,
commit=True,
)
run_results[-1]["train_episode_reward"] = avg_episode_reward
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -65,11 +65,14 @@ class TrainDIPODiffusionAgent(TrainAgent):
gamma=1.0,
)
# target update rate
self.target_ema_rate = cfg.train.target_ema_rate
# Buffer size
self.buffer_size = cfg.train.buffer_size
# Perturbation scale
self.eta = cfg.train.eta
# Action gradient scaling
self.action_lr = cfg.train.action_lr
# Updates
self.replay_ratio = cfg.train.replay_ratio
@ -80,6 +83,9 @@ class TrainDIPODiffusionAgent(TrainAgent):
# Apply action gradient many steps
self.action_gradient_steps = cfg.train.action_gradient_steps
# Max grad norm for action
self.action_grad_norm = self.action_dim * self.act_steps * 0.1
def run(self):
# make a FIFO replay buffer for obs, action, and reward
@ -87,12 +93,12 @@ class TrainDIPODiffusionAgent(TrainAgent):
next_obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
done_buffer = deque(maxlen=self.buffer_size)
first_buffer = deque(maxlen=self.buffer_size)
terminated_buffer = deque(maxlen=self.buffer_size)
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
@ -116,10 +122,9 @@ class TrainDIPODiffusionAgent(TrainAgent):
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
reward_trajs = np.empty((0, self.n_envs))
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -144,23 +149,33 @@ class TrainDIPODiffusionAgent(TrainAgent):
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
done_venv = terminated_venv | truncated_venv
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = done_venv
# add to buffer
for i in range(self.n_envs):
obs_buffer.append(prev_obs_venv["state"][i])
next_obs_buffer.append(obs_venv["state"][i])
action_buffer.append(action_venv[i])
reward_buffer.append(reward_venv[i] * self.scale_reward_factor)
done_buffer.append(done_venv[i])
first_buffer.append(firsts_trajs[step])
if not eval_mode:
for i in range(self.n_envs):
obs_buffer.append(prev_obs_venv["state"][i])
if truncated_venv[i]: # truncated
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
else:
next_obs_buffer.append(obs_venv["state"][i])
action_buffer.append(action_venv[i])
reward_buffer.extend(
(reward_venv * self.scale_reward_factor).tolist()
)
terminated_buffer.extend(terminated_venv.tolist())
firsts_trajs[step + 1] = done_venv
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
@ -200,40 +215,31 @@ class TrainDIPODiffusionAgent(TrainAgent):
# Update models
if not eval_mode:
num_batch = self.replay_ratio
num_batch = int(
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
)
# only worth converting first with parallel envs - large number of updates below
obs_array = np.array(obs_buffer)
next_obs_array = np.array(next_obs_buffer)
action_array = np.array(action_buffer)
reward_array = np.array(reward_buffer)
terminated_array = np.array(terminated_buffer)
# Critic learning
for _ in range(num_batch):
# Sample batch
inds = np.random.choice(len(obs_buffer), self.batch_size)
obs_b = (
torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds]))
.float()
.to(self.device)
)
obs_b = torch.from_numpy(obs_array[inds]).float().to(self.device)
next_obs_b = (
torch.from_numpy(
np.vstack([next_obs_buffer[i][None] for i in inds])
)
.float()
.to(self.device)
torch.from_numpy(next_obs_array[inds]).float().to(self.device)
)
actions_b = (
torch.from_numpy(
np.vstack([action_buffer[i][None] for i in inds])
)
.float()
.to(self.device)
torch.from_numpy(action_array[inds]).float().to(self.device)
)
rewards_b = (
torch.from_numpy(np.vstack([reward_buffer[i] for i in inds]))
.float()
.to(self.device)
torch.from_numpy(reward_array[inds]).float().to(self.device)
)
dones_b = (
torch.from_numpy(np.vstack([done_buffer[i] for i in inds]))
.float()
.to(self.device)
terminated_b = (
torch.from_numpy(terminated_array[inds]).float().to(self.device)
)
# Update critic
@ -242,78 +248,77 @@ class TrainDIPODiffusionAgent(TrainAgent):
{"state": next_obs_b},
actions_b,
rewards_b,
dones_b,
terminated_b,
self.gamma,
)
self.critic_optimizer.zero_grad()
loss_critic.backward()
self.critic_optimizer.step()
# Actor learning
for _ in range(num_batch):
# Sample batch
inds = np.random.choice(len(obs_buffer), self.batch_size)
obs_b = (
torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds]))
.float()
.to(self.device)
)
actions_b = (
torch.from_numpy(
np.vstack([action_buffer[i][None] for i in inds])
)
.float()
.to(self.device)
)
# Replace actions in buffer with guided actions
guided_action_list = []
# get Q-perturbed actions by optimizing
actions_flat = actions_b.reshape(actions_b.shape[0], -1)
actions_optim = torch.optim.Adam(
[actions_flat], lr=self.eta, eps=1e-5
)
for _ in range(self.action_gradient_steps):
actions_flat.requires_grad_(True)
q_values_1, q_values_2 = self.model.critic(
{"state": obs_b}, actions_flat
)
q_values = torch.min(q_values_1, q_values_2)
action_opt_loss = -q_values.sum()
actions_optim.zero_grad()
action_opt_loss.backward(torch.ones_like(action_opt_loss))
# get the perturbed action
actions_optim.step()
actions_flat.requires_grad_(False)
actions_flat.clamp_(-1.0, 1.0)
guided_action = actions_flat.detach()
guided_action = guided_action.reshape(
guided_action.shape[0], -1, self.action_dim
)
guided_action_list.append(guided_action)
guided_action_stacked = torch.cat(guided_action_list, 0)
# Add to buffer (need separate indices since we're working with a limited subset)
for i, i_buf in enumerate(inds):
action_buffer[i_buf] = (
guided_action_stacked[i].detach().cpu().numpy()
)
# Update policy with collected trajectories
loss = self.model.loss(guided_action.detach(), {"state": obs_b})
self.actor_optimizer.zero_grad()
loss.backward()
# Actor learning
loss_actor = 0.0
if self.itr >= self.n_critic_warmup_itr:
inds = np.random.choice(len(obs_buffer), self.batch_size)
obs_b = (
torch.from_numpy(obs_array[inds]).float().to(self.device)
)
actions_b = (
torch.from_numpy(action_array[inds]).float().to(self.device)
)
# get Q-perturbed actions by optimizing
actions_flat = actions_b.reshape(len(actions_b), -1)
actions_optim = torch.optim.Adam(
[actions_flat], lr=self.action_lr, eps=1e-5
)
for _ in range(self.action_gradient_steps):
actions_flat.requires_grad_(True)
q_values_1, q_values_2 = self.model.critic(
{"state": obs_b}, actions_flat
)
q_values = torch.min(q_values_1, q_values_2)
action_opt_loss = -q_values.sum()
actions_optim.zero_grad()
action_opt_loss.backward(torch.ones_like(action_opt_loss))
torch.nn.utils.clip_grad_norm_(
[actions_flat],
max_norm=self.action_grad_norm,
norm_type=2,
)
actions_optim.step()
actions_flat.requires_grad_(False)
actions_flat.clamp_(-1.0, 1.0)
guided_action = actions_flat.reshape(
len(actions_flat), self.horizon_steps, self.action_dim
)
guided_action_np = guided_action.detach().cpu().numpy()
# Add back to buffer
action_array[inds] = guided_action_np
# Update policy with collected trajectories
loss_actor = self.model.loss(
guided_action.detach(), {"state": obs_b}
)
self.actor_optimizer.zero_grad()
loss_actor.backward()
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
self.model.actor.parameters(), self.max_grad_norm
)
self.actor_optimizer.step()
# Update target critic and actor
self.model.update_target_critic(self.target_ema_rate)
self.model.update_target_actor(self.target_ema_rate)
# convert back to buffer
action_buffer = deque(
[action for action in action_array], maxlen=self.buffer_size
)
# Update lr
self.actor_lr_scheduler.step()
self.critic_lr_scheduler.step()
@ -326,10 +331,12 @@ class TrainDIPODiffusionAgent(TrainAgent):
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.itr % self.log_freq == 0:
time = timer()
run_results[-1]["time"] = time
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
@ -350,23 +357,19 @@ class TrainDIPODiffusionAgent(TrainAgent):
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}"
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss - critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"loss - critic": loss_critic,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
},
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["loss_critic"] = loss_critic
wandb_log = {
"total env step": cnt_train_step,
"loss - critic": loss_critic,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
}
if type(loss_actor) == torch.Tensor:
wandb_log["loss - actor"] = loss_actor
wandb.log(wandb_log, step=self.itr, commit=True)
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = time
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -77,6 +77,9 @@ class TrainDQLDiffusionAgent(TrainAgent):
# Updates
self.replay_ratio = cfg.train.replay_ratio
# critic target update rate
self.target_ema_rate = cfg.train.target_ema_rate
def run(self):
# make a FIFO replay buffer for obs, action, and reward
@ -84,12 +87,12 @@ class TrainDQLDiffusionAgent(TrainAgent):
next_obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
done_buffer = deque(maxlen=self.buffer_size)
first_buffer = deque(maxlen=self.buffer_size)
terminated_buffer = deque(maxlen=self.buffer_size)
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
@ -113,10 +116,9 @@ class TrainDQLDiffusionAgent(TrainAgent):
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
reward_trajs = np.empty((0, self.n_envs))
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -141,23 +143,33 @@ class TrainDQLDiffusionAgent(TrainAgent):
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
done_venv = terminated_venv | truncated_venv
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = done_venv
# add to buffer
for i in range(self.n_envs):
obs_buffer.append(prev_obs_venv["state"][i])
next_obs_buffer.append(obs_venv["state"][i])
action_buffer.append(action_venv[i])
reward_buffer.append(reward_venv[i] * self.scale_reward_factor)
done_buffer.append(done_venv[i])
first_buffer.append(firsts_trajs[step])
if not eval_mode:
for i in range(self.n_envs):
obs_buffer.append(prev_obs_venv["state"][i])
if truncated_venv[i]: # truncated
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
else:
next_obs_buffer.append(obs_venv["state"][i])
action_buffer.append(action_venv[i])
reward_buffer.extend(
(reward_venv * self.scale_reward_factor).tolist()
)
terminated_buffer.extend(terminated_venv.tolist())
firsts_trajs[step + 1] = done_venv
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
@ -197,41 +209,24 @@ class TrainDQLDiffusionAgent(TrainAgent):
# Update models
if not eval_mode:
num_batch = self.replay_ratio
num_batch = int(
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
)
# only worth converting first with parallel envs - large number of updates below
obs_array = np.array(obs_buffer)
next_obs_array = np.array(next_obs_buffer)
action_array = np.array(action_buffer)
reward_array = np.array(reward_buffer)
terminated_array = np.array(terminated_buffer)
# Critic learning
for _ in range(num_batch):
# Sample batch
inds = np.random.choice(len(obs_buffer), self.batch_size)
obs_b = (
torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds]))
.float()
.to(self.device)
)
next_obs_b = (
torch.from_numpy(
np.vstack([next_obs_buffer[i][None] for i in inds])
)
.float()
.to(self.device)
)
actions_b = (
torch.from_numpy(
np.vstack([action_buffer[i][None] for i in inds])
)
.float()
.to(self.device)
)
rewards_b = (
torch.from_numpy(np.vstack([reward_buffer[i] for i in inds]))
.float()
.to(self.device)
)
dones_b = (
torch.from_numpy(np.vstack([done_buffer[i] for i in inds]))
.float()
.to(self.device)
)
obs_b = torch.from_numpy(obs_array[inds]).float().to(self.device)
next_obs_b = torch.from_numpy(next_obs_array[inds]).float().to(self.device)
actions_b = torch.from_numpy(action_array[inds]).float().to(self.device)
rewards_b = torch.from_numpy(reward_array[inds]).float().to(self.device)
terminated_b = torch.from_numpy(terminated_array[inds]).float().to(self.device)
# Update critic
loss_critic = self.model.loss_critic(
@ -239,39 +234,30 @@ class TrainDQLDiffusionAgent(TrainAgent):
{"state": next_obs_b},
actions_b,
rewards_b,
dones_b,
terminated_b,
self.gamma,
)
self.critic_optimizer.zero_grad()
loss_critic.backward()
self.critic_optimizer.step()
# get the new action and q values
samples = self.model.forward_train(
cond={"state": obs_b},
deterministic=eval_mode,
)
action_venv = samples[:, : self.act_steps] # n_env x horizon x act
q_values_b = self.model.critic({"state": obs_b}, action_venv)
q1_new_action, q2_new_action = q_values_b
# Update policy with collected trajectories
self.actor_optimizer.zero_grad()
actor_loss = self.model.loss_actor(
loss_actor = self.model.loss_actor(
{"state": obs_b},
actions_b,
q1_new_action,
q2_new_action,
self.eta,
self.act_steps,
)
actor_loss.backward()
loss_actor.backward()
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
self.model.actor.parameters(), self.max_grad_norm
)
self.actor_optimizer.step()
loss = actor_loss
# update target
self.model.update_target_critic(self.target_ema_rate)
# Update lr
self.actor_lr_scheduler.step()
@ -285,10 +271,12 @@ class TrainDQLDiffusionAgent(TrainAgent):
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.itr % self.log_freq == 0:
time = timer()
run_results[-1]["time"] = time
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
@ -309,12 +297,13 @@ class TrainDQLDiffusionAgent(TrainAgent):
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}"
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"total env step": cnt_train_step,
"loss - actor": loss_actor,
"loss - critic": loss_critic,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
@ -322,10 +311,7 @@ class TrainDQLDiffusionAgent(TrainAgent):
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["loss_critic"] = loss_critic
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = time
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,352 @@
"""
Imitation Bootstrapped Reinforcement Learning (IBRL) agent training script.
Does not support image observations right now.
"""
import os
import pickle
import numpy as np
import torch
import logging
import wandb
import hydra
from collections import deque
log = logging.getLogger(__name__)
from util.timer import Timer
from agent.finetune.train_agent import TrainAgent
from util.scheduler import CosineAnnealingWarmupRestarts
class TrainIBRLAgent(TrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
# Build dataset
self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset)
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
self.gamma = cfg.train.gamma
# Optimizer
self.actor_optimizer = torch.optim.AdamW(
self.model.network.parameters(),
lr=cfg.train.actor_lr,
weight_decay=cfg.train.actor_weight_decay,
)
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
self.actor_optimizer,
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.actor_lr,
min_lr=cfg.train.actor_lr_scheduler.min_lr,
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
gamma=1.0,
)
self.critic_optimizer = torch.optim.AdamW(
self.model.ensemble_params.values(), # https://github.com/pytorch/pytorch/issues/120581
lr=cfg.train.critic_lr,
weight_decay=cfg.train.critic_weight_decay,
)
self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
self.critic_optimizer,
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.critic_lr,
min_lr=cfg.train.critic_lr_scheduler.min_lr,
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
gamma=1.0,
)
# Perturbation scale
self.target_ema_rate = cfg.train.target_ema_rate
# Reward scale
self.scale_reward_factor = cfg.train.scale_reward_factor
# Number of critic updates
self.critic_num_update = cfg.train.critic_num_update
# Update frequency
self.update_freq = cfg.train.update_freq
# Buffer size
self.buffer_size = cfg.train.buffer_size
# Eval episodes
self.n_eval_episode = cfg.train.n_eval_episode
# Exploration steps at the beginning - using randomly sampled action
self.n_explore_steps = cfg.train.n_explore_steps
def run(self):
# make a FIFO replay buffer for obs, action, and reward
obs_buffer = deque(maxlen=self.buffer_size)
next_obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
terminated_buffer = deque(maxlen=self.buffer_size)
# load offline dataset into replay buffer
dataloader_offline = torch.utils.data.DataLoader(
self.dataset_offline,
batch_size=len(self.dataset_offline),
drop_last=False,
)
for batch in dataloader_offline:
actions, states_and_next, rewards, terminated = batch
states = states_and_next["state"]
next_states = states_and_next["next_state"]
obs_buffer.extend(states.cpu().numpy())
next_obs_buffer.extend(next_states.cpu().numpy())
action_buffer.extend(actions.cpu().numpy())
reward_buffer.extend(rewards.cpu().numpy().flatten())
terminated_buffer.extend(terminated.cpu().numpy().flatten())
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
if self.itr % 1000 == 0:
print(f"Finished training iteration {self.itr} of {self.n_train_itr}")
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = (
self.itr % self.val_freq == 0
and self.itr > self.n_explore_steps
and not self.force_train
)
n_steps = (
self.n_steps if not eval_mode else int(1e5)
) # large number for eval mode
self.model.eval() if eval_mode else self.model.train()
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning
firsts_trajs = np.zeros((n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or self.itr == 0:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
reward_trajs = np.zeros((n_steps, self.n_envs))
# Collect a set of trajectories from env
cnt_episode = 0
for step in range(n_steps):
# Select action
with torch.no_grad():
cond = {
"state": torch.from_numpy(prev_obs_venv["state"])
.float()
.to(self.device)
}
samples = (
self.model(
cond=cond,
deterministic=eval_mode,
)
.cpu()
.numpy()
) # n_env x horizon x act
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
done_venv = terminated_venv | truncated_venv
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = done_venv
# add to buffer in train mode
if not eval_mode:
for i in range(self.n_envs):
obs_buffer.append(prev_obs_venv["state"][i])
if "final_obs" in info_venv[i]: # truncated
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
terminated_venv[i] = False
else: # first obs in new episode
next_obs_buffer.append(obs_venv["state"][i])
action_buffer.append(action_venv[i])
reward_buffer.extend(
(reward_venv * self.scale_reward_factor).tolist()
)
terminated_buffer.append(terminated_venv.tolist())
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# check if enough eval episodes are done
cnt_episode += np.sum(done_venv)
if eval_mode and cnt_episode >= self.n_eval_episode:
break
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
# Update models
if (
not eval_mode
and self.itr > self.n_explore_steps
and self.itr % self.update_freq == 0
):
# Update critic more frequently
for _ in range(self.critic_num_update):
# Sample from online buffer
inds = np.random.choice(len(obs_buffer), self.batch_size)
obs_b = (
torch.from_numpy(np.array([obs_buffer[i] for i in inds]))
.float()
.to(self.device)
)
next_obs_b = (
torch.from_numpy(np.array([next_obs_buffer[i] for i in inds]))
.float()
.to(self.device)
)
actions_b = (
torch.from_numpy(np.array([action_buffer[i] for i in inds]))
.float()
.to(self.device)
)
rewards_b = (
torch.from_numpy(np.array([reward_buffer[i] for i in inds]))
.float()
.to(self.device)
)
terminated_b = (
torch.from_numpy(np.array([terminated_buffer[i] for i in inds]))
.float()
.to(self.device)
)
loss_critic = self.model.loss_critic(
{"state": obs_b},
{"state": next_obs_b},
actions_b,
rewards_b,
terminated_b,
self.gamma,
)
self.critic_optimizer.zero_grad()
loss_critic.backward()
self.critic_optimizer.step()
# Update target critic every critic update
self.model.update_target_critic(self.target_ema_rate)
# Update actor once with the final batch
loss_actor = self.model.loss_actor(
{"state": obs_b},
)
self.actor_optimizer.zero_grad()
loss_actor.backward()
self.actor_optimizer.step()
# Update target actor
self.model.update_target_actor(self.target_ema_rate)
# Update lr
self.actor_lr_scheduler.step()
self.critic_lr_scheduler.step()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps:
time = timer()
run_results[-1]["time"] = time
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"total env step": cnt_train_step,
"loss - actor": loss_actor,
"loss - critic": loss_critic,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
},
step=self.itr,
commit=True,
)
run_results[-1]["train_episode_reward"] = avg_episode_reward
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -102,12 +102,12 @@ class TrainIDQLDiffusionAgent(TrainAgent):
next_obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
done_buffer = deque(maxlen=self.buffer_size)
first_buffer = deque(maxlen=self.buffer_size)
terminated_buffer = deque(maxlen=self.buffer_size)
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
@ -131,10 +131,9 @@ class TrainIDQLDiffusionAgent(TrainAgent):
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
reward_trajs = np.empty((0, self.n_envs))
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -161,22 +160,33 @@ class TrainIDQLDiffusionAgent(TrainAgent):
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
done_venv = terminated_venv | truncated_venv
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = done_venv
# add to buffer
obs_buffer.append(prev_obs_venv["state"])
next_obs_buffer.append(obs_venv["state"])
action_buffer.append(action_venv)
reward_buffer.append(reward_venv * self.scale_reward_factor)
done_buffer.append(done_venv)
first_buffer.append(firsts_trajs[step])
if not eval_mode:
obs_venv_copy = obs_venv.copy()
for i in range(self.n_envs):
if truncated_venv[i]:
obs_venv_copy["state"][i] = info_venv[i]["final_obs"][
"state"
]
obs_buffer.append(prev_obs_venv["state"])
next_obs_buffer.append(obs_venv_copy["state"])
action_buffer.append(action_venv)
reward_buffer.append(reward_venv * self.scale_reward_factor)
terminated_buffer.append(terminated_venv)
firsts_trajs[step + 1] = done_venv
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
@ -216,13 +226,15 @@ class TrainIDQLDiffusionAgent(TrainAgent):
# Update models
if not eval_mode:
num_batch = int(
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
)
obs_trajs = np.array(deepcopy(obs_buffer))
action_trajs = np.array(deepcopy(action_buffer))
next_obs_trajs = np.array(deepcopy(next_obs_buffer))
reward_trajs = np.array(deepcopy(reward_buffer))
done_trajs = np.array(deepcopy(done_buffer))
first_trajs = np.array(deepcopy(first_buffer))
terminated_trajs = np.array(deepcopy(terminated_buffer))
# flatten
obs_trajs = einops.rearrange(
@ -238,13 +250,7 @@ class TrainIDQLDiffusionAgent(TrainAgent):
"s e h d -> (s e) h d",
)
reward_trajs = reward_trajs.reshape(-1)
done_trajs = done_trajs.reshape(-1)
first_trajs = first_trajs.reshape(-1)
num_batch = int(
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
)
terminated_trajs = terminated_trajs.reshape(-1)
for _ in range(num_batch):
# Sample batch
@ -259,7 +265,9 @@ class TrainIDQLDiffusionAgent(TrainAgent):
reward_b = (
torch.from_numpy(reward_trajs[inds]).float().to(self.device)
)
done_b = torch.from_numpy(done_trajs[inds]).float().to(self.device)
terminated_b = (
torch.from_numpy(terminated_trajs[inds]).float().to(self.device)
)
# update critic value function
critic_loss_v = self.model.loss_critic_v(
@ -275,7 +283,7 @@ class TrainIDQLDiffusionAgent(TrainAgent):
{"state": next_obs_b},
actions_b,
reward_b,
done_b,
terminated_b,
self.gamma,
)
self.critic_q_optimizer.zero_grad()
@ -284,16 +292,15 @@ class TrainIDQLDiffusionAgent(TrainAgent):
# update target q function
self.model.update_target_critic(self.critic_tau)
loss_critic = critic_loss_q.detach() + critic_loss_v.detach()
# Update policy with collected trajectories - no weighting
loss = self.model.loss(
loss_actor = self.model.loss(
actions_b,
{"state": obs_b},
)
self.actor_optimizer.zero_grad()
loss.backward()
loss_actor.backward()
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
@ -314,10 +321,12 @@ class TrainIDQLDiffusionAgent(TrainAgent):
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.itr % self.log_freq == 0:
time = timer()
run_results[-1]["time"] = time
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
@ -338,12 +347,13 @@ class TrainIDQLDiffusionAgent(TrainAgent):
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}"
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"total env step": cnt_train_step,
"loss - actor": loss_actor,
"loss - critic": loss_critic,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
@ -351,10 +361,7 @@ class TrainIDQLDiffusionAgent(TrainAgent):
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["loss_critic"] = loss_critic
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = time
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -50,6 +50,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
@ -68,34 +69,36 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
last_itr_eval = eval_mode
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
dones_trajs = np.zeros((self.n_steps, self.n_envs))
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
# Holder
obs_trajs = {
"state": np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
"state": np.zeros(
(self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim)
)
}
chains_trajs = np.empty(
chains_trajs = np.zeros(
(
0,
self.n_steps,
self.n_envs,
self.model.ft_denoising_steps + 1,
self.horizon_steps,
self.action_dim,
)
)
reward_trajs = np.empty((0, self.n_envs))
obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
obs_full_trajs = np.vstack(
(obs_full_trajs, prev_obs_venv["state"][:, -1][None])
) # save current obs
terminated_trajs = np.zeros((self.n_steps, self.n_envs))
reward_trajs = np.zeros((self.n_steps, self.n_envs))
if self.save_full_observations: # state-only
obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
obs_full_trajs = np.vstack(
(obs_full_trajs, prev_obs_venv["state"][:, -1][None])
)
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -123,9 +126,10 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
action_venv = output_venv[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
done_venv = terminated_venv | truncated_venv
if self.save_full_observations: # state-only
obs_full_venv = np.array(
[info["full_obs"]["state"] for info in info_venv]
@ -133,15 +137,18 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
obs_full_trajs = np.vstack(
(obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
)
obs_trajs["state"] = np.vstack(
(obs_trajs["state"], prev_obs_venv["state"][None])
)
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
dones_trajs[step] = done_venv
obs_trajs["state"][step] = prev_obs_venv["state"]
chains_trajs[step] = chains_venv
reward_trajs[step] = reward_venv
terminated_trajs[step] = terminated_venv
firsts_trajs[step + 1] = done_venv
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
@ -238,7 +245,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
)
reward_trajs = reward_trajs_transpose.T
# bootstrap value with GAE if not done - apply reward scaling with constant if specified
# bootstrap value with GAE if not terminal - apply reward scaling with constant if specified
obs_venv_ts = {
"state": torch.from_numpy(obs_venv["state"])
.float()
@ -256,7 +263,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
)
else:
nextvalues = values_trajs[t + 1]
nonterminal = 1.0 - dones_trajs[t]
nonterminal = 1.0 - terminated_trajs[t]
# delta = r + gamma*V(st+1) - V(st)
delta = (
reward_trajs[t] * self.reward_scale_const
@ -405,6 +412,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.save_trajs:
@ -414,6 +422,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
run_results[-1]["reward_trajs"] = reward_trajs
if self.itr % self.log_freq == 0:
time = timer()
run_results[-1]["time"] = time
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
@ -434,11 +443,12 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}"
f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"total env step": cnt_train_step,
"loss": loss,
"pg loss": pg_loss,
"value loss": v_loss,
@ -459,17 +469,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["pg_loss"] = pg_loss
run_results[-1]["value_loss"] = v_loss
run_results[-1]["bc_loss"] = bc_loss
run_results[-1]["eta"] = eta
run_results[-1]["approx_kl"] = approx_kl
run_results[-1]["ratio"] = ratio
run_results[-1]["clip_frac"] = np.mean(clipfracs)
run_results[-1]["explained_variance"] = explained_var
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = time
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -40,6 +40,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
@ -58,31 +59,32 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
last_itr_eval = eval_mode
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
dones_trajs = np.zeros((self.n_steps, self.n_envs))
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
# Holder
obs_trajs = {
k: np.empty((0, self.n_envs, self.n_cond_step, *self.obs_dims[k]))
k: np.zeros(
(self.n_steps, self.n_envs, self.n_cond_step, *self.obs_dims[k])
)
for k in self.obs_dims
}
chains_trajs = np.empty(
chains_trajs = np.zeros(
(
0,
self.n_steps,
self.n_envs,
self.model.ft_denoising_steps + 1,
self.horizon_steps,
self.action_dim,
)
)
reward_trajs = np.empty((0, self.n_envs))
terminated_trajs = np.zeros((self.n_steps, self.n_envs))
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -111,17 +113,23 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
action_venv = output_venv[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
done_venv = terminated_venv | truncated_venv
for k in obs_trajs:
obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None]))
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
dones_trajs[step] = done_venv
obs_trajs[k][step] = prev_obs_venv[k]
chains_trajs[step] = chains_venv
reward_trajs[step] = reward_venv
terminated_trajs[step] = terminated_venv
firsts_trajs[step + 1] = done_venv
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
@ -235,7 +243,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
)
reward_trajs = reward_trajs_transpose.T
# bootstrap value with GAE if not done - apply reward scaling with constant if specified
# bootstrap value with GAE if not terminal - apply reward scaling with constant if specified
obs_venv_ts = {
key: torch.from_numpy(obs_venv[key]).float().to(self.device)
for key in self.obs_dims
@ -252,7 +260,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
)
else:
nextvalues = values_trajs[t + 1]
nonterminal = 1.0 - dones_trajs[t]
nonterminal = 1.0 - terminated_trajs[t]
# delta = r + gamma*V(st+1) - V(st)
delta = (
reward_trajs[t] * self.reward_scale_const
@ -398,10 +406,12 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.itr % self.log_freq == 0:
time = timer()
run_results[-1]["time"] = time
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
@ -422,11 +432,12 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}"
f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"total env step": cnt_train_step,
"loss": loss,
"pg loss": pg_loss,
"value loss": v_loss,
@ -447,17 +458,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["pg_loss"] = pg_loss
run_results[-1]["value_loss"] = v_loss
run_results[-1]["bc_loss"] = bc_loss
run_results[-1]["eta"] = eta
run_results[-1]["approx_kl"] = approx_kl
run_results[-1]["ratio"] = ratio
run_results[-1]["clip_frac"] = np.mean(clipfracs)
run_results[-1]["explained_variance"] = explained_var
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = time
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -32,6 +32,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
@ -50,42 +51,39 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
last_itr_eval = eval_mode
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
dones_trajs = np.zeros((self.n_steps, self.n_envs))
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
# Holder
obs_trajs = {
"state": np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
"state": np.zeros(
(self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim)
)
}
samples_trajs = np.empty(
samples_trajs = np.zeros(
(
0,
self.n_steps,
self.n_envs,
self.horizon_steps,
self.action_dim,
)
)
chains_trajs = np.empty(
chains_trajs = np.zeros(
(
0,
self.n_steps,
self.n_envs,
self.model.ft_denoising_steps + 1,
self.horizon_steps,
self.action_dim,
)
)
reward_trajs = np.empty((0, self.n_envs))
obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
obs_full_trajs = np.vstack(
(obs_full_trajs, prev_obs_venv["state"][:, -1][None])
) # save current obs
terminated_trajs = np.zeros((self.n_steps, self.n_envs))
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -111,28 +109,25 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
samples.chains.cpu().numpy()
) # n_env x denoising x horizon x act
action_venv = output_venv[:, : self.act_steps]
samples_trajs = np.vstack((samples_trajs, output_venv[None]))
samples_trajs[step] = output_venv
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
if self.save_full_observations: # state-only
obs_full_venv = np.array(
[info["full_obs"]["state"] for info in info_venv]
) # n_envs x act_steps x obs_dim
obs_full_trajs = np.vstack(
(obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
)
obs_trajs["state"] = np.vstack(
(obs_trajs["state"], prev_obs_venv["state"][None])
)
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
dones_trajs[step] = done_venv
done_venv = terminated_venv | truncated_venv
obs_trajs["state"][step] = prev_obs_venv["state"]
chains_trajs[step] = chains_venv
reward_trajs[step] = reward_venv
terminated_trajs[step] = terminated_venv
firsts_trajs[step + 1] = done_venv
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
@ -214,7 +209,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
)
reward_trajs = reward_trajs_transpose.T
# bootstrap value with GAE if not done - apply reward scaling with constant if specified
# bootstrap value with GAE if not terminal - apply reward scaling with constant if specified
obs_venv_ts = {
"state": torch.from_numpy(obs_venv["state"])
.float()
@ -232,7 +227,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
)
else:
nextvalues = values_trajs[t + 1]
nonterminal = 1.0 - dones_trajs[t]
nonterminal = 1.0 - terminated_trajs[t]
# delta = r + gamma*V(st+1) - V(st)
delta = (
reward_trajs[t] * self.reward_scale_const
@ -343,20 +338,6 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
)
# Plot state trajectories (only in D3IL)
if (
self.itr % self.render_freq == 0
and self.n_render > 0
and self.traj_plotter is not None
):
self.traj_plotter(
obs_full_trajs=obs_full_trajs,
n_render=self.n_render,
max_episode_steps=self.max_episode_steps,
render_dir=self.render_dir,
itr=self.itr,
)
# Update lr
if self.itr >= self.n_critic_warmup_itr:
self.actor_lr_scheduler.step()
@ -370,16 +351,17 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.save_trajs:
run_results[-1]["obs_full_trajs"] = obs_full_trajs
run_results[-1]["obs_trajs"] = obs_trajs
run_results[-1]["action_trajs"] = samples_trajs
run_results[-1]["chains_trajs"] = chains_trajs
run_results[-1]["reward_trajs"] = reward_trajs
if self.itr % self.log_freq == 0:
time = timer()
run_results[-1]["time"] = time
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
@ -400,11 +382,12 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"total env step": cnt_train_step,
"loss": loss,
"pg loss": pg_loss,
"value loss": v_loss,
@ -417,15 +400,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["pg_loss"] = pg_loss
run_results[-1]["value_loss"] = v_loss
run_results[-1]["approx_kl"] = approx_kl
run_results[-1]["ratio"] = ratio
run_results[-1]["clip_frac"] = np.mean(clipfracs)
run_results[-1]["explained_variance"] = explained_var
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = time
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -27,6 +27,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
@ -45,33 +46,35 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
last_itr_eval = eval_mode
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
dones_trajs = np.zeros((self.n_steps, self.n_envs))
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
# Holder
obs_trajs = {
"state": np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
"state": np.zeros(
(self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim)
)
}
samples_trajs = np.empty(
samples_trajs = np.zeros(
(
0,
self.n_steps,
self.n_envs,
self.horizon_steps,
self.action_dim,
)
)
reward_trajs = np.empty((0, self.n_envs))
obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
obs_full_trajs = np.vstack(
(obs_full_trajs, prev_obs_venv["state"][:, -1][None])
) # save current obs
reward_trajs = np.zeros((self.n_steps, self.n_envs))
terminated_trajs = np.zeros((self.n_steps, self.n_envs))
if self.save_full_observations:
obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
obs_full_trajs = np.vstack(
(obs_full_trajs, prev_obs_venv["state"][:, -1][None])
)
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -93,9 +96,10 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
action_venv = output_venv[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
done_venv = terminated_venv | truncated_venv
if self.save_full_observations: # state-only
obs_full_venv = np.array(
[info["full_obs"]["state"] for info in info_venv]
@ -103,15 +107,18 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
obs_full_trajs = np.vstack(
(obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
)
obs_trajs["state"] = np.vstack(
(obs_trajs["state"], prev_obs_venv["state"][None])
)
samples_trajs = np.vstack((samples_trajs, output_venv[None]))
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
dones_trajs[step] = done_venv
obs_trajs["state"][step] = prev_obs_venv["state"]
samples_trajs[step] = output_venv
reward_trajs[step] = reward_venv
terminated_trajs[step] = terminated_venv
firsts_trajs[step + 1] = done_venv
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
@ -221,7 +228,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
)
else:
nextvalues = values_trajs[t + 1]
nonterminal = 1.0 - dones_trajs[t]
nonterminal = 1.0 - terminated_trajs[t]
# delta = r + gamma*V(st+1) - V(st)
delta = (
reward_trajs[t] * self.reward_scale_const
@ -363,6 +370,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.save_trajs:
@ -372,6 +380,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
run_results[-1]["reward_trajs"] = reward_trajs
if self.itr % self.log_freq == 0:
time = timer()
run_results[-1]["time"] = time
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
@ -392,11 +401,12 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | ent {-entropy_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | ent {-entropy_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"total env step": cnt_train_step,
"loss": loss,
"pg loss": pg_loss,
"value loss": v_loss,
@ -412,16 +422,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["pg_loss"] = pg_loss
run_results[-1]["value_loss"] = v_loss
run_results[-1]["entropy_loss"] = entropy_loss
run_results[-1]["approx_kl"] = approx_kl
run_results[-1]["ratio"] = ratio
run_results[-1]["clip_frac"] = np.mean(clipfracs)
run_results[-1]["explained_variance"] = explained_var
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = time
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -40,6 +40,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
@ -58,30 +59,31 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
last_itr_eval = eval_mode
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
dones_trajs = np.zeros((self.n_steps, self.n_envs))
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
# Holder
obs_trajs = {
k: np.empty((0, self.n_envs, self.n_cond_step, *self.obs_dims[k]))
k: np.zeros(
(self.n_steps, self.n_envs, self.n_cond_step, *self.obs_dims[k])
)
for k in self.obs_dims
}
samples_trajs = np.empty(
samples_trajs = np.zeros(
(
0,
self.n_steps,
self.n_envs,
self.horizon_steps,
self.action_dim,
)
)
reward_trajs = np.empty((0, self.n_envs))
terminated_trajs = np.zeros((self.n_steps, self.n_envs))
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -104,17 +106,23 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
action_venv = output_venv[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
done_venv = terminated_venv | truncated_venv
for k in obs_trajs:
obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None]))
samples_trajs = np.vstack((samples_trajs, output_venv[None]))
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
dones_trajs[step] = done_venv
obs_trajs[k][step] = prev_obs_venv[k]
samples_trajs[step] = output_venv
reward_trajs[step] = reward_venv
terminated_trajs[step] = terminated_venv
firsts_trajs[step + 1] = done_venv
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
@ -240,7 +248,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
)
else:
nextvalues = values_trajs[t + 1]
nonterminal = 1.0 - dones_trajs[t]
nonterminal = 1.0 - terminated_trajs[t]
# delta = r + gamma*V(st+1) - V(st)
delta = (
reward_trajs[t] * self.reward_scale_const
@ -374,10 +382,12 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.itr % self.log_freq == 0:
time = timer()
run_results[-1]["time"] = time
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
@ -398,11 +408,12 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"total env step": cnt_train_step,
"loss": loss,
"pg loss": pg_loss,
"value loss": v_loss,
@ -422,17 +433,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["pg_loss"] = pg_loss
run_results[-1]["value_loss"] = v_loss
run_results[-1]["bc_loss"] = bc_loss
run_results[-1]["std"] = std
run_results[-1]["approx_kl"] = approx_kl
run_results[-1]["ratio"] = ratio
run_results[-1]["clip_frac"] = np.mean(clipfracs)
run_results[-1]["explained_variance"] = explained_var
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = time
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -80,12 +80,12 @@ class TrainQSMDiffusionAgent(TrainAgent):
next_obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
done_buffer = deque(maxlen=self.buffer_size)
first_buffer = deque(maxlen=self.buffer_size)
terminated_buffer = deque(maxlen=self.buffer_size)
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
@ -109,10 +109,9 @@ class TrainQSMDiffusionAgent(TrainAgent):
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
reward_trajs = np.empty((0, self.n_envs))
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -137,22 +136,33 @@ class TrainQSMDiffusionAgent(TrainAgent):
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
done_venv = terminated_venv | truncated_venv
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = done_venv
# add to buffer
obs_buffer.append(prev_obs_venv["state"])
next_obs_buffer.append(obs_venv["state"])
action_buffer.append(action_venv)
reward_buffer.append(reward_venv * self.scale_reward_factor)
done_buffer.append(done_venv)
first_buffer.append(firsts_trajs[step])
if not eval_mode:
obs_venv_copy = obs_venv.copy()
for i in range(self.n_envs):
if truncated_venv[i]:
obs_venv_copy["state"][i] = info_venv[i]["final_obs"][
"state"
]
obs_buffer.append(prev_obs_venv["state"])
next_obs_buffer.append(obs_venv_copy["state"])
action_buffer.append(action_venv)
reward_buffer.append(reward_venv * self.scale_reward_factor)
terminated_buffer.append(terminated_venv)
firsts_trajs[step + 1] = done_venv
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
@ -192,13 +202,15 @@ class TrainQSMDiffusionAgent(TrainAgent):
# Update models
if not eval_mode:
num_batch = int(
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
)
obs_trajs = np.array(deepcopy(obs_buffer))
action_trajs = np.array(deepcopy(action_buffer))
next_obs_trajs = np.array(deepcopy(next_obs_buffer))
reward_trajs = np.array(deepcopy(reward_buffer))
done_trajs = np.array(deepcopy(done_buffer))
first_trajs = np.array(deepcopy(first_buffer))
terminated_trajs = np.array(deepcopy(terminated_buffer))
# flatten
obs_trajs = einops.rearrange(
@ -214,16 +226,8 @@ class TrainQSMDiffusionAgent(TrainAgent):
"s e h d -> (s e) h d",
)
reward_trajs = reward_trajs.reshape(-1)
done_trajs = done_trajs.reshape(-1)
first_trajs = first_trajs.reshape(-1)
num_batch = int(
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
)
terminated_trajs = terminated_trajs.reshape(-1)
for _ in range(num_batch):
# Sample batch
inds = np.random.choice(len(obs_trajs), self.batch_size)
obs_b = torch.from_numpy(obs_trajs[inds]).float().to(self.device)
next_obs_b = (
@ -232,37 +236,34 @@ class TrainQSMDiffusionAgent(TrainAgent):
actions_b = (
torch.from_numpy(action_trajs[inds]).float().to(self.device)
)
reward_b = (
rewards_b = (
torch.from_numpy(reward_trajs[inds]).float().to(self.device)
)
done_b = torch.from_numpy(done_trajs[inds]).float().to(self.device)
terminated_b = (
torch.from_numpy(terminated_trajs[inds]).float().to(self.device)
)
# update critic q function
critic_loss = self.model.loss_critic(
loss_critic = self.model.loss_critic(
{"state": obs_b},
{"state": next_obs_b},
actions_b,
reward_b,
done_b,
rewards_b,
terminated_b,
self.gamma,
)
self.critic_optimizer.zero_grad()
critic_loss.backward()
loss_critic.backward()
self.critic_optimizer.step()
# update target q function
self.model.update_target_critic(self.critic_tau)
loss_critic = critic_loss.detach()
# Update policy with collected trajectories
loss = self.model.loss_actor(
loss_actor = self.model.loss_actor(
{"state": obs_b},
actions_b,
self.q_grad_coeff,
)
self.actor_optimizer.zero_grad()
loss.backward()
loss_actor.backward()
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
@ -270,6 +271,9 @@ class TrainQSMDiffusionAgent(TrainAgent):
)
self.actor_optimizer.step()
# update target critic
self.model.update_target_critic(self.critic_tau)
# Update lr
self.actor_lr_scheduler.step()
self.critic_lr_scheduler.step()
@ -282,10 +286,12 @@ class TrainQSMDiffusionAgent(TrainAgent):
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.itr % self.log_freq == 0:
time = timer()
run_results[-1]["time"] = time
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
@ -306,12 +312,13 @@ class TrainQSMDiffusionAgent(TrainAgent):
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}"
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"total env step": cnt_train_step,
"loss - actor": loss_actor,
"loss - critic": loss_critic,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
@ -319,10 +326,7 @@ class TrainQSMDiffusionAgent(TrainAgent):
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["loss_critic"] = loss_critic
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = time
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,404 @@
"""
Reinforcement Learning with Prior Data (RLPD) agent training script.
Does not support image observations right now.
"""
import os
import pickle
import numpy as np
import torch
import logging
import wandb
import hydra
from collections import deque
log = logging.getLogger(__name__)
from util.timer import Timer
from agent.finetune.train_agent import TrainAgent
from util.scheduler import CosineAnnealingWarmupRestarts
class TrainRLPDAgent(TrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
# Build dataset
self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset)
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
self.gamma = cfg.train.gamma
# Optimizer
self.actor_optimizer = torch.optim.AdamW(
self.model.network.parameters(),
lr=cfg.train.actor_lr,
weight_decay=cfg.train.actor_weight_decay,
)
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
self.actor_optimizer,
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.actor_lr,
min_lr=cfg.train.actor_lr_scheduler.min_lr,
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
gamma=1.0,
)
self.critic_optimizer = torch.optim.AdamW(
self.model.ensemble_params.values(), # https://github.com/pytorch/pytorch/issues/120581
lr=cfg.train.critic_lr,
weight_decay=cfg.train.critic_weight_decay,
)
self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
self.critic_optimizer,
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.critic_lr,
min_lr=cfg.train.critic_lr_scheduler.min_lr,
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
gamma=1.0,
)
# Perturbation scale
self.target_ema_rate = cfg.train.target_ema_rate
# Reward scale
self.scale_reward_factor = cfg.train.scale_reward_factor
# Number of critic updates
self.critic_num_update = cfg.train.critic_num_update
# Buffer size
self.buffer_size = cfg.train.buffer_size
# Eval episodes
self.n_eval_episode = cfg.train.n_eval_episode
# Exploration steps at the beginning - using randomly sampled action
self.n_explore_steps = cfg.train.n_explore_steps
# Initialize temperature parameter for entropy
init_temperature = cfg.train.init_temperature
self.log_alpha = torch.tensor(np.log(init_temperature)).to(self.device)
self.log_alpha.requires_grad = True
self.target_entropy = cfg.train.target_entropy
self.log_alpha_optimizer = torch.optim.Adam(
[self.log_alpha],
lr=cfg.train.critic_lr,
)
def run(self):
# make a FIFO replay buffer for obs, action, and reward
obs_buffer = deque(maxlen=self.buffer_size)
next_obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
terminated_buffer = deque(maxlen=self.buffer_size)
# load offline dataset into replay buffer
dataloader_offline = torch.utils.data.DataLoader(
self.dataset_offline,
batch_size=len(self.dataset_offline),
drop_last=False,
)
for batch in dataloader_offline:
actions, states_and_next, rewards, terminated = batch
states = states_and_next["state"]
next_states = states_and_next["next_state"]
obs_buffer_off = states.cpu().numpy()
next_obs_buffer_off = next_states.cpu().numpy()
action_buffer_off = actions.cpu().numpy()
reward_buffer_off = rewards.cpu().numpy().flatten()
terminated_buffer_off = terminated.cpu().numpy().flatten()
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
if self.itr % 1000 == 0:
print(f"Finished training iteration {self.itr} of {self.n_train_itr}")
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = (
self.itr % self.val_freq == 0
and self.itr >= self.n_explore_steps
and not self.force_train
)
n_steps = (
self.n_steps if not eval_mode else int(1e5)
) # large number for eval mode
self.model.eval() if eval_mode else self.model.train()
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning
firsts_trajs = np.zeros((n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or self.itr == 0:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
# if done at the end of last iteration, then the envs are just reset
firsts_trajs[0] = done_venv
reward_trajs = np.zeros((n_steps, self.n_envs))
# Collect a set of trajectories from env
cnt_episode = 0
for step in range(n_steps):
# Select action
if self.itr < self.n_explore_steps:
action_venv = self.venv.action_space.sample()
else:
with torch.no_grad():
cond = {
"state": torch.from_numpy(prev_obs_venv["state"])
.float()
.to(self.device)
}
samples = (
self.model(
cond=cond,
deterministic=eval_mode,
)
.cpu()
.numpy()
) # n_env x horizon x act
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
done_venv = terminated_venv | truncated_venv
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = done_venv
# add to buffer in train mode
if not eval_mode:
for i in range(self.n_envs):
obs_buffer.append(prev_obs_venv["state"][i])
if truncated_venv[i]:
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
else:
next_obs_buffer.append(obs_venv["state"][i])
action_buffer.append(action_venv[i])
reward_buffer.extend(
(reward_venv * self.scale_reward_factor).tolist()
)
terminated_buffer.extend(terminated_venv.tolist())
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# check if enough eval episodes are done
cnt_episode += np.sum(done_venv)
if eval_mode and cnt_episode >= self.n_eval_episode:
break
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
# Update models
if not eval_mode and self.itr >= self.n_explore_steps:
# Update critic more frequently
for _ in range(self.critic_num_update):
# Sample from OFFLINE buffer
inds = np.random.choice(len(obs_buffer_off), self.batch_size // 2)
obs_b_off = (
torch.from_numpy(obs_buffer_off[inds]).float().to(self.device)
)
next_obs_b_off = (
torch.from_numpy(next_obs_buffer_off[inds])
.float()
.to(self.device)
)
actions_b_off = (
torch.from_numpy(action_buffer_off[inds])
.float()
.to(self.device)
)
rewards_b_off = (
torch.from_numpy(reward_buffer_off[inds])
.float()
.to(self.device)
)
terminated_b_off = (
torch.from_numpy(terminated_buffer_off[inds])
.float()
.to(self.device)
)
# Sample from ONLINE buffer
inds = np.random.choice(len(obs_buffer), self.batch_size // 2)
obs_b_on = (
torch.from_numpy(np.array([obs_buffer[i] for i in inds]))
.float()
.to(self.device)
)
next_obs_b_on = (
torch.from_numpy(np.array([next_obs_buffer[i] for i in inds]))
.float()
.to(self.device)
)
actions_b_on = (
torch.from_numpy(np.array([action_buffer[i] for i in inds]))
.float()
.to(self.device)
)
rewards_b_on = (
torch.from_numpy(np.array([reward_buffer[i] for i in inds]))
.float()
.to(self.device)
)
terminated_b_on = (
torch.from_numpy(np.array([terminated_buffer[i] for i in inds]))
.float()
.to(self.device)
)
# merge offline and online data
obs_b = torch.cat([obs_b_off, obs_b_on], dim=0)
next_obs_b = torch.cat([next_obs_b_off, next_obs_b_on], dim=0)
actions_b = torch.cat([actions_b_off, actions_b_on], dim=0)
rewards_b = torch.cat([rewards_b_off, rewards_b_on], dim=0)
terminated_b = torch.cat([terminated_b_off, terminated_b_on], dim=0)
# Update critic
alpha = self.log_alpha.exp().item()
loss_critic = self.model.loss_critic(
{"state": obs_b},
{"state": next_obs_b},
actions_b,
rewards_b,
terminated_b,
self.gamma,
alpha,
)
self.critic_optimizer.zero_grad()
loss_critic.backward()
self.critic_optimizer.step()
# Update target critic every critic update
self.model.update_target_critic(self.target_ema_rate)
# Update actor once with the final batch
loss_actor = self.model.loss_actor(
{"state": obs_b},
alpha,
)
self.actor_optimizer.zero_grad()
loss_actor.backward()
self.actor_optimizer.step()
# Update temperature parameter
self.log_alpha_optimizer.zero_grad()
loss_alpha = self.model.loss_temperature(
{"state": obs_b},
self.log_alpha.exp(), # with grad
self.target_entropy,
)
loss_alpha.backward()
self.log_alpha_optimizer.step()
# Update lr
self.actor_lr_scheduler.step()
self.critic_lr_scheduler.step()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps:
time = timer()
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | alpha {alpha:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"total env step": cnt_train_step,
"loss - actor": loss_actor,
"loss - critic": loss_critic,
"entropy coeff": alpha,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
},
step=self.itr,
commit=True,
)
run_results[-1]["train_episode_reward"] = avg_episode_reward
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -19,7 +19,6 @@ from util.scheduler import CosineAnnealingWarmupRestarts
class TrainRWRDiffusionAgent(TrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
@ -52,14 +51,13 @@ class TrainRWRDiffusionAgent(TrainAgent):
self.update_epochs = cfg.train.update_epochs
def run(self):
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
@ -79,23 +77,24 @@ class TrainRWRDiffusionAgent(TrainAgent):
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
# Holder
obs_trajs = {
"state": np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
"state": np.zeros(
(self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim)
)
}
samples_trajs = np.empty(
samples_trajs = np.zeros(
(
0,
self.n_steps,
self.n_envs,
self.horizon_steps,
self.action_dim,
)
)
reward_trajs = np.empty((0, self.n_envs))
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
@ -118,19 +117,25 @@ class TrainRWRDiffusionAgent(TrainAgent):
.numpy()
) # n_env x horizon x act
action_venv = samples[:, : self.act_steps]
samples_trajs = np.vstack((samples_trajs, samples[None]))
samples_trajs[step] = samples
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
obs_trajs["state"] = np.vstack(
(obs_trajs["state"], prev_obs_venv["state"][None])
)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
done_venv = terminated_venv | truncated_venv
# save
obs_trajs["state"][step] = prev_obs_venv["state"]
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = done_venv
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
@ -157,20 +162,23 @@ class TrainRWRDiffusionAgent(TrainAgent):
num_episode_finished = len(reward_trajs_split)
# Compute episode returns
discounted_reward_trajs_split = [
[
self.gamma**t * r
for t, r in zip(
list(range(end - start + 1)),
reward_trajs[start : end + 1, env_ind],
)
]
for env_ind, start, end in episodes_start_end
]
returns_trajs_split = [
np.cumsum(y[::-1])[::-1] for y in discounted_reward_trajs_split
np.zeros_like(reward_trajs) for reward_trajs in reward_trajs_split
]
for traj_rewards, traj_returns in zip(
reward_trajs_split, returns_trajs_split
):
prev_return = 0
for t in range(len(traj_rewards)):
traj_returns[-t - 1] = (
traj_rewards[-t - 1] + self.gamma * prev_return
)
prev_return = traj_returns[-t - 1]
# Note: concatenation is okay here since we are concatenating
# states and actions later on, in the same order
returns_trajs_split = np.concatenate(returns_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
@ -195,7 +203,6 @@ class TrainRWRDiffusionAgent(TrainAgent):
# Update models
if not eval_mode:
# Tensorize data and put them to device
# k for environment step
obs_k = {
@ -230,7 +237,6 @@ class TrainRWRDiffusionAgent(TrainAgent):
total_steps = len(rewards_k_scaled)
inds_k = np.arange(total_steps)
for _ in range(self.update_epochs):
# for each epoch, go through all data in batches
np.random.shuffle(inds_k)
num_batch = max(1, total_steps // self.batch_size) # skip last ones
@ -267,10 +273,12 @@ class TrainRWRDiffusionAgent(TrainAgent):
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.itr % self.log_freq == 0:
time = timer()
run_results[-1]["time"] = time
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
@ -291,11 +299,12 @@ class TrainRWRDiffusionAgent(TrainAgent):
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}"
f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"total env step": cnt_train_step,
"loss": loss,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
@ -303,9 +312,7 @@ class TrainRWRDiffusionAgent(TrainAgent):
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = time
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,335 @@
"""
Soft Actor Critic (SAC) agent training script.
Does not support image observations right now.
"""
import os
import pickle
import numpy as np
import torch
import logging
import wandb
from collections import deque
log = logging.getLogger(__name__)
from util.timer import Timer
from agent.finetune.train_agent import TrainAgent
class TrainSACAgent(TrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
self.gamma = cfg.train.gamma
# Optimizer
self.actor_optimizer = torch.optim.Adam(
self.model.network.parameters(),
lr=cfg.train.actor_lr,
)
self.critic_optimizer = torch.optim.Adam(
self.model.critic.parameters(),
lr=cfg.train.critic_lr,
)
# Perturbation scale
self.target_ema_rate = cfg.train.target_ema_rate
# Reward scale
self.scale_reward_factor = cfg.train.scale_reward_factor
# Actor/critic update frequency - assume single env
self.critic_update_freq = int(
cfg.train.batch_size / cfg.train.critic_replay_ratio
)
self.actor_update_freq = int(
cfg.train.batch_size / cfg.train.actor_replay_ratio
)
# Buffer size
self.buffer_size = cfg.train.buffer_size
# Eval episodes
self.n_eval_episode = cfg.train.n_eval_episode
# Exploration steps at the beginning - using randomly sampled action
self.n_explore_steps = cfg.train.n_explore_steps
# Initialize temperature parameter for entropy
init_temperature = cfg.train.init_temperature
self.log_alpha = torch.tensor(np.log(init_temperature)).to(self.device)
self.log_alpha.requires_grad = True
self.target_entropy = cfg.train.target_entropy
self.log_alpha_optimizer = torch.optim.Adam(
[self.log_alpha],
lr=cfg.train.critic_lr,
)
def run(self):
# make a FIFO replay buffer for obs, action, and reward
obs_buffer = deque(maxlen=self.buffer_size)
next_obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
terminated_buffer = deque(maxlen=self.buffer_size)
# Start training loop
timer = Timer()
run_results = []
cnt_train_step = 0
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
if self.itr % 1000 == 0:
print(f"Finished training iteration {self.itr} of {self.n_train_itr}")
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = (
self.itr % self.val_freq == 0
and self.itr > self.n_explore_steps
and not self.force_train
)
n_steps = (
self.n_steps if not eval_mode else int(1e5)
) # large number for eval mode
self.model.eval() if eval_mode else self.model.train()
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or self.itr == 0:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
# if done at the end of last iteration, the envs are just reset
firsts_trajs[0] = done_venv
reward_trajs = np.zeros((self.n_steps, self.n_envs))
# Collect a set of trajectories from env
cnt_episode = 0
for step in range(n_steps):
# Select action
if self.itr < self.n_explore_steps:
action_venv = self.venv.action_space.sample()
else:
with torch.no_grad():
cond = {
"state": torch.from_numpy(prev_obs_venv["state"])
.float()
.to(self.device)
}
samples = (
self.model(
cond=cond,
deterministic=eval_mode,
)
.cpu()
.numpy()
) # n_env x horizon x act
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
self.venv.step(action_venv)
)
done_venv = terminated_venv | truncated_venv
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = done_venv
# add to buffer in train mode
if not eval_mode:
for i in range(self.n_envs):
obs_buffer.append(prev_obs_venv["state"][i])
if "final_obs" in info_venv[i]: # truncated
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
else: # first obs in new episode
next_obs_buffer.append(obs_venv["state"][i])
action_buffer.append(action_venv[i])
reward_buffer.extend(
(reward_venv * self.scale_reward_factor).tolist()
)
terminated_buffer.extend(terminated_venv.tolist())
# update for next step
prev_obs_venv = obs_venv
# count steps --- not acounting for done within action chunk
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
# check if enough eval episodes are done
cnt_episode += np.sum(done_venv)
if eval_mode and cnt_episode >= self.n_eval_episode:
break
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
# Update models
if (
not eval_mode
and self.itr > self.n_explore_steps
and self.itr % self.critic_update_freq == 0
):
inds = np.random.choice(len(obs_buffer), self.batch_size, replace=False)
obs_b = (
torch.from_numpy(np.array([obs_buffer[i] for i in inds]))
.float()
.to(self.device)
)
next_obs_b = (
torch.from_numpy(np.array([next_obs_buffer[i] for i in inds]))
.float()
.to(self.device)
)
actions_b = (
torch.from_numpy(np.array([action_buffer[i] for i in inds]))
.float()
.to(self.device)
)
rewards_b = (
torch.from_numpy(np.array([reward_buffer[i] for i in inds]))
.float()
.to(self.device)
)
terminated_b = (
torch.from_numpy(np.array([terminated_buffer[i] for i in inds]))
.float()
.to(self.device)
)
# Update critic
alpha = self.log_alpha.exp().item()
loss_critic = self.model.loss_critic(
{"state": obs_b},
{"state": next_obs_b},
actions_b,
rewards_b,
terminated_b,
self.gamma,
alpha,
)
self.critic_optimizer.zero_grad()
loss_critic.backward()
self.critic_optimizer.step()
# Update target critic every critic update
self.model.update_target_critic(self.target_ema_rate)
# Delay update actor
loss_actor = 0
if self.itr % self.actor_update_freq == 0:
for _ in range(2):
loss_actor = self.model.loss_actor(
{"state": obs_b},
alpha,
)
self.actor_optimizer.zero_grad()
loss_actor.backward()
self.actor_optimizer.step()
# Update temperature parameter
self.log_alpha_optimizer.zero_grad()
loss_alpha = self.model.loss_temperature(
{"state": obs_b},
self.log_alpha.exp(), # with grad
self.target_entropy,
)
loss_alpha.backward()
self.log_alpha_optimizer.step()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
"step": cnt_train_step,
}
)
if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps:
time = timer()
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | alpha {alpha:8.4f} | t {time:8.4f}"
)
if self.use_wandb:
wandb_log_dict = {
"total env step": cnt_train_step,
"loss - critic": loss_critic,
"entropy coeff": alpha,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
}
if loss_actor is not None:
wandb_log_dict["loss - actor"] = loss_actor
wandb.log(
wandb_log_dict,
step=self.itr,
commit=True,
)
run_results[-1]["train_episode_reward"] = avg_episode_reward
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -16,7 +16,6 @@ env_name: avoiding-m5
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 10
cond_steps: 1
@ -102,7 +101,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -16,7 +16,6 @@ env_name: avoiding-m5
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 4
act_steps: 4
@ -94,7 +93,7 @@ model:
learn_fixed_std: False
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
mlp_dims: [256, 256, 256]

View File

@ -16,7 +16,6 @@ env_name: avoiding-m5
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 4
act_steps: 4
@ -95,7 +94,7 @@ model:
num_modes: ${num_modes}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
mlp_dims: [256, 256, 256]

View File

@ -16,7 +16,6 @@ env_name: avoiding-m5
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 10
cond_steps: 1
@ -102,7 +101,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -16,7 +16,6 @@ env_name: avoiding-m5
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 4
act_steps: 4
@ -94,7 +93,7 @@ model:
learn_fixed_std: False
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
mlp_dims: [256, 256, 256]

View File

@ -16,7 +16,6 @@ env_name: avoiding-m5
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 4
act_steps: 4
@ -95,7 +94,7 @@ model:
num_modes: ${num_modes}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
mlp_dims: [256, 256, 256]

View File

@ -16,7 +16,6 @@ env_name: avoiding-m5
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 10
cond_steps: 1
@ -102,7 +101,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -16,7 +16,6 @@ env_name: avoiding-m5
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 4
act_steps: 4
@ -94,7 +93,7 @@ model:
learn_fixed_std: False
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
mlp_dims: [256, 256, 256]

View File

@ -16,7 +16,6 @@ env_name: avoiding-m5
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 4
act_steps: 4
@ -95,7 +94,7 @@ model:
num_modes: ${num_modes}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
mlp_dims: [256, 256, 256]

View File

@ -15,7 +15,6 @@ env: avoid
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
denoising_steps: 20
horizon_steps: 4
cond_steps: 1
@ -50,7 +49,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -15,7 +15,6 @@ env: avoid
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
horizon_steps: 4
cond_steps: 1
@ -47,7 +46,7 @@ model:
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -15,7 +15,6 @@ env: avoid
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
horizon_steps: 4
cond_steps: 1
num_modes: 5
@ -49,7 +48,7 @@ model:
num_modes: ${num_modes}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -15,7 +15,6 @@ env: avoid
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
denoising_steps: 20
horizon_steps: 4
cond_steps: 1
@ -50,7 +49,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -15,7 +15,6 @@ env: avoid
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
horizon_steps: 4
cond_steps: 1
@ -47,7 +46,7 @@ model:
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -15,7 +15,6 @@ env: avoid
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
horizon_steps: 4
cond_steps: 1
num_modes: 5
@ -49,7 +48,7 @@ model:
num_modes: ${num_modes}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -15,7 +15,6 @@ env: avoid
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
denoising_steps: 20
horizon_steps: 4
cond_steps: 1
@ -50,7 +49,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -15,7 +15,6 @@ env: avoid
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
horizon_steps: 4
cond_steps: 1
@ -47,7 +46,7 @@ model:
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -15,7 +15,6 @@ env: avoid
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
horizon_steps: 4
cond_steps: 1
num_modes: 5
@ -49,7 +48,7 @@ model:
num_modes: ${num_modes}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
cond_steps: 1
horizon_steps: 8
@ -59,7 +58,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
@ -105,7 +104,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
@ -107,7 +106,7 @@ model:
cond_predict_scale: True
groupnorm_eps: 1e-4
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 8
act_steps: 8
@ -98,7 +97,7 @@ model:
std_max: 0.2
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
@ -105,7 +104,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
@ -106,7 +105,7 @@ model:
smaller_encoder: False
cond_predict_scale: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 8
act_steps: 8
@ -98,7 +97,7 @@ model:
std_max: 0.2
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
@ -105,7 +104,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
@ -107,7 +106,7 @@ model:
cond_predict_scale: True
groupnorm_eps: 1e-4
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 8
act_steps: 8
@ -98,7 +97,7 @@ model:
std_max: 0.2
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
@ -105,7 +104,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
@ -107,7 +106,7 @@ model:
cond_predict_scale: True
groupnorm_eps: 1e-4
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 8
act_steps: 8
@ -98,7 +97,7 @@ model:
std_max: 0.2
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
@ -105,7 +104,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
@ -107,7 +106,7 @@ model:
cond_predict_scale: True
groupnorm_eps: 1e-4
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 8
act_steps: 8
@ -98,7 +97,7 @@ model:
std_max: 0.2
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
@ -105,7 +104,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
@ -106,7 +105,7 @@ model:
smaller_encoder: False
cond_predict_scale: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 8
act_steps: 8
@ -98,7 +97,7 @@ model:
std_max: 0.2
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -16,7 +16,6 @@ randomness: low
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 8
cond_steps: 1
@ -52,7 +51,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -16,7 +16,6 @@ randomness: low
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 16
cond_steps: 1
@ -54,7 +53,7 @@ model:
cond_predict_scale: True
groupnorm_eps: 1e-4 # not important
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -16,7 +16,6 @@ randomness: low
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
horizon_steps: 8
cond_steps: 1
@ -49,7 +48,7 @@ model:
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -16,7 +16,6 @@ randomness: med
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 8
cond_steps: 1
@ -52,7 +51,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -16,7 +16,6 @@ randomness: med
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 16
cond_steps: 1
@ -53,7 +52,7 @@ model:
smaller_encoder: False
cond_predict_scale: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -16,7 +16,6 @@ randomness: med
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
horizon_steps: 8
cond_steps: 1
@ -49,7 +48,7 @@ model:
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -16,7 +16,6 @@ randomness: low
env: ${task}_${randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 8
cond_steps: 1
@ -52,7 +51,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -16,7 +16,6 @@ randomness: low
env: ${task}_${randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 16
cond_steps: 1
@ -54,7 +53,7 @@ model:
cond_predict_scale: True
groupnorm_eps: 1e-4 # not important
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -16,7 +16,6 @@ randomness: low
env: ${task}_${randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
horizon_steps: 8
cond_steps: 1
@ -49,7 +48,7 @@ model:
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -16,7 +16,6 @@ randomness: med
env: ${task}_${randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 8
cond_steps: 1
@ -52,7 +51,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -16,7 +16,6 @@ randomness: med
env: ${task}_${randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 16
cond_steps: 1
@ -53,7 +52,7 @@ model:
smaller_encoder: False
cond_predict_scale: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -16,7 +16,6 @@ randomness: med
env: ${task}_${randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
horizon_steps: 8
cond_steps: 1
@ -49,7 +48,7 @@ model:
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -16,7 +16,6 @@ randomness: low
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 8
cond_steps: 1
@ -52,7 +51,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -16,7 +16,6 @@ randomness: low
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 16
cond_steps: 1
@ -53,7 +52,7 @@ model:
smaller_encoder: False
cond_predict_scale: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -16,7 +16,6 @@ randomness: low
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
horizon_steps: 8
cond_steps: 1
@ -49,7 +48,7 @@ model:
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -16,7 +16,6 @@ randomness: med
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 8
cond_steps: 1
@ -52,7 +51,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -16,7 +16,6 @@ randomness: med
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 16
cond_steps: 1
@ -53,7 +52,7 @@ model:
smaller_encoder: False
cond_predict_scale: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -16,7 +16,6 @@ randomness: med
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
horizon_steps: 8
cond_steps: 1
@ -49,7 +48,7 @@ model:
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -0,0 +1,61 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
denoising_steps: 20
cond_steps: 1
horizon_steps: 1
act_steps: 1
n_steps: 1000 # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
render_num: 0
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
#
network_path: ${base_policy_path}
network:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,54 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.eval.eval_gaussian_agent.EvalGaussianAgent
name: ${env_name}_eval_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path:
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
cond_steps: 1
horizon_steps: 1
act_steps: 1
n_steps: 1000 # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
render_num: 0
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
model:
_target_: model.common.gaussian.GaussianModel
#
network_path: ${base_policy_path}
network:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [256, 256, 256]
activation_type: Mish
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -15,7 +15,6 @@ device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
@ -54,7 +53,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}

View File

@ -0,0 +1,54 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.eval.eval_gaussian_agent.EvalGaussianAgent
name: ${env_name}_eval_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
cond_steps: 1
horizon_steps: 1
act_steps: 1
n_steps: 1000 # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
render_num: 0
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
model:
_target_: model.common.gaussian.GaussianModel
#
network_path: ${base_policy_path}
network:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [256, 256, 256]
activation_type: Mish
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -0,0 +1,117 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
name: ${env_name}_calql_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path:
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
cond_steps: 1
horizon_steps: 1
act_steps: 1
env:
n_envs: 1
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: calql-${env_name}
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 10000
n_steps: 1 # not used
n_episode_per_epoch: 1
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 3e-4
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 3e-4
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
log_freq: 1
# CalQL specific
train_online: True
batch_size: 256
n_random_actions: 4
target_ema_rate: 0.005
scale_reward_factor: 1.0
num_update: 1000
buffer_size: 1000000
online_utd_ratio: 1
n_eval_episode: 10
n_explore_steps: 0
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
init_temperature: 1
automatic_entropy_tuning: True
model:
_target_: model.rl.gaussian_calql.CalQL_Gaussian
randn_clip_value: 3
cql_min_q_weight: 5.0
tanh_output: True
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [256, 256]
activation_type: ReLU
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
std_max: 7.3891
std_min: 0.0067
critic:
_target_: model.common.critic.CriticObsAct
mlp_dims: [256, 256]
activation_type: ReLU
use_layernorm: True
double_q: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
action_dim: ${action_dim}
action_steps: ${act_steps}
horizon_steps: ${horizon_steps}
device: ${device}
offline_dataset:
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
dataset_path: ${offline_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}
discount_factor: ${train.gamma}
get_mc_return: True

View File

@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
@ -68,7 +68,7 @@ train:
max_adv_weight: 100
beta: 10
buffer_size: 5000
batch_size: 256
batch_size: 1000
replay_ratio: 64
critic_update_ratio: 4
@ -82,7 +82,7 @@ model:
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
time_dim: 16
mlp_dims: [512, 512, 512]

View File

@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
@ -65,11 +65,12 @@ train:
num: 0
# DIPO specific
scale_reward_factor: 0.01
eta: 0.0001
target_ema_rate: 0.005
buffer_size: 1000000
action_lr: 0.0001
action_gradient_steps: 10
buffer_size: 400000
batch_size: 5000
replay_ratio: 64
batch_size: 1000
model:
_target_: model.diffusion.diffusion_dipo.DIPODiffusion
@ -81,7 +82,7 @@ model:
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
time_dim: 16
mlp_dims: [512, 512, 512]

View File

@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
@ -65,10 +65,11 @@ train:
num: 0
# DQL specific
scale_reward_factor: 0.01
target_ema_rate: 0.005
buffer_size: 1000000
eta: 1.0
buffer_size: 400000
batch_size: 5000
replay_ratio: 64
replay_ratio: 16
batch_size: 1000
model:
_target_: model.diffusion.diffusion_dql.DQLDiffusion
@ -80,7 +81,7 @@ model:
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
time_dim: 16
mlp_dims: [512, 512, 512]

View File

@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
@ -69,9 +69,9 @@ train:
eval_sample_num: 20 # how many samples to score during eval
critic_tau: 0.001 # rate of target q network update
use_expectile_exploration: True
buffer_size: 5000
batch_size: 512
replay_ratio: 16
buffer_size: 25000 # * n_envs
replay_ratio: 128
batch_size: 1000
model:
_target_: model.diffusion.diffusion_idql.IDQLDiffusion
@ -83,7 +83,7 @@ model:
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
time_dim: 16
mlp_dims: [512, 512, 512]

View File

@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 10
cond_steps: 1
@ -93,7 +93,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_ppo_exact_diffusion_agent.TrainPPOExactDiffusionAgent
name: ${env_name}_ppo_exact_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 10
cond_steps: 1
@ -87,7 +87,6 @@ model:
sde_min_beta: 1e-10
sde_probability_flow: True
#
gamma_denoising: 0.99
clip_ploss_coef: 0.01
min_sampling_denoising_std: 0.1
min_logprob_denoising_std: 0.1
@ -101,7 +100,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent
name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
@ -65,11 +65,11 @@ train:
num: 0
# QSM specific
scale_reward_factor: 0.01
q_grad_coeff: 50
critic_tau: 0.005 # rate of target q network update
buffer_size: 5000
batch_size: 256
replay_ratio: 32
q_grad_coeff: 10
critic_tau: 0.005
buffer_size: 25000
replay_ratio: 16
batch_size: 1000
model:
_target_: model.diffusion.diffusion_qsm.QSMDiffusion
@ -81,7 +81,7 @@ model:
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
time_dim: 16
mlp_dims: [512, 512, 512]

View File

@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent
name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
@ -73,7 +73,7 @@ model:
network:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
time_dim: 16
mlp_dims: [512, 512, 512]

View File

@ -0,0 +1,109 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
base_policy_path:
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
cond_steps: 1
horizon_steps: 1
act_steps: 1
env:
n_envs: 1
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: ibrl-${env_name}
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 300000
n_steps: 1
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-4
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
save_model_freq: 50000
val_freq: 2000
render:
freq: 1
num: 0
log_freq: 200
# IBRL specific
batch_size: 256
target_ema_rate: 0.01
scale_reward_factor: 1
critic_num_update: 5
buffer_size: 300000
n_eval_episode: 10
n_explore_steps: 0
update_freq: 2
model:
_target_: model.rl.gaussian_ibrl.IBRL_Gaussian
randn_clip_value: 3
n_critics: 5
soft_action_sample: True
soft_action_sample_beta: 10
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [256, 256, 256]
activation_type: Mish
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
critic:
_target_: model.common.critic.CriticObsAct
mlp_dims: [256, 256, 256]
activation_type: ReLU
use_layernorm: True
double_q: False # use ensemble
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
action_dim: ${action_dim}
action_steps: ${act_steps}
horizon_steps: ${horizon_steps}
device: ${device}
offline_dataset:
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
dataset_path: ${offline_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}
max_n_episodes: 50

View File

@ -6,14 +6,14 @@ hydra:
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_nopre_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 20
cond_steps: 1
@ -86,7 +86,7 @@ model:
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
time_dim: 16
mlp_dims: [512, 512, 512]

View File

@ -6,14 +6,14 @@ hydra:
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_nopre_ppo_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 1
act_steps: 1
@ -79,10 +79,10 @@ model:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
residual_style: False # with new logvar head
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

View File

@ -0,0 +1,109 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
name: ${env_name}_rlpd_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
cond_steps: 1
horizon_steps: 1
act_steps: 1
env:
n_envs: 1
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: rlpd-${env_name}
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 250000
n_steps: 1
gamma: 0.99
actor_lr: 3e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 3e-4
critic_lr: 3e-4
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 3e-4
save_model_freq: 50000
val_freq: 5000
render:
freq: 1
num: 0
log_freq: 200
# RLPD specific
batch_size: 256
target_ema_rate: 0.005
scale_reward_factor: 1
critic_num_update: 20
buffer_size: 1000000
n_eval_episode: 10
n_explore_steps: 5000
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
init_temperature: 1
model:
_target_: model.rl.gaussian_rlpd.RLPD_Gaussian
randn_clip_value: 10
tanh_output: True # squash after sampling
backup_entropy: True
n_critics: 10 # Ensemble size for critic models
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [256, 256]
activation_type: ReLU
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
std_max: 7.3891
std_min: 0.0067
critic:
_target_: model.common.critic.CriticObsAct
mlp_dims: [256, 256]
activation_type: ReLU
use_layernorm: True
double_q: False # use ensemble
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
action_dim: ${action_dim}
action_steps: ${act_steps}
horizon_steps: ${horizon_steps}
device: ${device}
offline_dataset:
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
dataset_path: ${offline_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,89 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_sac_agent.TrainSACAgent
name: ${env_name}_sac_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
seed: 42
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
cond_steps: 1
horizon_steps: 1
act_steps: 1
env:
n_envs: 1
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: sac-gym-${env_name}
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000000
n_steps: 1
gamma: 0.99
actor_lr: 3e-4
critic_lr: 1e-3
save_model_freq: 100000
val_freq: 10000
render:
freq: 1
num: 0
log_freq: 200
# SAC specific
batch_size: 256
target_ema_rate: 0.005
scale_reward_factor: 1
critic_replay_ratio: 256
actor_replay_ratio: 128
buffer_size: 1000000
n_eval_episode: 10
n_explore_steps: 5000
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
init_temperature: 1
model:
_target_: model.rl.gaussian_sac.SAC_Gaussian
randn_clip_value: 10
tanh_output: True # squash after sampling
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [256, 256]
activation_type: ReLU
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
std_max: 7.3891
std_min: 0.0067
critic: # no layernorm
_target_: model.common.critic.CriticObsAct
mlp_dims: [256, 256]
activation_type: ReLU
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
action_dim: ${action_dim}
action_steps: ${act_steps}
horizon_steps: ${horizon_steps}
device: ${device}

View File

@ -0,0 +1,117 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
name: ${env_name}_calql_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path:
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
seed: 42
device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
cond_steps: 1
horizon_steps: 1
act_steps: 1
env:
n_envs: 1
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: calql-${env_name}
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 10000
n_steps: 1 # not used
n_episode_per_epoch: 1
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 3e-4
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 3e-4
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
log_freq: 1
# CalQL specific
train_online: True
batch_size: 256
n_random_actions: 4
target_ema_rate: 0.005
scale_reward_factor: 1.0
num_update: 1000
buffer_size: 1000000
online_utd_ratio: 1
n_eval_episode: 10
n_explore_steps: 0
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
init_temperature: 1
automatic_entropy_tuning: True
model:
_target_: model.rl.gaussian_calql.CalQL_Gaussian
randn_clip_value: 3
cql_min_q_weight: 5.0
tanh_output: True
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [256, 256]
activation_type: ReLU
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
std_max: 7.3891
std_min: 0.0067
critic:
_target_: model.common.critic.CriticObsAct
mlp_dims: [256, 256]
activation_type: ReLU
use_layernorm: True
double_q: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
action_dim: ${action_dim}
action_steps: ${act_steps}
horizon_steps: ${horizon_steps}
device: ${device}
offline_dataset:
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
dataset_path: ${offline_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}
discount_factor: ${train.gamma}
get_mc_return: True

View File

@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
@ -68,7 +68,7 @@ train:
max_adv_weight: 100
beta: 10
buffer_size: 5000
batch_size: 256
batch_size: 1000
replay_ratio: 64
critic_update_ratio: 4
@ -82,7 +82,7 @@ model:
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
time_dim: 16
mlp_dims: [512, 512, 512]

View File

@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
@ -65,11 +65,12 @@ train:
num: 0
# DIPO specific
scale_reward_factor: 0.01
eta: 0.0001
target_ema_rate: 0.005
buffer_size: 1000000
action_lr: 0.0001
action_gradient_steps: 10
buffer_size: 400000
batch_size: 5000
replay_ratio: 64
batch_size: 1000
model:
_target_: model.diffusion.diffusion_dipo.DIPODiffusion
@ -81,7 +82,7 @@ model:
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
time_dim: 16
mlp_dims: [512, 512, 512]

View File

@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
@ -65,10 +65,11 @@ train:
num: 0
# DQL specific
scale_reward_factor: 0.01
target_ema_rate: 0.005
buffer_size: 1000000
eta: 1.0
buffer_size: 400000
batch_size: 5000
replay_ratio: 64
replay_ratio: 16
batch_size: 1000
model:
_target_: model.diffusion.diffusion_dql.DQLDiffusion
@ -80,7 +81,7 @@ model:
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
time_dim: 16
mlp_dims: [512, 512, 512]

View File

@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
@ -69,9 +69,9 @@ train:
eval_sample_num: 20 # how many samples to score during eval
critic_tau: 0.001 # rate of target q network update
use_expectile_exploration: True
buffer_size: 5000
batch_size: 512
replay_ratio: 16
buffer_size: 25000 # * n_envs
replay_ratio: 128
batch_size: 1000
model:
_target_: model.diffusion.diffusion_idql.IDQLDiffusion
@ -83,7 +83,7 @@ model:
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
time_dim: 16
mlp_dims: [512, 512, 512]

View File

@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
seed: 42
device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 10
cond_steps: 1
@ -93,7 +93,7 @@ model:
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}

Some files were not shown because too many files have changed in this diff Show More