diff --git a/.gitignore b/.gitignore index f145a12..b45dd92 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ checkpoints/ out/ err/ *.pkl +*.sh # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/agent/dataset/sequence.py b/agent/dataset/sequence.py index 08349a3..b0edabf 100644 --- a/agent/dataset/sequence.py +++ b/agent/dataset/sequence.py @@ -11,10 +11,15 @@ import torch import logging import pickle import random +from tqdm import tqdm log = logging.getLogger(__name__) Batch = namedtuple("Batch", "actions conditions") +Transition = namedtuple("Transition", "actions conditions rewards dones") +TransitionWithReturn = namedtuple( + "Transition", "actions conditions rewards dones reward_to_gos" +) class StitchedSequenceDataset(torch.utils.data.Dataset): @@ -49,6 +54,8 @@ class StitchedSequenceDataset(torch.utils.data.Dataset): self.img_cond_steps = img_cond_steps self.device = device self.use_img = use_img + self.max_n_episodes = max_n_episodes + self.dataset_path = dataset_path # Load dataset to device specified if dataset_path.endswith(".npz"): @@ -87,7 +94,7 @@ class StitchedSequenceDataset(torch.utils.data.Dataset): """ start, num_before_start = self.indices[idx] end = start + self.horizon_steps - states = self.states[(start - num_before_start) : end] + states = self.states[(start - num_before_start) : (start + 1)] actions = self.actions[start:end] states = torch.stack( [ @@ -116,9 +123,9 @@ class StitchedSequenceDataset(torch.utils.data.Dataset): indices = [] cur_traj_index = 0 for traj_length in traj_lengths: - max_start = cur_traj_index + traj_length - horizon_steps + 1 + max_start = cur_traj_index + traj_length - horizon_steps indices += [ - (i, i - cur_traj_index) for i in range(cur_traj_index, max_start) + (i, i - cur_traj_index) for i in range(cur_traj_index, max_start + 1) ] cur_traj_index += traj_length return indices @@ -135,3 +142,151 @@ class StitchedSequenceDataset(torch.utils.data.Dataset): def __len__(self): return len(self.indices) + + +class StitchedSequenceQLearningDataset(StitchedSequenceDataset): + """ + Extends StitchedSequenceDataset to include rewards and dones for Q learning + + Do not load the last step of **truncated** episodes since we do not have the correct next state for the final step of each episode. Truncation can be determined by terminal=False but end of episode. + """ + + def __init__( + self, + dataset_path, + max_n_episodes=10000, + discount_factor=1.0, + device="cuda:0", + get_mc_return=False, + **kwargs, + ): + if dataset_path.endswith(".npz"): + dataset = np.load(dataset_path, allow_pickle=False) + elif dataset_path.endswith(".pkl"): + with open(dataset_path, "rb") as f: + dataset = pickle.load(f) + else: + raise ValueError(f"Unsupported file format: {dataset_path}") + traj_lengths = dataset["traj_lengths"][:max_n_episodes] + total_num_steps = np.sum(traj_lengths) + + # discount factor + self.discount_factor = discount_factor + + # rewards and dones(terminals) + self.rewards = ( + torch.from_numpy(dataset["rewards"][:total_num_steps]).float().to(device) + ) + log.info(f"Rewards shape/type: {self.rewards.shape, self.rewards.dtype}") + self.dones = ( + torch.from_numpy(dataset["terminals"][:total_num_steps]).to(device).float() + ) + log.info(f"Dones shape/type: {self.dones.shape, self.dones.dtype}") + + super().__init__( + dataset_path=dataset_path, + max_n_episodes=max_n_episodes, + device=device, + **kwargs, + ) + log.info(f"Total number of transitions using: {len(self)}") + + # compute discounted reward-to-go for each trajectory + self.get_mc_return = get_mc_return + if get_mc_return: + self.reward_to_go = torch.zeros_like(self.rewards) + cumulative_traj_length = np.cumsum(traj_lengths) + prev_traj_length = 0 + for i, traj_length in tqdm( + enumerate(cumulative_traj_length), desc="Computing reward-to-go" + ): + traj_rewards = self.rewards[prev_traj_length:traj_length] + returns = torch.zeros_like(traj_rewards) + prev_return = 0 + for t in range(len(traj_rewards)): + returns[-t - 1] = ( + traj_rewards[-t - 1] + self.discount_factor * prev_return + ) + prev_return = returns[-t - 1] + self.reward_to_go[prev_traj_length:traj_length] = returns + prev_traj_length = traj_length + log.info(f"Computed reward-to-go for each trajectory.") + + def make_indices(self, traj_lengths, horizon_steps): + """ + skip last step of truncated episodes + """ + num_skip = 0 + indices = [] + cur_traj_index = 0 + for traj_length in traj_lengths: + max_start = cur_traj_index + traj_length - horizon_steps + if not self.dones[cur_traj_index + traj_length - 1]: # truncation + max_start -= 1 + num_skip += 1 + indices += [ + (i, i - cur_traj_index) for i in range(cur_traj_index, max_start + 1) + ] + cur_traj_index += traj_length + log.info(f"Number of transitions skipped due to truncation: {num_skip}") + return indices + + def __getitem__(self, idx): + start, num_before_start = self.indices[idx] + end = start + self.horizon_steps + states = self.states[(start - num_before_start) : (start + 1)] + actions = self.actions[start:end] + rewards = self.rewards[start : (start + 1)] + dones = self.dones[start : (start + 1)] + + # Account for action horizon + if idx < len(self.indices) - self.horizon_steps: + next_states = self.states[ + (start - num_before_start + self.horizon_steps) : start + + 1 + + self.horizon_steps + ] # even if this uses the first state(s) of the next episode, done=True will prevent bootstrapping. We have already filtered out cases where done=False but end of episode (truncation). + else: + # prevents indexing error, but ignored since done=True + next_states = torch.zeros_like(states) + + # stack obs history + states = torch.stack( + [ + states[max(num_before_start - t, 0)] + for t in reversed(range(self.cond_steps)) + ] + ) # more recent is at the end + next_states = torch.stack( + [ + next_states[max(num_before_start - t, 0)] + for t in reversed(range(self.cond_steps)) + ] + ) # more recent is at the end + conditions = {"state": states, "next_state": next_states} + if self.use_img: + images = self.images[(start - num_before_start) : end] + images = torch.stack( + [ + images[max(num_before_start - t, 0)] + for t in reversed(range(self.img_cond_steps)) + ] + ) + conditions["rgb"] = images + if self.get_mc_return: + reward_to_gos = self.reward_to_go[start : (start + 1)] + batch = TransitionWithReturn( + actions, + conditions, + rewards, + dones, + reward_to_gos, + ) + else: + batch = Transition( + actions, + conditions, + rewards, + dones, + ) + return batch diff --git a/agent/eval/eval_diffusion_agent.py b/agent/eval/eval_diffusion_agent.py index 9daa70f..00ffab5 100644 --- a/agent/eval/eval_diffusion_agent.py +++ b/agent/eval/eval_diffusion_agent.py @@ -36,7 +36,7 @@ class EvalDiffusionAgent(EvalAgent): firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 - reward_trajs = np.empty((0, self.n_envs)) + reward_trajs = np.zeros((self.n_steps, self.n_envs)) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -57,9 +57,13 @@ class EvalDiffusionAgent(EvalAgent): action_venv = output_venv[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step(action_venv) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - firsts_trajs[step + 1] = done_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) + ) + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = terminated_venv | truncated_venv + + # update for next step prev_obs_venv = obs_venv # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. diff --git a/agent/eval/eval_diffusion_img_agent.py b/agent/eval/eval_diffusion_img_agent.py index e8eacbe..6002413 100644 --- a/agent/eval/eval_diffusion_img_agent.py +++ b/agent/eval/eval_diffusion_img_agent.py @@ -40,7 +40,7 @@ class EvalImgDiffusionAgent(EvalAgent): firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 - reward_trajs = np.empty((0, self.n_envs)) + reward_trajs = np.zeros((self.n_steps, self.n_envs)) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -60,9 +60,13 @@ class EvalImgDiffusionAgent(EvalAgent): action_venv = output_venv[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step(action_venv) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - firsts_trajs[step + 1] = done_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) + ) + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = terminated_venv | truncated_venv + + # update for next step prev_obs_venv = obs_venv # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. diff --git a/agent/eval/eval_gaussian_agent.py b/agent/eval/eval_gaussian_agent.py index 4dc124b..96b304d 100644 --- a/agent/eval/eval_gaussian_agent.py +++ b/agent/eval/eval_gaussian_agent.py @@ -36,7 +36,7 @@ class EvalGaussianAgent(EvalAgent): firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 - reward_trajs = np.empty((0, self.n_envs)) + reward_trajs = np.zeros((self.n_steps, self.n_envs)) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -55,9 +55,13 @@ class EvalGaussianAgent(EvalAgent): action_venv = output_venv[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step(action_venv) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - firsts_trajs[step + 1] = done_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) + ) + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = terminated_venv | truncated_venv + + # update for next step prev_obs_venv = obs_venv # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. diff --git a/agent/eval/eval_gaussian_img_agent.py b/agent/eval/eval_gaussian_img_agent.py index 247c0f8..2a8e54a 100644 --- a/agent/eval/eval_gaussian_img_agent.py +++ b/agent/eval/eval_gaussian_img_agent.py @@ -40,7 +40,7 @@ class EvalImgGaussianAgent(EvalAgent): firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 - reward_trajs = np.empty((0, self.n_envs)) + reward_trajs = np.zeros((self.n_steps, self.n_envs)) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -58,9 +58,13 @@ class EvalImgGaussianAgent(EvalAgent): action_venv = output_venv[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step(action_venv) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - firsts_trajs[step + 1] = done_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) + ) + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = terminated_venv | truncated_venv + + # update for next step prev_obs_venv = obs_venv # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. diff --git a/agent/finetune/train_awr_diffusion_agent.py b/agent/finetune/train_awr_diffusion_agent.py index 4ec3cf1..e6c3850 100644 --- a/agent/finetune/train_awr_diffusion_agent.py +++ b/agent/finetune/train_awr_diffusion_agent.py @@ -26,7 +26,7 @@ from util.scheduler import CosineAnnealingWarmupRestarts def td_values( states, rewards, - dones, + terminateds, state_values, gamma=0.99, alpha=0.95, @@ -43,21 +43,20 @@ def td_values( """ sample_count = len(states) tds = np.zeros_like(state_values, dtype=np.float32) - dones[-1] = 1 - next_value = 1 - dones[-1] + next_value = state_values[-1].copy() + next_value[terminateds[-1]] = 0.0 val = 0.0 for i in range(sample_count - 1, -1, -1): - # next_value = 0.0 if dones[i] else state_values[i + 1] # get next_value for vectorized if i < sample_count - 1: next_value = state_values[i + 1] - next_value = next_value * (1 - dones[i]) + next_value = next_value * (1 - terminateds[i]) state_value = state_values[i] error = rewards[i] + gamma * next_value - state_value - val = alpha * error + gamma * lam * (1 - dones[i]) * val + val = alpha * error + gamma * lam * (1 - terminateds[i]) * val tds[i] = val + state_value return tds @@ -127,12 +126,12 @@ class TrainAWRDiffusionAgent(TrainAgent): obs_buffer = deque(maxlen=self.buffer_size) action_buffer = deque(maxlen=self.buffer_size) reward_buffer = deque(maxlen=self.buffer_size) - done_buffer = deque(maxlen=self.buffer_size) - first_buffer = deque(maxlen=self.buffer_size) + terminated_buffer = deque(maxlen=self.buffer_size) # Start training loop timer = Timer() run_results = [] + cnt_train_step = 0 last_itr_eval = False done_venv = np.zeros((1, self.n_envs)) while self.itr < self.n_train_itr: @@ -156,10 +155,9 @@ class TrainAWRDiffusionAgent(TrainAgent): prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 else: - firsts_trajs[0] = ( - done_venv # if done at the end of last iteration, then the envs are just reset - ) - reward_trajs = np.empty((0, self.n_envs)) + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv + reward_trajs = np.zeros((self.n_steps, self.n_envs)) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -184,21 +182,26 @@ class TrainAWRDiffusionAgent(TrainAgent): action_venv = samples[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step( - action_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) ) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) + done_venv = terminated_venv | truncated_venv + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = done_venv # add to buffer - obs_buffer.append(prev_obs_venv["state"]) - action_buffer.append(action_venv) - reward_buffer.append(reward_venv * self.scale_reward_factor) - done_buffer.append(done_venv) - first_buffer.append(firsts_trajs[step]) + if not eval_mode: + obs_buffer.append(prev_obs_venv["state"]) + action_buffer.append(action_venv) + reward_buffer.append(reward_venv * self.scale_reward_factor) + terminated_buffer.append(terminated_venv) - firsts_trajs[step + 1] = done_venv + # update for next step prev_obs_venv = obs_venv + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. episodes_start_end = [] for env_ind in range(self.n_envs): @@ -240,7 +243,7 @@ class TrainAWRDiffusionAgent(TrainAgent): if not eval_mode: obs_trajs = np.array(deepcopy(obs_buffer)) # assume only state reward_trajs = np.array(deepcopy(reward_buffer)) - dones_trajs = np.array(deepcopy(done_buffer)) + terminated_trajs = np.array(deepcopy(terminated_buffer)) obs_t = einops.rearrange( torch.from_numpy(obs_trajs).float().to(self.device), "s e h d -> (s e) h d", @@ -248,7 +251,9 @@ class TrainAWRDiffusionAgent(TrainAgent): values_trajs = np.array( self.model.critic({"state": obs_t}).detach().cpu().numpy() ).reshape(-1, self.n_envs) - td_trajs = td_values(obs_trajs, reward_trajs, dones_trajs, values_trajs) + td_trajs = td_values( + obs_trajs, reward_trajs, terminated_trajs, values_trajs + ) td_t = torch.from_numpy(td_trajs.flatten()).float().to(self.device) # Update critic @@ -268,7 +273,7 @@ class TrainAWRDiffusionAgent(TrainAgent): obs_trajs = np.array(deepcopy(obs_buffer)) samples_trajs = np.array(deepcopy(action_buffer)) reward_trajs = np.array(deepcopy(reward_buffer)) - dones_trajs = np.array(deepcopy(done_buffer)) + terminated_trajs = np.array(deepcopy(terminated_buffer)) obs_t = einops.rearrange( torch.from_numpy(obs_trajs).float().to(self.device), "s e h d -> (s e) h d", @@ -276,7 +281,9 @@ class TrainAWRDiffusionAgent(TrainAgent): values_trajs = np.array( self.model.critic({"state": obs_t}).detach().cpu().numpy() ).reshape(-1, self.n_envs) - td_trajs = td_values(obs_trajs, reward_trajs, dones_trajs, values_trajs) + td_trajs = td_values( + obs_trajs, reward_trajs, terminated_trajs, values_trajs + ) advantages_trajs = td_trajs - values_trajs # flatten @@ -315,13 +322,13 @@ class TrainAWRDiffusionAgent(TrainAgent): advantages_b_scaled.clamp_(max=self.max_adv_weight) # Update policy with collected trajectories - loss = self.model.loss( + loss_actor = self.model.loss( actions_b, obs_b, advantages_b_scaled.detach(), ) self.actor_optimizer.zero_grad() - loss.backward() + loss_actor.backward() if self.itr >= self.n_critic_warmup_itr: if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm_( @@ -341,10 +348,12 @@ class TrainAWRDiffusionAgent(TrainAgent): run_results.append( { "itr": self.itr, + "step": cnt_train_step, } ) if self.itr % self.log_freq == 0: time = timer() + run_results[-1]["time"] = time if eval_mode: log.info( f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" @@ -365,12 +374,13 @@ class TrainAWRDiffusionAgent(TrainAgent): run_results[-1]["eval_best_reward"] = avg_best_reward else: log.info( - f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}" + f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" ) if self.use_wandb: wandb.log( { - "loss": loss, + "total env step": cnt_train_step, + "loss - actor": loss_actor, "loss - critic": loss_critic, "avg episode reward - train": avg_episode_reward, "num episode - train": num_episode_finished, @@ -378,10 +388,7 @@ class TrainAWRDiffusionAgent(TrainAgent): step=self.itr, commit=True, ) - run_results[-1]["loss"] = loss - run_results[-1]["loss_critic"] = loss_critic run_results[-1]["train_episode_reward"] = avg_episode_reward - run_results[-1]["time"] = time with open(self.result_path, "wb") as f: pickle.dump(run_results, f) self.itr += 1 diff --git a/agent/finetune/train_calql_agent.py b/agent/finetune/train_calql_agent.py new file mode 100644 index 0000000..cd96d0b --- /dev/null +++ b/agent/finetune/train_calql_agent.py @@ -0,0 +1,501 @@ +""" +Reinforcement Learning with Prior Data (RLPD) agent training script. + +Does not support image observations right now. +""" + +import os +import pickle +import numpy as np +import torch +import logging +import wandb +import hydra +from collections import deque + +log = logging.getLogger(__name__) +from util.timer import Timer +from agent.finetune.train_agent import TrainAgent +from util.scheduler import CosineAnnealingWarmupRestarts + + +class TrainCalQLAgent(TrainAgent): + def __init__(self, cfg): + super().__init__(cfg) + assert self.n_envs == 1, "Cal-QL only supports single env for now" + + # Train mode (offline or online) + self.train_online = cfg.train.train_online + + # Build dataset + self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset) + + # note the discount factor gamma here is applied to reward every act_steps, instead of every env step + self.gamma = cfg.train.gamma + + # Optimizer + self.actor_optimizer = torch.optim.AdamW( + self.model.network.parameters(), + lr=cfg.train.actor_lr, + weight_decay=cfg.train.actor_weight_decay, + ) + self.actor_lr_scheduler = CosineAnnealingWarmupRestarts( + self.actor_optimizer, + first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.actor_lr, + min_lr=cfg.train.actor_lr_scheduler.min_lr, + warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps, + gamma=1.0, + ) + self.critic_optimizer = torch.optim.AdamW( + self.model.critic.parameters(), + lr=cfg.train.critic_lr, + weight_decay=cfg.train.critic_weight_decay, + ) + self.critic_lr_scheduler = CosineAnnealingWarmupRestarts( + self.critic_optimizer, + first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.critic_lr, + min_lr=cfg.train.critic_lr_scheduler.min_lr, + warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps, + gamma=1.0, + ) + + # Perturbation scale + self.target_ema_rate = cfg.train.target_ema_rate + + # Number of random actions to sample for Cal-QL + self.n_random_actions = cfg.train.n_random_actions + + # Reward scale + self.scale_reward_factor = cfg.train.scale_reward_factor + + # Number of critic updates + self.num_update = cfg.train.num_update + + # Buffer size + self.buffer_size = cfg.train.buffer_size + + # Online only configs + if self.train_online: + # number of episode to colect per epoch for training + self.n_episode_per_epoch = cfg.train.n_episode_per_epoch + # UTD ratio + self.online_utd_ratio = cfg.train.online_utd_ratio + + # Eval episodes + self.n_eval_episode = cfg.train.n_eval_episode + + # Exploration steps at the beginning - using randomly sampled action + self.n_explore_steps = cfg.train.n_explore_steps + + # Initialize temperature parameter for entropy + init_temperature = cfg.train.init_temperature + self.log_alpha = torch.tensor(np.log(init_temperature)).to(self.device) + self.log_alpha.requires_grad = True + self.automatic_entropy_tuning = cfg.train.automatic_entropy_tuning + self.target_entropy = cfg.train.target_entropy + self.log_alpha_optimizer = torch.optim.Adam( + [self.log_alpha], + lr=cfg.train.critic_lr, + ) + + def run(self): + # make a FIFO replay buffer for obs, action, and reward + obs_buffer = deque(maxlen=self.buffer_size) + next_obs_buffer = deque(maxlen=self.buffer_size) + action_buffer = deque(maxlen=self.buffer_size) + reward_buffer = deque(maxlen=self.buffer_size) + reward_to_go_buffer = deque(maxlen=self.buffer_size) + terminated_buffer = deque(maxlen=self.buffer_size) + if not self.train_online: + obs_array = np.array(obs_buffer) + next_obs_array = np.array(next_obs_buffer) + actions_array = np.array(action_buffer) + rewards_array = np.array(reward_buffer) + reward_to_go_array = np.array(reward_to_go_buffer) + terminated_array = np.array(terminated_buffer) + + # load offline dataset into replay buffer + dataloader_offline = torch.utils.data.DataLoader( + self.dataset_offline, + batch_size=len(self.dataset_offline), + drop_last=False, + ) + for batch in dataloader_offline: + actions, states_and_next, rewards, terminated, reward_to_go = batch + states = states_and_next["state"] + next_states = states_and_next["next_state"] + obs_buffer_off = states.cpu().numpy() + next_obs_buffer_off = next_states.cpu().numpy() + action_buffer_off = actions.cpu().numpy() + reward_buffer_off = rewards.cpu().numpy().flatten() + reward_to_go_buffer_off = reward_to_go.cpu().numpy().flatten() + terminated_buffer_off = terminated.cpu().numpy().flatten() + + # Start training loop + timer = Timer() + run_results = [] + cnt_train_step = 0 + done_venv = np.zeros((1, self.n_envs)) + while self.itr < self.n_train_itr: + if self.itr % 1000 == 0: + print(f"Finished training iteration {self.itr} of {self.n_train_itr}") + + # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env + options_venv = [{} for _ in range(self.n_envs)] + if self.itr % self.render_freq == 0 and self.render_video: + for env_ind in range(self.n_render): + options_venv[env_ind]["video_path"] = os.path.join( + self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4" + ) + + # Define train or eval - all envs restart + eval_mode = ( + self.itr % self.val_freq == 0 + and self.itr >= self.n_explore_steps + and not self.force_train + ) + # during eval, we collect a fixed number of episodes, so we set n_steps to a large value + if eval_mode: + n_steps = int(1e5) + elif not self.train_online: + n_steps = 0 + else: + n_steps = int(1e5) # use episodes + self.model.eval() if eval_mode else self.model.train() + + # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning + firsts_trajs = np.zeros((n_steps + 1, self.n_envs)) + if self.reset_at_iteration or eval_mode or self.itr == 0: + prev_obs_venv = self.reset_env_all(options_venv=options_venv) + firsts_trajs[0] = 1 + else: + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv + reward_trajs = np.zeros((n_steps, self.n_envs)) + + # Collect a set of trajectories from env + cnt_episode = 0 + for step in range(n_steps): + if step % 100 == 0: + print(f"Completed environment step {step}") + + # Select action + if self.itr < self.n_explore_steps: + action_venv = self.venv.action_space.sample() + else: + with torch.no_grad(): + cond = { + "state": torch.from_numpy(prev_obs_venv["state"]) + .float() + .to(self.device) + } + samples = ( + self.model( + cond=cond, + deterministic=eval_mode, + ) + .cpu() + .numpy() + ) # n_env x horizon x act + action_venv = samples[:, : self.act_steps] + + # Apply multi-step action + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) + ) + done_venv = terminated_venv | truncated_venv + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = done_venv + + # add to buffer in train mode + if not eval_mode: + for i in range(self.n_envs): + obs_buffer.append(prev_obs_venv["state"][i]) + if truncated_venv[i]: + next_obs_buffer.append(info_venv[i]["final_obs"]["state"]) + else: # first obs in new episode + next_obs_buffer.append(obs_venv["state"][i]) + action_buffer.append(action_venv[i]) + reward_buffer.extend( + (reward_venv * self.scale_reward_factor).tolist() + ) + terminated_buffer.extend(terminated_venv.tolist()) + + # update for next step + prev_obs_venv = obs_venv + + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + + # check if enough eval episodes are done + cnt_episode += np.sum(done_venv) + if eval_mode and cnt_episode >= self.n_eval_episode: + break + if not eval_mode and cnt_episode >= self.n_episode_per_epoch: + break + + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. + episodes_start_end = [] + for env_ind in range(self.n_envs): + env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0] + for i in range(len(env_steps) - 1): + start = env_steps[i] + end = env_steps[i + 1] + if end - start > 1: + episodes_start_end.append((env_ind, start, end - 1)) + if len(episodes_start_end) > 0: + reward_trajs_split = [ + reward_trajs[start : end + 1, env_ind] + for env_ind, start, end in episodes_start_end + ] + + # compute episode returns + returns_trajs_split = [ + np.zeros_like(reward_trajs) for reward_trajs in reward_trajs_split + ] + for traj_rewards, traj_returns in zip( + reward_trajs_split, returns_trajs_split + ): + prev_return = 0 + for t in range(len(traj_rewards)): + traj_returns[-t - 1] = ( + traj_rewards[-t - 1] + self.gamma * prev_return + ) + prev_return = traj_returns[-t - 1] + + # flatten (note: only works for single env!) + returns_trajs_split = np.concatenate(returns_trajs_split) + + # extend buffer + reward_to_go_buffer.extend(returns_trajs_split) + + num_episode_finished = len(reward_trajs_split) + episode_reward = np.array( + [np.sum(reward_traj) for reward_traj in reward_trajs_split] + ) + episode_best_reward = np.array( + [ + np.max(reward_traj) / self.act_steps + for reward_traj in reward_trajs_split + ] + ) + avg_episode_reward = np.mean(episode_reward) + avg_best_reward = np.mean(episode_best_reward) + success_rate = np.mean( + episode_best_reward >= self.best_reward_threshold_for_success + ) + else: + episode_reward = np.array([]) + num_episode_finished = 0 + avg_episode_reward = 0 + avg_best_reward = 0 + success_rate = 0 + + # Update models + if not eval_mode and self.itr >= self.n_explore_steps: + # TODO: is this slow in online? + if self.train_online: + obs_array = np.array(obs_buffer) + next_obs_array = np.array(next_obs_buffer) + actions_array = np.array(action_buffer) + rewards_array = np.array(reward_buffer) + reward_to_go_array = np.array(reward_to_go_buffer) + terminated_array = np.array(terminated_buffer) + + # override num_update + if self.train_online: + num_update = len(reward_trajs) # assume one env! + else: + num_update = self.num_update + for _ in range(num_update): + # Sample from OFFLINE buffer + inds = np.random.choice( + len(obs_buffer_off), + self.batch_size // 2 if self.train_online else self.batch_size, + ) + obs_b = ( + torch.from_numpy(obs_buffer_off[inds]).float().to(self.device) + ) + next_obs_b = ( + torch.from_numpy(next_obs_buffer_off[inds]) + .float() + .to(self.device) + ) + actions_b = ( + torch.from_numpy(action_buffer_off[inds]) + .float() + .to(self.device) + ) + rewards_b = ( + torch.from_numpy(reward_buffer_off[inds]) + .float() + .to(self.device) + ) + terminated_b = ( + torch.from_numpy(terminated_buffer_off[inds]) + .float() + .to(self.device) + ) + reward_to_go_b = ( + torch.from_numpy(reward_to_go_buffer_off[inds]) + .float() + .to(self.device) + ) + + # Sample from ONLINE buffer + if self.train_online: + inds = np.random.choice(len(obs_buffer), self.batch_size // 2) + obs_b_on = ( + torch.from_numpy(obs_array[inds]).float().to(self.device) + ) + next_obs_b_on = ( + torch.from_numpy(next_obs_array[inds]) + .float() + .to(self.device) + ) + actions_b_on = ( + torch.from_numpy(actions_array[inds]) + .float() + .to(self.device) + ) + rewards_b_on = ( + torch.from_numpy(rewards_array[inds]) + .float() + .to(self.device) + ) + terminated_b_on = ( + torch.from_numpy(terminated_array[inds]) + .float() + .to(self.device) + ) + reward_to_go_b_on = ( + torch.from_numpy(reward_to_go_array[inds]) + .float() + .to(self.device) + ) + + # merge offline and online data + obs_b = torch.cat([obs_b, obs_b_on], dim=0) + next_obs_b = torch.cat([next_obs_b, next_obs_b_on], dim=0) + actions_b = torch.cat([actions_b, actions_b_on], dim=0) + rewards_b = torch.cat([rewards_b, rewards_b_on], dim=0) + terminated_b = torch.cat([terminated_b, terminated_b_on], dim=0) + reward_to_go_b = torch.cat( + [reward_to_go_b, reward_to_go_b_on], dim=0 + ) + + # Get a random action for Cal-QL + random_actions = ( + torch.rand( + ( + self.batch_size, + self.n_random_actions, + self.horizon_steps, + self.action_dim, + ) + ).to(self.device) + * 2 + - 1 + ) # scale to [-1, 1] + + # Update critic + alpha = self.log_alpha.exp().item() + loss_critic = self.model.loss_critic( + {"state": obs_b}, + {"state": next_obs_b}, + actions_b, + random_actions, + rewards_b, + reward_to_go_b, + terminated_b, + self.gamma, + alpha, + ) + self.critic_optimizer.zero_grad() + loss_critic.backward() + self.critic_optimizer.step() + + # Update target critic + self.model.update_target_critic(self.target_ema_rate) + + # Update actor + loss_actor = self.model.loss_actor( + {"state": obs_b}, + alpha, + ) + self.actor_optimizer.zero_grad() + loss_actor.backward() + self.actor_optimizer.step() + + # Update temperature parameter + if self.automatic_entropy_tuning: + self.log_alpha_optimizer.zero_grad() + loss_alpha = self.model.loss_temperature( + {"state": obs_b}, + self.log_alpha.exp(), # with grad + self.target_entropy, + ) + loss_alpha.backward() + self.log_alpha_optimizer.step() + + # Update lr + self.actor_lr_scheduler.step() + self.critic_lr_scheduler.step() + + # Save model + if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1: + self.save_model() + + # Log loss and save metrics + run_results.append( + { + "itr": self.itr, + "step": cnt_train_step, + } + ) + if self.itr % self.log_freq == 0 and self.itr >= self.n_explore_steps: + time = timer() + run_results[-1]["time"] = time + if eval_mode: + log.info( + f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "success rate - eval": success_rate, + "avg episode reward - eval": avg_episode_reward, + "avg best reward - eval": avg_best_reward, + "num episode - eval": num_episode_finished, + }, + step=self.itr, + commit=False, + ) + run_results[-1]["eval_success_rate"] = success_rate + run_results[-1]["eval_episode_reward"] = avg_episode_reward + run_results[-1]["eval_best_reward"] = avg_best_reward + else: + log.info( + f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | alpha {alpha:8.4f} | t:{time:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "total env step": cnt_train_step, + "loss - actor": loss_actor, + "loss - critic": loss_critic, + "entropy coeff": alpha, + "avg episode reward - train": avg_episode_reward, + "num episode - train": num_episode_finished, + }, + step=self.itr, + commit=True, + ) + run_results[-1]["train_episode_reward"] = avg_episode_reward + with open(self.result_path, "wb") as f: + pickle.dump(run_results, f) + self.itr += 1 diff --git a/agent/finetune/train_dipo_diffusion_agent.py b/agent/finetune/train_dipo_diffusion_agent.py index 1726dff..dcd7638 100644 --- a/agent/finetune/train_dipo_diffusion_agent.py +++ b/agent/finetune/train_dipo_diffusion_agent.py @@ -65,11 +65,14 @@ class TrainDIPODiffusionAgent(TrainAgent): gamma=1.0, ) + # target update rate + self.target_ema_rate = cfg.train.target_ema_rate + # Buffer size self.buffer_size = cfg.train.buffer_size - # Perturbation scale - self.eta = cfg.train.eta + # Action gradient scaling + self.action_lr = cfg.train.action_lr # Updates self.replay_ratio = cfg.train.replay_ratio @@ -80,6 +83,9 @@ class TrainDIPODiffusionAgent(TrainAgent): # Apply action gradient many steps self.action_gradient_steps = cfg.train.action_gradient_steps + # Max grad norm for action + self.action_grad_norm = self.action_dim * self.act_steps * 0.1 + def run(self): # make a FIFO replay buffer for obs, action, and reward @@ -87,12 +93,12 @@ class TrainDIPODiffusionAgent(TrainAgent): next_obs_buffer = deque(maxlen=self.buffer_size) action_buffer = deque(maxlen=self.buffer_size) reward_buffer = deque(maxlen=self.buffer_size) - done_buffer = deque(maxlen=self.buffer_size) - first_buffer = deque(maxlen=self.buffer_size) + terminated_buffer = deque(maxlen=self.buffer_size) # Start training loop timer = Timer() run_results = [] + cnt_train_step = 0 last_itr_eval = False done_venv = np.zeros((1, self.n_envs)) while self.itr < self.n_train_itr: @@ -116,10 +122,9 @@ class TrainDIPODiffusionAgent(TrainAgent): prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 else: - firsts_trajs[0] = ( - done_venv # if done at the end of last iteration, then the envs are just reset - ) - reward_trajs = np.empty((0, self.n_envs)) + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv + reward_trajs = np.zeros((self.n_steps, self.n_envs)) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -144,23 +149,33 @@ class TrainDIPODiffusionAgent(TrainAgent): action_venv = samples[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step( - action_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) ) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) + done_venv = terminated_venv | truncated_venv + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = done_venv # add to buffer - for i in range(self.n_envs): - obs_buffer.append(prev_obs_venv["state"][i]) - next_obs_buffer.append(obs_venv["state"][i]) - action_buffer.append(action_venv[i]) - reward_buffer.append(reward_venv[i] * self.scale_reward_factor) - done_buffer.append(done_venv[i]) - first_buffer.append(firsts_trajs[step]) + if not eval_mode: + for i in range(self.n_envs): + obs_buffer.append(prev_obs_venv["state"][i]) + if truncated_venv[i]: # truncated + next_obs_buffer.append(info_venv[i]["final_obs"]["state"]) + else: + next_obs_buffer.append(obs_venv["state"][i]) + action_buffer.append(action_venv[i]) + reward_buffer.extend( + (reward_venv * self.scale_reward_factor).tolist() + ) + terminated_buffer.extend(terminated_venv.tolist()) - firsts_trajs[step + 1] = done_venv + # update for next step prev_obs_venv = obs_venv + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. episodes_start_end = [] for env_ind in range(self.n_envs): @@ -200,40 +215,31 @@ class TrainDIPODiffusionAgent(TrainAgent): # Update models if not eval_mode: - num_batch = self.replay_ratio + num_batch = int( + self.n_steps * self.n_envs / self.batch_size * self.replay_ratio + ) + # only worth converting first with parallel envs - large number of updates below + obs_array = np.array(obs_buffer) + next_obs_array = np.array(next_obs_buffer) + action_array = np.array(action_buffer) + reward_array = np.array(reward_buffer) + terminated_array = np.array(terminated_buffer) # Critic learning for _ in range(num_batch): - # Sample batch inds = np.random.choice(len(obs_buffer), self.batch_size) - obs_b = ( - torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds])) - .float() - .to(self.device) - ) + obs_b = torch.from_numpy(obs_array[inds]).float().to(self.device) next_obs_b = ( - torch.from_numpy( - np.vstack([next_obs_buffer[i][None] for i in inds]) - ) - .float() - .to(self.device) + torch.from_numpy(next_obs_array[inds]).float().to(self.device) ) actions_b = ( - torch.from_numpy( - np.vstack([action_buffer[i][None] for i in inds]) - ) - .float() - .to(self.device) + torch.from_numpy(action_array[inds]).float().to(self.device) ) rewards_b = ( - torch.from_numpy(np.vstack([reward_buffer[i] for i in inds])) - .float() - .to(self.device) + torch.from_numpy(reward_array[inds]).float().to(self.device) ) - dones_b = ( - torch.from_numpy(np.vstack([done_buffer[i] for i in inds])) - .float() - .to(self.device) + terminated_b = ( + torch.from_numpy(terminated_array[inds]).float().to(self.device) ) # Update critic @@ -242,78 +248,77 @@ class TrainDIPODiffusionAgent(TrainAgent): {"state": next_obs_b}, actions_b, rewards_b, - dones_b, + terminated_b, self.gamma, ) self.critic_optimizer.zero_grad() loss_critic.backward() self.critic_optimizer.step() - # Actor learning - for _ in range(num_batch): - # Sample batch - inds = np.random.choice(len(obs_buffer), self.batch_size) - obs_b = ( - torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds])) - .float() - .to(self.device) - ) - actions_b = ( - torch.from_numpy( - np.vstack([action_buffer[i][None] for i in inds]) - ) - .float() - .to(self.device) - ) - - # Replace actions in buffer with guided actions - guided_action_list = [] - - # get Q-perturbed actions by optimizing - actions_flat = actions_b.reshape(actions_b.shape[0], -1) - actions_optim = torch.optim.Adam( - [actions_flat], lr=self.eta, eps=1e-5 - ) - for _ in range(self.action_gradient_steps): - actions_flat.requires_grad_(True) - q_values_1, q_values_2 = self.model.critic( - {"state": obs_b}, actions_flat - ) - q_values = torch.min(q_values_1, q_values_2) - action_opt_loss = -q_values.sum() - - actions_optim.zero_grad() - action_opt_loss.backward(torch.ones_like(action_opt_loss)) - - # get the perturbed action - actions_optim.step() - - actions_flat.requires_grad_(False) - actions_flat.clamp_(-1.0, 1.0) - guided_action = actions_flat.detach() - guided_action = guided_action.reshape( - guided_action.shape[0], -1, self.action_dim - ) - guided_action_list.append(guided_action) - guided_action_stacked = torch.cat(guided_action_list, 0) - - # Add to buffer (need separate indices since we're working with a limited subset) - for i, i_buf in enumerate(inds): - action_buffer[i_buf] = ( - guided_action_stacked[i].detach().cpu().numpy() - ) - - # Update policy with collected trajectories - loss = self.model.loss(guided_action.detach(), {"state": obs_b}) - self.actor_optimizer.zero_grad() - loss.backward() + # Actor learning + loss_actor = 0.0 if self.itr >= self.n_critic_warmup_itr: + inds = np.random.choice(len(obs_buffer), self.batch_size) + obs_b = ( + torch.from_numpy(obs_array[inds]).float().to(self.device) + ) + actions_b = ( + torch.from_numpy(action_array[inds]).float().to(self.device) + ) + + # get Q-perturbed actions by optimizing + actions_flat = actions_b.reshape(len(actions_b), -1) + actions_optim = torch.optim.Adam( + [actions_flat], lr=self.action_lr, eps=1e-5 + ) + for _ in range(self.action_gradient_steps): + actions_flat.requires_grad_(True) + q_values_1, q_values_2 = self.model.critic( + {"state": obs_b}, actions_flat + ) + q_values = torch.min(q_values_1, q_values_2) + action_opt_loss = -q_values.sum() + + actions_optim.zero_grad() + action_opt_loss.backward(torch.ones_like(action_opt_loss)) + torch.nn.utils.clip_grad_norm_( + [actions_flat], + max_norm=self.action_grad_norm, + norm_type=2, + ) + actions_optim.step() + + actions_flat.requires_grad_(False) + actions_flat.clamp_(-1.0, 1.0) + guided_action = actions_flat.reshape( + len(actions_flat), self.horizon_steps, self.action_dim + ) + guided_action_np = guided_action.detach().cpu().numpy() + + # Add back to buffer + action_array[inds] = guided_action_np + + # Update policy with collected trajectories + loss_actor = self.model.loss( + guided_action.detach(), {"state": obs_b} + ) + self.actor_optimizer.zero_grad() + loss_actor.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm_( self.model.actor.parameters(), self.max_grad_norm ) self.actor_optimizer.step() + # Update target critic and actor + self.model.update_target_critic(self.target_ema_rate) + self.model.update_target_actor(self.target_ema_rate) + + # convert back to buffer + action_buffer = deque( + [action for action in action_array], maxlen=self.buffer_size + ) + # Update lr self.actor_lr_scheduler.step() self.critic_lr_scheduler.step() @@ -326,10 +331,12 @@ class TrainDIPODiffusionAgent(TrainAgent): run_results.append( { "itr": self.itr, + "step": cnt_train_step, } ) if self.itr % self.log_freq == 0: time = timer() + run_results[-1]["time"] = time if eval_mode: log.info( f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" @@ -350,23 +357,19 @@ class TrainDIPODiffusionAgent(TrainAgent): run_results[-1]["eval_best_reward"] = avg_best_reward else: log.info( - f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}" + f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss - critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" ) if self.use_wandb: - wandb.log( - { - "loss": loss, - "loss - critic": loss_critic, - "avg episode reward - train": avg_episode_reward, - "num episode - train": num_episode_finished, - }, - step=self.itr, - commit=True, - ) - run_results[-1]["loss"] = loss - run_results[-1]["loss_critic"] = loss_critic + wandb_log = { + "total env step": cnt_train_step, + "loss - critic": loss_critic, + "avg episode reward - train": avg_episode_reward, + "num episode - train": num_episode_finished, + } + if type(loss_actor) == torch.Tensor: + wandb_log["loss - actor"] = loss_actor + wandb.log(wandb_log, step=self.itr, commit=True) run_results[-1]["train_episode_reward"] = avg_episode_reward - run_results[-1]["time"] = time with open(self.result_path, "wb") as f: pickle.dump(run_results, f) self.itr += 1 diff --git a/agent/finetune/train_dql_diffusion_agent.py b/agent/finetune/train_dql_diffusion_agent.py index 2e6e4e0..d7786fe 100644 --- a/agent/finetune/train_dql_diffusion_agent.py +++ b/agent/finetune/train_dql_diffusion_agent.py @@ -77,6 +77,9 @@ class TrainDQLDiffusionAgent(TrainAgent): # Updates self.replay_ratio = cfg.train.replay_ratio + # critic target update rate + self.target_ema_rate = cfg.train.target_ema_rate + def run(self): # make a FIFO replay buffer for obs, action, and reward @@ -84,12 +87,12 @@ class TrainDQLDiffusionAgent(TrainAgent): next_obs_buffer = deque(maxlen=self.buffer_size) action_buffer = deque(maxlen=self.buffer_size) reward_buffer = deque(maxlen=self.buffer_size) - done_buffer = deque(maxlen=self.buffer_size) - first_buffer = deque(maxlen=self.buffer_size) + terminated_buffer = deque(maxlen=self.buffer_size) # Start training loop timer = Timer() run_results = [] + cnt_train_step = 0 last_itr_eval = False done_venv = np.zeros((1, self.n_envs)) while self.itr < self.n_train_itr: @@ -113,10 +116,9 @@ class TrainDQLDiffusionAgent(TrainAgent): prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 else: - firsts_trajs[0] = ( - done_venv # if done at the end of last iteration, then the envs are just reset - ) - reward_trajs = np.empty((0, self.n_envs)) + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv + reward_trajs = np.zeros((self.n_steps, self.n_envs)) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -141,23 +143,33 @@ class TrainDQLDiffusionAgent(TrainAgent): action_venv = samples[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step( - action_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) ) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) + done_venv = terminated_venv | truncated_venv + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = done_venv # add to buffer - for i in range(self.n_envs): - obs_buffer.append(prev_obs_venv["state"][i]) - next_obs_buffer.append(obs_venv["state"][i]) - action_buffer.append(action_venv[i]) - reward_buffer.append(reward_venv[i] * self.scale_reward_factor) - done_buffer.append(done_venv[i]) - first_buffer.append(firsts_trajs[step]) + if not eval_mode: + for i in range(self.n_envs): + obs_buffer.append(prev_obs_venv["state"][i]) + if truncated_venv[i]: # truncated + next_obs_buffer.append(info_venv[i]["final_obs"]["state"]) + else: + next_obs_buffer.append(obs_venv["state"][i]) + action_buffer.append(action_venv[i]) + reward_buffer.extend( + (reward_venv * self.scale_reward_factor).tolist() + ) + terminated_buffer.extend(terminated_venv.tolist()) - firsts_trajs[step + 1] = done_venv + # update for next step prev_obs_venv = obs_venv + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. episodes_start_end = [] for env_ind in range(self.n_envs): @@ -197,41 +209,24 @@ class TrainDQLDiffusionAgent(TrainAgent): # Update models if not eval_mode: - num_batch = self.replay_ratio + num_batch = int( + self.n_steps * self.n_envs / self.batch_size * self.replay_ratio + ) + # only worth converting first with parallel envs - large number of updates below + obs_array = np.array(obs_buffer) + next_obs_array = np.array(next_obs_buffer) + action_array = np.array(action_buffer) + reward_array = np.array(reward_buffer) + terminated_array = np.array(terminated_buffer) # Critic learning for _ in range(num_batch): - # Sample batch inds = np.random.choice(len(obs_buffer), self.batch_size) - obs_b = ( - torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds])) - .float() - .to(self.device) - ) - next_obs_b = ( - torch.from_numpy( - np.vstack([next_obs_buffer[i][None] for i in inds]) - ) - .float() - .to(self.device) - ) - actions_b = ( - torch.from_numpy( - np.vstack([action_buffer[i][None] for i in inds]) - ) - .float() - .to(self.device) - ) - rewards_b = ( - torch.from_numpy(np.vstack([reward_buffer[i] for i in inds])) - .float() - .to(self.device) - ) - dones_b = ( - torch.from_numpy(np.vstack([done_buffer[i] for i in inds])) - .float() - .to(self.device) - ) + obs_b = torch.from_numpy(obs_array[inds]).float().to(self.device) + next_obs_b = torch.from_numpy(next_obs_array[inds]).float().to(self.device) + actions_b = torch.from_numpy(action_array[inds]).float().to(self.device) + rewards_b = torch.from_numpy(reward_array[inds]).float().to(self.device) + terminated_b = torch.from_numpy(terminated_array[inds]).float().to(self.device) # Update critic loss_critic = self.model.loss_critic( @@ -239,39 +234,30 @@ class TrainDQLDiffusionAgent(TrainAgent): {"state": next_obs_b}, actions_b, rewards_b, - dones_b, + terminated_b, self.gamma, ) self.critic_optimizer.zero_grad() loss_critic.backward() self.critic_optimizer.step() - # get the new action and q values - samples = self.model.forward_train( - cond={"state": obs_b}, - deterministic=eval_mode, - ) - action_venv = samples[:, : self.act_steps] # n_env x horizon x act - q_values_b = self.model.critic({"state": obs_b}, action_venv) - q1_new_action, q2_new_action = q_values_b - # Update policy with collected trajectories self.actor_optimizer.zero_grad() - actor_loss = self.model.loss_actor( + loss_actor = self.model.loss_actor( {"state": obs_b}, - actions_b, - q1_new_action, - q2_new_action, self.eta, + self.act_steps, ) - actor_loss.backward() + loss_actor.backward() if self.itr >= self.n_critic_warmup_itr: if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm_( self.model.actor.parameters(), self.max_grad_norm ) self.actor_optimizer.step() - loss = actor_loss + + # update target + self.model.update_target_critic(self.target_ema_rate) # Update lr self.actor_lr_scheduler.step() @@ -285,10 +271,12 @@ class TrainDQLDiffusionAgent(TrainAgent): run_results.append( { "itr": self.itr, + "step": cnt_train_step, } ) if self.itr % self.log_freq == 0: time = timer() + run_results[-1]["time"] = time if eval_mode: log.info( f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" @@ -309,12 +297,13 @@ class TrainDQLDiffusionAgent(TrainAgent): run_results[-1]["eval_best_reward"] = avg_best_reward else: log.info( - f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}" + f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" ) if self.use_wandb: wandb.log( { - "loss": loss, + "total env step": cnt_train_step, + "loss - actor": loss_actor, "loss - critic": loss_critic, "avg episode reward - train": avg_episode_reward, "num episode - train": num_episode_finished, @@ -322,10 +311,7 @@ class TrainDQLDiffusionAgent(TrainAgent): step=self.itr, commit=True, ) - run_results[-1]["loss"] = loss - run_results[-1]["loss_critic"] = loss_critic run_results[-1]["train_episode_reward"] = avg_episode_reward - run_results[-1]["time"] = time with open(self.result_path, "wb") as f: pickle.dump(run_results, f) self.itr += 1 diff --git a/agent/finetune/train_ibrl_agent.py b/agent/finetune/train_ibrl_agent.py new file mode 100644 index 0000000..0f9a06d --- /dev/null +++ b/agent/finetune/train_ibrl_agent.py @@ -0,0 +1,352 @@ +""" +Imitation Bootstrapped Reinforcement Learning (IBRL) agent training script. + +Does not support image observations right now. +""" + +import os +import pickle +import numpy as np +import torch +import logging +import wandb +import hydra +from collections import deque + +log = logging.getLogger(__name__) +from util.timer import Timer +from agent.finetune.train_agent import TrainAgent +from util.scheduler import CosineAnnealingWarmupRestarts + + +class TrainIBRLAgent(TrainAgent): + def __init__(self, cfg): + super().__init__(cfg) + + # Build dataset + self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset) + + # note the discount factor gamma here is applied to reward every act_steps, instead of every env step + self.gamma = cfg.train.gamma + + # Optimizer + self.actor_optimizer = torch.optim.AdamW( + self.model.network.parameters(), + lr=cfg.train.actor_lr, + weight_decay=cfg.train.actor_weight_decay, + ) + self.actor_lr_scheduler = CosineAnnealingWarmupRestarts( + self.actor_optimizer, + first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.actor_lr, + min_lr=cfg.train.actor_lr_scheduler.min_lr, + warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps, + gamma=1.0, + ) + self.critic_optimizer = torch.optim.AdamW( + self.model.ensemble_params.values(), # https://github.com/pytorch/pytorch/issues/120581 + lr=cfg.train.critic_lr, + weight_decay=cfg.train.critic_weight_decay, + ) + self.critic_lr_scheduler = CosineAnnealingWarmupRestarts( + self.critic_optimizer, + first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.critic_lr, + min_lr=cfg.train.critic_lr_scheduler.min_lr, + warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps, + gamma=1.0, + ) + + # Perturbation scale + self.target_ema_rate = cfg.train.target_ema_rate + + # Reward scale + self.scale_reward_factor = cfg.train.scale_reward_factor + + # Number of critic updates + self.critic_num_update = cfg.train.critic_num_update + + # Update frequency + self.update_freq = cfg.train.update_freq + + # Buffer size + self.buffer_size = cfg.train.buffer_size + + # Eval episodes + self.n_eval_episode = cfg.train.n_eval_episode + + # Exploration steps at the beginning - using randomly sampled action + self.n_explore_steps = cfg.train.n_explore_steps + + def run(self): + # make a FIFO replay buffer for obs, action, and reward + obs_buffer = deque(maxlen=self.buffer_size) + next_obs_buffer = deque(maxlen=self.buffer_size) + action_buffer = deque(maxlen=self.buffer_size) + reward_buffer = deque(maxlen=self.buffer_size) + terminated_buffer = deque(maxlen=self.buffer_size) + + # load offline dataset into replay buffer + dataloader_offline = torch.utils.data.DataLoader( + self.dataset_offline, + batch_size=len(self.dataset_offline), + drop_last=False, + ) + for batch in dataloader_offline: + actions, states_and_next, rewards, terminated = batch + states = states_and_next["state"] + next_states = states_and_next["next_state"] + obs_buffer.extend(states.cpu().numpy()) + next_obs_buffer.extend(next_states.cpu().numpy()) + action_buffer.extend(actions.cpu().numpy()) + reward_buffer.extend(rewards.cpu().numpy().flatten()) + terminated_buffer.extend(terminated.cpu().numpy().flatten()) + + # Start training loop + timer = Timer() + run_results = [] + cnt_train_step = 0 + done_venv = np.zeros((1, self.n_envs)) + while self.itr < self.n_train_itr: + if self.itr % 1000 == 0: + print(f"Finished training iteration {self.itr} of {self.n_train_itr}") + + # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env + options_venv = [{} for _ in range(self.n_envs)] + if self.itr % self.render_freq == 0 and self.render_video: + for env_ind in range(self.n_render): + options_venv[env_ind]["video_path"] = os.path.join( + self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4" + ) + + # Define train or eval - all envs restart + eval_mode = ( + self.itr % self.val_freq == 0 + and self.itr > self.n_explore_steps + and not self.force_train + ) + n_steps = ( + self.n_steps if not eval_mode else int(1e5) + ) # large number for eval mode + self.model.eval() if eval_mode else self.model.train() + + # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning + firsts_trajs = np.zeros((n_steps + 1, self.n_envs)) + if self.reset_at_iteration or eval_mode or self.itr == 0: + prev_obs_venv = self.reset_env_all(options_venv=options_venv) + firsts_trajs[0] = 1 + else: + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv + reward_trajs = np.zeros((n_steps, self.n_envs)) + + # Collect a set of trajectories from env + cnt_episode = 0 + for step in range(n_steps): + + # Select action + with torch.no_grad(): + cond = { + "state": torch.from_numpy(prev_obs_venv["state"]) + .float() + .to(self.device) + } + samples = ( + self.model( + cond=cond, + deterministic=eval_mode, + ) + .cpu() + .numpy() + ) # n_env x horizon x act + action_venv = samples[:, : self.act_steps] + + # Apply multi-step action + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) + ) + done_venv = terminated_venv | truncated_venv + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = done_venv + + # add to buffer in train mode + if not eval_mode: + for i in range(self.n_envs): + obs_buffer.append(prev_obs_venv["state"][i]) + if "final_obs" in info_venv[i]: # truncated + next_obs_buffer.append(info_venv[i]["final_obs"]["state"]) + terminated_venv[i] = False + else: # first obs in new episode + next_obs_buffer.append(obs_venv["state"][i]) + action_buffer.append(action_venv[i]) + reward_buffer.extend( + (reward_venv * self.scale_reward_factor).tolist() + ) + terminated_buffer.append(terminated_venv.tolist()) + + # update for next step + prev_obs_venv = obs_venv + + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + + # check if enough eval episodes are done + cnt_episode += np.sum(done_venv) + if eval_mode and cnt_episode >= self.n_eval_episode: + break + + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. + episodes_start_end = [] + for env_ind in range(self.n_envs): + env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0] + for i in range(len(env_steps) - 1): + start = env_steps[i] + end = env_steps[i + 1] + if end - start > 1: + episodes_start_end.append((env_ind, start, end - 1)) + if len(episodes_start_end) > 0: + reward_trajs_split = [ + reward_trajs[start : end + 1, env_ind] + for env_ind, start, end in episodes_start_end + ] + num_episode_finished = len(reward_trajs_split) + episode_reward = np.array( + [np.sum(reward_traj) for reward_traj in reward_trajs_split] + ) + episode_best_reward = np.array( + [ + np.max(reward_traj) / self.act_steps + for reward_traj in reward_trajs_split + ] + ) + avg_episode_reward = np.mean(episode_reward) + avg_best_reward = np.mean(episode_best_reward) + success_rate = np.mean( + episode_best_reward >= self.best_reward_threshold_for_success + ) + else: + episode_reward = np.array([]) + num_episode_finished = 0 + avg_episode_reward = 0 + avg_best_reward = 0 + success_rate = 0 + + # Update models + if ( + not eval_mode + and self.itr > self.n_explore_steps + and self.itr % self.update_freq == 0 + ): + # Update critic more frequently + for _ in range(self.critic_num_update): + # Sample from online buffer + inds = np.random.choice(len(obs_buffer), self.batch_size) + obs_b = ( + torch.from_numpy(np.array([obs_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + next_obs_b = ( + torch.from_numpy(np.array([next_obs_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + actions_b = ( + torch.from_numpy(np.array([action_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + rewards_b = ( + torch.from_numpy(np.array([reward_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + terminated_b = ( + torch.from_numpy(np.array([terminated_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + loss_critic = self.model.loss_critic( + {"state": obs_b}, + {"state": next_obs_b}, + actions_b, + rewards_b, + terminated_b, + self.gamma, + ) + self.critic_optimizer.zero_grad() + loss_critic.backward() + self.critic_optimizer.step() + + # Update target critic every critic update + self.model.update_target_critic(self.target_ema_rate) + + # Update actor once with the final batch + loss_actor = self.model.loss_actor( + {"state": obs_b}, + ) + self.actor_optimizer.zero_grad() + loss_actor.backward() + self.actor_optimizer.step() + + # Update target actor + self.model.update_target_actor(self.target_ema_rate) + + # Update lr + self.actor_lr_scheduler.step() + self.critic_lr_scheduler.step() + + # Save model + if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1: + self.save_model() + + # Log loss and save metrics + run_results.append( + { + "itr": self.itr, + "step": cnt_train_step, + } + ) + if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps: + time = timer() + run_results[-1]["time"] = time + if eval_mode: + log.info( + f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "success rate - eval": success_rate, + "avg episode reward - eval": avg_episode_reward, + "avg best reward - eval": avg_best_reward, + "num episode - eval": num_episode_finished, + }, + step=self.itr, + commit=False, + ) + run_results[-1]["eval_success_rate"] = success_rate + run_results[-1]["eval_episode_reward"] = avg_episode_reward + run_results[-1]["eval_best_reward"] = avg_best_reward + else: + log.info( + f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "total env step": cnt_train_step, + "loss - actor": loss_actor, + "loss - critic": loss_critic, + "avg episode reward - train": avg_episode_reward, + "num episode - train": num_episode_finished, + }, + step=self.itr, + commit=True, + ) + run_results[-1]["train_episode_reward"] = avg_episode_reward + with open(self.result_path, "wb") as f: + pickle.dump(run_results, f) + self.itr += 1 diff --git a/agent/finetune/train_idql_diffusion_agent.py b/agent/finetune/train_idql_diffusion_agent.py index e6dd168..7ce132c 100644 --- a/agent/finetune/train_idql_diffusion_agent.py +++ b/agent/finetune/train_idql_diffusion_agent.py @@ -102,12 +102,12 @@ class TrainIDQLDiffusionAgent(TrainAgent): next_obs_buffer = deque(maxlen=self.buffer_size) action_buffer = deque(maxlen=self.buffer_size) reward_buffer = deque(maxlen=self.buffer_size) - done_buffer = deque(maxlen=self.buffer_size) - first_buffer = deque(maxlen=self.buffer_size) + terminated_buffer = deque(maxlen=self.buffer_size) # Start training loop timer = Timer() run_results = [] + cnt_train_step = 0 last_itr_eval = False done_venv = np.zeros((1, self.n_envs)) while self.itr < self.n_train_itr: @@ -131,10 +131,9 @@ class TrainIDQLDiffusionAgent(TrainAgent): prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 else: - firsts_trajs[0] = ( - done_venv # if done at the end of last iteration, then the envs are just reset - ) - reward_trajs = np.empty((0, self.n_envs)) + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv + reward_trajs = np.zeros((self.n_steps, self.n_envs)) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -161,22 +160,33 @@ class TrainIDQLDiffusionAgent(TrainAgent): action_venv = samples[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step( - action_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) ) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) + done_venv = terminated_venv | truncated_venv + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = done_venv # add to buffer - obs_buffer.append(prev_obs_venv["state"]) - next_obs_buffer.append(obs_venv["state"]) - action_buffer.append(action_venv) - reward_buffer.append(reward_venv * self.scale_reward_factor) - done_buffer.append(done_venv) - first_buffer.append(firsts_trajs[step]) + if not eval_mode: + obs_venv_copy = obs_venv.copy() + for i in range(self.n_envs): + if truncated_venv[i]: + obs_venv_copy["state"][i] = info_venv[i]["final_obs"][ + "state" + ] + obs_buffer.append(prev_obs_venv["state"]) + next_obs_buffer.append(obs_venv_copy["state"]) + action_buffer.append(action_venv) + reward_buffer.append(reward_venv * self.scale_reward_factor) + terminated_buffer.append(terminated_venv) - firsts_trajs[step + 1] = done_venv + # update for next step prev_obs_venv = obs_venv + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. episodes_start_end = [] for env_ind in range(self.n_envs): @@ -216,13 +226,15 @@ class TrainIDQLDiffusionAgent(TrainAgent): # Update models if not eval_mode: + num_batch = int( + self.n_steps * self.n_envs / self.batch_size * self.replay_ratio + ) obs_trajs = np.array(deepcopy(obs_buffer)) action_trajs = np.array(deepcopy(action_buffer)) next_obs_trajs = np.array(deepcopy(next_obs_buffer)) reward_trajs = np.array(deepcopy(reward_buffer)) - done_trajs = np.array(deepcopy(done_buffer)) - first_trajs = np.array(deepcopy(first_buffer)) + terminated_trajs = np.array(deepcopy(terminated_buffer)) # flatten obs_trajs = einops.rearrange( @@ -238,13 +250,7 @@ class TrainIDQLDiffusionAgent(TrainAgent): "s e h d -> (s e) h d", ) reward_trajs = reward_trajs.reshape(-1) - done_trajs = done_trajs.reshape(-1) - first_trajs = first_trajs.reshape(-1) - - num_batch = int( - self.n_steps * self.n_envs / self.batch_size * self.replay_ratio - ) - + terminated_trajs = terminated_trajs.reshape(-1) for _ in range(num_batch): # Sample batch @@ -259,7 +265,9 @@ class TrainIDQLDiffusionAgent(TrainAgent): reward_b = ( torch.from_numpy(reward_trajs[inds]).float().to(self.device) ) - done_b = torch.from_numpy(done_trajs[inds]).float().to(self.device) + terminated_b = ( + torch.from_numpy(terminated_trajs[inds]).float().to(self.device) + ) # update critic value function critic_loss_v = self.model.loss_critic_v( @@ -275,7 +283,7 @@ class TrainIDQLDiffusionAgent(TrainAgent): {"state": next_obs_b}, actions_b, reward_b, - done_b, + terminated_b, self.gamma, ) self.critic_q_optimizer.zero_grad() @@ -284,16 +292,15 @@ class TrainIDQLDiffusionAgent(TrainAgent): # update target q function self.model.update_target_critic(self.critic_tau) - loss_critic = critic_loss_q.detach() + critic_loss_v.detach() # Update policy with collected trajectories - no weighting - loss = self.model.loss( + loss_actor = self.model.loss( actions_b, {"state": obs_b}, ) self.actor_optimizer.zero_grad() - loss.backward() + loss_actor.backward() if self.itr >= self.n_critic_warmup_itr: if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm_( @@ -314,10 +321,12 @@ class TrainIDQLDiffusionAgent(TrainAgent): run_results.append( { "itr": self.itr, + "step": cnt_train_step, } ) if self.itr % self.log_freq == 0: time = timer() + run_results[-1]["time"] = time if eval_mode: log.info( f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" @@ -338,12 +347,13 @@ class TrainIDQLDiffusionAgent(TrainAgent): run_results[-1]["eval_best_reward"] = avg_best_reward else: log.info( - f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}" + f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" ) if self.use_wandb: wandb.log( { - "loss": loss, + "total env step": cnt_train_step, + "loss - actor": loss_actor, "loss - critic": loss_critic, "avg episode reward - train": avg_episode_reward, "num episode - train": num_episode_finished, @@ -351,10 +361,7 @@ class TrainIDQLDiffusionAgent(TrainAgent): step=self.itr, commit=True, ) - run_results[-1]["loss"] = loss - run_results[-1]["loss_critic"] = loss_critic run_results[-1]["train_episode_reward"] = avg_episode_reward - run_results[-1]["time"] = time with open(self.result_path, "wb") as f: pickle.dump(run_results, f) self.itr += 1 diff --git a/agent/finetune/train_ppo_diffusion_agent.py b/agent/finetune/train_ppo_diffusion_agent.py index 73878ce..ee073b3 100644 --- a/agent/finetune/train_ppo_diffusion_agent.py +++ b/agent/finetune/train_ppo_diffusion_agent.py @@ -50,6 +50,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent): # Start training loop timer = Timer() run_results = [] + cnt_train_step = 0 last_itr_eval = False done_venv = np.zeros((1, self.n_envs)) while self.itr < self.n_train_itr: @@ -68,34 +69,36 @@ class TrainPPODiffusionAgent(TrainPPOAgent): last_itr_eval = eval_mode # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode - dones_trajs = np.zeros((self.n_steps, self.n_envs)) firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) if self.reset_at_iteration or eval_mode or last_itr_eval: prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 else: - firsts_trajs[0] = ( - done_venv # if done at the end of last iteration, then the envs are just reset - ) + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv # Holder obs_trajs = { - "state": np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim)) + "state": np.zeros( + (self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim) + ) } - chains_trajs = np.empty( + chains_trajs = np.zeros( ( - 0, + self.n_steps, self.n_envs, self.model.ft_denoising_steps + 1, self.horizon_steps, self.action_dim, ) ) - reward_trajs = np.empty((0, self.n_envs)) - obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim)) - obs_full_trajs = np.vstack( - (obs_full_trajs, prev_obs_venv["state"][:, -1][None]) - ) # save current obs + terminated_trajs = np.zeros((self.n_steps, self.n_envs)) + reward_trajs = np.zeros((self.n_steps, self.n_envs)) + if self.save_full_observations: # state-only + obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim)) + obs_full_trajs = np.vstack( + (obs_full_trajs, prev_obs_venv["state"][:, -1][None]) + ) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -123,9 +126,10 @@ class TrainPPODiffusionAgent(TrainPPOAgent): action_venv = output_venv[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step( - action_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) ) + done_venv = terminated_venv | truncated_venv if self.save_full_observations: # state-only obs_full_venv = np.array( [info["full_obs"]["state"] for info in info_venv] @@ -133,15 +137,18 @@ class TrainPPODiffusionAgent(TrainPPOAgent): obs_full_trajs = np.vstack( (obs_full_trajs, obs_full_venv.transpose(1, 0, 2)) ) - obs_trajs["state"] = np.vstack( - (obs_trajs["state"], prev_obs_venv["state"][None]) - ) - chains_trajs = np.vstack((chains_trajs, chains_venv[None])) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - dones_trajs[step] = done_venv + obs_trajs["state"][step] = prev_obs_venv["state"] + chains_trajs[step] = chains_venv + reward_trajs[step] = reward_venv + terminated_trajs[step] = terminated_venv firsts_trajs[step + 1] = done_venv + + # update for next step prev_obs_venv = obs_venv + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. episodes_start_end = [] for env_ind in range(self.n_envs): @@ -238,7 +245,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent): ) reward_trajs = reward_trajs_transpose.T - # bootstrap value with GAE if not done - apply reward scaling with constant if specified + # bootstrap value with GAE if not terminal - apply reward scaling with constant if specified obs_venv_ts = { "state": torch.from_numpy(obs_venv["state"]) .float() @@ -256,7 +263,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent): ) else: nextvalues = values_trajs[t + 1] - nonterminal = 1.0 - dones_trajs[t] + nonterminal = 1.0 - terminated_trajs[t] # delta = r + gamma*V(st+1) - V(st) delta = ( reward_trajs[t] * self.reward_scale_const @@ -405,6 +412,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent): run_results.append( { "itr": self.itr, + "step": cnt_train_step, } ) if self.save_trajs: @@ -414,6 +422,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent): run_results[-1]["reward_trajs"] = reward_trajs if self.itr % self.log_freq == 0: time = timer() + run_results[-1]["time"] = time if eval_mode: log.info( f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" @@ -434,11 +443,12 @@ class TrainPPODiffusionAgent(TrainPPOAgent): run_results[-1]["eval_best_reward"] = avg_best_reward else: log.info( - f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}" + f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}" ) if self.use_wandb: wandb.log( { + "total env step": cnt_train_step, "loss": loss, "pg loss": pg_loss, "value loss": v_loss, @@ -459,17 +469,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent): step=self.itr, commit=True, ) - run_results[-1]["loss"] = loss - run_results[-1]["pg_loss"] = pg_loss - run_results[-1]["value_loss"] = v_loss - run_results[-1]["bc_loss"] = bc_loss - run_results[-1]["eta"] = eta - run_results[-1]["approx_kl"] = approx_kl - run_results[-1]["ratio"] = ratio - run_results[-1]["clip_frac"] = np.mean(clipfracs) - run_results[-1]["explained_variance"] = explained_var run_results[-1]["train_episode_reward"] = avg_episode_reward - run_results[-1]["time"] = time with open(self.result_path, "wb") as f: pickle.dump(run_results, f) self.itr += 1 diff --git a/agent/finetune/train_ppo_diffusion_img_agent.py b/agent/finetune/train_ppo_diffusion_img_agent.py index 2abb148..9d47b0d 100644 --- a/agent/finetune/train_ppo_diffusion_img_agent.py +++ b/agent/finetune/train_ppo_diffusion_img_agent.py @@ -40,6 +40,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent): # Start training loop timer = Timer() run_results = [] + cnt_train_step = 0 last_itr_eval = False done_venv = np.zeros((1, self.n_envs)) while self.itr < self.n_train_itr: @@ -58,31 +59,32 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent): last_itr_eval = eval_mode # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode - dones_trajs = np.zeros((self.n_steps, self.n_envs)) firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) if self.reset_at_iteration or eval_mode or last_itr_eval: prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 else: - firsts_trajs[0] = ( - done_venv # if done at the end of last iteration, then the envs are just reset - ) + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv # Holder obs_trajs = { - k: np.empty((0, self.n_envs, self.n_cond_step, *self.obs_dims[k])) + k: np.zeros( + (self.n_steps, self.n_envs, self.n_cond_step, *self.obs_dims[k]) + ) for k in self.obs_dims } - chains_trajs = np.empty( + chains_trajs = np.zeros( ( - 0, + self.n_steps, self.n_envs, self.model.ft_denoising_steps + 1, self.horizon_steps, self.action_dim, ) ) - reward_trajs = np.empty((0, self.n_envs)) + terminated_trajs = np.zeros((self.n_steps, self.n_envs)) + reward_trajs = np.zeros((self.n_steps, self.n_envs)) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -111,17 +113,23 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent): action_venv = output_venv[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step( - action_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) ) + done_venv = terminated_venv | truncated_venv for k in obs_trajs: - obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None])) - chains_trajs = np.vstack((chains_trajs, chains_venv[None])) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - dones_trajs[step] = done_venv + obs_trajs[k][step] = prev_obs_venv[k] + chains_trajs[step] = chains_venv + reward_trajs[step] = reward_venv + terminated_trajs[step] = terminated_venv firsts_trajs[step + 1] = done_venv + + # update for next step prev_obs_venv = obs_venv + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. episodes_start_end = [] for env_ind in range(self.n_envs): @@ -235,7 +243,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent): ) reward_trajs = reward_trajs_transpose.T - # bootstrap value with GAE if not done - apply reward scaling with constant if specified + # bootstrap value with GAE if not terminal - apply reward scaling with constant if specified obs_venv_ts = { key: torch.from_numpy(obs_venv[key]).float().to(self.device) for key in self.obs_dims @@ -252,7 +260,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent): ) else: nextvalues = values_trajs[t + 1] - nonterminal = 1.0 - dones_trajs[t] + nonterminal = 1.0 - terminated_trajs[t] # delta = r + gamma*V(st+1) - V(st) delta = ( reward_trajs[t] * self.reward_scale_const @@ -398,10 +406,12 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent): run_results.append( { "itr": self.itr, + "step": cnt_train_step, } ) if self.itr % self.log_freq == 0: time = timer() + run_results[-1]["time"] = time if eval_mode: log.info( f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" @@ -422,11 +432,12 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent): run_results[-1]["eval_best_reward"] = avg_best_reward else: log.info( - f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}" + f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}" ) if self.use_wandb: wandb.log( { + "total env step": cnt_train_step, "loss": loss, "pg loss": pg_loss, "value loss": v_loss, @@ -447,17 +458,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent): step=self.itr, commit=True, ) - run_results[-1]["loss"] = loss - run_results[-1]["pg_loss"] = pg_loss - run_results[-1]["value_loss"] = v_loss - run_results[-1]["bc_loss"] = bc_loss - run_results[-1]["eta"] = eta - run_results[-1]["approx_kl"] = approx_kl - run_results[-1]["ratio"] = ratio - run_results[-1]["clip_frac"] = np.mean(clipfracs) - run_results[-1]["explained_variance"] = explained_var run_results[-1]["train_episode_reward"] = avg_episode_reward - run_results[-1]["time"] = time with open(self.result_path, "wb") as f: pickle.dump(run_results, f) self.itr += 1 diff --git a/agent/finetune/train_ppo_exact_diffusion_agent.py b/agent/finetune/train_ppo_exact_diffusion_agent.py index 00c4c90..920b03f 100644 --- a/agent/finetune/train_ppo_exact_diffusion_agent.py +++ b/agent/finetune/train_ppo_exact_diffusion_agent.py @@ -32,6 +32,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): # Start training loop timer = Timer() run_results = [] + cnt_train_step = 0 last_itr_eval = False done_venv = np.zeros((1, self.n_envs)) while self.itr < self.n_train_itr: @@ -50,42 +51,39 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): last_itr_eval = eval_mode # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode - dones_trajs = np.zeros((self.n_steps, self.n_envs)) firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) if self.reset_at_iteration or eval_mode or last_itr_eval: prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 else: - firsts_trajs[0] = ( - done_venv # if done at the end of last iteration, then the envs are just reset - ) + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv # Holder obs_trajs = { - "state": np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim)) + "state": np.zeros( + (self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim) + ) } - samples_trajs = np.empty( + samples_trajs = np.zeros( ( - 0, + self.n_steps, self.n_envs, self.horizon_steps, self.action_dim, ) ) - chains_trajs = np.empty( + chains_trajs = np.zeros( ( - 0, + self.n_steps, self.n_envs, self.model.ft_denoising_steps + 1, self.horizon_steps, self.action_dim, ) ) - reward_trajs = np.empty((0, self.n_envs)) - obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim)) - obs_full_trajs = np.vstack( - (obs_full_trajs, prev_obs_venv["state"][:, -1][None]) - ) # save current obs + terminated_trajs = np.zeros((self.n_steps, self.n_envs)) + reward_trajs = np.zeros((self.n_steps, self.n_envs)) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -111,28 +109,25 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): samples.chains.cpu().numpy() ) # n_env x denoising x horizon x act action_venv = output_venv[:, : self.act_steps] - samples_trajs = np.vstack((samples_trajs, output_venv[None])) + samples_trajs[step] = output_venv # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step( - action_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) ) - if self.save_full_observations: # state-only - obs_full_venv = np.array( - [info["full_obs"]["state"] for info in info_venv] - ) # n_envs x act_steps x obs_dim - obs_full_trajs = np.vstack( - (obs_full_trajs, obs_full_venv.transpose(1, 0, 2)) - ) - obs_trajs["state"] = np.vstack( - (obs_trajs["state"], prev_obs_venv["state"][None]) - ) - chains_trajs = np.vstack((chains_trajs, chains_venv[None])) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - dones_trajs[step] = done_venv + done_venv = terminated_venv | truncated_venv + obs_trajs["state"][step] = prev_obs_venv["state"] + chains_trajs[step] = chains_venv + reward_trajs[step] = reward_venv + terminated_trajs[step] = terminated_venv firsts_trajs[step + 1] = done_venv + + # update for next step prev_obs_venv = obs_venv + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. episodes_start_end = [] for env_ind in range(self.n_envs): @@ -214,7 +209,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): ) reward_trajs = reward_trajs_transpose.T - # bootstrap value with GAE if not done - apply reward scaling with constant if specified + # bootstrap value with GAE if not terminal - apply reward scaling with constant if specified obs_venv_ts = { "state": torch.from_numpy(obs_venv["state"]) .float() @@ -232,7 +227,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): ) else: nextvalues = values_trajs[t + 1] - nonterminal = 1.0 - dones_trajs[t] + nonterminal = 1.0 - terminated_trajs[t] # delta = r + gamma*V(st+1) - V(st) delta = ( reward_trajs[t] * self.reward_scale_const @@ -343,20 +338,6 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y ) - # Plot state trajectories (only in D3IL) - if ( - self.itr % self.render_freq == 0 - and self.n_render > 0 - and self.traj_plotter is not None - ): - self.traj_plotter( - obs_full_trajs=obs_full_trajs, - n_render=self.n_render, - max_episode_steps=self.max_episode_steps, - render_dir=self.render_dir, - itr=self.itr, - ) - # Update lr if self.itr >= self.n_critic_warmup_itr: self.actor_lr_scheduler.step() @@ -370,16 +351,17 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): run_results.append( { "itr": self.itr, + "step": cnt_train_step, } ) if self.save_trajs: - run_results[-1]["obs_full_trajs"] = obs_full_trajs run_results[-1]["obs_trajs"] = obs_trajs run_results[-1]["action_trajs"] = samples_trajs run_results[-1]["chains_trajs"] = chains_trajs run_results[-1]["reward_trajs"] = reward_trajs if self.itr % self.log_freq == 0: time = timer() + run_results[-1]["time"] = time if eval_mode: log.info( f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" @@ -400,11 +382,12 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): run_results[-1]["eval_best_reward"] = avg_best_reward else: log.info( - f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" + f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" ) if self.use_wandb: wandb.log( { + "total env step": cnt_train_step, "loss": loss, "pg loss": pg_loss, "value loss": v_loss, @@ -417,15 +400,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): step=self.itr, commit=True, ) - run_results[-1]["loss"] = loss - run_results[-1]["pg_loss"] = pg_loss - run_results[-1]["value_loss"] = v_loss - run_results[-1]["approx_kl"] = approx_kl - run_results[-1]["ratio"] = ratio - run_results[-1]["clip_frac"] = np.mean(clipfracs) - run_results[-1]["explained_variance"] = explained_var run_results[-1]["train_episode_reward"] = avg_episode_reward - run_results[-1]["time"] = time with open(self.result_path, "wb") as f: pickle.dump(run_results, f) self.itr += 1 diff --git a/agent/finetune/train_ppo_gaussian_agent.py b/agent/finetune/train_ppo_gaussian_agent.py index cfc1546..2ad38bd 100644 --- a/agent/finetune/train_ppo_gaussian_agent.py +++ b/agent/finetune/train_ppo_gaussian_agent.py @@ -27,6 +27,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent): # Start training loop timer = Timer() run_results = [] + cnt_train_step = 0 last_itr_eval = False done_venv = np.zeros((1, self.n_envs)) while self.itr < self.n_train_itr: @@ -45,33 +46,35 @@ class TrainPPOGaussianAgent(TrainPPOAgent): last_itr_eval = eval_mode # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode - dones_trajs = np.zeros((self.n_steps, self.n_envs)) firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) if self.reset_at_iteration or eval_mode or last_itr_eval: prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 else: - firsts_trajs[0] = ( - done_venv # if done at the end of last iteration, then the envs are just reset - ) + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv # Holder obs_trajs = { - "state": np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim)) + "state": np.zeros( + (self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim) + ) } - samples_trajs = np.empty( + samples_trajs = np.zeros( ( - 0, + self.n_steps, self.n_envs, self.horizon_steps, self.action_dim, ) ) - reward_trajs = np.empty((0, self.n_envs)) - obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim)) - obs_full_trajs = np.vstack( - (obs_full_trajs, prev_obs_venv["state"][:, -1][None]) - ) # save current obs + reward_trajs = np.zeros((self.n_steps, self.n_envs)) + terminated_trajs = np.zeros((self.n_steps, self.n_envs)) + if self.save_full_observations: + obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim)) + obs_full_trajs = np.vstack( + (obs_full_trajs, prev_obs_venv["state"][:, -1][None]) + ) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -93,9 +96,10 @@ class TrainPPOGaussianAgent(TrainPPOAgent): action_venv = output_venv[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step( - action_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) ) + done_venv = terminated_venv | truncated_venv if self.save_full_observations: # state-only obs_full_venv = np.array( [info["full_obs"]["state"] for info in info_venv] @@ -103,15 +107,18 @@ class TrainPPOGaussianAgent(TrainPPOAgent): obs_full_trajs = np.vstack( (obs_full_trajs, obs_full_venv.transpose(1, 0, 2)) ) - obs_trajs["state"] = np.vstack( - (obs_trajs["state"], prev_obs_venv["state"][None]) - ) - samples_trajs = np.vstack((samples_trajs, output_venv[None])) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - dones_trajs[step] = done_venv + obs_trajs["state"][step] = prev_obs_venv["state"] + samples_trajs[step] = output_venv + reward_trajs[step] = reward_venv + terminated_trajs[step] = terminated_venv firsts_trajs[step + 1] = done_venv + + # update for next step prev_obs_venv = obs_venv + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. episodes_start_end = [] for env_ind in range(self.n_envs): @@ -221,7 +228,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent): ) else: nextvalues = values_trajs[t + 1] - nonterminal = 1.0 - dones_trajs[t] + nonterminal = 1.0 - terminated_trajs[t] # delta = r + gamma*V(st+1) - V(st) delta = ( reward_trajs[t] * self.reward_scale_const @@ -363,6 +370,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent): run_results.append( { "itr": self.itr, + "step": cnt_train_step, } ) if self.save_trajs: @@ -372,6 +380,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent): run_results[-1]["reward_trajs"] = reward_trajs if self.itr % self.log_freq == 0: time = timer() + run_results[-1]["time"] = time if eval_mode: log.info( f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" @@ -392,11 +401,12 @@ class TrainPPOGaussianAgent(TrainPPOAgent): run_results[-1]["eval_best_reward"] = avg_best_reward else: log.info( - f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | ent {-entropy_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" + f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | ent {-entropy_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" ) if self.use_wandb: wandb.log( { + "total env step": cnt_train_step, "loss": loss, "pg loss": pg_loss, "value loss": v_loss, @@ -412,16 +422,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent): step=self.itr, commit=True, ) - run_results[-1]["loss"] = loss - run_results[-1]["pg_loss"] = pg_loss - run_results[-1]["value_loss"] = v_loss - run_results[-1]["entropy_loss"] = entropy_loss - run_results[-1]["approx_kl"] = approx_kl - run_results[-1]["ratio"] = ratio - run_results[-1]["clip_frac"] = np.mean(clipfracs) - run_results[-1]["explained_variance"] = explained_var run_results[-1]["train_episode_reward"] = avg_episode_reward - run_results[-1]["time"] = time with open(self.result_path, "wb") as f: pickle.dump(run_results, f) self.itr += 1 diff --git a/agent/finetune/train_ppo_gaussian_img_agent.py b/agent/finetune/train_ppo_gaussian_img_agent.py index eabd531..de1dbb9 100644 --- a/agent/finetune/train_ppo_gaussian_img_agent.py +++ b/agent/finetune/train_ppo_gaussian_img_agent.py @@ -40,6 +40,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent): # Start training loop timer = Timer() run_results = [] + cnt_train_step = 0 last_itr_eval = False done_venv = np.zeros((1, self.n_envs)) while self.itr < self.n_train_itr: @@ -58,30 +59,31 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent): last_itr_eval = eval_mode # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode - dones_trajs = np.zeros((self.n_steps, self.n_envs)) firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) if self.reset_at_iteration or eval_mode or last_itr_eval: prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 else: - firsts_trajs[0] = ( - done_venv # if done at the end of last iteration, then the envs are just reset - ) + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv # Holder obs_trajs = { - k: np.empty((0, self.n_envs, self.n_cond_step, *self.obs_dims[k])) + k: np.zeros( + (self.n_steps, self.n_envs, self.n_cond_step, *self.obs_dims[k]) + ) for k in self.obs_dims } - samples_trajs = np.empty( + samples_trajs = np.zeros( ( - 0, + self.n_steps, self.n_envs, self.horizon_steps, self.action_dim, ) ) - reward_trajs = np.empty((0, self.n_envs)) + terminated_trajs = np.zeros((self.n_steps, self.n_envs)) + reward_trajs = np.zeros((self.n_steps, self.n_envs)) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -104,17 +106,23 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent): action_venv = output_venv[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step( - action_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) ) + done_venv = terminated_venv | truncated_venv for k in obs_trajs: - obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None])) - samples_trajs = np.vstack((samples_trajs, output_venv[None])) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - dones_trajs[step] = done_venv + obs_trajs[k][step] = prev_obs_venv[k] + samples_trajs[step] = output_venv + reward_trajs[step] = reward_venv + terminated_trajs[step] = terminated_venv firsts_trajs[step + 1] = done_venv + + # update for next step prev_obs_venv = obs_venv + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. episodes_start_end = [] for env_ind in range(self.n_envs): @@ -240,7 +248,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent): ) else: nextvalues = values_trajs[t + 1] - nonterminal = 1.0 - dones_trajs[t] + nonterminal = 1.0 - terminated_trajs[t] # delta = r + gamma*V(st+1) - V(st) delta = ( reward_trajs[t] * self.reward_scale_const @@ -374,10 +382,12 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent): run_results.append( { "itr": self.itr, + "step": cnt_train_step, } ) if self.itr % self.log_freq == 0: time = timer() + run_results[-1]["time"] = time if eval_mode: log.info( f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" @@ -398,11 +408,12 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent): run_results[-1]["eval_best_reward"] = avg_best_reward else: log.info( - f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" + f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" ) if self.use_wandb: wandb.log( { + "total env step": cnt_train_step, "loss": loss, "pg loss": pg_loss, "value loss": v_loss, @@ -422,17 +433,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent): step=self.itr, commit=True, ) - run_results[-1]["loss"] = loss - run_results[-1]["pg_loss"] = pg_loss - run_results[-1]["value_loss"] = v_loss - run_results[-1]["bc_loss"] = bc_loss - run_results[-1]["std"] = std - run_results[-1]["approx_kl"] = approx_kl - run_results[-1]["ratio"] = ratio - run_results[-1]["clip_frac"] = np.mean(clipfracs) - run_results[-1]["explained_variance"] = explained_var run_results[-1]["train_episode_reward"] = avg_episode_reward - run_results[-1]["time"] = time with open(self.result_path, "wb") as f: pickle.dump(run_results, f) self.itr += 1 diff --git a/agent/finetune/train_qsm_diffusion_agent.py b/agent/finetune/train_qsm_diffusion_agent.py index fa80dd9..53f93e7 100644 --- a/agent/finetune/train_qsm_diffusion_agent.py +++ b/agent/finetune/train_qsm_diffusion_agent.py @@ -80,12 +80,12 @@ class TrainQSMDiffusionAgent(TrainAgent): next_obs_buffer = deque(maxlen=self.buffer_size) action_buffer = deque(maxlen=self.buffer_size) reward_buffer = deque(maxlen=self.buffer_size) - done_buffer = deque(maxlen=self.buffer_size) - first_buffer = deque(maxlen=self.buffer_size) + terminated_buffer = deque(maxlen=self.buffer_size) # Start training loop timer = Timer() run_results = [] + cnt_train_step = 0 last_itr_eval = False done_venv = np.zeros((1, self.n_envs)) while self.itr < self.n_train_itr: @@ -109,10 +109,9 @@ class TrainQSMDiffusionAgent(TrainAgent): prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 else: - firsts_trajs[0] = ( - done_venv # if done at the end of last iteration, then the envs are just reset - ) - reward_trajs = np.empty((0, self.n_envs)) + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv + reward_trajs = np.zeros((self.n_steps, self.n_envs)) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -137,22 +136,33 @@ class TrainQSMDiffusionAgent(TrainAgent): action_venv = samples[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step( - action_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) ) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) + done_venv = terminated_venv | truncated_venv + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = done_venv # add to buffer - obs_buffer.append(prev_obs_venv["state"]) - next_obs_buffer.append(obs_venv["state"]) - action_buffer.append(action_venv) - reward_buffer.append(reward_venv * self.scale_reward_factor) - done_buffer.append(done_venv) - first_buffer.append(firsts_trajs[step]) + if not eval_mode: + obs_venv_copy = obs_venv.copy() + for i in range(self.n_envs): + if truncated_venv[i]: + obs_venv_copy["state"][i] = info_venv[i]["final_obs"][ + "state" + ] + obs_buffer.append(prev_obs_venv["state"]) + next_obs_buffer.append(obs_venv_copy["state"]) + action_buffer.append(action_venv) + reward_buffer.append(reward_venv * self.scale_reward_factor) + terminated_buffer.append(terminated_venv) - firsts_trajs[step + 1] = done_venv + # update for next step prev_obs_venv = obs_venv + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. episodes_start_end = [] for env_ind in range(self.n_envs): @@ -192,13 +202,15 @@ class TrainQSMDiffusionAgent(TrainAgent): # Update models if not eval_mode: + num_batch = int( + self.n_steps * self.n_envs / self.batch_size * self.replay_ratio + ) obs_trajs = np.array(deepcopy(obs_buffer)) action_trajs = np.array(deepcopy(action_buffer)) next_obs_trajs = np.array(deepcopy(next_obs_buffer)) reward_trajs = np.array(deepcopy(reward_buffer)) - done_trajs = np.array(deepcopy(done_buffer)) - first_trajs = np.array(deepcopy(first_buffer)) + terminated_trajs = np.array(deepcopy(terminated_buffer)) # flatten obs_trajs = einops.rearrange( @@ -214,16 +226,8 @@ class TrainQSMDiffusionAgent(TrainAgent): "s e h d -> (s e) h d", ) reward_trajs = reward_trajs.reshape(-1) - done_trajs = done_trajs.reshape(-1) - first_trajs = first_trajs.reshape(-1) - - num_batch = int( - self.n_steps * self.n_envs / self.batch_size * self.replay_ratio - ) - + terminated_trajs = terminated_trajs.reshape(-1) for _ in range(num_batch): - - # Sample batch inds = np.random.choice(len(obs_trajs), self.batch_size) obs_b = torch.from_numpy(obs_trajs[inds]).float().to(self.device) next_obs_b = ( @@ -232,37 +236,34 @@ class TrainQSMDiffusionAgent(TrainAgent): actions_b = ( torch.from_numpy(action_trajs[inds]).float().to(self.device) ) - reward_b = ( + rewards_b = ( torch.from_numpy(reward_trajs[inds]).float().to(self.device) ) - done_b = torch.from_numpy(done_trajs[inds]).float().to(self.device) + terminated_b = ( + torch.from_numpy(terminated_trajs[inds]).float().to(self.device) + ) # update critic q function - critic_loss = self.model.loss_critic( + loss_critic = self.model.loss_critic( {"state": obs_b}, {"state": next_obs_b}, actions_b, - reward_b, - done_b, + rewards_b, + terminated_b, self.gamma, ) self.critic_optimizer.zero_grad() - critic_loss.backward() + loss_critic.backward() self.critic_optimizer.step() - # update target q function - self.model.update_target_critic(self.critic_tau) - - loss_critic = critic_loss.detach() - # Update policy with collected trajectories - loss = self.model.loss_actor( + loss_actor = self.model.loss_actor( {"state": obs_b}, actions_b, self.q_grad_coeff, ) self.actor_optimizer.zero_grad() - loss.backward() + loss_actor.backward() if self.itr >= self.n_critic_warmup_itr: if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm_( @@ -270,6 +271,9 @@ class TrainQSMDiffusionAgent(TrainAgent): ) self.actor_optimizer.step() + # update target critic + self.model.update_target_critic(self.critic_tau) + # Update lr self.actor_lr_scheduler.step() self.critic_lr_scheduler.step() @@ -282,10 +286,12 @@ class TrainQSMDiffusionAgent(TrainAgent): run_results.append( { "itr": self.itr, + "step": cnt_train_step, } ) if self.itr % self.log_freq == 0: time = timer() + run_results[-1]["time"] = time if eval_mode: log.info( f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" @@ -306,12 +312,13 @@ class TrainQSMDiffusionAgent(TrainAgent): run_results[-1]["eval_best_reward"] = avg_best_reward else: log.info( - f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}" + f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" ) if self.use_wandb: wandb.log( { - "loss": loss, + "total env step": cnt_train_step, + "loss - actor": loss_actor, "loss - critic": loss_critic, "avg episode reward - train": avg_episode_reward, "num episode - train": num_episode_finished, @@ -319,10 +326,7 @@ class TrainQSMDiffusionAgent(TrainAgent): step=self.itr, commit=True, ) - run_results[-1]["loss"] = loss - run_results[-1]["loss_critic"] = loss_critic run_results[-1]["train_episode_reward"] = avg_episode_reward - run_results[-1]["time"] = time with open(self.result_path, "wb") as f: pickle.dump(run_results, f) self.itr += 1 diff --git a/agent/finetune/train_rlpd_agent.py b/agent/finetune/train_rlpd_agent.py new file mode 100644 index 0000000..99e0f22 --- /dev/null +++ b/agent/finetune/train_rlpd_agent.py @@ -0,0 +1,404 @@ +""" +Reinforcement Learning with Prior Data (RLPD) agent training script. + +Does not support image observations right now. +""" + +import os +import pickle +import numpy as np +import torch +import logging +import wandb +import hydra +from collections import deque + +log = logging.getLogger(__name__) +from util.timer import Timer +from agent.finetune.train_agent import TrainAgent +from util.scheduler import CosineAnnealingWarmupRestarts + + +class TrainRLPDAgent(TrainAgent): + def __init__(self, cfg): + super().__init__(cfg) + + # Build dataset + self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset) + + # note the discount factor gamma here is applied to reward every act_steps, instead of every env step + self.gamma = cfg.train.gamma + + # Optimizer + self.actor_optimizer = torch.optim.AdamW( + self.model.network.parameters(), + lr=cfg.train.actor_lr, + weight_decay=cfg.train.actor_weight_decay, + ) + self.actor_lr_scheduler = CosineAnnealingWarmupRestarts( + self.actor_optimizer, + first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.actor_lr, + min_lr=cfg.train.actor_lr_scheduler.min_lr, + warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps, + gamma=1.0, + ) + self.critic_optimizer = torch.optim.AdamW( + self.model.ensemble_params.values(), # https://github.com/pytorch/pytorch/issues/120581 + lr=cfg.train.critic_lr, + weight_decay=cfg.train.critic_weight_decay, + ) + self.critic_lr_scheduler = CosineAnnealingWarmupRestarts( + self.critic_optimizer, + first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.critic_lr, + min_lr=cfg.train.critic_lr_scheduler.min_lr, + warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps, + gamma=1.0, + ) + + # Perturbation scale + self.target_ema_rate = cfg.train.target_ema_rate + + # Reward scale + self.scale_reward_factor = cfg.train.scale_reward_factor + + # Number of critic updates + self.critic_num_update = cfg.train.critic_num_update + + # Buffer size + self.buffer_size = cfg.train.buffer_size + + # Eval episodes + self.n_eval_episode = cfg.train.n_eval_episode + + # Exploration steps at the beginning - using randomly sampled action + self.n_explore_steps = cfg.train.n_explore_steps + + # Initialize temperature parameter for entropy + init_temperature = cfg.train.init_temperature + self.log_alpha = torch.tensor(np.log(init_temperature)).to(self.device) + self.log_alpha.requires_grad = True + self.target_entropy = cfg.train.target_entropy + self.log_alpha_optimizer = torch.optim.Adam( + [self.log_alpha], + lr=cfg.train.critic_lr, + ) + + def run(self): + # make a FIFO replay buffer for obs, action, and reward + obs_buffer = deque(maxlen=self.buffer_size) + next_obs_buffer = deque(maxlen=self.buffer_size) + action_buffer = deque(maxlen=self.buffer_size) + reward_buffer = deque(maxlen=self.buffer_size) + terminated_buffer = deque(maxlen=self.buffer_size) + + # load offline dataset into replay buffer + dataloader_offline = torch.utils.data.DataLoader( + self.dataset_offline, + batch_size=len(self.dataset_offline), + drop_last=False, + ) + for batch in dataloader_offline: + actions, states_and_next, rewards, terminated = batch + states = states_and_next["state"] + next_states = states_and_next["next_state"] + obs_buffer_off = states.cpu().numpy() + next_obs_buffer_off = next_states.cpu().numpy() + action_buffer_off = actions.cpu().numpy() + reward_buffer_off = rewards.cpu().numpy().flatten() + terminated_buffer_off = terminated.cpu().numpy().flatten() + + # Start training loop + timer = Timer() + run_results = [] + cnt_train_step = 0 + done_venv = np.zeros((1, self.n_envs)) + while self.itr < self.n_train_itr: + if self.itr % 1000 == 0: + print(f"Finished training iteration {self.itr} of {self.n_train_itr}") + + # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env + options_venv = [{} for _ in range(self.n_envs)] + if self.itr % self.render_freq == 0 and self.render_video: + for env_ind in range(self.n_render): + options_venv[env_ind]["video_path"] = os.path.join( + self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4" + ) + + # Define train or eval - all envs restart + eval_mode = ( + self.itr % self.val_freq == 0 + and self.itr >= self.n_explore_steps + and not self.force_train + ) + n_steps = ( + self.n_steps if not eval_mode else int(1e5) + ) # large number for eval mode + self.model.eval() if eval_mode else self.model.train() + + # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning + firsts_trajs = np.zeros((n_steps + 1, self.n_envs)) + if self.reset_at_iteration or eval_mode or self.itr == 0: + prev_obs_venv = self.reset_env_all(options_venv=options_venv) + firsts_trajs[0] = 1 + else: + # if done at the end of last iteration, then the envs are just reset + firsts_trajs[0] = done_venv + reward_trajs = np.zeros((n_steps, self.n_envs)) + + # Collect a set of trajectories from env + cnt_episode = 0 + for step in range(n_steps): + + # Select action + if self.itr < self.n_explore_steps: + action_venv = self.venv.action_space.sample() + else: + with torch.no_grad(): + cond = { + "state": torch.from_numpy(prev_obs_venv["state"]) + .float() + .to(self.device) + } + samples = ( + self.model( + cond=cond, + deterministic=eval_mode, + ) + .cpu() + .numpy() + ) # n_env x horizon x act + action_venv = samples[:, : self.act_steps] + + # Apply multi-step action + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) + ) + done_venv = terminated_venv | truncated_venv + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = done_venv + + # add to buffer in train mode + if not eval_mode: + for i in range(self.n_envs): + obs_buffer.append(prev_obs_venv["state"][i]) + if truncated_venv[i]: + next_obs_buffer.append(info_venv[i]["final_obs"]["state"]) + else: + next_obs_buffer.append(obs_venv["state"][i]) + action_buffer.append(action_venv[i]) + reward_buffer.extend( + (reward_venv * self.scale_reward_factor).tolist() + ) + terminated_buffer.extend(terminated_venv.tolist()) + + # update for next step + prev_obs_venv = obs_venv + + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + + # check if enough eval episodes are done + cnt_episode += np.sum(done_venv) + if eval_mode and cnt_episode >= self.n_eval_episode: + break + + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. + episodes_start_end = [] + for env_ind in range(self.n_envs): + env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0] + for i in range(len(env_steps) - 1): + start = env_steps[i] + end = env_steps[i + 1] + if end - start > 1: + episodes_start_end.append((env_ind, start, end - 1)) + if len(episodes_start_end) > 0: + reward_trajs_split = [ + reward_trajs[start : end + 1, env_ind] + for env_ind, start, end in episodes_start_end + ] + num_episode_finished = len(reward_trajs_split) + episode_reward = np.array( + [np.sum(reward_traj) for reward_traj in reward_trajs_split] + ) + episode_best_reward = np.array( + [ + np.max(reward_traj) / self.act_steps + for reward_traj in reward_trajs_split + ] + ) + avg_episode_reward = np.mean(episode_reward) + avg_best_reward = np.mean(episode_best_reward) + success_rate = np.mean( + episode_best_reward >= self.best_reward_threshold_for_success + ) + else: + episode_reward = np.array([]) + num_episode_finished = 0 + avg_episode_reward = 0 + avg_best_reward = 0 + success_rate = 0 + + # Update models + if not eval_mode and self.itr >= self.n_explore_steps: + + # Update critic more frequently + for _ in range(self.critic_num_update): + + # Sample from OFFLINE buffer + inds = np.random.choice(len(obs_buffer_off), self.batch_size // 2) + obs_b_off = ( + torch.from_numpy(obs_buffer_off[inds]).float().to(self.device) + ) + next_obs_b_off = ( + torch.from_numpy(next_obs_buffer_off[inds]) + .float() + .to(self.device) + ) + actions_b_off = ( + torch.from_numpy(action_buffer_off[inds]) + .float() + .to(self.device) + ) + rewards_b_off = ( + torch.from_numpy(reward_buffer_off[inds]) + .float() + .to(self.device) + ) + terminated_b_off = ( + torch.from_numpy(terminated_buffer_off[inds]) + .float() + .to(self.device) + ) + + # Sample from ONLINE buffer + inds = np.random.choice(len(obs_buffer), self.batch_size // 2) + obs_b_on = ( + torch.from_numpy(np.array([obs_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + next_obs_b_on = ( + torch.from_numpy(np.array([next_obs_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + actions_b_on = ( + torch.from_numpy(np.array([action_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + rewards_b_on = ( + torch.from_numpy(np.array([reward_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + terminated_b_on = ( + torch.from_numpy(np.array([terminated_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + + # merge offline and online data + obs_b = torch.cat([obs_b_off, obs_b_on], dim=0) + next_obs_b = torch.cat([next_obs_b_off, next_obs_b_on], dim=0) + actions_b = torch.cat([actions_b_off, actions_b_on], dim=0) + rewards_b = torch.cat([rewards_b_off, rewards_b_on], dim=0) + terminated_b = torch.cat([terminated_b_off, terminated_b_on], dim=0) + + # Update critic + alpha = self.log_alpha.exp().item() + loss_critic = self.model.loss_critic( + {"state": obs_b}, + {"state": next_obs_b}, + actions_b, + rewards_b, + terminated_b, + self.gamma, + alpha, + ) + self.critic_optimizer.zero_grad() + loss_critic.backward() + self.critic_optimizer.step() + + # Update target critic every critic update + self.model.update_target_critic(self.target_ema_rate) + + # Update actor once with the final batch + loss_actor = self.model.loss_actor( + {"state": obs_b}, + alpha, + ) + self.actor_optimizer.zero_grad() + loss_actor.backward() + self.actor_optimizer.step() + + # Update temperature parameter + self.log_alpha_optimizer.zero_grad() + loss_alpha = self.model.loss_temperature( + {"state": obs_b}, + self.log_alpha.exp(), # with grad + self.target_entropy, + ) + loss_alpha.backward() + self.log_alpha_optimizer.step() + + # Update lr + self.actor_lr_scheduler.step() + self.critic_lr_scheduler.step() + + # Save model + if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1: + self.save_model() + + # Log loss and save metrics + run_results.append( + { + "itr": self.itr, + "step": cnt_train_step, + } + ) + if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps: + time = timer() + if eval_mode: + log.info( + f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "success rate - eval": success_rate, + "avg episode reward - eval": avg_episode_reward, + "avg best reward - eval": avg_best_reward, + "num episode - eval": num_episode_finished, + }, + step=self.itr, + commit=False, + ) + run_results[-1]["eval_success_rate"] = success_rate + run_results[-1]["eval_episode_reward"] = avg_episode_reward + run_results[-1]["eval_best_reward"] = avg_best_reward + else: + log.info( + f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | alpha {alpha:8.4f} | t:{time:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "total env step": cnt_train_step, + "loss - actor": loss_actor, + "loss - critic": loss_critic, + "entropy coeff": alpha, + "avg episode reward - train": avg_episode_reward, + "num episode - train": num_episode_finished, + }, + step=self.itr, + commit=True, + ) + run_results[-1]["train_episode_reward"] = avg_episode_reward + with open(self.result_path, "wb") as f: + pickle.dump(run_results, f) + self.itr += 1 diff --git a/agent/finetune/train_rwr_diffusion_agent.py b/agent/finetune/train_rwr_diffusion_agent.py index 84ab2a5..8034da4 100644 --- a/agent/finetune/train_rwr_diffusion_agent.py +++ b/agent/finetune/train_rwr_diffusion_agent.py @@ -19,7 +19,6 @@ from util.scheduler import CosineAnnealingWarmupRestarts class TrainRWRDiffusionAgent(TrainAgent): - def __init__(self, cfg): super().__init__(cfg) @@ -52,14 +51,13 @@ class TrainRWRDiffusionAgent(TrainAgent): self.update_epochs = cfg.train.update_epochs def run(self): - # Start training loop timer = Timer() run_results = [] + cnt_train_step = 0 last_itr_eval = False done_venv = np.zeros((1, self.n_envs)) while self.itr < self.n_train_itr: - # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env options_venv = [{} for _ in range(self.n_envs)] if self.itr % self.render_freq == 0 and self.render_video: @@ -79,23 +77,24 @@ class TrainRWRDiffusionAgent(TrainAgent): prev_obs_venv = self.reset_env_all(options_venv=options_venv) firsts_trajs[0] = 1 else: - firsts_trajs[0] = ( - done_venv # if done at the end of last iteration, then the envs are just reset - ) + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv # Holder obs_trajs = { - "state": np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim)) + "state": np.zeros( + (self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim) + ) } - samples_trajs = np.empty( + samples_trajs = np.zeros( ( - 0, + self.n_steps, self.n_envs, self.horizon_steps, self.action_dim, ) ) - reward_trajs = np.empty((0, self.n_envs)) + reward_trajs = np.zeros((self.n_steps, self.n_envs)) # Collect a set of trajectories from env for step in range(self.n_steps): @@ -118,19 +117,25 @@ class TrainRWRDiffusionAgent(TrainAgent): .numpy() ) # n_env x horizon x act action_venv = samples[:, : self.act_steps] - samples_trajs = np.vstack((samples_trajs, samples[None])) + samples_trajs[step] = samples # Apply multi-step action - obs_venv, reward_venv, done_venv, info_venv = self.venv.step( - action_venv + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) ) - obs_trajs["state"] = np.vstack( - (obs_trajs["state"], prev_obs_venv["state"][None]) - ) - reward_trajs = np.vstack((reward_trajs, reward_venv[None])) + done_venv = terminated_venv | truncated_venv + + # save + obs_trajs["state"][step] = prev_obs_venv["state"] + reward_trajs[step] = reward_venv firsts_trajs[step + 1] = done_venv + + # update for next step prev_obs_venv = obs_venv + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. episodes_start_end = [] for env_ind in range(self.n_envs): @@ -157,20 +162,23 @@ class TrainRWRDiffusionAgent(TrainAgent): num_episode_finished = len(reward_trajs_split) # Compute episode returns - discounted_reward_trajs_split = [ - [ - self.gamma**t * r - for t, r in zip( - list(range(end - start + 1)), - reward_trajs[start : end + 1, env_ind], - ) - ] - for env_ind, start, end in episodes_start_end - ] returns_trajs_split = [ - np.cumsum(y[::-1])[::-1] for y in discounted_reward_trajs_split + np.zeros_like(reward_trajs) for reward_trajs in reward_trajs_split ] + for traj_rewards, traj_returns in zip( + reward_trajs_split, returns_trajs_split + ): + prev_return = 0 + for t in range(len(traj_rewards)): + traj_returns[-t - 1] = ( + traj_rewards[-t - 1] + self.gamma * prev_return + ) + prev_return = traj_returns[-t - 1] + + # Note: concatenation is okay here since we are concatenating + # states and actions later on, in the same order returns_trajs_split = np.concatenate(returns_trajs_split) + episode_reward = np.array( [np.sum(reward_traj) for reward_traj in reward_trajs_split] ) @@ -195,7 +203,6 @@ class TrainRWRDiffusionAgent(TrainAgent): # Update models if not eval_mode: - # Tensorize data and put them to device # k for environment step obs_k = { @@ -230,7 +237,6 @@ class TrainRWRDiffusionAgent(TrainAgent): total_steps = len(rewards_k_scaled) inds_k = np.arange(total_steps) for _ in range(self.update_epochs): - # for each epoch, go through all data in batches np.random.shuffle(inds_k) num_batch = max(1, total_steps // self.batch_size) # skip last ones @@ -267,10 +273,12 @@ class TrainRWRDiffusionAgent(TrainAgent): run_results.append( { "itr": self.itr, + "step": cnt_train_step, } ) if self.itr % self.log_freq == 0: time = timer() + run_results[-1]["time"] = time if eval_mode: log.info( f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" @@ -291,11 +299,12 @@ class TrainRWRDiffusionAgent(TrainAgent): run_results[-1]["eval_best_reward"] = avg_best_reward else: log.info( - f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}" + f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" ) if self.use_wandb: wandb.log( { + "total env step": cnt_train_step, "loss": loss, "avg episode reward - train": avg_episode_reward, "num episode - train": num_episode_finished, @@ -303,9 +312,7 @@ class TrainRWRDiffusionAgent(TrainAgent): step=self.itr, commit=True, ) - run_results[-1]["loss"] = loss run_results[-1]["train_episode_reward"] = avg_episode_reward - run_results[-1]["time"] = time with open(self.result_path, "wb") as f: pickle.dump(run_results, f) self.itr += 1 diff --git a/agent/finetune/train_sac_agent.py b/agent/finetune/train_sac_agent.py new file mode 100644 index 0000000..8484dc2 --- /dev/null +++ b/agent/finetune/train_sac_agent.py @@ -0,0 +1,335 @@ +""" +Soft Actor Critic (SAC) agent training script. + +Does not support image observations right now. +""" + +import os +import pickle +import numpy as np +import torch +import logging +import wandb +from collections import deque + +log = logging.getLogger(__name__) +from util.timer import Timer +from agent.finetune.train_agent import TrainAgent + + +class TrainSACAgent(TrainAgent): + def __init__(self, cfg): + super().__init__(cfg) + + # note the discount factor gamma here is applied to reward every act_steps, instead of every env step + self.gamma = cfg.train.gamma + + # Optimizer + self.actor_optimizer = torch.optim.Adam( + self.model.network.parameters(), + lr=cfg.train.actor_lr, + ) + self.critic_optimizer = torch.optim.Adam( + self.model.critic.parameters(), + lr=cfg.train.critic_lr, + ) + + # Perturbation scale + self.target_ema_rate = cfg.train.target_ema_rate + + # Reward scale + self.scale_reward_factor = cfg.train.scale_reward_factor + + # Actor/critic update frequency - assume single env + self.critic_update_freq = int( + cfg.train.batch_size / cfg.train.critic_replay_ratio + ) + self.actor_update_freq = int( + cfg.train.batch_size / cfg.train.actor_replay_ratio + ) + + # Buffer size + self.buffer_size = cfg.train.buffer_size + + # Eval episodes + self.n_eval_episode = cfg.train.n_eval_episode + + # Exploration steps at the beginning - using randomly sampled action + self.n_explore_steps = cfg.train.n_explore_steps + + # Initialize temperature parameter for entropy + init_temperature = cfg.train.init_temperature + self.log_alpha = torch.tensor(np.log(init_temperature)).to(self.device) + self.log_alpha.requires_grad = True + self.target_entropy = cfg.train.target_entropy + self.log_alpha_optimizer = torch.optim.Adam( + [self.log_alpha], + lr=cfg.train.critic_lr, + ) + + def run(self): + # make a FIFO replay buffer for obs, action, and reward + obs_buffer = deque(maxlen=self.buffer_size) + next_obs_buffer = deque(maxlen=self.buffer_size) + action_buffer = deque(maxlen=self.buffer_size) + reward_buffer = deque(maxlen=self.buffer_size) + terminated_buffer = deque(maxlen=self.buffer_size) + + # Start training loop + timer = Timer() + run_results = [] + cnt_train_step = 0 + done_venv = np.zeros((1, self.n_envs)) + while self.itr < self.n_train_itr: + if self.itr % 1000 == 0: + print(f"Finished training iteration {self.itr} of {self.n_train_itr}") + + # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env + options_venv = [{} for _ in range(self.n_envs)] + if self.itr % self.render_freq == 0 and self.render_video: + for env_ind in range(self.n_render): + options_venv[env_ind]["video_path"] = os.path.join( + self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4" + ) + + # Define train or eval - all envs restart + eval_mode = ( + self.itr % self.val_freq == 0 + and self.itr > self.n_explore_steps + and not self.force_train + ) + n_steps = ( + self.n_steps if not eval_mode else int(1e5) + ) # large number for eval mode + self.model.eval() if eval_mode else self.model.train() + + # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning + firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) + if self.reset_at_iteration or eval_mode or self.itr == 0: + prev_obs_venv = self.reset_env_all(options_venv=options_venv) + firsts_trajs[0] = 1 + else: + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv + reward_trajs = np.zeros((self.n_steps, self.n_envs)) + + # Collect a set of trajectories from env + cnt_episode = 0 + for step in range(n_steps): + + # Select action + if self.itr < self.n_explore_steps: + action_venv = self.venv.action_space.sample() + else: + with torch.no_grad(): + cond = { + "state": torch.from_numpy(prev_obs_venv["state"]) + .float() + .to(self.device) + } + samples = ( + self.model( + cond=cond, + deterministic=eval_mode, + ) + .cpu() + .numpy() + ) # n_env x horizon x act + action_venv = samples[:, : self.act_steps] + + # Apply multi-step action + obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( + self.venv.step(action_venv) + ) + done_venv = terminated_venv | truncated_venv + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = done_venv + + # add to buffer in train mode + if not eval_mode: + for i in range(self.n_envs): + obs_buffer.append(prev_obs_venv["state"][i]) + if "final_obs" in info_venv[i]: # truncated + next_obs_buffer.append(info_venv[i]["final_obs"]["state"]) + else: # first obs in new episode + next_obs_buffer.append(obs_venv["state"][i]) + action_buffer.append(action_venv[i]) + reward_buffer.extend( + (reward_venv * self.scale_reward_factor).tolist() + ) + terminated_buffer.extend(terminated_venv.tolist()) + + # update for next step + prev_obs_venv = obs_venv + + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + + # check if enough eval episodes are done + cnt_episode += np.sum(done_venv) + if eval_mode and cnt_episode >= self.n_eval_episode: + break + + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. + episodes_start_end = [] + for env_ind in range(self.n_envs): + env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0] + for i in range(len(env_steps) - 1): + start = env_steps[i] + end = env_steps[i + 1] + if end - start > 1: + episodes_start_end.append((env_ind, start, end - 1)) + if len(episodes_start_end) > 0: + reward_trajs_split = [ + reward_trajs[start : end + 1, env_ind] + for env_ind, start, end in episodes_start_end + ] + num_episode_finished = len(reward_trajs_split) + episode_reward = np.array( + [np.sum(reward_traj) for reward_traj in reward_trajs_split] + ) + episode_best_reward = np.array( + [ + np.max(reward_traj) / self.act_steps + for reward_traj in reward_trajs_split + ] + ) + avg_episode_reward = np.mean(episode_reward) + avg_best_reward = np.mean(episode_best_reward) + success_rate = np.mean( + episode_best_reward >= self.best_reward_threshold_for_success + ) + else: + episode_reward = np.array([]) + num_episode_finished = 0 + avg_episode_reward = 0 + avg_best_reward = 0 + success_rate = 0 + + # Update models + if ( + not eval_mode + and self.itr > self.n_explore_steps + and self.itr % self.critic_update_freq == 0 + ): + inds = np.random.choice(len(obs_buffer), self.batch_size, replace=False) + obs_b = ( + torch.from_numpy(np.array([obs_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + next_obs_b = ( + torch.from_numpy(np.array([next_obs_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + actions_b = ( + torch.from_numpy(np.array([action_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + rewards_b = ( + torch.from_numpy(np.array([reward_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + terminated_b = ( + torch.from_numpy(np.array([terminated_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + + # Update critic + alpha = self.log_alpha.exp().item() + loss_critic = self.model.loss_critic( + {"state": obs_b}, + {"state": next_obs_b}, + actions_b, + rewards_b, + terminated_b, + self.gamma, + alpha, + ) + self.critic_optimizer.zero_grad() + loss_critic.backward() + self.critic_optimizer.step() + + # Update target critic every critic update + self.model.update_target_critic(self.target_ema_rate) + + # Delay update actor + loss_actor = 0 + if self.itr % self.actor_update_freq == 0: + for _ in range(2): + loss_actor = self.model.loss_actor( + {"state": obs_b}, + alpha, + ) + self.actor_optimizer.zero_grad() + loss_actor.backward() + self.actor_optimizer.step() + + # Update temperature parameter + self.log_alpha_optimizer.zero_grad() + loss_alpha = self.model.loss_temperature( + {"state": obs_b}, + self.log_alpha.exp(), # with grad + self.target_entropy, + ) + loss_alpha.backward() + self.log_alpha_optimizer.step() + + # Save model + if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1: + self.save_model() + + # Log loss and save metrics + run_results.append( + { + "itr": self.itr, + "step": cnt_train_step, + } + ) + if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps: + time = timer() + if eval_mode: + log.info( + f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "success rate - eval": success_rate, + "avg episode reward - eval": avg_episode_reward, + "avg best reward - eval": avg_best_reward, + "num episode - eval": num_episode_finished, + }, + step=self.itr, + commit=False, + ) + run_results[-1]["eval_success_rate"] = success_rate + run_results[-1]["eval_episode_reward"] = avg_episode_reward + run_results[-1]["eval_best_reward"] = avg_best_reward + else: + log.info( + f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | alpha {alpha:8.4f} | t {time:8.4f}" + ) + if self.use_wandb: + wandb_log_dict = { + "total env step": cnt_train_step, + "loss - critic": loss_critic, + "entropy coeff": alpha, + "avg episode reward - train": avg_episode_reward, + "num episode - train": num_episode_finished, + } + if loss_actor is not None: + wandb_log_dict["loss - actor"] = loss_actor + wandb.log( + wandb_log_dict, + step=self.itr, + commit=True, + ) + run_results[-1]["train_episode_reward"] = avg_episode_reward + with open(self.result_path, "wb") as f: + pickle.dump(run_results, f) + self.itr += 1 diff --git a/cfg/d3il/finetune/avoid_m1/ft_ppo_diffusion_mlp.yaml b/cfg/d3il/finetune/avoid_m1/ft_ppo_diffusion_mlp.yaml index da3b099..380cb8d 100644 --- a/cfg/d3il/finetune/avoid_m1/ft_ppo_diffusion_mlp.yaml +++ b/cfg/d3il/finetune/avoid_m1/ft_ppo_diffusion_mlp.yaml @@ -16,7 +16,6 @@ env_name: avoiding-m5 mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -102,7 +101,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/d3il/finetune/avoid_m1/ft_ppo_gaussian_mlp.yaml b/cfg/d3il/finetune/avoid_m1/ft_ppo_gaussian_mlp.yaml index 13526f1..a8a4645 100644 --- a/cfg/d3il/finetune/avoid_m1/ft_ppo_gaussian_mlp.yaml +++ b/cfg/d3il/finetune/avoid_m1/ft_ppo_gaussian_mlp.yaml @@ -16,7 +16,6 @@ env_name: avoiding-m5 mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -94,7 +93,7 @@ model: learn_fixed_std: False cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/d3il/finetune/avoid_m1/ft_ppo_gmm_mlp.yaml b/cfg/d3il/finetune/avoid_m1/ft_ppo_gmm_mlp.yaml index 11e98dd..32b4ff6 100644 --- a/cfg/d3il/finetune/avoid_m1/ft_ppo_gmm_mlp.yaml +++ b/cfg/d3il/finetune/avoid_m1/ft_ppo_gmm_mlp.yaml @@ -16,7 +16,6 @@ env_name: avoiding-m5 mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -95,7 +94,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/d3il/finetune/avoid_m2/ft_ppo_diffusion_mlp.yaml b/cfg/d3il/finetune/avoid_m2/ft_ppo_diffusion_mlp.yaml index 7dc7454..e6e70f0 100644 --- a/cfg/d3il/finetune/avoid_m2/ft_ppo_diffusion_mlp.yaml +++ b/cfg/d3il/finetune/avoid_m2/ft_ppo_diffusion_mlp.yaml @@ -16,7 +16,6 @@ env_name: avoiding-m5 mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -102,7 +101,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/d3il/finetune/avoid_m2/ft_ppo_gaussian_mlp.yaml b/cfg/d3il/finetune/avoid_m2/ft_ppo_gaussian_mlp.yaml index 86743b8..218bd5d 100644 --- a/cfg/d3il/finetune/avoid_m2/ft_ppo_gaussian_mlp.yaml +++ b/cfg/d3il/finetune/avoid_m2/ft_ppo_gaussian_mlp.yaml @@ -16,7 +16,6 @@ env_name: avoiding-m5 mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -94,7 +93,7 @@ model: learn_fixed_std: False cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/d3il/finetune/avoid_m2/ft_ppo_gmm_mlp.yaml b/cfg/d3il/finetune/avoid_m2/ft_ppo_gmm_mlp.yaml index 960fc60..47e8ce6 100644 --- a/cfg/d3il/finetune/avoid_m2/ft_ppo_gmm_mlp.yaml +++ b/cfg/d3il/finetune/avoid_m2/ft_ppo_gmm_mlp.yaml @@ -16,7 +16,6 @@ env_name: avoiding-m5 mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -95,7 +94,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/d3il/finetune/avoid_m3/ft_ppo_diffusion_mlp.yaml b/cfg/d3il/finetune/avoid_m3/ft_ppo_diffusion_mlp.yaml index c22d961..ccdfd2e 100644 --- a/cfg/d3il/finetune/avoid_m3/ft_ppo_diffusion_mlp.yaml +++ b/cfg/d3il/finetune/avoid_m3/ft_ppo_diffusion_mlp.yaml @@ -16,7 +16,6 @@ env_name: avoiding-m5 mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -102,7 +101,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/d3il/finetune/avoid_m3/ft_ppo_gaussian_mlp.yaml b/cfg/d3il/finetune/avoid_m3/ft_ppo_gaussian_mlp.yaml index 4147674..562b4e0 100644 --- a/cfg/d3il/finetune/avoid_m3/ft_ppo_gaussian_mlp.yaml +++ b/cfg/d3il/finetune/avoid_m3/ft_ppo_gaussian_mlp.yaml @@ -16,7 +16,6 @@ env_name: avoiding-m5 mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -94,7 +93,7 @@ model: learn_fixed_std: False cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/d3il/finetune/avoid_m3/ft_ppo_gmm_mlp.yaml b/cfg/d3il/finetune/avoid_m3/ft_ppo_gmm_mlp.yaml index 476fb33..7bc41af 100644 --- a/cfg/d3il/finetune/avoid_m3/ft_ppo_gmm_mlp.yaml +++ b/cfg/d3il/finetune/avoid_m3/ft_ppo_gmm_mlp.yaml @@ -16,7 +16,6 @@ env_name: avoiding-m5 mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -95,7 +94,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/d3il/pretrain/avoid_m1/pre_diffusion_mlp.yaml b/cfg/d3il/pretrain/avoid_m1/pre_diffusion_mlp.yaml index 8ac6f58..e9e46bd 100644 --- a/cfg/d3il/pretrain/avoid_m1/pre_diffusion_mlp.yaml +++ b/cfg/d3il/pretrain/avoid_m1/pre_diffusion_mlp.yaml @@ -15,7 +15,6 @@ env: avoid mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 4 cond_steps: 1 @@ -50,7 +49,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/d3il/pretrain/avoid_m1/pre_gaussian_mlp.yaml b/cfg/d3il/pretrain/avoid_m1/pre_gaussian_mlp.yaml index c4fd533..ad49b8f 100644 --- a/cfg/d3il/pretrain/avoid_m1/pre_gaussian_mlp.yaml +++ b/cfg/d3il/pretrain/avoid_m1/pre_gaussian_mlp.yaml @@ -15,7 +15,6 @@ env: avoid mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 @@ -47,7 +46,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/d3il/pretrain/avoid_m1/pre_gmm_mlp.yaml b/cfg/d3il/pretrain/avoid_m1/pre_gmm_mlp.yaml index d980c50..eae560a 100644 --- a/cfg/d3il/pretrain/avoid_m1/pre_gmm_mlp.yaml +++ b/cfg/d3il/pretrain/avoid_m1/pre_gmm_mlp.yaml @@ -15,7 +15,6 @@ env: avoid mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 num_modes: 5 @@ -49,7 +48,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/d3il/pretrain/avoid_m2/pre_diffusion_mlp.yaml b/cfg/d3il/pretrain/avoid_m2/pre_diffusion_mlp.yaml index 6ba8992..80c6c81 100644 --- a/cfg/d3il/pretrain/avoid_m2/pre_diffusion_mlp.yaml +++ b/cfg/d3il/pretrain/avoid_m2/pre_diffusion_mlp.yaml @@ -15,7 +15,6 @@ env: avoid mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 4 cond_steps: 1 @@ -50,7 +49,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/d3il/pretrain/avoid_m2/pre_gaussian_mlp.yaml b/cfg/d3il/pretrain/avoid_m2/pre_gaussian_mlp.yaml index 922d9d9..fc93398 100644 --- a/cfg/d3il/pretrain/avoid_m2/pre_gaussian_mlp.yaml +++ b/cfg/d3il/pretrain/avoid_m2/pre_gaussian_mlp.yaml @@ -15,7 +15,6 @@ env: avoid mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 @@ -47,7 +46,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/d3il/pretrain/avoid_m2/pre_gmm_mlp.yaml b/cfg/d3il/pretrain/avoid_m2/pre_gmm_mlp.yaml index 52eb8f9..792fab5 100644 --- a/cfg/d3il/pretrain/avoid_m2/pre_gmm_mlp.yaml +++ b/cfg/d3il/pretrain/avoid_m2/pre_gmm_mlp.yaml @@ -15,7 +15,6 @@ env: avoid mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 num_modes: 5 @@ -49,7 +48,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/d3il/pretrain/avoid_m3/pre_diffusion_mlp.yaml b/cfg/d3il/pretrain/avoid_m3/pre_diffusion_mlp.yaml index 7567116..292493c 100644 --- a/cfg/d3il/pretrain/avoid_m3/pre_diffusion_mlp.yaml +++ b/cfg/d3il/pretrain/avoid_m3/pre_diffusion_mlp.yaml @@ -15,7 +15,6 @@ env: avoid mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 4 cond_steps: 1 @@ -50,7 +49,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/d3il/pretrain/avoid_m3/pre_gaussian_mlp.yaml b/cfg/d3il/pretrain/avoid_m3/pre_gaussian_mlp.yaml index fa58f74..587d0ba 100644 --- a/cfg/d3il/pretrain/avoid_m3/pre_gaussian_mlp.yaml +++ b/cfg/d3il/pretrain/avoid_m3/pre_gaussian_mlp.yaml @@ -15,7 +15,6 @@ env: avoid mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 @@ -47,7 +46,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/d3il/pretrain/avoid_m3/pre_gmm_mlp.yaml b/cfg/d3il/pretrain/avoid_m3/pre_gmm_mlp.yaml index a9b24bf..07f5a0c 100644 --- a/cfg/d3il/pretrain/avoid_m3/pre_gmm_mlp.yaml +++ b/cfg/d3il/pretrain/avoid_m3/pre_gmm_mlp.yaml @@ -15,7 +15,6 @@ env: avoid mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2 obs_dim: 4 action_dim: 2 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 num_modes: 5 @@ -49,7 +48,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/furniture/eval/one_leg_low/eval_diffusion_mlp.yaml b/cfg/furniture/eval/one_leg_low/eval_diffusion_mlp.yaml index 6cb3c88..2f8d0ef 100644 --- a/cfg/furniture/eval/one_leg_low/eval_diffusion_mlp.yaml +++ b/cfg/furniture/eval/one_leg_low/eval_diffusion_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 58 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 cond_steps: 1 horizon_steps: 8 @@ -59,7 +58,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_mlp.yaml index 0aaf430..c39af92 100644 --- a/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_mlp.yaml +++ b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -105,7 +104,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml index 0008395..69011ec 100644 --- a/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml +++ b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -107,7 +106,7 @@ model: cond_predict_scale: True groupnorm_eps: 1e-4 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/lamp_low/ft_ppo_gaussian_mlp.yaml b/cfg/furniture/finetune/lamp_low/ft_ppo_gaussian_mlp.yaml index 1e20c5d..c2fd417 100644 --- a/cfg/furniture/finetune/lamp_low/ft_ppo_gaussian_mlp.yaml +++ b/cfg/furniture/finetune/lamp_low/ft_ppo_gaussian_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 8 act_steps: 8 @@ -98,7 +97,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_mlp.yaml index ca49f91..3d0b203 100644 --- a/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_mlp.yaml +++ b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -105,7 +104,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml index 9ea2f1b..ce73c44 100644 --- a/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml +++ b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -106,7 +105,7 @@ model: smaller_encoder: False cond_predict_scale: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/lamp_med/ft_ppo_gaussian_mlp.yaml b/cfg/furniture/finetune/lamp_med/ft_ppo_gaussian_mlp.yaml index 6db6e73..bdf198e 100644 --- a/cfg/furniture/finetune/lamp_med/ft_ppo_gaussian_mlp.yaml +++ b/cfg/furniture/finetune/lamp_med/ft_ppo_gaussian_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 8 act_steps: 8 @@ -98,7 +97,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml index b801d5e..44cd23a 100644 --- a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml +++ b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 58 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -105,7 +104,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml index 5282f8b..54082c5 100644 --- a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml +++ b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 58 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -107,7 +106,7 @@ model: cond_predict_scale: True groupnorm_eps: 1e-4 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/one_leg_low/ft_ppo_gaussian_mlp.yaml b/cfg/furniture/finetune/one_leg_low/ft_ppo_gaussian_mlp.yaml index 4a672cf..ba0186e 100644 --- a/cfg/furniture/finetune/one_leg_low/ft_ppo_gaussian_mlp.yaml +++ b/cfg/furniture/finetune/one_leg_low/ft_ppo_gaussian_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 58 action_dim: 10 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 8 act_steps: 8 @@ -98,7 +97,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_mlp.yaml index 77f6193..cdca086 100644 --- a/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_mlp.yaml +++ b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 58 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -105,7 +104,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml index 38d7288..9484c40 100644 --- a/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml +++ b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 58 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -107,7 +106,7 @@ model: cond_predict_scale: True groupnorm_eps: 1e-4 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/one_leg_med/ft_ppo_gaussian_mlp.yaml b/cfg/furniture/finetune/one_leg_med/ft_ppo_gaussian_mlp.yaml index b3927ee..638df5e 100644 --- a/cfg/furniture/finetune/one_leg_med/ft_ppo_gaussian_mlp.yaml +++ b/cfg/furniture/finetune/one_leg_med/ft_ppo_gaussian_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 58 action_dim: 10 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 8 act_steps: 8 @@ -98,7 +97,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_mlp.yaml index 34adf89..7d3ecb6 100644 --- a/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_mlp.yaml +++ b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -105,7 +104,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml index fdc44f6..519d59f 100644 --- a/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml +++ b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -107,7 +106,7 @@ model: cond_predict_scale: True groupnorm_eps: 1e-4 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/round_table_low/ft_ppo_gaussian_mlp.yaml b/cfg/furniture/finetune/round_table_low/ft_ppo_gaussian_mlp.yaml index fc90dd0..c963c98 100644 --- a/cfg/furniture/finetune/round_table_low/ft_ppo_gaussian_mlp.yaml +++ b/cfg/furniture/finetune/round_table_low/ft_ppo_gaussian_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 8 act_steps: 8 @@ -98,7 +97,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_mlp.yaml index 1a1c27b..d9dae77 100644 --- a/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_mlp.yaml +++ b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -105,7 +104,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml index 3e32d0a..659fd30 100644 --- a/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml +++ b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -106,7 +105,7 @@ model: smaller_encoder: False cond_predict_scale: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/finetune/round_table_med/ft_ppo_gaussian_mlp.yaml b/cfg/furniture/finetune/round_table_med/ft_ppo_gaussian_mlp.yaml index 7ced6f3..ff4f671 100644 --- a/cfg/furniture/finetune/round_table_med/ft_ppo_gaussian_mlp.yaml +++ b/cfg/furniture/finetune/round_table_med/ft_ppo_gaussian_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 8 act_steps: 8 @@ -98,7 +97,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/furniture/pretrain/lamp_low/pre_diffusion_mlp.yaml b/cfg/furniture/pretrain/lamp_low/pre_diffusion_mlp.yaml index 775916f..4b655d4 100644 --- a/cfg/furniture/pretrain/lamp_low/pre_diffusion_mlp.yaml +++ b/cfg/furniture/pretrain/lamp_low/pre_diffusion_mlp.yaml @@ -16,7 +16,6 @@ randomness: low env: ${task}_${randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 8 cond_steps: 1 @@ -52,7 +51,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/furniture/pretrain/lamp_low/pre_diffusion_unet.yaml b/cfg/furniture/pretrain/lamp_low/pre_diffusion_unet.yaml index fc22513..45b0d2e 100644 --- a/cfg/furniture/pretrain/lamp_low/pre_diffusion_unet.yaml +++ b/cfg/furniture/pretrain/lamp_low/pre_diffusion_unet.yaml @@ -16,7 +16,6 @@ randomness: low env: ${task}_${randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 16 cond_steps: 1 @@ -54,7 +53,7 @@ model: cond_predict_scale: True groupnorm_eps: 1e-4 # not important cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/furniture/pretrain/lamp_low/pre_gaussian_mlp.yaml b/cfg/furniture/pretrain/lamp_low/pre_gaussian_mlp.yaml index 2d700be..bc451c7 100644 --- a/cfg/furniture/pretrain/lamp_low/pre_gaussian_mlp.yaml +++ b/cfg/furniture/pretrain/lamp_low/pre_gaussian_mlp.yaml @@ -16,7 +16,6 @@ randomness: low env: ${task}_${randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} horizon_steps: 8 cond_steps: 1 @@ -49,7 +48,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/furniture/pretrain/lamp_med/pre_diffusion_mlp.yaml b/cfg/furniture/pretrain/lamp_med/pre_diffusion_mlp.yaml index 827739c..d24459b 100644 --- a/cfg/furniture/pretrain/lamp_med/pre_diffusion_mlp.yaml +++ b/cfg/furniture/pretrain/lamp_med/pre_diffusion_mlp.yaml @@ -16,7 +16,6 @@ randomness: med env: ${task}_${randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 8 cond_steps: 1 @@ -52,7 +51,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/furniture/pretrain/lamp_med/pre_diffusion_unet.yaml b/cfg/furniture/pretrain/lamp_med/pre_diffusion_unet.yaml index 5d7bc9f..478562f 100644 --- a/cfg/furniture/pretrain/lamp_med/pre_diffusion_unet.yaml +++ b/cfg/furniture/pretrain/lamp_med/pre_diffusion_unet.yaml @@ -16,7 +16,6 @@ randomness: med env: ${task}_${randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 16 cond_steps: 1 @@ -53,7 +52,7 @@ model: smaller_encoder: False cond_predict_scale: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/furniture/pretrain/lamp_med/pre_gaussian_mlp.yaml b/cfg/furniture/pretrain/lamp_med/pre_gaussian_mlp.yaml index 8bed467..e74dcd4 100644 --- a/cfg/furniture/pretrain/lamp_med/pre_gaussian_mlp.yaml +++ b/cfg/furniture/pretrain/lamp_med/pre_gaussian_mlp.yaml @@ -16,7 +16,6 @@ randomness: med env: ${task}_${randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} horizon_steps: 8 cond_steps: 1 @@ -49,7 +48,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/furniture/pretrain/one_leg_low/pre_diffusion_mlp.yaml b/cfg/furniture/pretrain/one_leg_low/pre_diffusion_mlp.yaml index c996393..e751175 100644 --- a/cfg/furniture/pretrain/one_leg_low/pre_diffusion_mlp.yaml +++ b/cfg/furniture/pretrain/one_leg_low/pre_diffusion_mlp.yaml @@ -16,7 +16,6 @@ randomness: low env: ${task}_${randomness}_dim obs_dim: 58 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 8 cond_steps: 1 @@ -52,7 +51,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/furniture/pretrain/one_leg_low/pre_diffusion_unet.yaml b/cfg/furniture/pretrain/one_leg_low/pre_diffusion_unet.yaml index 253b89b..e981119 100644 --- a/cfg/furniture/pretrain/one_leg_low/pre_diffusion_unet.yaml +++ b/cfg/furniture/pretrain/one_leg_low/pre_diffusion_unet.yaml @@ -16,7 +16,6 @@ randomness: low env: ${task}_${randomness}_dim obs_dim: 58 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 16 cond_steps: 1 @@ -54,7 +53,7 @@ model: cond_predict_scale: True groupnorm_eps: 1e-4 # not important cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/furniture/pretrain/one_leg_low/pre_gaussian_mlp.yaml b/cfg/furniture/pretrain/one_leg_low/pre_gaussian_mlp.yaml index fe6da75..ff9919c 100644 --- a/cfg/furniture/pretrain/one_leg_low/pre_gaussian_mlp.yaml +++ b/cfg/furniture/pretrain/one_leg_low/pre_gaussian_mlp.yaml @@ -16,7 +16,6 @@ randomness: low env: ${task}_${randomness}_dim obs_dim: 58 action_dim: 10 -transition_dim: ${action_dim} horizon_steps: 8 cond_steps: 1 @@ -49,7 +48,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/furniture/pretrain/one_leg_med/pre_diffusion_mlp.yaml b/cfg/furniture/pretrain/one_leg_med/pre_diffusion_mlp.yaml index 57bad4b..1ce6cd9 100644 --- a/cfg/furniture/pretrain/one_leg_med/pre_diffusion_mlp.yaml +++ b/cfg/furniture/pretrain/one_leg_med/pre_diffusion_mlp.yaml @@ -16,7 +16,6 @@ randomness: med env: ${task}_${randomness}_dim obs_dim: 58 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 8 cond_steps: 1 @@ -52,7 +51,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/furniture/pretrain/one_leg_med/pre_diffusion_unet.yaml b/cfg/furniture/pretrain/one_leg_med/pre_diffusion_unet.yaml index d588590..bca3be5 100644 --- a/cfg/furniture/pretrain/one_leg_med/pre_diffusion_unet.yaml +++ b/cfg/furniture/pretrain/one_leg_med/pre_diffusion_unet.yaml @@ -16,7 +16,6 @@ randomness: med env: ${task}_${randomness}_dim obs_dim: 58 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 16 cond_steps: 1 @@ -53,7 +52,7 @@ model: smaller_encoder: False cond_predict_scale: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/furniture/pretrain/one_leg_med/pre_gaussian_mlp.yaml b/cfg/furniture/pretrain/one_leg_med/pre_gaussian_mlp.yaml index df64f22..4cce367 100644 --- a/cfg/furniture/pretrain/one_leg_med/pre_gaussian_mlp.yaml +++ b/cfg/furniture/pretrain/one_leg_med/pre_gaussian_mlp.yaml @@ -16,7 +16,6 @@ randomness: med env: ${task}_${randomness}_dim obs_dim: 58 action_dim: 10 -transition_dim: ${action_dim} horizon_steps: 8 cond_steps: 1 @@ -49,7 +48,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/furniture/pretrain/round_table_low/pre_diffusion_mlp.yaml b/cfg/furniture/pretrain/round_table_low/pre_diffusion_mlp.yaml index 9930155..5ac604e 100644 --- a/cfg/furniture/pretrain/round_table_low/pre_diffusion_mlp.yaml +++ b/cfg/furniture/pretrain/round_table_low/pre_diffusion_mlp.yaml @@ -16,7 +16,6 @@ randomness: low env: ${task}_${randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 8 cond_steps: 1 @@ -52,7 +51,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/furniture/pretrain/round_table_low/pre_diffusion_unet.yaml b/cfg/furniture/pretrain/round_table_low/pre_diffusion_unet.yaml index a9a9de1..9d20e49 100644 --- a/cfg/furniture/pretrain/round_table_low/pre_diffusion_unet.yaml +++ b/cfg/furniture/pretrain/round_table_low/pre_diffusion_unet.yaml @@ -16,7 +16,6 @@ randomness: low env: ${task}_${randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 16 cond_steps: 1 @@ -53,7 +52,7 @@ model: smaller_encoder: False cond_predict_scale: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/furniture/pretrain/round_table_low/pre_gaussian_mlp.yaml b/cfg/furniture/pretrain/round_table_low/pre_gaussian_mlp.yaml index 2d308d5..218559e 100644 --- a/cfg/furniture/pretrain/round_table_low/pre_gaussian_mlp.yaml +++ b/cfg/furniture/pretrain/round_table_low/pre_gaussian_mlp.yaml @@ -16,7 +16,6 @@ randomness: low env: ${task}_${randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} horizon_steps: 8 cond_steps: 1 @@ -49,7 +48,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/furniture/pretrain/round_table_med/pre_diffusion_mlp.yaml b/cfg/furniture/pretrain/round_table_med/pre_diffusion_mlp.yaml index a8f0d83..d4e0c56 100644 --- a/cfg/furniture/pretrain/round_table_med/pre_diffusion_mlp.yaml +++ b/cfg/furniture/pretrain/round_table_med/pre_diffusion_mlp.yaml @@ -16,7 +16,6 @@ randomness: med env: ${task}_${randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 8 cond_steps: 1 @@ -52,7 +51,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/furniture/pretrain/round_table_med/pre_diffusion_unet.yaml b/cfg/furniture/pretrain/round_table_med/pre_diffusion_unet.yaml index e0c351a..1965d40 100644 --- a/cfg/furniture/pretrain/round_table_med/pre_diffusion_unet.yaml +++ b/cfg/furniture/pretrain/round_table_med/pre_diffusion_unet.yaml @@ -16,7 +16,6 @@ randomness: med env: ${task}_${randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 16 cond_steps: 1 @@ -53,7 +52,7 @@ model: smaller_encoder: False cond_predict_scale: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/furniture/pretrain/round_table_med/pre_gaussian_mlp.yaml b/cfg/furniture/pretrain/round_table_med/pre_gaussian_mlp.yaml index 543e16c..1ade284 100644 --- a/cfg/furniture/pretrain/round_table_med/pre_gaussian_mlp.yaml +++ b/cfg/furniture/pretrain/round_table_med/pre_gaussian_mlp.yaml @@ -16,7 +16,6 @@ randomness: med env: ${task}_${randomness}_dim obs_dim: 44 action_dim: 10 -transition_dim: ${action_dim} horizon_steps: 8 cond_steps: 1 @@ -49,7 +48,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/gym/eval/halfcheetah-v2/eval_diffusion_mlp.yaml b/cfg/gym/eval/halfcheetah-v2/eval_diffusion_mlp.yaml new file mode 100644 index 0000000..bfef1e1 --- /dev/null +++ b/cfg/gym/eval/halfcheetah-v2/eval_diffusion_mlp.yaml @@ -0,0 +1,61 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent + +name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: halfcheetah-medium-v2 +obs_dim: 17 +action_dim: 6 +denoising_steps: 20 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +n_steps: 1000 # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation. +render_num: 0 + +env: + n_envs: 40 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +model: + _target_: model.diffusion.diffusion.DiffusionModel + predict_epsilon: True + denoised_clip_value: 1.0 + # + network_path: ${base_policy_path} + network: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + activation_type: ReLU + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/eval/halfcheetah-v2/eval_gaussian_mlp.yaml b/cfg/gym/eval/halfcheetah-v2/eval_gaussian_mlp.yaml new file mode 100644 index 0000000..62c62bb --- /dev/null +++ b/cfg/gym/eval/halfcheetah-v2/eval_gaussian_mlp.yaml @@ -0,0 +1,54 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.eval.eval_gaussian_agent.EvalGaussianAgent + +name: ${env_name}_eval_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: halfcheetah-medium-v2 +obs_dim: 17 +action_dim: 6 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +n_steps: 1000 # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation. +render_num: 0 + +env: + n_envs: 40 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +model: + _target_: model.common.gaussian.GaussianModel + # + network_path: ${base_policy_path} + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: Mish + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + horizon_steps: ${horizon_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/eval/hopper-v2/eval_diffusion_mlp.yaml b/cfg/gym/eval/hopper-v2/eval_diffusion_mlp.yaml index bd4af16..754ed1e 100644 --- a/cfg/gym/eval/hopper-v2/eval_diffusion_mlp.yaml +++ b/cfg/gym/eval/hopper-v2/eval_diffusion_mlp.yaml @@ -15,7 +15,6 @@ device: cuda:0 env_name: hopper-medium-v2 obs_dim: 11 action_dim: 3 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -54,7 +53,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml b/cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml new file mode 100644 index 0000000..897e06a --- /dev/null +++ b/cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml @@ -0,0 +1,54 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.eval.eval_gaussian_agent.EvalGaussianAgent + +name: ${env_name}_eval_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +n_steps: 1000 # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation. +render_num: 0 + +env: + n_envs: 40 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +model: + _target_: model.common.gaussian.GaussianModel + # + network_path: ${base_policy_path} + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: Mish + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + horizon_steps: ${horizon_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml b/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml new file mode 100644 index 0000000..cef9f0f --- /dev/null +++ b/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml @@ -0,0 +1,117 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: halfcheetah-medium-v2 +obs_dim: 17 +action_dim: 6 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 10000 + n_steps: 1 # not used + n_episode_per_epoch: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: True + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + online_utd_ratio: 1 + n_eval_episode: 10 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_awr_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_awr_diffusion_mlp.yaml index 4a99ae1..676abdd 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ft_awr_diffusion_mlp.yaml +++ b/cfg/gym/finetune/halfcheetah-v2/ft_awr_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: halfcheetah-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -68,7 +68,7 @@ train: max_adv_weight: 100 beta: 10 buffer_size: 5000 - batch_size: 256 + batch_size: 1000 replay_ratio: 64 critic_update_ratio: 4 @@ -82,7 +82,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_dipo_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_dipo_diffusion_mlp.yaml index f132db1..651588f 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ft_dipo_diffusion_mlp.yaml +++ b/cfg/gym/finetune/halfcheetah-v2/ft_dipo_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: halfcheetah-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -65,11 +65,12 @@ train: num: 0 # DIPO specific scale_reward_factor: 0.01 - eta: 0.0001 + target_ema_rate: 0.005 + buffer_size: 1000000 + action_lr: 0.0001 action_gradient_steps: 10 - buffer_size: 400000 - batch_size: 5000 replay_ratio: 64 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dipo.DIPODiffusion @@ -81,7 +82,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_dql_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_dql_diffusion_mlp.yaml index 48a907a..fd1d917 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ft_dql_diffusion_mlp.yaml +++ b/cfg/gym/finetune/halfcheetah-v2/ft_dql_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: halfcheetah-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -65,10 +65,11 @@ train: num: 0 # DQL specific scale_reward_factor: 0.01 + target_ema_rate: 0.005 + buffer_size: 1000000 eta: 1.0 - buffer_size: 400000 - batch_size: 5000 - replay_ratio: 64 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dql.DQLDiffusion @@ -80,7 +81,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_idql_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_idql_diffusion_mlp.yaml index 2c82fdd..0d115d0 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ft_idql_diffusion_mlp.yaml +++ b/cfg/gym/finetune/halfcheetah-v2/ft_idql_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: halfcheetah-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -69,9 +69,9 @@ train: eval_sample_num: 20 # how many samples to score during eval critic_tau: 0.001 # rate of target q network update use_expectile_exploration: True - buffer_size: 5000 - batch_size: 512 - replay_ratio: 16 + buffer_size: 25000 # * n_envs + replay_ratio: 128 + batch_size: 1000 model: _target_: model.diffusion.diffusion_idql.IDQLDiffusion @@ -83,7 +83,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml index 1363b2d..fbcea35 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml +++ b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: halfcheetah-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -93,7 +93,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_exact_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_exact_diffusion_mlp.yaml index 2a71ecf..d28fc90 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_exact_diffusion_mlp.yaml +++ b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_exact_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_ppo_exact_diffusion_agent.TrainPPOExactDiffusionAgent name: ${env_name}_ppo_exact_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: halfcheetah-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -87,7 +87,6 @@ model: sde_min_beta: 1e-10 sde_probability_flow: True # - gamma_denoising: 0.99 clip_ploss_coef: 0.01 min_sampling_denoising_std: 0.1 min_logprob_denoising_std: 0.1 @@ -101,7 +100,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_qsm_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_qsm_diffusion_mlp.yaml index bdf4a1b..ace0ab9 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ft_qsm_diffusion_mlp.yaml +++ b/cfg/gym/finetune/halfcheetah-v2/ft_qsm_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: halfcheetah-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -65,11 +65,11 @@ train: num: 0 # QSM specific scale_reward_factor: 0.01 - q_grad_coeff: 50 - critic_tau: 0.005 # rate of target q network update - buffer_size: 5000 - batch_size: 256 - replay_ratio: 32 + q_grad_coeff: 10 + critic_tau: 0.005 + buffer_size: 25000 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_qsm.QSMDiffusion @@ -81,7 +81,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_rwr_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_rwr_diffusion_mlp.yaml index ccb3a63..817d89c 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ft_rwr_diffusion_mlp.yaml +++ b/cfg/gym/finetune/halfcheetah-v2/ft_rwr_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: halfcheetah-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -73,7 +73,7 @@ model: network: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml new file mode 100644 index 0000000..adfec91 --- /dev/null +++ b/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml @@ -0,0 +1,109 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent + +name: ${env_name}_ibrl_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +base_policy_path: +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: halfcheetah-medium-v2 +obs_dim: 17 +action_dim: 6 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: ibrl-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 300000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 50000 + val_freq: 2000 + render: + freq: 1 + num: 0 + log_freq: 200 + # IBRL specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 5 + buffer_size: 300000 + n_eval_episode: 10 + n_explore_steps: 0 + update_freq: 2 + +model: + _target_: model.rl.gaussian_ibrl.IBRL_Gaussian + randn_clip_value: 3 + n_critics: 5 + soft_action_sample: True + soft_action_sample_beta: 10 + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: Mish + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + max_n_episodes: 50 \ No newline at end of file diff --git a/cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml index 05a60c8..9be391c 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml +++ b/cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml @@ -6,14 +6,14 @@ hydra: _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent name: ${env_name}_nopre_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: halfcheetah-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 20 cond_steps: 1 @@ -86,7 +86,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml index 4eb1cbb..f09c664 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml +++ b/cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml @@ -6,14 +6,14 @@ hydra: _target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent name: ${env_name}_nopre_ppo_gaussian_mlp_ta${horizon_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: halfcheetah-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 1 act_steps: 1 @@ -79,10 +79,10 @@ model: _target_: model.common.mlp_gaussian.Gaussian_MLP mlp_dims: [512, 512, 512] activation_type: ReLU - residual_style: True + residual_style: False # with new logvar head cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml new file mode 100644 index 0000000..898cf9b --- /dev/null +++ b/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml @@ -0,0 +1,109 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: halfcheetah-medium-v2 +obs_dim: 17 +action_dim: 6 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: rlpd-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 250000 + n_steps: 1 + gamma: 0.99 + actor_lr: 3e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 50000 + val_freq: 5000 + render: + freq: 1 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.005 + scale_reward_factor: 1 + critic_num_update: 20 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 5000 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + +model: + _target_: model.rl.gaussian_rlpd.RLPD_Gaussian + randn_clip_value: 10 + tanh_output: True # squash after sampling + backup_entropy: True + n_critics: 10 # Ensemble size for critic models + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml new file mode 100644 index 0000000..8051c73 --- /dev/null +++ b/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml @@ -0,0 +1,89 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_sac_agent.TrainSACAgent + +name: ${env_name}_sac_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: halfcheetah-medium-v2 +obs_dim: 17 +action_dim: 6 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: sac-gym-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 3e-4 + critic_lr: 1e-3 + save_model_freq: 100000 + val_freq: 10000 + render: + freq: 1 + num: 0 + log_freq: 200 + # SAC specific + batch_size: 256 + target_ema_rate: 0.005 + scale_reward_factor: 1 + critic_replay_ratio: 256 + actor_replay_ratio: 128 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 5000 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + +model: + _target_: model.rl.gaussian_sac.SAC_Gaussian + randn_clip_value: 10 + tanh_output: True # squash after sampling + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: # no layernorm + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256] + activation_type: ReLU + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} diff --git a/cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml b/cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml new file mode 100644 index 0000000..10204ba --- /dev/null +++ b/cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml @@ -0,0 +1,117 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 10000 + n_steps: 1 # not used + n_episode_per_epoch: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: True + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + online_utd_ratio: 1 + n_eval_episode: 10 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/gym/finetune/hopper-v2/ft_awr_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_awr_diffusion_mlp.yaml index 6fb1f04..ee7193f 100644 --- a/cfg/gym/finetune/hopper-v2/ft_awr_diffusion_mlp.yaml +++ b/cfg/gym/finetune/hopper-v2/ft_awr_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: hopper-medium-v2 obs_dim: 11 action_dim: 3 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -68,7 +68,7 @@ train: max_adv_weight: 100 beta: 10 buffer_size: 5000 - batch_size: 256 + batch_size: 1000 replay_ratio: 64 critic_update_ratio: 4 @@ -82,7 +82,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/hopper-v2/ft_dipo_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_dipo_diffusion_mlp.yaml index 0bee961..8ca4f63 100644 --- a/cfg/gym/finetune/hopper-v2/ft_dipo_diffusion_mlp.yaml +++ b/cfg/gym/finetune/hopper-v2/ft_dipo_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: hopper-medium-v2 obs_dim: 11 action_dim: 3 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -65,11 +65,12 @@ train: num: 0 # DIPO specific scale_reward_factor: 0.01 - eta: 0.0001 + target_ema_rate: 0.005 + buffer_size: 1000000 + action_lr: 0.0001 action_gradient_steps: 10 - buffer_size: 400000 - batch_size: 5000 replay_ratio: 64 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dipo.DIPODiffusion @@ -81,7 +82,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/hopper-v2/ft_dql_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_dql_diffusion_mlp.yaml index 4ca0a9d..2dd7572 100644 --- a/cfg/gym/finetune/hopper-v2/ft_dql_diffusion_mlp.yaml +++ b/cfg/gym/finetune/hopper-v2/ft_dql_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: hopper-medium-v2 obs_dim: 11 action_dim: 3 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -65,10 +65,11 @@ train: num: 0 # DQL specific scale_reward_factor: 0.01 + target_ema_rate: 0.005 + buffer_size: 1000000 eta: 1.0 - buffer_size: 400000 - batch_size: 5000 - replay_ratio: 64 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dql.DQLDiffusion @@ -80,7 +81,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/hopper-v2/ft_idql_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_idql_diffusion_mlp.yaml index eb345d8..42ef50b 100644 --- a/cfg/gym/finetune/hopper-v2/ft_idql_diffusion_mlp.yaml +++ b/cfg/gym/finetune/hopper-v2/ft_idql_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: hopper-medium-v2 obs_dim: 11 action_dim: 3 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -69,9 +69,9 @@ train: eval_sample_num: 20 # how many samples to score during eval critic_tau: 0.001 # rate of target q network update use_expectile_exploration: True - buffer_size: 5000 - batch_size: 512 - replay_ratio: 16 + buffer_size: 25000 # * n_envs + replay_ratio: 128 + batch_size: 1000 model: _target_: model.diffusion.diffusion_idql.IDQLDiffusion @@ -83,7 +83,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml index 3467f91..d4b9597 100644 --- a/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml +++ b/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: hopper-medium-v2 obs_dim: 11 action_dim: 3 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -93,7 +93,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/gym/finetune/hopper-v2/ft_ppo_exact_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_ppo_exact_diffusion_mlp.yaml index 3f44524..323a8ed 100644 --- a/cfg/gym/finetune/hopper-v2/ft_ppo_exact_diffusion_mlp.yaml +++ b/cfg/gym/finetune/hopper-v2/ft_ppo_exact_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_ppo_exact_diffusion_agent.TrainPPOExactDiffusionAgent name: ${env_name}_ppo_exact_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: hopper-medium-v2 obs_dim: 11 action_dim: 3 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -87,7 +87,6 @@ model: sde_min_beta: 1e-10 sde_probability_flow: True # - gamma_denoising: 0.99 clip_ploss_coef: 0.01 min_sampling_denoising_std: 0.1 min_logprob_denoising_std: 0.1 @@ -100,7 +99,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/gym/finetune/hopper-v2/ft_qsm_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_qsm_diffusion_mlp.yaml index 3656300..41270be 100644 --- a/cfg/gym/finetune/hopper-v2/ft_qsm_diffusion_mlp.yaml +++ b/cfg/gym/finetune/hopper-v2/ft_qsm_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: hopper-medium-v2 obs_dim: 11 action_dim: 3 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -65,11 +65,11 @@ train: num: 0 # QSM specific scale_reward_factor: 0.01 - q_grad_coeff: 50 - critic_tau: 0.005 # rate of target q network update - buffer_size: 5000 - batch_size: 256 - replay_ratio: 32 + q_grad_coeff: 10 + critic_tau: 0.005 + buffer_size: 25000 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_qsm.QSMDiffusion @@ -81,7 +81,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/hopper-v2/ft_rwr_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_rwr_diffusion_mlp.yaml index 2cdc50d..7335253 100644 --- a/cfg/gym/finetune/hopper-v2/ft_rwr_diffusion_mlp.yaml +++ b/cfg/gym/finetune/hopper-v2/ft_rwr_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: hopper-medium-v2 obs_dim: 11 action_dim: 3 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -73,7 +73,7 @@ model: network: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml b/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml new file mode 100644 index 0000000..1737a1e --- /dev/null +++ b/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml @@ -0,0 +1,108 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent + +name: ${env_name}_ibrl_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +base_policy_path: +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: ibrl-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 250000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 50000 + val_freq: 2000 + render: + freq: 1 + num: 0 + log_freq: 200 + # IBRL specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 5 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 0 + update_freq: 2 + +model: + _target_: model.rl.gaussian_ibrl.IBRL_Gaussian + randn_clip_value: 3 + n_critics: 5 + soft_action_sample: True + soft_action_sample_beta: 0.1 + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: Mish + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml index 89c75c1..3f26654 100644 --- a/cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml +++ b/cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml @@ -6,14 +6,14 @@ hydra: _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent name: ${env_name}_nopre_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: hopper-medium-v2 obs_dim: 11 action_dim: 3 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 20 cond_steps: 1 @@ -86,7 +86,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml b/cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml index 17865c5..57eafcb 100644 --- a/cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml +++ b/cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml @@ -6,14 +6,14 @@ hydra: _target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent name: ${env_name}_nopre_ppo_gaussian_mlp_ta${horizon_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: hopper-medium-v2 obs_dim: 11 action_dim: 3 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 1 act_steps: 1 @@ -79,10 +79,10 @@ model: _target_: model.common.mlp_gaussian.Gaussian_MLP mlp_dims: [512, 512, 512] activation_type: ReLU - residual_style: True + residual_style: False # with new logvar head cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml b/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml new file mode 100644 index 0000000..7a33bde --- /dev/null +++ b/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml @@ -0,0 +1,109 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: rlpd-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 250000 + n_steps: 1 + gamma: 0.99 + actor_lr: 3e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 50000 + val_freq: 5000 + render: + freq: 1 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.005 + scale_reward_factor: 1 + critic_num_update: 20 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 5000 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + +model: + _target_: model.rl.gaussian_rlpd.RLPD_Gaussian + randn_clip_value: 10 + tanh_output: True # squash after sampling + backup_entropy: True + n_critics: 10 # Ensemble size for critic models + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/hopper-v2/sac_mlp.yaml b/cfg/gym/finetune/hopper-v2/sac_mlp.yaml new file mode 100644 index 0000000..6d44909 --- /dev/null +++ b/cfg/gym/finetune/hopper-v2/sac_mlp.yaml @@ -0,0 +1,89 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_sac_agent.TrainSACAgent + +name: ${env_name}_sac_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: sac-gym-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 3e-4 + critic_lr: 1e-3 + save_model_freq: 100000 + val_freq: 10000 + render: + freq: 1 + num: 0 + log_freq: 200 + # SAC specific + batch_size: 256 + target_ema_rate: 0.005 + scale_reward_factor: 1 + critic_replay_ratio: 256 + actor_replay_ratio: 128 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 5000 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + +model: + _target_: model.rl.gaussian_sac.SAC_Gaussian + randn_clip_value: 10 + tanh_output: True # squash after sampling + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: # no layernorm + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256] + activation_type: ReLU + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} diff --git a/cfg/gym/finetune/walker2d-v2/ft_awr_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_awr_diffusion_mlp.yaml index a529e3d..7d23ba1 100644 --- a/cfg/gym/finetune/walker2d-v2/ft_awr_diffusion_mlp.yaml +++ b/cfg/gym/finetune/walker2d-v2/ft_awr_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: walker2d-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -68,7 +68,7 @@ train: max_adv_weight: 100 beta: 10 buffer_size: 5000 - batch_size: 256 + batch_size: 1000 replay_ratio: 64 critic_update_ratio: 4 @@ -82,7 +82,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/walker2d-v2/ft_dipo_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_dipo_diffusion_mlp.yaml index 7ef6db2..5cc2417 100644 --- a/cfg/gym/finetune/walker2d-v2/ft_dipo_diffusion_mlp.yaml +++ b/cfg/gym/finetune/walker2d-v2/ft_dipo_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: walker2d-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -65,11 +65,12 @@ train: num: 0 # DIPO specific scale_reward_factor: 0.01 - eta: 0.0001 + target_ema_rate: 0.005 + buffer_size: 1000000 + action_lr: 0.0001 action_gradient_steps: 10 - buffer_size: 400000 - batch_size: 5000 replay_ratio: 64 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dipo.DIPODiffusion @@ -81,7 +82,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/walker2d-v2/ft_dql_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_dql_diffusion_mlp.yaml index e6b8d9e..a764cf0 100644 --- a/cfg/gym/finetune/walker2d-v2/ft_dql_diffusion_mlp.yaml +++ b/cfg/gym/finetune/walker2d-v2/ft_dql_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: walker2d-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -65,10 +65,11 @@ train: num: 0 # DQL specific scale_reward_factor: 0.01 + target_ema_rate: 0.005 + buffer_size: 1000000 eta: 1.0 - buffer_size: 400000 - batch_size: 5000 - replay_ratio: 64 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dql.DQLDiffusion @@ -80,7 +81,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/walker2d-v2/ft_idql_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_idql_diffusion_mlp.yaml index 3c46b89..bea8ae9 100644 --- a/cfg/gym/finetune/walker2d-v2/ft_idql_diffusion_mlp.yaml +++ b/cfg/gym/finetune/walker2d-v2/ft_idql_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: walker2d-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -69,9 +69,9 @@ train: eval_sample_num: 20 # how many samples to score during eval critic_tau: 0.001 # rate of target q network update use_expectile_exploration: True - buffer_size: 5000 - batch_size: 512 - replay_ratio: 16 + buffer_size: 25000 # * n_envs + replay_ratio: 128 + batch_size: 1000 model: _target_: model.diffusion.diffusion_idql.IDQLDiffusion @@ -83,7 +83,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml index 00dd321..9158042 100644 --- a/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml +++ b/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: walker2d-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -93,7 +93,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/gym/finetune/walker2d-v2/ft_qsm_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_qsm_diffusion_mlp.yaml index e91d9ff..4bb6915 100644 --- a/cfg/gym/finetune/walker2d-v2/ft_qsm_diffusion_mlp.yaml +++ b/cfg/gym/finetune/walker2d-v2/ft_qsm_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: walker2d-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -65,11 +65,11 @@ train: num: 0 # QSM specific scale_reward_factor: 0.01 - q_grad_coeff: 50 - critic_tau: 0.005 # rate of target q network update - buffer_size: 5000 - batch_size: 256 - replay_ratio: 32 + q_grad_coeff: 10 + critic_tau: 0.005 + buffer_size: 25000 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_qsm.QSMDiffusion @@ -81,7 +81,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml new file mode 100644 index 0000000..42dcdf5 --- /dev/null +++ b/cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml @@ -0,0 +1,103 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: walker2d-medium-v2 +obs_dim: 17 +action_dim: 6 +denoising_steps: 20 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 40 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: rlpd-gym-${env_name}-finetune + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_critic_warmup_itr: 5 + n_steps: 2000 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # RLPD specific + batch_size: 512 + entropy_temperature: 1.0 # alpha in RLPD paper + target_ema_rate: 0.005 # rho in RLPD paper + scale_reward_factor: 1.0 # multiply reward by this amount for more stable value estimation + replay_ratio: 64 # number of batches to sample for each learning update + buffer_size: 1000000 + +model: + _target_: model.rl.gaussian_rlpd.RLPD_Gaussian + randn_clip_value: 3 + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + action_dim: ${action_dim} + action_steps: ${act_steps} + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + use_layernorm: True + horizon_steps: ${horizon_steps} + device: ${device} + n_critics: 2 # Ensemble size for critic models + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/walker2d-v2/ft_rwr_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_rwr_diffusion_mlp.yaml index 6be6eb6..f78103a 100644 --- a/cfg/gym/finetune/walker2d-v2/ft_rwr_diffusion_mlp.yaml +++ b/cfg/gym/finetune/walker2d-v2/ft_rwr_diffusion_mlp.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: walker2d-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -73,7 +73,7 @@ model: network: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml index dafbea6..6530d49 100644 --- a/cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml +++ b/cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml @@ -6,14 +6,14 @@ hydra: _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent name: ${env_name}_nopre_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: walker2d-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 20 cond_steps: 1 @@ -86,7 +86,7 @@ model: actor: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml index a644eda..dff57a3 100644 --- a/cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml +++ b/cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml @@ -6,14 +6,14 @@ hydra: _target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent name: ${env_name}_nopre_ppo_gaussian_mlp_ta${horizon_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +seed: 42 device: cuda:0 env_name: walker2d-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 1 act_steps: 1 @@ -79,10 +79,10 @@ model: _target_: model.common.mlp_gaussian.Gaussian_MLP mlp_dims: [512, 512, 512] activation_type: ReLU - residual_style: True + residual_style: False # with new logvar head cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml b/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml new file mode 100644 index 0000000..7dfd7ed --- /dev/null +++ b/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml @@ -0,0 +1,113 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: halfcheetah-medium-v2 +obs_dim: 17 +action_dim: 6 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 100 + n_steps: 1 # not used + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 10 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: False + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/gym/pretrain/halfcheetah-medium-v2/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_diffusion_mlp.yaml index 6e20b5f..612bc5e 100644 --- a/cfg/gym/pretrain/halfcheetah-medium-v2/pre_diffusion_mlp.yaml +++ b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_diffusion_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: halfcheetah-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 4 cond_steps: 1 @@ -44,7 +43,7 @@ model: network: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml new file mode 100644 index 0000000..9a7f5bf --- /dev/null +++ b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml @@ -0,0 +1,59 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz + +seed: 42 +device: cuda:0 +env: halfcheetah-medium-v2 +obs_dim: 17 +action_dim: 6 +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 500 + batch_size: 128 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 1 + min_lr: 1e-4 + epoch_start_ema: 10 + update_ema_freq: 5 + save_model_freq: 100 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: Mish + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/pretrain/hopper-medium-v2/calql_mlp_offline.yaml b/cfg/gym/pretrain/hopper-medium-v2/calql_mlp_offline.yaml new file mode 100644 index 0000000..24f8957 --- /dev/null +++ b/cfg/gym/pretrain/hopper-medium-v2/calql_mlp_offline.yaml @@ -0,0 +1,113 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 100 + n_steps: 1 # not used + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 10 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: False + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/gym/pretrain/hopper-medium-v2/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/hopper-medium-v2/pre_diffusion_mlp.yaml index c1428a6..4b05bba 100644 --- a/cfg/gym/pretrain/hopper-medium-v2/pre_diffusion_mlp.yaml +++ b/cfg/gym/pretrain/hopper-medium-v2/pre_diffusion_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: hopper-medium-v2 obs_dim: 11 action_dim: 3 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 4 cond_steps: 1 @@ -44,7 +43,7 @@ model: network: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml new file mode 100644 index 0000000..5a11734 --- /dev/null +++ b/cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml @@ -0,0 +1,59 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz + +seed: 42 +device: cuda:0 +env: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 500 + batch_size: 128 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 1 + min_lr: 1e-4 + epoch_start_ema: 10 + update_ema_freq: 5 + save_model_freq: 100 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: Mish + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/pretrain/walker2d-medium-v2/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/walker2d-medium-v2/pre_diffusion_mlp.yaml index 893caa5..8118aba 100644 --- a/cfg/gym/pretrain/walker2d-medium-v2/pre_diffusion_mlp.yaml +++ b/cfg/gym/pretrain/walker2d-medium-v2/pre_diffusion_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: walker2d-medium-v2 obs_dim: 17 action_dim: 6 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 4 cond_steps: 1 @@ -44,7 +43,7 @@ model: network: _target_: model.diffusion.mlp_diffusion.DiffusionMLP horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} time_dim: 16 mlp_dims: [512, 512, 512] diff --git a/cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml new file mode 100644 index 0000000..dca4923 --- /dev/null +++ b/cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml @@ -0,0 +1,59 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz + +seed: 42 +device: cuda:1 +env: walker2d-medium-v2 +obs_dim: 17 +action_dim: 6 +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env}-pretrain-gaussian + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 3000 + batch_size: 128 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 3000 + warmup_steps: 1 + min_lr: 1e-4 + epoch_start_ema: 10 + update_ema_freq: 5 + save_model_freq: 100 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256] + activation_type: ReLU + residual_style: False + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/eval/can/eval_diffusion_mlp.yaml b/cfg/robomimic/eval/can/eval_diffusion_mlp.yaml index 4d98a76..d19359e 100644 --- a/cfg/robomimic/eval/can/eval_diffusion_mlp.yaml +++ b/cfg/robomimic/eval/can/eval_diffusion_mlp.yaml @@ -7,7 +7,7 @@ _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} -base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt # use 8000 for comparing policy parameterizations +base_policy_path: robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -58,7 +57,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/eval/can/eval_diffusion_mlp_img.yaml b/cfg/robomimic/eval/can/eval_diffusion_mlp_img.yaml index fb302fc..b100545 100644 --- a/cfg/robomimic/eval/can/eval_diffusion_mlp_img.yaml +++ b/cfg/robomimic/eval/can/eval_diffusion_mlp_img.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 9 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 100 cond_steps: 1 img_cond_steps: 1 @@ -90,7 +89,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/eval/can/eval_gaussian_mlp.yaml b/cfg/robomimic/eval/can/eval_gaussian_mlp.yaml index 4ac40b7..25a3719 100644 --- a/cfg/robomimic/eval/can/eval_gaussian_mlp.yaml +++ b/cfg/robomimic/eval/can/eval_gaussian_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -55,6 +54,6 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/eval/can/eval_gaussian_mlp_img.yaml b/cfg/robomimic/eval/can/eval_gaussian_mlp_img.yaml index 1492304..7aa0269 100644 --- a/cfg/robomimic/eval/can/eval_gaussian_mlp_img.yaml +++ b/cfg/robomimic/eval/can/eval_gaussian_mlp_img.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 9 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 img_cond_steps: 1 horizon_steps: 4 @@ -82,6 +81,6 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/eval/square/eval_diffusion_mlp.yaml b/cfg/robomimic/eval/square/eval_diffusion_mlp.yaml new file mode 100644 index 0000000..759c653 --- /dev/null +++ b/cfg/robomimic/eval/square/eval_diffusion_mlp.yaml @@ -0,0 +1,66 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent + +name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 20 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +n_steps: 400 # each episode takes max_episode_steps / act_steps steps +render_num: 0 + +env: + n_envs: 50 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 400 + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +model: + _target_: model.diffusion.diffusion.DiffusionModel + predict_epsilon: True + denoised_clip_value: 1.0 + randn_clip_value: 3 + # + network_path: ${base_policy_path} + network: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/eval/square/eval_gaussian_mlp.yaml b/cfg/robomimic/eval/square/eval_gaussian_mlp.yaml new file mode 100644 index 0000000..3e6a089 --- /dev/null +++ b/cfg/robomimic/eval/square/eval_gaussian_mlp.yaml @@ -0,0 +1,60 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.eval.eval_gaussian_agent.EvalGaussianAgent + +name: ${env_name}_eval_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +n_steps: 400 # each episode takes max_episode_steps / act_steps steps +render_num: 0 + +env: + n_envs: 50 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 400 + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +model: + _target_: model.common.gaussian.GaussianModel + randn_clip_value: 3 + # + network_path: ${base_policy_path} + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: true + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + horizon_steps: ${horizon_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/finetune/can/calql_mlp_online.yaml b/cfg/robomimic/finetune/can/calql_mlp_online.yaml new file mode 100644 index 0000000..8fc1a3c --- /dev/null +++ b/cfg/robomimic/finetune/can/calql_mlp_online.yaml @@ -0,0 +1,122 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 10000 + n_steps: 1 # not used + n_episode_per_epoch: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: True + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + online_utd_ratio: 1 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml index 4b095d6..2a8343a 100644 --- a/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -73,7 +72,7 @@ train: max_adv_weight: 100 beta: 10 buffer_size: 3000 - batch_size: 256 + batch_size: 1000 replay_ratio: 64 critic_update_ratio: 4 @@ -91,7 +90,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/robomimic/finetune/can/ft_dipo_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_dipo_diffusion_mlp.yaml index a6876c4..55ebccc 100644 --- a/cfg/robomimic/finetune/can/ft_dipo_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_dipo_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -70,11 +69,12 @@ train: num: 0 # DIPO specific scale_reward_factor: 1 - eta: 0.0001 + target_ema_rate: 0.005 + buffer_size: 1000000 + action_lr: 0.0001 action_gradient_steps: 10 - buffer_size: 400000 - batch_size: 5000 - replay_ratio: 64 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dipo.DIPODiffusion @@ -90,7 +90,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml index 9086fe9..ed9c90f 100644 --- a/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -48,7 +47,7 @@ wandb: train: n_train_itr: 300 - n_critic_warmup_itr: 2 + n_critic_warmup_itr: 5 n_steps: 300 gamma: 0.999 actor_lr: 1e-5 @@ -70,10 +69,11 @@ train: num: 0 # DQL specific scale_reward_factor: 1 + target_ema_rate: 0.005 + buffer_size: 1000000 eta: 1.0 - buffer_size: 400000 - batch_size: 5000 - replay_ratio: 64 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dql.DQLDiffusion @@ -89,7 +89,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml index d0ee1c5..24bb53a 100644 --- a/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -74,9 +73,9 @@ train: eval_sample_num: 10 # how many samples to score during eval critic_tau: 0.001 # rate of target q network update use_expectile_exploration: True - buffer_size: 3000 - batch_size: 512 - replay_ratio: 16 + buffer_size: 5000 # * n_envs + replay_ratio: 128 + batch_size: 1000 model: _target_: model.diffusion.diffusion_idql.IDQLDiffusion @@ -92,7 +91,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic_q: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml index e91c255..ba1fa16 100644 --- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -97,7 +96,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml index ef77c2f..0873cb4 100644 --- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml +++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 9 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -140,7 +139,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.ViTCritic spatial_emb: 128 diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet.yaml index 6c94bf6..6f3c0ce 100644 --- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet.yaml +++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -100,7 +99,7 @@ model: smaller_encoder: False cond_predict_scale: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/robomimic/finetune/can/ft_ppo_exact_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_ppo_exact_diffusion_mlp.yaml index 617bdab..8a97a68 100644 --- a/cfg/robomimic/finetune/can/ft_ppo_exact_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_ppo_exact_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -92,7 +91,6 @@ model: sde_min_beta: 1e-10 sde_probability_flow: True # - gamma_denoising: 0.99 clip_ploss_coef: 0.01 min_sampling_denoising_std: 0.1 min_logprob_denoising_std: 0.1 @@ -105,7 +103,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp.yaml b/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp.yaml index cc267e8..1f093e2 100644 --- a/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -91,7 +90,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp_img.yaml b/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp_img.yaml index ea9b229..fcba3e6 100644 --- a/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp_img.yaml +++ b/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp_img.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 9 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 img_cond_steps: 1 horizon_steps: 4 @@ -122,7 +121,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.ViTCritic spatial_emb: 128 diff --git a/cfg/robomimic/finetune/can/ft_ppo_gaussian_transformer.yaml b/cfg/robomimic/finetune/can/ft_ppo_gaussian_transformer.yaml index c4930af..3b6254d 100644 --- a/cfg/robomimic/finetune/can/ft_ppo_gaussian_transformer.yaml +++ b/cfg/robomimic/finetune/can/ft_ppo_gaussian_transformer.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -92,7 +91,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/robomimic/finetune/can/ft_ppo_gmm_mlp.yaml b/cfg/robomimic/finetune/can/ft_ppo_gmm_mlp.yaml index bdfe130..1e7beb2 100644 --- a/cfg/robomimic/finetune/can/ft_ppo_gmm_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_ppo_gmm_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -92,7 +91,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/robomimic/finetune/can/ft_ppo_gmm_transformer.yaml b/cfg/robomimic/finetune/can/ft_ppo_gmm_transformer.yaml index ea2a4ce..1aa6b45 100644 --- a/cfg/robomimic/finetune/can/ft_ppo_gmm_transformer.yaml +++ b/cfg/robomimic/finetune/can/ft_ppo_gmm_transformer.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -93,7 +92,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs mlp_dims: [256, 256, 256] diff --git a/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml index 9d7396e..591f3a9 100644 --- a/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -70,11 +69,11 @@ train: num: 0 # QSM specific scale_reward_factor: 1 - q_grad_coeff: 50 + q_grad_coeff: 10 critic_tau: 0.005 # rate of target q network update - buffer_size: 3000 - batch_size: 256 - replay_ratio: 32 + buffer_size: 5000 # * n_envs + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_qsm.QSMDiffusion @@ -90,7 +89,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml index 9fe8610..5037605 100644 --- a/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -82,7 +81,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/can/ibrl_mlp.yaml b/cfg/robomimic/finetune/can/ibrl_mlp.yaml new file mode 100644 index 0000000..7aa8d24 --- /dev/null +++ b/cfg/robomimic/finetune/can/ibrl_mlp.yaml @@ -0,0 +1,115 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent + +name: ${env_name}_ibrl_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 250 # IBRL uses 200 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: ibrl-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 100000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # IBRL specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + update_freq: 2 + +model: + _target_: model.rl.gaussian_ibrl.IBRL_Gaussian + randn_clip_value: 3 + n_critics: 5 + soft_action_sample: True + soft_action_sample_beta: 10 + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + dropout: 0.5 + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + max_n_episodes: 100 \ No newline at end of file diff --git a/cfg/robomimic/finetune/can/rlpd_mlp.yaml b/cfg/robomimic/finetune/can/rlpd_mlp.yaml new file mode 100644 index 0000000..4f5a948 --- /dev/null +++ b/cfg/robomimic/finetune/can/rlpd_mlp.yaml @@ -0,0 +1,114 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: rlpd-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 100000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + +model: + _target_: model.rl.gaussian_rlpd.RLPD_Gaussian + randn_clip_value: 10 + backup_entropy: True + n_critics: 5 + tanh_output: True + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml index 18d016f..bddd57c 100644 --- a/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -73,7 +72,7 @@ train: max_adv_weight: 100 beta: 10 buffer_size: 3000 - batch_size: 256 + batch_size: 1000 replay_ratio: 64 critic_update_ratio: 4 @@ -91,13 +90,13 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} mlp_dims: [256, 256, 256] activation_type: Mish residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/lift/ft_dipo_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_dipo_diffusion_mlp.yaml index 613e4ee..a3c635f 100644 --- a/cfg/robomimic/finetune/lift/ft_dipo_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_dipo_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -48,7 +47,7 @@ wandb: train: n_train_itr: 300 - n_critic_warmup_itr: 2 + n_critic_warmup_itr: 5 n_steps: 300 gamma: 0.999 actor_lr: 1e-5 @@ -70,11 +69,12 @@ train: num: 0 # DIPO specific scale_reward_factor: 1 - eta: 0.0001 + target_ema_rate: 0.005 + buffer_size: 1000000 + action_lr: 0.0001 action_gradient_steps: 10 - buffer_size: 400000 - batch_size: 5000 - replay_ratio: 64 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dipo.DIPODiffusion @@ -90,7 +90,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml index 9705f53..e0353b6 100644 --- a/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -48,7 +47,7 @@ wandb: train: n_train_itr: 300 - n_critic_warmup_itr: 2 + n_critic_warmup_itr: 5 n_steps: 300 gamma: 0.999 actor_lr: 1e-5 @@ -70,10 +69,11 @@ train: num: 0 # DQL specific scale_reward_factor: 1 + target_ema_rate: 0.005 + buffer_size: 1000000 eta: 1.0 - buffer_size: 400000 - batch_size: 5000 - replay_ratio: 64 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dql.DQLDiffusion @@ -89,7 +89,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml index ef5c502..a0e2567 100644 --- a/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -74,9 +73,9 @@ train: eval_sample_num: 10 # how many samples to score during eval critic_tau: 0.001 # rate of target q network update use_expectile_exploration: True - buffer_size: 3000 - batch_size: 512 - replay_ratio: 16 + buffer_size: 5000 # * n_envs + replay_ratio: 128 + batch_size: 1000 model: _target_: model.diffusion.diffusion_idql.IDQLDiffusion @@ -92,7 +91,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic_q: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml index 35a0318..b505b81 100644 --- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -97,7 +96,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml index 03f9f0b..d46c44b 100644 --- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml +++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 9 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -140,7 +139,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.ViTCritic spatial_emb: 128 diff --git a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet.yaml b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet.yaml index 451e9ec..6550645 100644 --- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet.yaml +++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -100,7 +99,7 @@ model: smaller_encoder: False cond_predict_scale: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp.yaml b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp.yaml index 9134bcd..6bab450 100644 --- a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -91,7 +90,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp_img.yaml b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp_img.yaml index 26a9a52..6f589c3 100644 --- a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp_img.yaml +++ b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp_img.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 9 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 img_cond_steps: 1 horizon_steps: 4 @@ -122,7 +121,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.ViTCritic spatial_emb: 128 diff --git a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_transformer.yaml b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_transformer.yaml index 3e32e7e..fff3c02 100644 --- a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_transformer.yaml +++ b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_transformer.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -92,7 +91,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/lift/ft_ppo_gmm_mlp.yaml b/cfg/robomimic/finetune/lift/ft_ppo_gmm_mlp.yaml index 84e0c78..1b31a4e 100644 --- a/cfg/robomimic/finetune/lift/ft_ppo_gmm_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_ppo_gmm_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -92,7 +91,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/lift/ft_ppo_gmm_transformer.yaml b/cfg/robomimic/finetune/lift/ft_ppo_gmm_transformer.yaml index bee15ff..4a89144 100644 --- a/cfg/robomimic/finetune/lift/ft_ppo_gmm_transformer.yaml +++ b/cfg/robomimic/finetune/lift/ft_ppo_gmm_transformer.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -93,7 +92,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml index 6a7c1c7..8262daa 100644 --- a/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -70,11 +69,11 @@ train: num: 0 # QSM specific scale_reward_factor: 1 - q_grad_coeff: 50 + q_grad_coeff: 10 critic_tau: 0.005 # rate of target q network update - buffer_size: 3000 - batch_size: 256 - replay_ratio: 32 + buffer_size: 5000 # * n_envs + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_qsm.QSMDiffusion @@ -90,7 +89,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml index ff0f7fa..fa6b4ca 100644 --- a/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -82,7 +81,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/lift/rlpd_mlp.yaml b/cfg/robomimic/finetune/lift/rlpd_mlp.yaml new file mode 100644 index 0000000..a90e2fd --- /dev/null +++ b/cfg/robomimic/finetune/lift/rlpd_mlp.yaml @@ -0,0 +1,114 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: lift +obs_dim: 19 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: rlpd-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 250000 + n_steps: 1 + gamma: 0.99 + actor_lr: 3e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 50000 + val_freq: 5000 + render: + freq: 1 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.005 + scale_reward_factor: 1 + critic_num_update: 5 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 5000 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + +model: + _target_: model.rl.gaussian_rlpd.RLPD_Gaussian + randn_clip_value: 10 + backup_entropy: True + n_critics: 5 + tanh_output: True + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/finetune/square/calql_mlp_online.yaml b/cfg/robomimic/finetune/square/calql_mlp_online.yaml new file mode 100644 index 0000000..22ebae4 --- /dev/null +++ b/cfg/robomimic/finetune/square/calql_mlp_online.yaml @@ -0,0 +1,122 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 400 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 10000 + n_steps: 1 # not used + n_episode_per_epoch: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: True + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + online_utd_ratio: 1 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml index 7a44806..c5b2e39 100644 --- a/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -73,7 +72,7 @@ train: max_adv_weight: 100 beta: 10 buffer_size: 3000 - batch_size: 256 + batch_size: 1000 replay_ratio: 64 critic_update_ratio: 4 @@ -92,13 +91,13 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} mlp_dims: [256, 256, 256] activation_type: Mish residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/square/ft_dipo_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_dipo_diffusion_mlp.yaml index 6ab3d87..8e7598e 100644 --- a/cfg/robomimic/finetune/square/ft_dipo_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_dipo_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -70,11 +69,12 @@ train: num: 0 # DIPO specific scale_reward_factor: 1 - eta: 0.0001 + target_ema_rate: 0.005 + buffer_size: 1000000 + action_lr: 0.0001 action_gradient_steps: 10 - buffer_size: 400000 - batch_size: 5000 - replay_ratio: 64 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dipo.DIPODiffusion @@ -91,7 +91,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml index f545be8..350bfe6 100644 --- a/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -48,7 +47,7 @@ wandb: train: n_train_itr: 300 - n_critic_warmup_itr: 2 + n_critic_warmup_itr: 5 n_steps: 400 gamma: 0.999 actor_lr: 1e-5 @@ -70,10 +69,11 @@ train: num: 0 # DQL specific scale_reward_factor: 1 + target_ema_rate: 0.005 + buffer_size: 1000000 eta: 1.0 - buffer_size: 400000 - batch_size: 5000 - replay_ratio: 64 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dql.DQLDiffusion @@ -90,7 +90,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml index 48adb2a..87f1e5b 100644 --- a/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -74,9 +73,9 @@ train: eval_sample_num: 10 # how many samples to score during eval critic_tau: 0.001 # rate of target q network update use_expectile_exploration: True - buffer_size: 3000 - batch_size: 512 - replay_ratio: 16 + buffer_size: 5000 # * n_envs + replay_ratio: 128 + batch_size: 1000 model: _target_: model.diffusion.diffusion_idql.IDQLDiffusion @@ -93,7 +92,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic_q: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml index f8b8bbc..47c539e 100644 --- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -98,7 +97,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml index fa1db67..51d3e3a 100644 --- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml +++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 9 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -140,7 +139,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.ViTCritic spatial_emb: 128 diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet.yaml index 10aa67a..794017a 100644 --- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet.yaml +++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -100,7 +99,7 @@ model: smaller_encoder: False cond_predict_scale: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp.yaml b/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp.yaml index d705195..e5f382c 100644 --- a/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -91,7 +90,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp_img.yaml b/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp_img.yaml index 0ffddd7..7ed1e91 100644 --- a/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp_img.yaml +++ b/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp_img.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 9 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 img_cond_steps: 1 horizon_steps: 4 @@ -122,7 +121,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.ViTCritic spatial_emb: 128 diff --git a/cfg/robomimic/finetune/square/ft_ppo_gaussian_transformer.yaml b/cfg/robomimic/finetune/square/ft_ppo_gaussian_transformer.yaml index ac4cb99..e5ca94b 100644 --- a/cfg/robomimic/finetune/square/ft_ppo_gaussian_transformer.yaml +++ b/cfg/robomimic/finetune/square/ft_ppo_gaussian_transformer.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -92,7 +91,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/square/ft_ppo_gmm_mlp.yaml b/cfg/robomimic/finetune/square/ft_ppo_gmm_mlp.yaml index 2f85676..e7f14ca 100644 --- a/cfg/robomimic/finetune/square/ft_ppo_gmm_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_ppo_gmm_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -92,7 +91,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/square/ft_ppo_gmm_transformer.yaml b/cfg/robomimic/finetune/square/ft_ppo_gmm_transformer.yaml index d0c82db..b5f3157 100644 --- a/cfg/robomimic/finetune/square/ft_ppo_gmm_transformer.yaml +++ b/cfg/robomimic/finetune/square/ft_ppo_gmm_transformer.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 4 act_steps: 4 @@ -93,7 +92,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml index 90e72d7..1ad16d7 100644 --- a/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -70,11 +69,11 @@ train: num: 0 # QSM specific scale_reward_factor: 1 - q_grad_coeff: 50 + q_grad_coeff: 10 critic_tau: 0.005 # rate of target q network update - buffer_size: 3000 - batch_size: 256 - replay_ratio: 32 + buffer_size: 5000 # * n_envs + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_qsm.QSMDiffusion @@ -91,7 +90,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml index dbc8924..2d34101 100644 --- a/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 4 @@ -83,7 +82,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/square/ibrl_mlp.yaml b/cfg/robomimic/finetune/square/ibrl_mlp.yaml new file mode 100644 index 0000000..6e34653 --- /dev/null +++ b/cfg/robomimic/finetune/square/ibrl_mlp.yaml @@ -0,0 +1,115 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent + +name: ${env_name}_ibrl_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 350 # IBRL uses 300 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: ibrl-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 100000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # IBRL specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + update_freq: 2 + +model: + _target_: model.rl.gaussian_ibrl.IBRL_Gaussian + randn_clip_value: 3 + n_critics: 5 + soft_action_sample: True + soft_action_sample_beta: 10 + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + dropout: 0.5 + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + max_n_episodes: 100 \ No newline at end of file diff --git a/cfg/robomimic/finetune/square/rlpd_mlp.yaml b/cfg/robomimic/finetune/square/rlpd_mlp.yaml new file mode 100644 index 0000000..d62a41d --- /dev/null +++ b/cfg/robomimic/finetune/square/rlpd_mlp.yaml @@ -0,0 +1,114 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 400 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: rlpd-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 100000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + +model: + _target_: model.rl.gaussian_rlpd.RLPD_Gaussian + randn_clip_value: 10 + backup_entropy: True + n_critics: 5 + tanh_output: True + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml index 025fb48..61d7dec 100644 --- a/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 8 @@ -76,7 +75,7 @@ train: max_adv_weight: 100 beta: 10 buffer_size: 3000 - batch_size: 256 + batch_size: 1000 replay_ratio: 64 critic_update_ratio: 4 @@ -94,13 +93,13 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} mlp_dims: [256, 256, 256] activation_type: Mish residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml index 91d8e96..ec30a80 100644 --- a/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 8 @@ -73,11 +72,12 @@ train: num: 0 # DIPO specific scale_reward_factor: 1 - eta: 0.0001 + target_ema_rate: 0.005 + buffer_size: 1000000 + action_lr: 0.0001 action_gradient_steps: 10 - buffer_size: 400000 - batch_size: 5000 - replay_ratio: 64 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dipo.DIPODiffusion @@ -93,7 +93,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml index 72041f4..825e9d6 100644 --- a/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 8 @@ -73,10 +72,11 @@ train: num: 0 # DQL specific scale_reward_factor: 1 + target_ema_rate: 0.005 + buffer_size: 1000000 eta: 1.0 - buffer_size: 400000 - batch_size: 5000 - replay_ratio: 64 + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_dql.DQLDiffusion @@ -92,7 +92,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml index 9b670b9..db690f9 100644 --- a/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 8 @@ -77,9 +76,9 @@ train: eval_sample_num: 10 # how many samples to score during eval critic_tau: 0.001 # rate of target q network update use_expectile_exploration: True - buffer_size: 3000 - batch_size: 512 - replay_ratio: 16 + buffer_size: 5000 # * n_envs + replay_ratio: 128 + batch_size: 1000 model: _target_: model.diffusion.diffusion_idql.IDQLDiffusion @@ -95,7 +94,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic_q: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml index e887fce..f0418c9 100644 --- a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -100,7 +99,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml index 19c2b72..ad22b83 100644 --- a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml +++ b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 18 action_dim: 14 -transition_dim: ${action_dim} denoising_steps: 100 ft_denoising_steps: 5 cond_steps: 1 @@ -145,7 +144,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.ViTCritic spatial_emb: 128 diff --git a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_unet.yaml b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_unet.yaml index d17e184..5aec825 100644 --- a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_unet.yaml +++ b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_unet.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} denoising_steps: 20 ft_denoising_steps: 10 cond_steps: 1 @@ -103,7 +102,7 @@ model: smaller_encoder: False cond_predict_scale: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp.yaml b/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp.yaml index 9a5650a..aa6338c 100644 --- a/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 8 act_steps: 8 @@ -94,7 +93,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp_img.yaml b/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp_img.yaml index dac3e6e..286c7bb 100644 --- a/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp_img.yaml +++ b/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp_img.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 18 action_dim: 14 -transition_dim: ${action_dim} cond_steps: 1 img_cond_steps: 1 horizon_steps: 8 @@ -127,7 +126,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.ViTCritic spatial_emb: 128 diff --git a/cfg/robomimic/finetune/transport/ft_ppo_gaussian_transformer.yaml b/cfg/robomimic/finetune/transport/ft_ppo_gaussian_transformer.yaml index 9d4eaa0..2681560 100644 --- a/cfg/robomimic/finetune/transport/ft_ppo_gaussian_transformer.yaml +++ b/cfg/robomimic/finetune/transport/ft_ppo_gaussian_transformer.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 8 act_steps: 8 @@ -95,7 +94,7 @@ model: std_max: 0.2 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/transport/ft_ppo_gmm_mlp.yaml b/cfg/robomimic/finetune/transport/ft_ppo_gmm_mlp.yaml index 07e3cf5..b707736 100644 --- a/cfg/robomimic/finetune/transport/ft_ppo_gmm_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_ppo_gmm_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 8 act_steps: 8 @@ -95,7 +94,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/transport/ft_ppo_gmm_transformer.yaml b/cfg/robomimic/finetune/transport/ft_ppo_gmm_transformer.yaml index b35c5dc..f1b981b 100644 --- a/cfg/robomimic/finetune/transport/ft_ppo_gmm_transformer.yaml +++ b/cfg/robomimic/finetune/transport/ft_ppo_gmm_transformer.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} cond_steps: 1 horizon_steps: 8 act_steps: 8 @@ -96,7 +95,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} diff --git a/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml index 60fc95c..4072238 100644 --- a/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 8 @@ -73,11 +72,11 @@ train: num: 0 # QSM specific scale_reward_factor: 1 - q_grad_coeff: 50 + q_grad_coeff: 10 critic_tau: 0.005 # rate of target q network update - buffer_size: 3000 - batch_size: 256 - replay_ratio: 32 + buffer_size: 5000 # * n_envs + replay_ratio: 16 + batch_size: 1000 model: _target_: model.diffusion.diffusion_qsm.QSMDiffusion @@ -93,7 +92,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml index 016ccee..af9e9cb 100644 --- a/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml @@ -16,7 +16,6 @@ device: cuda:0 env_name: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} denoising_steps: 20 cond_steps: 1 horizon_steps: 8 @@ -85,7 +84,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml b/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml new file mode 100644 index 0000000..3c610a1 --- /dev/null +++ b/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml @@ -0,0 +1,118 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 100 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 10 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: False + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_mlp.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_mlp.yaml index 80a5a6d..8834aee 100644 --- a/cfg/robomimic/pretrain/can/pre_diffusion_mlp.yaml +++ b/cfg/robomimic/pretrain/can/pre_diffusion_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 4 cond_steps: 1 @@ -48,7 +47,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_mlp_img.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_img.yaml index b91268e..2ce38ef 100644 --- a/cfg/robomimic/pretrain/can/pre_diffusion_mlp_img.yaml +++ b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_img.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: can obs_dim: 9 # proprioception only action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 4 cond_steps: 1 @@ -72,7 +71,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_unet.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_unet.yaml index 204ebf0..14aac61 100644 --- a/cfg/robomimic/pretrain/can/pre_diffusion_unet.yaml +++ b/cfg/robomimic/pretrain/can/pre_diffusion_unet.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 4 cond_steps: 1 @@ -51,7 +50,7 @@ model: smaller_encoder: False cond_predict_scale: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/pretrain/can/pre_gaussian_mlp.yaml b/cfg/robomimic/pretrain/can/pre_gaussian_mlp.yaml index c887e17..97800c2 100644 --- a/cfg/robomimic/pretrain/can/pre_gaussian_mlp.yaml +++ b/cfg/robomimic/pretrain/can/pre_gaussian_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 @@ -45,7 +44,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ibrl.yaml new file mode 100644 index 0000000..f07ffd8 --- /dev/null +++ b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ibrl.yaml @@ -0,0 +1,60 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz + +seed: 42 +device: cuda:0 +env: can +obs_dim: 23 +action_dim: 7 +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 5000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 0 + lr_scheduler: + first_cycle_steps: 5000 + warmup_steps: 100 + min_lr: 1e-4 + epoch_start_ema: 20 + update_ema_freq: 10 + save_model_freq: 1000 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + dropout: 0.5 + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/pretrain/can/pre_gaussian_mlp_img.yaml b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_img.yaml index 0c8c8d6..f68d400 100644 --- a/cfg/robomimic/pretrain/can/pre_gaussian_mlp_img.yaml +++ b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_img.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: can obs_dim: 9 # proprioception only action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 img_cond_steps: 1 @@ -69,7 +68,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/can/pre_gaussian_transformer.yaml b/cfg/robomimic/pretrain/can/pre_gaussian_transformer.yaml index 328dc18..37e9fcf 100644 --- a/cfg/robomimic/pretrain/can/pre_gaussian_transformer.yaml +++ b/cfg/robomimic/pretrain/can/pre_gaussian_transformer.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 @@ -47,7 +46,7 @@ model: learn_fixed_std: False cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/can/pre_gmm_mlp.yaml b/cfg/robomimic/pretrain/can/pre_gmm_mlp.yaml index 5ec7d9b..ebef165 100644 --- a/cfg/robomimic/pretrain/can/pre_gmm_mlp.yaml +++ b/cfg/robomimic/pretrain/can/pre_gmm_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 num_modes: 5 @@ -47,7 +46,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/can/pre_gmm_transformer.yaml b/cfg/robomimic/pretrain/can/pre_gmm_transformer.yaml index c8a7157..3c698aa 100644 --- a/cfg/robomimic/pretrain/can/pre_gmm_transformer.yaml +++ b/cfg/robomimic/pretrain/can/pre_gmm_transformer.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: can obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 num_modes: 5 @@ -49,7 +48,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/lift/pre_diffusion_mlp.yaml b/cfg/robomimic/pretrain/lift/pre_diffusion_mlp.yaml index 63057ea..94c31d9 100644 --- a/cfg/robomimic/pretrain/lift/pre_diffusion_mlp.yaml +++ b/cfg/robomimic/pretrain/lift/pre_diffusion_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 4 cond_steps: 1 @@ -48,7 +47,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/pretrain/lift/pre_diffusion_mlp_img.yaml b/cfg/robomimic/pretrain/lift/pre_diffusion_mlp_img.yaml index 5a8de94..418daac 100644 --- a/cfg/robomimic/pretrain/lift/pre_diffusion_mlp_img.yaml +++ b/cfg/robomimic/pretrain/lift/pre_diffusion_mlp_img.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: lift obs_dim: 9 # proprioception only action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 4 cond_steps: 1 @@ -72,7 +71,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/pretrain/lift/pre_diffusion_unet.yaml b/cfg/robomimic/pretrain/lift/pre_diffusion_unet.yaml index 2322762..cb85f89 100644 --- a/cfg/robomimic/pretrain/lift/pre_diffusion_unet.yaml +++ b/cfg/robomimic/pretrain/lift/pre_diffusion_unet.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 4 cond_steps: 1 @@ -50,7 +49,7 @@ model: n_groups: 8 smaller_encoder: False cond_predict_scale: True - transition_dim: ${transition_dim} + action_dim: ${action_dim} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} diff --git a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp.yaml b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp.yaml index b363574..616c27c 100644 --- a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp.yaml +++ b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 @@ -45,7 +44,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml new file mode 100644 index 0000000..11d3f08 --- /dev/null +++ b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml @@ -0,0 +1,59 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz + +seed: 42 +device: cuda:0 +env: lift +obs_dim: 19 +action_dim: 7 +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 5000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 5000 + warmup_steps: 100 + min_lr: 1e-5 + epoch_start_ema: 20 + update_ema_freq: 10 + save_model_freq: 1000 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + residual_style: False + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + device: ${device} + + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_img.yaml b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_img.yaml index af0f065..e31d30c 100644 --- a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_img.yaml +++ b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_img.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: lift obs_dim: 9 # proprioception only action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 img_cond_steps: 1 @@ -69,7 +68,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/lift/pre_gaussian_transformer.yaml b/cfg/robomimic/pretrain/lift/pre_gaussian_transformer.yaml index f2c92e0..50add2d 100644 --- a/cfg/robomimic/pretrain/lift/pre_gaussian_transformer.yaml +++ b/cfg/robomimic/pretrain/lift/pre_gaussian_transformer.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 @@ -47,7 +46,7 @@ model: learn_fixed_std: False cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/lift/pre_gmm_mlp.yaml b/cfg/robomimic/pretrain/lift/pre_gmm_mlp.yaml index 5941572..24fd45a 100644 --- a/cfg/robomimic/pretrain/lift/pre_gmm_mlp.yaml +++ b/cfg/robomimic/pretrain/lift/pre_gmm_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 num_modes: 5 @@ -47,7 +46,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/lift/pre_gmm_transformer.yaml b/cfg/robomimic/pretrain/lift/pre_gmm_transformer.yaml index f364b83..f24767a 100644 --- a/cfg/robomimic/pretrain/lift/pre_gmm_transformer.yaml +++ b/cfg/robomimic/pretrain/lift/pre_gmm_transformer.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: lift obs_dim: 19 action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 num_modes: 5 @@ -49,7 +48,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml b/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml new file mode 100644 index 0000000..1cf5527 --- /dev/null +++ b/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml @@ -0,0 +1,118 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 400 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 100 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 10 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: False + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_mlp.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_mlp.yaml index 1f9a13b..4f4672b 100644 --- a/cfg/robomimic/pretrain/square/pre_diffusion_mlp.yaml +++ b/cfg/robomimic/pretrain/square/pre_diffusion_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 4 cond_steps: 1 @@ -49,7 +48,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_mlp_img.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_img.yaml index f6853f4..8da14e0 100644 --- a/cfg/robomimic/pretrain/square/pre_diffusion_mlp_img.yaml +++ b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_img.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: square obs_dim: 9 # proprioception only action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 4 cond_steps: 1 @@ -72,7 +71,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_unet.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_unet.yaml index f74397c..b5a8faf 100644 --- a/cfg/robomimic/pretrain/square/pre_diffusion_unet.yaml +++ b/cfg/robomimic/pretrain/square/pre_diffusion_unet.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 4 cond_steps: 1 @@ -51,7 +50,7 @@ model: smaller_encoder: False cond_predict_scale: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/pretrain/square/pre_gaussian_mlp.yaml b/cfg/robomimic/pretrain/square/pre_gaussian_mlp.yaml index 3e092af..652c7bd 100644 --- a/cfg/robomimic/pretrain/square/pre_gaussian_mlp.yaml +++ b/cfg/robomimic/pretrain/square/pre_gaussian_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 @@ -45,7 +44,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ibrl.yaml new file mode 100644 index 0000000..38f3402 --- /dev/null +++ b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ibrl.yaml @@ -0,0 +1,60 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz + +seed: 42 +device: cuda:0 +env: square +obs_dim: 23 +action_dim: 7 +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 5000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 0 + lr_scheduler: + first_cycle_steps: 5000 + warmup_steps: 100 + min_lr: 1e-4 + epoch_start_ema: 20 + update_ema_freq: 10 + save_model_freq: 1000 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + dropout: 0.5 + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/pretrain/square/pre_gaussian_mlp_img.yaml b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_img.yaml index 065ff91..7bb5632 100644 --- a/cfg/robomimic/pretrain/square/pre_gaussian_mlp_img.yaml +++ b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_img.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: square obs_dim: 9 # proprioception only action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 img_cond_steps: 1 @@ -69,7 +68,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/square/pre_gaussian_transformer.yaml b/cfg/robomimic/pretrain/square/pre_gaussian_transformer.yaml index c116189..a469afa 100644 --- a/cfg/robomimic/pretrain/square/pre_gaussian_transformer.yaml +++ b/cfg/robomimic/pretrain/square/pre_gaussian_transformer.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 @@ -47,7 +46,7 @@ model: learn_fixed_std: False cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/square/pre_gmm_mlp.yaml b/cfg/robomimic/pretrain/square/pre_gmm_mlp.yaml index 2efbaa1..315c980 100644 --- a/cfg/robomimic/pretrain/square/pre_gmm_mlp.yaml +++ b/cfg/robomimic/pretrain/square/pre_gmm_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 num_modes: 5 @@ -47,7 +46,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/square/pre_gmm_transformer.yaml b/cfg/robomimic/pretrain/square/pre_gmm_transformer.yaml index 41c3fb4..27afc79 100644 --- a/cfg/robomimic/pretrain/square/pre_gmm_transformer.yaml +++ b/cfg/robomimic/pretrain/square/pre_gmm_transformer.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: square obs_dim: 23 action_dim: 7 -transition_dim: ${action_dim} horizon_steps: 4 cond_steps: 1 num_modes: 5 @@ -49,7 +48,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/transport/pre_diffusion_mlp.yaml b/cfg/robomimic/pretrain/transport/pre_diffusion_mlp.yaml index 9d2425f..2c6f868 100644 --- a/cfg/robomimic/pretrain/transport/pre_diffusion_mlp.yaml +++ b/cfg/robomimic/pretrain/transport/pre_diffusion_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 8 cond_steps: 1 @@ -48,7 +47,7 @@ model: residual_style: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/pretrain/transport/pre_diffusion_mlp_img.yaml b/cfg/robomimic/pretrain/transport/pre_diffusion_mlp_img.yaml index 2da0e07..54415dc 100644 --- a/cfg/robomimic/pretrain/transport/pre_diffusion_mlp_img.yaml +++ b/cfg/robomimic/pretrain/transport/pre_diffusion_mlp_img.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: transport obs_dim: 18 # proprioception only action_dim: 14 -transition_dim: ${action_dim} denoising_steps: 100 horizon_steps: 8 cond_steps: 1 @@ -73,7 +72,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/pretrain/transport/pre_diffusion_unet.yaml b/cfg/robomimic/pretrain/transport/pre_diffusion_unet.yaml index f525a21..15dd4d9 100644 --- a/cfg/robomimic/pretrain/transport/pre_diffusion_unet.yaml +++ b/cfg/robomimic/pretrain/transport/pre_diffusion_unet.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} denoising_steps: 20 horizon_steps: 16 cond_steps: 1 @@ -51,7 +50,7 @@ model: smaller_encoder: False cond_predict_scale: True cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp.yaml b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp.yaml index 8c68ec6..08cb96f 100644 --- a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp.yaml +++ b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} horizon_steps: 8 cond_steps: 1 @@ -45,7 +44,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml new file mode 100644 index 0000000..c7e0d9c --- /dev/null +++ b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml @@ -0,0 +1,60 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz + +seed: 42 +device: cuda:0 +env: transport +obs_dim: 59 +action_dim: 14 +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 5000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 0 + lr_scheduler: + first_cycle_steps: 5000 + warmup_steps: 100 + min_lr: 1e-4 + epoch_start_ema: 20 + update_ema_freq: 10 + save_model_freq: 1000 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + dropout: 0.5 + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_img.yaml b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_img.yaml index 1f6ba93..1ce968b 100644 --- a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_img.yaml +++ b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_img.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: transport obs_dim: 18 # proprioception only action_dim: 14 -transition_dim: ${action_dim} horizon_steps: 8 cond_steps: 1 img_cond_steps: 1 @@ -70,7 +69,7 @@ model: img_cond_steps: ${img_cond_steps} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/transport/pre_gaussian_transformer.yaml b/cfg/robomimic/pretrain/transport/pre_gaussian_transformer.yaml index 4beb459..ffb4be8 100644 --- a/cfg/robomimic/pretrain/transport/pre_gaussian_transformer.yaml +++ b/cfg/robomimic/pretrain/transport/pre_gaussian_transformer.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} horizon_steps: 8 cond_steps: 1 @@ -47,7 +46,7 @@ model: learn_fixed_std: False cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/transport/pre_gmm_mlp.yaml b/cfg/robomimic/pretrain/transport/pre_gmm_mlp.yaml index 13d1e24..dbe6290 100644 --- a/cfg/robomimic/pretrain/transport/pre_gmm_mlp.yaml +++ b/cfg/robomimic/pretrain/transport/pre_gmm_mlp.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} horizon_steps: 8 cond_steps: 1 num_modes: 5 @@ -47,7 +46,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/transport/pre_gmm_transformer.yaml b/cfg/robomimic/pretrain/transport/pre_gmm_transformer.yaml index 9255f10..17525d5 100644 --- a/cfg/robomimic/pretrain/transport/pre_gmm_transformer.yaml +++ b/cfg/robomimic/pretrain/transport/pre_gmm_transformer.yaml @@ -14,7 +14,6 @@ device: cuda:0 env: transport obs_dim: 59 action_dim: 14 -transition_dim: ${action_dim} horizon_steps: 8 cond_steps: 1 num_modes: 5 @@ -49,7 +48,7 @@ model: num_modes: ${num_modes} cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - transition_dim: ${transition_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/env/gym_utils/__init__.py b/env/gym_utils/__init__.py index 0b21423..cea639c 100644 --- a/env/gym_utils/__init__.py +++ b/env/gym_utils/__init__.py @@ -184,8 +184,8 @@ def make_async( # Create a fake env whose sole purpose is to provide # obs/action spaces and metadata. env = gym.Env() + observation_space = spaces.Dict() if shape_meta is not None: # rn only for images - observation_space = spaces.Dict() for key, value in shape_meta["obs"].items(): shape = value["shape"] if key.endswith("rgb"): @@ -194,18 +194,20 @@ def make_async( min_value, max_value = -1, 1 else: raise RuntimeError(f"Unsupported type {key}") - this_space = spaces.Box( + observation_space[key] = spaces.Box( low=min_value, high=max_value, shape=shape, dtype=np.float32, ) - observation_space[key] = this_space - env.observation_space = observation_space else: - env.observation_space = gym.spaces.Box( - -1, 1, shape=(obs_dim,), dtype=np.float64 + observation_space["state"] = gym.spaces.Box( + -1, + 1, + shape=(obs_dim,), + dtype=np.float32, ) + env.observation_space = observation_space env.action_space = gym.spaces.Box(-1, 1, shape=(action_dim,), dtype=np.int64) env.metadata = { "render.modes": ["human", "rgb_array", "depth_array"], diff --git a/env/gym_utils/async_vector_env.py b/env/gym_utils/async_vector_env.py index 92958a3..9fc9882 100644 --- a/env/gym_utils/async_vector_env.py +++ b/env/gym_utils/async_vector_env.py @@ -1,7 +1,10 @@ """ From gym==0.22.0 -Disable auto-reset after done. +Use terminated/truncated instead of done. + +Disable auto-reset after done. Reset in MultiStepWrapper instead. + Add reset_arg() that allows all environments with different options. Add reset_one_arg() that allows resetting a single environment with options. Add render(). @@ -398,8 +401,11 @@ class AsyncVectorEnv(VectorEnv): rewards : :obj:`np.ndarray`, dtype :obj:`np.float_` A vector of rewards from the vectorized environment. - dones : :obj:`np.ndarray`, dtype :obj:`np.bool_` - A vector whose entries indicate whether the episode has ended. + terminates : :obj:`np.ndarray`, dtype :obj:`np.bool_` + A vector whose entries indicate whether the episode has terminated (failed). + + truncates : :obj:`np.ndarray`, dtype :obj:`np.bool_` + A vector whose entries indicate whether the episode has been truncated (max episode length). infos : list of dict A list of auxiliary diagnostic information dicts from sub-environments. @@ -432,7 +438,7 @@ class AsyncVectorEnv(VectorEnv): results, successes = zip(*[pipe.recv() for pipe in self.parent_pipes]) self._raise_if_errors(successes) self._state = AsyncState.DEFAULT - observations_list, rewards, dones, infos = zip(*results) + observations_list, rewards, terminates, truncates, infos = zip(*results) if not self.shared_memory: self.observations = concatenate( @@ -444,7 +450,8 @@ class AsyncVectorEnv(VectorEnv): return ( deepcopy(self.observations) if self.copy else self.observations, np.array(rewards), - np.array(dones, dtype=np.bool_), + np.array(terminates, dtype=np.bool_), + np.array(truncates, dtype=np.bool_), infos, ) @@ -717,11 +724,8 @@ def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue): pipe.send((observation, True)) elif command == "step": - observation, reward, done, info = env.step(data) - # if done: - # info["terminal_observation"] = observation - # observation = env.reset() - pipe.send(((observation, reward, done, info), True)) + observation, reward, terminated, truncated, info = env.step(data) + pipe.send(((observation, reward, terminated, truncated, info), True)) elif command == "seed": env.seed(data) pipe.send((None, True)) @@ -789,14 +793,11 @@ def _worker_shared_memory(index, env_fn, pipe, parent_pipe, shared_memory, error ) pipe.send((None, True)) elif command == "step": - observation, reward, done, info = env.step(data) - # if done: - # info["terminal_observation"] = observation - # observation = env.reset() + observation, reward, terminated, truncated, info = env.step(data) write_to_shared_memory( observation_space, index, observation, shared_memory ) - pipe.send(((None, reward, done, info), True)) + pipe.send(((None, reward, terminated, truncated, info), True)) elif command == "seed": env.seed(data) pipe.send((None, True)) diff --git a/env/gym_utils/sync_vector_env.py b/env/gym_utils/sync_vector_env.py index 9804ca5..17ce127 100644 --- a/env/gym_utils/sync_vector_env.py +++ b/env/gym_utils/sync_vector_env.py @@ -73,7 +73,8 @@ class SyncVectorEnv(VectorEnv): self.single_observation_space, n=self.num_envs, fn=np.zeros ) self._rewards = np.zeros((self.num_envs,), dtype=np.float64) - self._dones = np.zeros((self.num_envs,), dtype=np.bool_) + self._terminates = np.zeros((self.num_envs,), dtype=np.bool_) + self._truncates = np.zeros((self.num_envs,), dtype=np.bool_) self._actions = None def seed(self, seed=None): @@ -99,7 +100,8 @@ class SyncVectorEnv(VectorEnv): seed = [seed + i for i in range(self.num_envs)] assert len(seed) == self.num_envs - self._dones[:] = False + self._terminates[:] = False + self._truncates[:] = False observations = [] data_list = [] for env, single_seed in zip(self.envs, seed): @@ -136,10 +138,13 @@ class SyncVectorEnv(VectorEnv): def step_wait(self): observations, infos = [], [] for i, (env, action) in enumerate(zip(self.envs, self._actions)): - observation, self._rewards[i], self._dones[i], info = env.step(action) - if self._dones[i]: - info["terminal_observation"] = observation - observation = env.reset() + ( + observation, + self._rewards[i], + self._terminates[i], + self._truncates[i], + info, + ) = env.step(action) observations.append(observation) infos.append(info) self.observations = concatenate( @@ -149,7 +154,8 @@ class SyncVectorEnv(VectorEnv): return ( deepcopy(self.observations) if self.copy else self.observations, np.copy(self._rewards), - np.copy(self._dones), + np.copy(self._terminates), + np.copy(self._truncates), infos, ) diff --git a/env/gym_utils/vector_env.py b/env/gym_utils/vector_env.py index e1cbc1b..d227d1a 100644 --- a/env/gym_utils/vector_env.py +++ b/env/gym_utils/vector_env.py @@ -102,8 +102,11 @@ class VectorEnv(gym.Env): rewards : :obj:`np.ndarray`, dtype :obj:`np.float_` A vector of rewards from the vectorized environment. - dones : :obj:`np.ndarray`, dtype :obj:`np.bool_` - A vector whose entries indicate whether the episode has ended. + terminated : :obj:`np.ndarray`, dtype :obj:`np.bool_` + A vector whose entries indicate whether the episode has terminated (failed). + + truncated : :obj:`np.ndarray`, dtype :obj:`np.bool_` + A vector whose entries indicate whether the episode has been truncated (max episode length). infos : list of dict A list of auxiliary diagnostic information dicts from sub-environments. diff --git a/env/gym_utils/wrapper/d3il_lowdim.py b/env/gym_utils/wrapper/d3il_lowdim.py index 4e1f2f8..1030f7f 100644 --- a/env/gym_utils/wrapper/d3il_lowdim.py +++ b/env/gym_utils/wrapper/d3il_lowdim.py @@ -1,6 +1,8 @@ """ Environment wrapper for D3IL environments with state observations. +Also return done=False since we do not terminate episode early. + For consistency, we will use Dict{} for the observation space, with the key "state" for the state observation. """ @@ -73,7 +75,7 @@ class D3ilLowdimWrapper(gym.Env): # normalize obs = self.normalize_obs(obs) - return {"state": obs}, reward, done, info + return {"state": obs}, reward, False, info def render(self, mode="rgb_array"): h, w = self.render_hw diff --git a/env/gym_utils/wrapper/furniture.py b/env/gym_utils/wrapper/furniture.py index 24cb680..3c02895 100644 --- a/env/gym_utils/wrapper/furniture.py +++ b/env/gym_utils/wrapper/furniture.py @@ -121,7 +121,7 @@ class FurnitureRLSimEnvMultiStepWrapper(gym.Wrapper): action = self.normalizer(action, "actions", forward=False) # Step the environment n_action_steps times - obs, sparse_reward, dense_reward, done, info = self._inner_step(action) + obs, sparse_reward, dense_reward, info = self._inner_step(action) if self.sparse_reward: reward = sparse_reward.clone().cpu().numpy() else: @@ -129,17 +129,14 @@ class FurnitureRLSimEnvMultiStepWrapper(gym.Wrapper): # Only mark the environment as done if it times out, ignore done from inner steps truncated = self.env.env_steps >= self.max_env_steps - done = truncated nobs: np.ndarray = self.process_obs(obs) - done: np.ndarray = done.squeeze().cpu().numpy() + truncated: np.ndarray = truncated.squeeze().cpu().numpy() + terminated: np.ndarray = np.zeros_like(truncated, dtype=bool) - return {"state": nobs}, reward, done, info + return {"state": nobs}, reward, terminated, truncated, info def _inner_step(self, action_chunk: torch.Tensor): - dones = torch.zeros( - action_chunk.shape[0], dtype=torch.bool, device=action_chunk.device - ) dense_reward = torch.zeros(action_chunk.shape[0], device=action_chunk.device) sparse_reward = torch.zeros(action_chunk.shape[0], device=action_chunk.device) for i in range(self.n_action_steps): @@ -156,10 +153,8 @@ class FurnitureRLSimEnvMultiStepWrapper(gym.Wrapper): # assign "permanent" rewards dense_reward += self.best_reward - dones = dones | done.squeeze() - obs = stack_last_n_obs_dict(self.obs, self.n_obs_steps) - return obs, sparse_reward, dense_reward, dones, info + return obs, sparse_reward, dense_reward, info def process_obs(self, obs: torch.Tensor) -> np.ndarray: # Convert the robot state to have 6D pose diff --git a/env/gym_utils/wrapper/mujoco_locomotion_lowdim.py b/env/gym_utils/wrapper/mujoco_locomotion_lowdim.py index 1c0dc3d..5ae3fa9 100644 --- a/env/gym_utils/wrapper/mujoco_locomotion_lowdim.py +++ b/env/gym_utils/wrapper/mujoco_locomotion_lowdim.py @@ -57,12 +57,12 @@ class MujocoLocomotionLowdimWrapper(gym.Env): def normalize_obs(self, obs): return 2 * ((obs - self.obs_min) / (self.obs_max - self.obs_min + 1e-6) - 0.5) - def unnormaliza_action(self, action): + def unnormalize_action(self, action): action = (action + 1) / 2 # [-1, 1] -> [0, 1] return action * (self.action_max - self.action_min) + self.action_min def step(self, action): - raw_action = self.unnormaliza_action(action) + raw_action = self.unnormalize_action(action) raw_obs, reward, done, info = self.env.step(raw_action) # normalize diff --git a/env/gym_utils/wrapper/multi_step.py b/env/gym_utils/wrapper/multi_step.py index 758caa8..3c4e52f 100644 --- a/env/gym_utils/wrapper/multi_step.py +++ b/env/gym_utils/wrapper/multi_step.py @@ -138,22 +138,32 @@ class MultiStep(gym.Wrapper): """ if action.ndim == 1: # in case action_steps = 1 action = action[None] + truncated = False + terminated = False for act_step, act in enumerate(action): self.cnt += 1 - - if len(self.done) > 0 and self.done[-1]: - # termination + if terminated or truncated: break + + # done does not differentiate terminal and truncation observation, reward, done, info = self.env.step(act) self.obs.append(observation) self.action.append(act) self.reward.append(reward) - if ( - self.max_episode_steps is not None - ) and self.cnt >= self.max_episode_steps: - # truncation - done = True + + # in gym, timelimit wrapper is automatically used given env._spec.max_episode_steps + if "TimeLimit.truncated" not in info: + if done: + terminated = True + elif ( + self.max_episode_steps is not None + ) and self.cnt >= self.max_episode_steps: + truncated = True + else: + truncated = info["TimeLimit.truncated"] + terminated = done + done = truncated or terminated self.done.append(done) self._add_info(info) observation = self._get_obs(self.n_obs_steps) @@ -165,6 +175,12 @@ class MultiStep(gym.Wrapper): # In mujoco case, done can happen within the loop above if self.reset_within_step and self.done[-1]: + + # need to save old observation in the case of truncation only, for bootstrapping + if truncated: + info["final_obs"] = observation + + # reset observation = ( self.reset() ) # TODO: arguments? this cannot handle video recording right now since needs to pass in options @@ -173,7 +189,7 @@ class MultiStep(gym.Wrapper): # reset reward and done for next step self.reward = list() self.done = list() - return observation, reward, done, info + return observation, reward, terminated, truncated, info def _get_obs(self, n_steps=1): """ diff --git a/env/gym_utils/wrapper/robomimic_image.py b/env/gym_utils/wrapper/robomimic_image.py index 32dac4b..ebe6a39 100644 --- a/env/gym_utils/wrapper/robomimic_image.py +++ b/env/gym_utils/wrapper/robomimic_image.py @@ -1,6 +1,8 @@ """ Environment wrapper for Robomimic environments with image observations. +Also return done=False since we do not terminate episode early. + Modified from https://github.com/real-stanford/diffusion_policy/blob/main/diffusion_policy/env/robomimic/robomimic_image_wrapper.py """ @@ -158,7 +160,7 @@ class RobomimicImageWrapper(gym.Env): video_img = self.render(mode="rgb_array") self.video_writer.append_data(video_img) - return obs, reward, done, info + return obs, reward, False, info def render(self, mode="rgb_array"): h, w = self.render_hw diff --git a/env/gym_utils/wrapper/robomimic_lowdim.py b/env/gym_utils/wrapper/robomimic_lowdim.py index 3e7378a..2df2033 100644 --- a/env/gym_utils/wrapper/robomimic_lowdim.py +++ b/env/gym_utils/wrapper/robomimic_lowdim.py @@ -1,6 +1,8 @@ """ Environment wrapper for Robomimic environments with state observations. +Also return done=False since we do not terminate episode early. + Modified from https://github.com/real-stanford/diffusion_policy/blob/main/diffusion_policy/env/robomimic/robomimic_lowdim_wrapper.py For consistency, we will use Dict{} for the observation space, with the key "state" for the state observation. @@ -65,7 +67,7 @@ class RobomimicLowdimWrapper(gym.Env): low=low, high=high, shape=low.shape, - dtype=low.dtype, + dtype=np.float32, ) def normalize_obs(self, obs): @@ -129,7 +131,7 @@ class RobomimicLowdimWrapper(gym.Env): video_img = self.render(mode="rgb_array") self.video_writer.append_data(video_img) - return obs, reward, done, info + return obs, reward, False, info def render(self, mode="rgb_array"): h, w = self.render_hw diff --git a/model/common/critic.py b/model/common/critic.py index 5552f37..d912ad5 100644 --- a/model/common/critic.py +++ b/model/common/critic.py @@ -5,7 +5,6 @@ Critic networks. from typing import Union import torch -import copy import einops from copy import deepcopy @@ -28,20 +27,15 @@ class CriticObs(torch.nn.Module): super().__init__() mlp_dims = [cond_dim] + mlp_dims + [1] if residual_style: - self.Q1 = ResidualMLP( - mlp_dims, - activation_type=activation_type, - out_activation_type="Identity", - use_layernorm=use_layernorm, - ) + model = ResidualMLP else: - self.Q1 = MLP( - mlp_dims, - activation_type=activation_type, - out_activation_type="Identity", - use_layernorm=use_layernorm, - verbose=False, - ) + model = MLP + self.Q1 = model( + mlp_dims, + activation_type=activation_type, + out_activation_type="Identity", + use_layernorm=use_layernorm, + ) def forward(self, cond: Union[dict, torch.Tensor]): """ @@ -72,26 +66,28 @@ class CriticObsAct(torch.nn.Module): activation_type="Mish", use_layernorm=False, residual_tyle=False, + double_q=True, **kwargs, ): super().__init__() mlp_dims = [cond_dim + action_dim * action_steps] + mlp_dims + [1] if residual_tyle: - self.Q1 = ResidualMLP( - mlp_dims, - activation_type=activation_type, - out_activation_type="Identity", - use_layernorm=use_layernorm, - ) + model = ResidualMLP else: - self.Q1 = MLP( + model = MLP + self.Q1 = model( + mlp_dims, + activation_type=activation_type, + out_activation_type="Identity", + use_layernorm=use_layernorm, + ) + if double_q: + self.Q2 = model( mlp_dims, activation_type=activation_type, out_activation_type="Identity", use_layernorm=use_layernorm, - verbose=False, ) - self.Q2 = copy.deepcopy(self.Q1) def forward(self, cond: dict, action): """ @@ -108,9 +104,13 @@ class CriticObsAct(torch.nn.Module): action = action.view(B, -1) x = torch.cat((state, action), dim=-1) - q1 = self.Q1(x) - q2 = self.Q2(x) - return q1.squeeze(1), q2.squeeze(1) + if hasattr(self, "Q2"): + q1 = self.Q1(x) + q2 = self.Q2(x) + return q1.squeeze(1), q2.squeeze(1) + else: + q1 = self.Q1(x) + return q1.squeeze(1) class ViTCritic(CriticObs): diff --git a/model/common/gaussian.py b/model/common/gaussian.py index 3fea827..c763dfc 100644 --- a/model/common/gaussian.py +++ b/model/common/gaussian.py @@ -19,13 +19,16 @@ class GaussianModel(torch.nn.Module): network_path=None, device="cuda:0", randn_clip_value=10, + tanh_output=False, ): super().__init__() self.device = device self.network = network.to(device) if network_path is not None: checkpoint = torch.load( - network_path, map_location=self.device, weights_only=True + network_path, + map_location=self.device, + weights_only=True, ) self.load_state_dict( checkpoint["model"], @@ -40,12 +43,16 @@ class GaussianModel(torch.nn.Module): # Clip sampled randn (from standard deviation) such that the sampled action is not too far away from mean self.randn_clip_value = randn_clip_value + # Whether to apply tanh to the **sampled** action --- used in SAC + self.tanh_output = tanh_output + def loss( self, true_action, cond, ent_coef, ): + """no squashing""" B = len(true_action) dist = self.forward_train( cond, @@ -80,6 +87,8 @@ class GaussianModel(torch.nn.Module): cond, deterministic=False, network_override=None, + reparameterize=False, + get_logprob=False, ): B = len(cond["state"]) if "state" in cond else len(cond["rgb"]) T = self.horizon_steps @@ -88,9 +97,24 @@ class GaussianModel(torch.nn.Module): deterministic=deterministic, network_override=network_override, ) - sampled_action = dist.sample() + if reparameterize: + sampled_action = dist.rsample() + else: + sampled_action = dist.sample() sampled_action.clamp_( dist.loc - self.randn_clip_value * dist.scale, dist.loc + self.randn_clip_value * dist.scale, ) - return sampled_action.view(B, T, -1) + + if get_logprob: + log_prob = dist.log_prob(sampled_action) + + # For SAC/RLPD, squash mean after sampling here instead of right after model output as in PPO + if self.tanh_output: + sampled_action = torch.tanh(sampled_action) + log_prob -= torch.log(1 - sampled_action.pow(2) + 1e-6) + return sampled_action.view(B, T, -1), log_prob.sum(1, keepdim=False) + else: + if self.tanh_output: + sampled_action = torch.tanh(sampled_action) + return sampled_action.view(B, T, -1) diff --git a/model/common/mlp.py b/model/common/mlp.py index fe154d1..3322af9 100644 --- a/model/common/mlp.py +++ b/model/common/mlp.py @@ -7,7 +7,6 @@ Residual model is taken from https://github.com/ALRhub/d3il/blob/main/agents/mod import torch from torch import nn -from torch.nn.utils import spectral_norm from collections import OrderedDict import logging @@ -26,7 +25,6 @@ activation_dict = nn.ModuleDict( class MLP(nn.Module): - def __init__( self, dim_list, @@ -35,7 +33,9 @@ class MLP(nn.Module): activation_type="Tanh", out_activation_type="Identity", use_layernorm=False, - use_spectralnorm=False, + use_layernorm_final=False, + dropout=0, + use_drop_final=False, verbose=False, ): super(MLP, self).__init__() @@ -50,39 +50,25 @@ class MLP(nn.Module): o_dim = dim_list[idx + 1] if append_dim > 0 and idx in append_layers: i_dim += append_dim - linear_layer = nn.Linear(i_dim, o_dim) - if use_spectralnorm: - linear_layer = spectral_norm(linear_layer) - if idx == num_layer - 1: - module = nn.Sequential( - OrderedDict( - [ - ("linear_1", linear_layer), - ("act_1", activation_dict[out_activation_type]), - ] - ) - ) - else: - if use_layernorm: - module = nn.Sequential( - OrderedDict( - [ - ("linear_1", linear_layer), - ("norm_1", nn.LayerNorm(o_dim)), - ("act_1", activation_dict[activation_type]), - ] - ) - ) - else: - module = nn.Sequential( - OrderedDict( - [ - ("linear_1", linear_layer), - ("act_1", activation_dict[activation_type]), - ] - ) - ) + + # Add module components + layers = [("linear_1", linear_layer)] + if use_layernorm and (idx < num_layer - 1 or use_layernorm_final): + layers.append(("norm_1", nn.LayerNorm(o_dim))) + if dropout > 0 and (idx < num_layer - 1 or use_drop_final): + layers.append(("dropout_1", nn.Dropout(dropout))) + + # add activation function + act = ( + activation_dict[activation_type] + if idx != num_layer - 1 + else activation_dict[out_activation_type] + ) + layers.append(("act_1", act)) + + # re-construct module + module = nn.Sequential(OrderedDict(layers)) self.moduleList.append(module) if verbose: logging.info(self.moduleList) @@ -109,6 +95,7 @@ class ResidualMLP(nn.Module): activation_type="Mish", out_activation_type="Identity", use_layernorm=False, + use_layernorm_final=False, ): super(ResidualMLP, self).__init__() hidden_dim = dim_list[1] @@ -126,6 +113,8 @@ class ResidualMLP(nn.Module): ] ) self.layers.append(nn.Linear(hidden_dim, dim_list[-1])) + if use_layernorm_final: + self.layers.append(nn.LayerNorm(dim_list[-1])) self.layers.append(activation_dict[out_activation_type]) def forward(self, x): diff --git a/model/common/mlp_gaussian.py b/model/common/mlp_gaussian.py index a60a1d8..e05dbed 100644 --- a/model/common/mlp_gaussian.py +++ b/model/common/mlp_gaussian.py @@ -18,7 +18,7 @@ class Gaussian_VisionMLP(nn.Module): def __init__( self, backbone, - transition_dim, + action_dim, horizon_steps, cond_dim, img_cond_steps=1, @@ -74,10 +74,10 @@ class Gaussian_VisionMLP(nn.Module): ) # head - self.transition_dim = transition_dim + self.action_dim = action_dim self.horizon_steps = horizon_steps input_dim = visual_feature_dim + cond_dim - output_dim = transition_dim * horizon_steps + output_dim = action_dim * horizon_steps if residual_style: model = ResidualMLP else: @@ -97,7 +97,7 @@ class Gaussian_VisionMLP(nn.Module): ) elif learn_fixed_std: # initialize to fixed_std self.logvar = torch.nn.Parameter( - torch.log(torch.tensor([fixed_std**2 for _ in range(transition_dim)])), + torch.log(torch.tensor([fixed_std**2 for _ in range(action_dim)])), requires_grad=True, ) self.logvar_min = torch.nn.Parameter( @@ -159,19 +159,19 @@ class Gaussian_VisionMLP(nn.Module): x_encoded = torch.cat([feat, state], dim=-1) out_mean = self.mlp_mean(x_encoded) out_mean = torch.tanh(out_mean).view( - B, self.horizon_steps * self.transition_dim + B, self.horizon_steps * self.action_dim ) # tanh squashing in [-1, 1] if self.learn_fixed_std: out_logvar = torch.clamp(self.logvar, self.logvar_min, self.logvar_max) out_scale = torch.exp(0.5 * out_logvar) - out_scale = out_scale.view(1, self.transition_dim) + out_scale = out_scale.view(1, self.action_dim) out_scale = out_scale.repeat(B, self.horizon_steps) elif self.use_fixed_std: out_scale = torch.ones_like(out_mean).to(device) * self.fixed_std else: out_logvar = self.mlp_logvar(x_encoded).view( - B, self.horizon_steps * self.transition_dim + B, self.horizon_steps * self.action_dim ) out_logvar = torch.clamp(out_logvar, self.logvar_min, self.logvar_max) out_scale = torch.exp(0.5 * out_logvar) @@ -179,48 +179,65 @@ class Gaussian_VisionMLP(nn.Module): class Gaussian_MLP(nn.Module): - def __init__( self, - transition_dim, + action_dim, horizon_steps, cond_dim, mlp_dims=[256, 256, 256], activation_type="Mish", + tanh_output=True, # sometimes we want to apply tanh after sampling instead of here, e.g., in SAC residual_style=False, use_layernorm=False, + dropout=0.0, fixed_std=None, learn_fixed_std=False, std_min=0.01, std_max=1, ): super().__init__() - self.transition_dim = transition_dim + self.action_dim = action_dim self.horizon_steps = horizon_steps input_dim = cond_dim - output_dim = transition_dim * horizon_steps + output_dim = action_dim * horizon_steps if residual_style: model = ResidualMLP else: model = MLP - self.mlp_mean = model( - [input_dim] + mlp_dims + [output_dim], - activation_type=activation_type, - out_activation_type="Identity", - use_layernorm=use_layernorm, - ) if fixed_std is None: + # learning std + self.mlp_base = model( + [input_dim] + mlp_dims, + activation_type=activation_type, + out_activation_type=activation_type, + use_layernorm=use_layernorm, + use_layernorm_final=use_layernorm, + ) + self.mlp_mean = MLP( + mlp_dims[-1:] + [output_dim], + out_activation_type="Identity", + ) self.mlp_logvar = MLP( - [input_dim] + mlp_dims[-1:] + [output_dim], + mlp_dims[-1:] + [output_dim], + out_activation_type="Identity", + ) + else: + # no separate head for mean and std + self.mlp_mean = model( + [input_dim] + mlp_dims + [output_dim], activation_type=activation_type, out_activation_type="Identity", use_layernorm=use_layernorm, + dropout=dropout, ) - elif learn_fixed_std: # initialize to fixed_std - self.logvar = torch.nn.Parameter( - torch.log(torch.tensor([fixed_std**2 for _ in range(transition_dim)])), - requires_grad=True, - ) + if learn_fixed_std: + # initialize to fixed_std + self.logvar = torch.nn.Parameter( + torch.log( + torch.tensor([fixed_std**2 for _ in range(action_dim)]) + ), + requires_grad=True, + ) self.logvar_min = torch.nn.Parameter( torch.log(torch.tensor(std_min**2)), requires_grad=False ) @@ -230,6 +247,7 @@ class Gaussian_MLP(nn.Module): self.use_fixed_std = fixed_std is not None self.fixed_std = fixed_std self.learn_fixed_std = learn_fixed_std + self.tanh_output = tanh_output def forward(self, cond): B = len(cond["state"]) @@ -239,22 +257,27 @@ class Gaussian_MLP(nn.Module): state = cond["state"].view(B, -1) # mlp + if hasattr(self, "mlp_base"): + state = self.mlp_base(state) out_mean = self.mlp_mean(state) - out_mean = torch.tanh(out_mean).view( - B, self.horizon_steps * self.transition_dim - ) # tanh squashing in [-1, 1] + if self.tanh_output: + out_mean = torch.tanh(out_mean) + out_mean = out_mean.view(B, self.horizon_steps * self.action_dim) if self.learn_fixed_std: out_logvar = torch.clamp(self.logvar, self.logvar_min, self.logvar_max) out_scale = torch.exp(0.5 * out_logvar) - out_scale = out_scale.view(1, self.transition_dim) + out_scale = out_scale.view(1, self.action_dim) out_scale = out_scale.repeat(B, self.horizon_steps) elif self.use_fixed_std: out_scale = torch.ones_like(out_mean).to(device) * self.fixed_std else: out_logvar = self.mlp_logvar(state).view( - B, self.horizon_steps * self.transition_dim + B, self.horizon_steps * self.action_dim ) - out_logvar = torch.clamp(out_logvar, self.logvar_min, self.logvar_max) + out_logvar = torch.tanh(out_logvar) + out_logvar = self.logvar_min + 0.5 * (self.logvar_max - self.logvar_min) * ( + out_logvar + 1 + ) # put back to full range out_scale = torch.exp(0.5 * out_logvar) return out_mean, out_scale diff --git a/model/common/mlp_gmm.py b/model/common/mlp_gmm.py index 8844b63..e93fdc7 100644 --- a/model/common/mlp_gmm.py +++ b/model/common/mlp_gmm.py @@ -12,7 +12,7 @@ class GMM_MLP(nn.Module): def __init__( self, - transition_dim, + action_dim, horizon_steps, cond_dim=None, mlp_dims=[256, 256, 256], @@ -26,10 +26,10 @@ class GMM_MLP(nn.Module): std_max=1, ): super().__init__() - self.transition_dim = transition_dim + self.action_dim = action_dim self.horizon_steps = horizon_steps input_dim = cond_dim - output_dim = transition_dim * horizon_steps * num_modes + output_dim = action_dim * horizon_steps * num_modes self.num_modes = num_modes if residual_style: model = ResidualMLP @@ -54,7 +54,7 @@ class GMM_MLP(nn.Module): self.logvar = torch.nn.Parameter( torch.log( torch.tensor( - [fixed_std**2 for _ in range(transition_dim * num_modes)] + [fixed_std**2 for _ in range(action_dim * num_modes)] ) ), requires_grad=True, @@ -87,19 +87,19 @@ class GMM_MLP(nn.Module): # mlp out_mean = self.mlp_mean(state) out_mean = torch.tanh(out_mean).view( - B, self.num_modes, self.horizon_steps * self.transition_dim + B, self.num_modes, self.horizon_steps * self.action_dim ) # tanh squashing in [-1, 1] if self.learn_fixed_std: out_logvar = torch.clamp(self.logvar, self.logvar_min, self.logvar_max) out_scale = torch.exp(0.5 * out_logvar) - out_scale = out_scale.view(1, self.num_modes, self.transition_dim) + out_scale = out_scale.view(1, self.num_modes, self.action_dim) out_scale = out_scale.repeat(B, 1, self.horizon_steps) elif self.use_fixed_std: out_scale = torch.ones_like(out_mean).to(device) * self.fixed_std else: out_logvar = self.mlp_logvar(state).view( - B, self.num_modes, self.horizon_steps * self.transition_dim + B, self.num_modes, self.horizon_steps * self.action_dim ) out_logvar = torch.clamp(out_logvar, self.logvar_min, self.logvar_max) out_scale = torch.exp(0.5 * out_logvar) diff --git a/model/common/transformer.py b/model/common/transformer.py index 1c38637..e93903d 100644 --- a/model/common/transformer.py +++ b/model/common/transformer.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) class Gaussian_Transformer(nn.Module): def __init__( self, - transition_dim, + action_dim, horizon_steps, cond_dim, transformer_embed_dim=256, @@ -32,16 +32,16 @@ class Gaussian_Transformer(nn.Module): ): super().__init__() - self.transition_dim = transition_dim + self.action_dim = action_dim self.horizon_steps = horizon_steps - output_dim = transition_dim + output_dim = action_dim if fixed_std is None: # learn the logvar output_dim *= 2 # mean and logvar logger.info("Using learned std") elif learn_fixed_std: # learn logvar self.logvar = torch.nn.Parameter( - torch.log(torch.tensor([fixed_std**2 for _ in range(transition_dim)])), + torch.log(torch.tensor([fixed_std**2 for _ in range(action_dim)])), requires_grad=True, ) logger.info(f"Using fixed std {fixed_std} with learning") @@ -81,19 +81,19 @@ class Gaussian_Transformer(nn.Module): out, _ = self.transformer(state) # (B,horizon,output_dim) # use the first half of the output as mean - out_mean = torch.tanh(out[:, :, : self.transition_dim]) - out_mean = out_mean.view(B, self.horizon_steps * self.transition_dim) + out_mean = torch.tanh(out[:, :, : self.action_dim]) + out_mean = out_mean.view(B, self.horizon_steps * self.action_dim) if self.learn_fixed_std: out_logvar = torch.clamp(self.logvar, self.logvar_min, self.logvar_max) out_scale = torch.exp(0.5 * out_logvar) - out_scale = out_scale.view(1, self.transition_dim) + out_scale = out_scale.view(1, self.action_dim) out_scale = out_scale.repeat(B, self.horizon_steps) elif self.fixed_std is not None: out_scale = torch.ones_like(out_mean).to(device) * self.fixed_std else: - out_logvar = out[:, :, self.transition_dim :] - out_logvar = out_logvar.reshape(B, self.horizon_steps * self.transition_dim) + out_logvar = out[:, :, self.action_dim :] + out_logvar = out_logvar.reshape(B, self.horizon_steps * self.action_dim) out_logvar = torch.clamp(out_logvar, self.logvar_min, self.logvar_max) out_scale = torch.exp(0.5 * out_logvar) return out_mean, out_scale @@ -102,7 +102,7 @@ class Gaussian_Transformer(nn.Module): class GMM_Transformer(nn.Module): def __init__( self, - transition_dim, + action_dim, horizon_steps, cond_dim, num_modes=5, @@ -120,13 +120,12 @@ class GMM_Transformer(nn.Module): super().__init__() self.num_modes = num_modes - self.transition_dim = transition_dim + self.action_dim = action_dim self.horizon_steps = horizon_steps - output_dim = transition_dim * num_modes - # + num_modes # mean and modes + output_dim = action_dim * num_modes if fixed_std is None: - output_dim += num_modes * transition_dim # logvar for each mode + output_dim += num_modes * action_dim # logvar for each mode logger.info("Using learned std") elif ( learn_fixed_std @@ -134,7 +133,7 @@ class GMM_Transformer(nn.Module): self.logvar = torch.nn.Parameter( torch.log( torch.tensor( - [fixed_std**2 for _ in range(num_modes * transition_dim)] + [fixed_std**2 for _ in range(num_modes * action_dim)] ) ), requires_grad=True, @@ -179,32 +178,32 @@ class GMM_Transformer(nn.Module): ) # (B,horizon,output_dim), (B,horizon,emb_dim) # use the first half of the output as mean - out_mean = torch.tanh(out[:, :, : self.num_modes * self.transition_dim]) + out_mean = torch.tanh(out[:, :, : self.num_modes * self.action_dim]) out_mean = out_mean.reshape( - B, self.horizon_steps, self.num_modes, self.transition_dim + B, self.horizon_steps, self.num_modes, self.action_dim ) out_mean = out_mean.permute(0, 2, 1, 3) # flip horizons and modes out_mean = out_mean.reshape( - B, self.num_modes, self.horizon_steps * self.transition_dim + B, self.num_modes, self.horizon_steps * self.action_dim ) if self.learn_fixed_std: out_logvar = torch.clamp(self.logvar, self.logvar_min, self.logvar_max) out_scale = torch.exp(0.5 * out_logvar) - out_scale = out_scale.view(1, self.num_modes, self.transition_dim) + out_scale = out_scale.view(1, self.num_modes, self.action_dim) out_scale = out_scale.repeat(B, 1, self.horizon_steps) elif self.fixed_std is not None: out_scale = torch.ones_like(out_mean).to(device) * self.fixed_std else: out_logvar = out[ - :, :, self.num_modes * self.transition_dim : -self.num_modes + :, :, self.num_modes * self.action_dim : -self.num_modes ] out_logvar = out_logvar.reshape( - B, self.horizon_steps, self.num_modes, self.transition_dim + B, self.horizon_steps, self.num_modes, self.action_dim ) out_logvar = out_logvar.permute(0, 2, 1, 3) # flip horizons and modes out_logvar = out_logvar.reshape( - B, self.num_modes, self.horizon_steps * self.transition_dim + B, self.num_modes, self.horizon_steps * self.action_dim ) out_logvar = torch.clamp(out_logvar, self.logvar_min, self.logvar_max) out_scale = torch.exp(0.5 * out_logvar) diff --git a/model/diffusion/diffusion.py b/model/diffusion/diffusion.py index 0b1e381..9d08c30 100644 --- a/model/diffusion/diffusion.py +++ b/model/diffusion/diffusion.py @@ -169,8 +169,11 @@ class DiffusionModel(nn.Module): # ---------- Sampling ----------# - def p_mean_var(self, x, t, cond, index=None): - noise = self.network(x, t, cond=cond) + def p_mean_var(self, x, t, cond, index=None, network_override=None): + if network_override is not None: + noise = network_override(x, t, cond=cond) + else: + noise = self.network(x, t, cond=cond) # Predict x_0 if self.predict_epsilon: @@ -228,7 +231,7 @@ class DiffusionModel(nn.Module): return mu, logvar @torch.no_grad() - def forward(self, cond): + def forward(self, cond, deterministic=True): """ Forward pass for sampling actions. Used in evaluating pre-trained/fine-tuned policy. Not modifying diffusion clipping diff --git a/model/diffusion/diffusion_dipo.py b/model/diffusion/diffusion_dipo.py index e4459dd..af4143d 100644 --- a/model/diffusion/diffusion_dipo.py +++ b/model/diffusion/diffusion_dipo.py @@ -5,6 +5,7 @@ Actor and Critic models for model-free online RL with DIffusion POlicy (DIPO). import torch import logging +import copy log = logging.getLogger(__name__) @@ -27,45 +28,67 @@ class DIPODiffusion(DiffusionModel): assert not self.use_ddim, "DQL does not support DDIM" self.critic = critic.to(self.device) + # target critic + self.critic_target = copy.deepcopy(self.critic) + # reassign actor self.actor = self.network + # target actor + self.actor_target = copy.deepcopy(self.actor) + # Minimum std used in denoising process when sampling action - helps exploration self.min_sampling_denoising_std = min_sampling_denoising_std # ---------- RL training ----------# - def loss_critic(self, obs, next_obs, actions, rewards, dones, gamma): + def loss_critic(self, obs, next_obs, actions, rewards, terminated, gamma): # get current Q-function current_q1, current_q2 = self.critic(obs, actions) - # get next Q-function - next_actions = self.forward( - cond=next_obs, - deterministic=False, - ) # forward() has no gradient, which is desired here. - next_q1, next_q2 = self.critic(next_obs, next_actions) - next_q = torch.min(next_q1, next_q2) + with torch.no_grad(): + # get next Q-function + next_actions = self.forward( + cond=next_obs, + deterministic=False, + ) # forward() has no gradient, which is desired here. + next_q1, next_q2 = self.critic_target(next_obs, next_actions) + next_q = torch.min(next_q1, next_q2) - # terminal state mask - mask = 1 - dones + # terminal state mask + mask = 1 - terminated - # flatten - rewards = rewards.view(-1) - next_q = next_q.view(-1) - mask = mask.view(-1) + # flatten + rewards = rewards.view(-1) + next_q = next_q.view(-1) + mask = mask.view(-1) - # target value - target_q = rewards + gamma * next_q * mask + # target value + target_q = rewards + gamma * next_q * mask # Update critic loss_critic = torch.mean((current_q1 - target_q) ** 2) + torch.mean( (current_q2 - target_q) ** 2 ) - return loss_critic + def update_target_critic(self, tau): + for target_param, source_param in zip( + self.critic_target.parameters(), self.critic.parameters() + ): + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) + + def update_target_actor(self, tau): + for target_param, source_param in zip( + self.actor_target.parameters(), self.actor.parameters() + ): + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) + # ---------- Sampling ----------#`` # override @@ -75,6 +98,7 @@ class DIPODiffusion(DiffusionModel): cond, deterministic=False, ): + """Use target actor""" device = self.betas.device B = len(cond["state"]) @@ -87,6 +111,7 @@ class DIPODiffusion(DiffusionModel): x=x, t=t_b, cond=cond, + network_override=self.actor_target, ) std = torch.exp(0.5 * logvar) diff --git a/model/diffusion/diffusion_dql.py b/model/diffusion/diffusion_dql.py index f8c315c..2024416 100644 --- a/model/diffusion/diffusion_dql.py +++ b/model/diffusion/diffusion_dql.py @@ -6,6 +6,7 @@ Diffusion Q-Learning (DQL) import torch import logging import numpy as np +import copy log = logging.getLogger(__name__) @@ -28,6 +29,9 @@ class DQLDiffusion(DiffusionModel): assert not self.use_ddim, "DQL does not support DDIM" self.critic = critic.to(self.device) + # target critic + self.critic_target = copy.deepcopy(self.critic) + # reassign actor self.actor = self.network @@ -36,39 +40,46 @@ class DQLDiffusion(DiffusionModel): # ---------- RL training ----------# - def loss_critic(self, obs, next_obs, actions, rewards, dones, gamma): + def loss_critic(self, obs, next_obs, actions, rewards, terminated, gamma): # get current Q-function current_q1, current_q2 = self.critic(obs, actions) # get next Q-function - next_actions = self.forward( - cond=next_obs, - deterministic=False, - ) # forward() has no gradient, which is desired here. - next_q1, next_q2 = self.critic(next_obs, next_actions) - next_q = torch.min(next_q1, next_q2) + with torch.no_grad(): + next_actions = self.forward( + cond=next_obs, + deterministic=False, + ) # forward() has no gradient, which is desired here. + next_q1, next_q2 = self.critic_target(next_obs, next_actions) + next_q = torch.min(next_q1, next_q2) - # terminal state mask - mask = 1 - dones + # terminal state mask + mask = 1 - terminated - # flatten - rewards = rewards.view(-1) - next_q = next_q.view(-1) - mask = mask.view(-1) + # flatten + rewards = rewards.view(-1) + next_q = next_q.view(-1) + mask = mask.view(-1) - # target value - target_q = rewards + gamma * next_q * mask + # target value + target_q = rewards + gamma * next_q * mask # Update critic loss_critic = torch.mean((current_q1 - target_q) ** 2) + torch.mean( (current_q2 - target_q) ** 2 ) - return loss_critic - def loss_actor(self, obs, actions, q1, q2, eta): - bc_loss = self.loss(actions, obs) + def loss_actor(self, obs, eta, act_steps): + action_new = self.forward_train( + cond=obs, + deterministic=False, + )[ + :, :act_steps + ] # with gradient + q1, q2 = self.critic(obs, action_new) + bc_loss = self.loss(action_new, obs) if np.random.uniform() > 0.5: q_loss = -q1.mean() / q2.abs().mean().detach() else: @@ -76,6 +87,14 @@ class DQLDiffusion(DiffusionModel): actor_loss = bc_loss + eta * q_loss return actor_loss + def update_target_critic(self, tau): + for target_param, source_param in zip( + self.critic_target.parameters(), self.critic.parameters() + ): + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) + # ---------- Sampling ----------#`` # override diff --git a/model/diffusion/diffusion_idql.py b/model/diffusion/diffusion_idql.py index 3b8fd28..a2343bf 100644 --- a/model/diffusion/diffusion_idql.py +++ b/model/diffusion/diffusion_idql.py @@ -20,11 +20,6 @@ def expectile_loss(diff, expectile=0.8): return weight * (diff**2) -def soft_update(target, source, tau): - for target_param, param in zip(target.parameters(), source.parameters()): - target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) - - class IDQLDiffusion(RWRDiffusion): def __init__( @@ -56,7 +51,6 @@ class IDQLDiffusion(RWRDiffusion): # compute advantage adv = q - v - return adv def loss_critic_v(self, obs, actions): @@ -64,10 +58,9 @@ class IDQLDiffusion(RWRDiffusion): # get the value loss v_loss = expectile_loss(adv).mean() - return v_loss - def loss_critic_q(self, obs, next_obs, actions, rewards, dones, gamma): + def loss_critic_q(self, obs, next_obs, actions, rewards, terminated, gamma): # get current Q-function current_q1, current_q2 = self.critic_q(obs, actions) @@ -77,7 +70,7 @@ class IDQLDiffusion(RWRDiffusion): next_v = self.critic_v(next_obs) # terminal state mask - mask = 1 - dones + mask = 1 - terminated # flatten rewards = rewards.view(-1) @@ -91,11 +84,15 @@ class IDQLDiffusion(RWRDiffusion): q_loss = torch.mean((current_q1 - discounted_q) ** 2) + torch.mean( (current_q2 - discounted_q) ** 2 ) - return q_loss def update_target_critic(self, tau): - soft_update(self.target_q, self.critic_q, tau) + for target_param, source_param in zip( + self.target_q.parameters(), self.critic_q.parameters() + ): + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) # override def p_losses( @@ -116,10 +113,9 @@ class IDQLDiffusion(RWRDiffusion): # Loss with mask if self.predict_epsilon: - loss = F.mse_loss(x_recon, noise, reduction="none") + loss = F.mse_loss(x_recon, noise) else: - loss = F.mse_loss(x_recon, x_start, reduction="none") - loss = einops.reduce(loss, "b h d -> b", "mean") + loss = F.mse_loss(x_recon, x_start) return loss.mean() # ---------- Sampling ----------#`` @@ -190,4 +186,4 @@ class IDQLDiffusion(RWRDiffusion): # squeeze dummy dimension samples = samples_best[0] - return samples + return samples \ No newline at end of file diff --git a/model/diffusion/diffusion_ppo_exact.py b/model/diffusion/diffusion_ppo_exact.py index cd7858c..2c9df2a 100644 --- a/model/diffusion/diffusion_ppo_exact.py +++ b/model/diffusion/diffusion_ppo_exact.py @@ -14,15 +14,18 @@ import torch import logging log = logging.getLogger(__name__) -from .diffusion_ppo import PPODiffusion +from .diffusion_vpg import VPGDiffusion from .exact_likelihood import get_likelihood_fn -class PPOExactDiffusion(PPODiffusion): +class PPOExactDiffusion(VPGDiffusion): def __init__( self, sde, + clip_ploss_coef, + clip_vloss_coef=None, + norm_adv=True, sde_hutchinson_type="Rademacher", sde_rtol=1e-4, sde_atol=1e-4, @@ -41,6 +44,9 @@ class PPOExactDiffusion(PPODiffusion): self.betas, sde_min_beta, ) + self.clip_ploss_coef = clip_ploss_coef + self.clip_vloss_coef = clip_vloss_coef + self.norm_adv = norm_adv # set up likelihood function self.likelihood_fn = get_likelihood_fn( @@ -62,7 +68,6 @@ class PPOExactDiffusion(PPODiffusion): samples: (B x Ta x Da) """ - # TODO: image input return self.likelihood_fn( self.actor, self.actor_ft, diff --git a/model/diffusion/diffusion_qsm.py b/model/diffusion/diffusion_qsm.py index 3b833b7..15c6e7b 100644 --- a/model/diffusion/diffusion_qsm.py +++ b/model/diffusion/diffusion_qsm.py @@ -14,16 +14,6 @@ log = logging.getLogger(__name__) from model.diffusion.diffusion_rwr import RWRDiffusion -def expectile_loss(diff, expectile=0.8): - weight = torch.where(diff > 0, expectile, (1 - expectile)) - return weight * (diff**2) - - -def soft_update(target, source, tau): - for target_param, param in zip(target.parameters(), source.parameters()): - target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) - - class QSMDiffusion(RWRDiffusion): def __init__( @@ -34,6 +24,8 @@ class QSMDiffusion(RWRDiffusion): ): super().__init__(network=actor, **kwargs) self.critic_q = critic.to(self.device) + + # target critic self.target_q = copy.deepcopy(critic) # assign actor @@ -54,7 +46,6 @@ class QSMDiffusion(RWRDiffusion): x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise) # get current value for noisy actions as the code does --- the algorthm block in the paper is wrong, it says using a_t, the final denoised action - # x_noisy_flat = torch.flatten(x_noisy, start_dim=-2) x_noisy.requires_grad_(True) current_q1, current_q2 = self.critic_q(obs, x_noisy) @@ -68,10 +59,10 @@ class QSMDiffusion(RWRDiffusion): # Loss with mask - align predicted noise with critic gradient of noisy actions # Note: the gradient of mu wrt. epsilon has a negative sign - loss = F.mse_loss(-x_recon, q_grad_coeff * gradient_q, reduction="none").mean() + loss = F.mse_loss(-x_recon, q_grad_coeff * gradient_q) return loss - def loss_critic(self, obs, next_obs, actions, rewards, dones, gamma): + def loss_critic(self, obs, next_obs, actions, rewards, terminated, gamma): # get current Q-function current_q1, current_q2 = self.critic_q(obs, actions) @@ -86,7 +77,7 @@ class QSMDiffusion(RWRDiffusion): next_q = torch.min(next_q1, next_q2) # terminal state mask - mask = 1 - dones + mask = 1 - terminated # flatten rewards = rewards.view(-1) @@ -104,4 +95,9 @@ class QSMDiffusion(RWRDiffusion): return loss_critic def update_target_critic(self, tau): - soft_update(self.target_q, self.critic_q, tau) + for target_param, source_param in zip( + self.target_q.parameters(), self.critic_q.parameters() + ): + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) diff --git a/model/diffusion/diffusion_vpg.py b/model/diffusion/diffusion_vpg.py index de7f5e7..e61b716 100644 --- a/model/diffusion/diffusion_vpg.py +++ b/model/diffusion/diffusion_vpg.py @@ -298,7 +298,9 @@ class VPGDiffusion(DiffusionModel): # clamp action at final step if self.final_action_clip_value is not None and i == len(t_all) - 1: - x = torch.clamp(x, -self.final_action_clip_value, self.final_action_clip_value) + x = torch.clamp( + x, -self.final_action_clip_value, self.final_action_clip_value + ) if return_chain: if not self.use_ddim and t <= self.ft_denoising_steps: diff --git a/model/diffusion/mlp_diffusion.py b/model/diffusion/mlp_diffusion.py index 9bc940e..9ffb9ff 100644 --- a/model/diffusion/mlp_diffusion.py +++ b/model/diffusion/mlp_diffusion.py @@ -22,7 +22,7 @@ class VisionDiffusionMLP(nn.Module): def __init__( self, backbone, - transition_dim, + action_dim, horizon_steps, cond_dim, img_cond_steps=1, @@ -77,9 +77,9 @@ class VisionDiffusionMLP(nn.Module): # diffusion input_dim = ( - time_dim + transition_dim * horizon_steps + visual_feature_dim + cond_dim + time_dim + action_dim * horizon_steps + visual_feature_dim + cond_dim ) - output_dim = transition_dim * horizon_steps + output_dim = action_dim * horizon_steps self.time_embedding = nn.Sequential( SinusoidalPosEmb(time_dim), nn.Linear(time_dim, time_dim * 2), @@ -175,7 +175,7 @@ class DiffusionMLP(nn.Module): def __init__( self, - transition_dim, + action_dim, horizon_steps, cond_dim, time_dim=16, @@ -187,7 +187,7 @@ class DiffusionMLP(nn.Module): residual_style=False, ): super().__init__() - output_dim = transition_dim * horizon_steps + output_dim = action_dim * horizon_steps self.time_embedding = nn.Sequential( SinusoidalPosEmb(time_dim), nn.Linear(time_dim, time_dim * 2), @@ -204,9 +204,9 @@ class DiffusionMLP(nn.Module): activation_type=activation_type, out_activation_type="Identity", ) - input_dim = time_dim + transition_dim * horizon_steps + cond_mlp_dims[-1] + input_dim = time_dim + action_dim * horizon_steps + cond_mlp_dims[-1] else: - input_dim = time_dim + transition_dim * horizon_steps + cond_dim + input_dim = time_dim + action_dim * horizon_steps + cond_dim self.mlp_mean = model( [input_dim] + mlp_dims + [output_dim], activation_type=activation_type, diff --git a/model/diffusion/unet.py b/model/diffusion/unet.py index c5e6836..c45b9c8 100644 --- a/model/diffusion/unet.py +++ b/model/diffusion/unet.py @@ -120,7 +120,7 @@ class Unet1D(nn.Module): def __init__( self, - transition_dim, + action_dim, cond_dim=None, diffusion_step_embed_dim=32, dim=32, @@ -134,7 +134,7 @@ class Unet1D(nn.Module): groupnorm_eps=1e-5, ): super().__init__() - dims = [transition_dim, *map(lambda m: dim * m, dim_mults)] + dims = [action_dim, *map(lambda m: dim * m, dim_mults)] in_out = list(zip(dims[:-1], dims[1:])) log.info(f"Channel dimensions: {in_out}") @@ -259,7 +259,7 @@ class Unet1D(nn.Module): activation_type=activation_type, eps=groupnorm_eps, ), - nn.Conv1d(dim, transition_dim, 1), + nn.Conv1d(dim, action_dim, 1), ) def forward( diff --git a/model/rl/gaussian_calql.py b/model/rl/gaussian_calql.py new file mode 100644 index 0000000..14d87f0 --- /dev/null +++ b/model/rl/gaussian_calql.py @@ -0,0 +1,187 @@ +""" +Calibrated Conservative Q-Learning (CalQL) for Gaussian policy. + +""" + +import torch +import torch.nn as nn +import logging +from copy import deepcopy +import numpy as np +import einops + +from model.common.gaussian import GaussianModel + +log = logging.getLogger(__name__) + + +class CalQL_Gaussian(GaussianModel): + def __init__( + self, + actor, + critic, + network_path=None, + cql_clip_diff_min=-np.inf, + cql_clip_diff_max=np.inf, + cql_min_q_weight=5.0, + cql_n_actions=10, + **kwargs, + ): + super().__init__(network=actor, network_path=None, **kwargs) + self.cql_clip_diff_min = cql_clip_diff_min + self.cql_clip_diff_max = cql_clip_diff_max + self.cql_min_q_weight = cql_min_q_weight + self.cql_n_actions = cql_n_actions + + # initialize critic networks + self.critic = critic.to(self.device) + self.target_critic = deepcopy(critic).to(self.device) + + # Load pre-trained checkpoint - note we are also loading the pre-trained critic here + if network_path is not None: + checkpoint = torch.load( + network_path, + map_location=self.device, + weights_only=True, + ) + self.load_state_dict( + checkpoint["model"], + strict=True, + ) + log.info("Loaded actor from %s", network_path) + log.info( + f"Number of network parameters: {sum(p.numel() for p in self.parameters())}" + ) + + def loss_critic( + self, + obs, + next_obs, + actions, + random_actions, + rewards, + returns, + terminated, + gamma, + alpha, + ): + B = len(actions) + + # Get initial TD loss + q_data1, q_data2 = self.critic(obs, actions) + with torch.no_grad(): + # repeat for action samples + next_obs["state"] = next_obs["state"].repeat_interleave( + self.cql_n_actions, dim=0 + ) + + # Get the next actions and logprobs + next_actions, next_logprobs = self.forward( + next_obs, + deterministic=False, + get_logprob=True, + ) + next_q1, next_q2 = self.target_critic(next_obs, next_actions) + next_q = torch.min(next_q1, next_q2) + + # Reshape the next_q to match the number of samples + next_q = next_q.view(B, self.cql_n_actions) # (B, n_sample) + next_logprobs = next_logprobs.view(B, self.cql_n_actions) # (B, n_sample) + + # Get the max indices over the samples, and index into the next_q and next_log_probs + max_idx = torch.argmax(next_q, dim=1) + next_q = next_q[torch.arange(B), max_idx] + next_logprobs = next_logprobs[torch.arange(B), max_idx] + + # Get the target Q values + target_q = rewards + gamma * (1 - terminated) * next_q + + # Subtract the entropy bonus + target_q = target_q - alpha * next_logprobs + + # TD loss + td_loss_1 = nn.functional.mse_loss(q_data1, target_q) + td_loss_2 = nn.functional.mse_loss(q_data2, target_q) + + # Get actions and logprobs + log_rand_pi = 0.5 ** torch.prod(torch.tensor(random_actions.shape[-2:])) + pi_actions, log_pi = self.forward( + obs, + deterministic=False, + reparameterize=False, + get_logprob=True, + ) # no gradient + + # Random action Q values + n_random_actions = random_actions.shape[1] + obs_sample_state = { + "state": obs["state"].repeat_interleave(n_random_actions, dim=0) + } + random_actions = einops.rearrange(random_actions, "B N H A -> (B N) H A") + + # Get the random action Q-values + q_rand_1, q_rand_2 = self.critic(obs_sample_state, random_actions) + q_rand_1 = q_rand_1 - log_rand_pi + q_rand_2 = q_rand_2 - log_rand_pi + + # Reshape the random action Q values to match the number of samples + q_rand_1 = q_rand_1.view(B, n_random_actions) # (n_sample, B) + q_rand_2 = q_rand_2.view(B, n_random_actions) + + # Policy action Q values + q_pi_1, q_pi_2 = self.critic(obs, pi_actions) + q_pi_1 = q_pi_1 - log_pi + q_pi_2 = q_pi_2 - log_pi + + # Ensure calibration w.r.t. value function estimate + q_pi_1 = torch.max(q_pi_1, returns)[:, None] # (B, 1) + q_pi_2 = torch.max(q_pi_2, returns)[:, None] # (B, 1) + cat_q_1 = torch.cat([q_rand_1, q_pi_1], dim=-1) # (B, num_samples+1) + cql_qf1_ood = torch.logsumexp(cat_q_1, dim=-1) # max over num_samples + cat_q_2 = torch.cat([q_rand_2, q_pi_2], dim=-1) # (B, num_samples+1) + cql_qf2_ood = torch.logsumexp(cat_q_2, dim=-1) # sum over num_samples + + # Subtract the log likelihood of the data + cql_qf1_diff = torch.clamp( + cql_qf1_ood - q_data1, + min=self.cql_clip_diff_min, + max=self.cql_clip_diff_max, + ).mean() + cql_qf2_diff = torch.clamp( + cql_qf2_ood - q_data2, + min=self.cql_clip_diff_min, + max=self.cql_clip_diff_max, + ).mean() + cql_min_qf1_loss = cql_qf1_diff * self.cql_min_q_weight + cql_min_qf2_loss = cql_qf2_diff * self.cql_min_q_weight + + # Sum the two losses + critic_loss = td_loss_1 + td_loss_2 + cql_min_qf1_loss + cql_min_qf2_loss + return critic_loss + + def loss_actor(self, obs, alpha): + action, logprob = self.forward( + obs, + deterministic=False, + reparameterize=True, + get_logprob=True, + ) + q1, q2 = self.critic(obs, action) + actor_loss = -torch.min(q1, q2) + alpha * logprob + return actor_loss.mean() + + def loss_temperature(self, obs, alpha, target_entropy): + with torch.no_grad(): + _, logprob = self.forward( + obs, + deterministic=False, + get_logprob=True, + ) + loss_alpha = -torch.mean(alpha * (logprob + target_entropy)) + return loss_alpha + + def update_target_critic(self, tau): + for target_param, param in zip( + self.target_critic.parameters(), self.critic.parameters() + ): + target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) diff --git a/model/rl/gaussian_ibrl.py b/model/rl/gaussian_ibrl.py new file mode 100644 index 0000000..ce96232 --- /dev/null +++ b/model/rl/gaussian_ibrl.py @@ -0,0 +1,205 @@ +""" +Imitation Bootstrapped Reinforcement Learning (IBRL) for Gaussian policy. + +""" + +import torch +import torch.nn as nn +import logging +from copy import deepcopy + +from model.common.gaussian import GaussianModel + +log = logging.getLogger(__name__) + + +class IBRL_Gaussian(GaussianModel): + def __init__( + self, + actor, + critic, + n_critics, + soft_action_sample=False, + soft_action_sample_beta=0.1, + **kwargs, + ): + super().__init__(network=actor, **kwargs) + self.soft_action_sample = soft_action_sample + self.soft_action_sample_beta = soft_action_sample_beta + + # Set up target actor + self.target_actor = deepcopy(actor) + + # Frozen pre-trained policy + self.bc_policy = deepcopy(actor) + for param in self.bc_policy.parameters(): + param.requires_grad = False + + # initialize critic networks + self.critic_networks = [ + deepcopy(critic).to(self.device) for _ in range(n_critics) + ] + self.critic_networks = nn.ModuleList(self.critic_networks) + + # initialize target networks + self.target_networks = [ + deepcopy(critic).to(self.device) for _ in range(n_critics) + ] + self.target_networks = nn.ModuleList(self.target_networks) + + # Construct a "stateless" version of one of the models. It is "stateless" in the sense that the parameters are meta Tensors and do not have storage. + base_model = deepcopy(self.critic_networks[0]) + self.base_model = base_model.to("meta") + self.ensemble_params, self.ensemble_buffers = torch.func.stack_module_state( + self.critic_networks + ) + + def critic_wrapper(self, params, buffers, data): + """for vmap""" + return torch.func.functional_call(self.base_model, (params, buffers), data) + + def get_random_indices(self, sz=None, num_ind=2): + """get num_ind random indices from a set of size sz (used for getting critic targets)""" + if sz is None: + sz = len(self.critic_networks) + perm = torch.randperm(sz) + ind = perm[:num_ind].to(self.device) + return ind + + def loss_critic( + self, + obs, + next_obs, + actions, + rewards, + terminated, + gamma, + ): + # get random critic index + q1_ind, q2_ind = self.get_random_indices() + with torch.no_grad(): + next_actions_bc = super().forward( + cond=next_obs, + deterministic=True, + network_override=self.bc_policy, + ) + next_actions_rl = super().forward( + cond=next_obs, + deterministic=False, + network_override=self.target_actor, + ) + + # get the BC Q value + next_q1_bc = self.target_networks[q1_ind](next_obs, next_actions_bc) + next_q2_bc = self.target_networks[q2_ind](next_obs, next_actions_bc) + next_q_bc = torch.min(next_q1_bc, next_q2_bc) + + # get the RL Q value + next_q1_rl = self.target_networks[q1_ind](next_obs, next_actions_rl) + next_q2_rl = self.target_networks[q2_ind](next_obs, next_actions_rl) + next_q_rl = torch.min(next_q1_rl, next_q2_rl) + + # take the max Q value + next_q = torch.where(next_q_bc > next_q_rl, next_q_bc, next_q_rl) + + # target value + target_q = rewards + gamma * (1 - terminated) * next_q # (B,) + + # run all critics in batch + current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))( + self.ensemble_params, self.ensemble_buffers, (obs, actions) + ) # (n_critics, B) + loss_critic = torch.mean((current_q - target_q[None]) ** 2) + return loss_critic + + def loss_actor(self, obs): + action = super().forward( + obs, + deterministic=False, + reparameterize=True, + ) # use online policy only, also IBRL does not use tanh squashing + current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))( + self.ensemble_params, self.ensemble_buffers, (obs, action) + ) # (n_critics, B) + current_q = current_q.min( + dim=0 + ).values # unlike RLPD, IBRL uses the min Q value for actor update + loss_actor = -torch.mean(current_q) + return loss_actor + + def update_target_critic(self, tau): + """need to use ensemble_params instead of critic_networks""" + for target_ind, target_critic in enumerate(self.target_networks): + for target_param_name, target_param in target_critic.named_parameters(): + source_param = self.ensemble_params[target_param_name][target_ind] + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) + + def update_target_actor(self, tau): + for target_param, source_param in zip( + self.target_actor.parameters(), self.network.parameters() + ): + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) + + # ---------- Sampling ----------# + + def forward( + self, + cond, + deterministic=False, + reparameterize=False, + ): + """use both pre-trained and online policies""" + q1_ind, q2_ind = self.get_random_indices() + + # sample an action from the BC policy + bc_action = super().forward( + cond=cond, + deterministic=True, + network_override=self.bc_policy, + ) + + # sample an action from the RL policy + rl_action = super().forward( + cond=cond, + deterministic=deterministic, + reparameterize=reparameterize, + ) + + # compute Q value of BC policy + q_bc_1 = self.critic_networks[q1_ind](cond, bc_action) # (B,) + q_bc_2 = self.critic_networks[q2_ind](cond, bc_action) + q_bc = torch.min(q_bc_1, q_bc_2) + + # compute Q value of RL policy + q_rl_1 = self.critic_networks[q1_ind](cond, rl_action) + q_rl_2 = self.critic_networks[q2_ind](cond, rl_action) + q_rl = torch.min(q_rl_1, q_rl_2) + + # soft sample or greedy + if deterministic or not self.soft_action_sample: + action = torch.where( + (q_bc > q_rl)[:, None, None], + bc_action, + rl_action, + ) + else: + # compute the Q weights with probability proportional to exp(\beta * Q(a)) + qw_bc = torch.exp(q_bc * self.soft_action_sample_beta) + qw_rl = torch.exp(q_rl * self.soft_action_sample_beta) + q_weights = torch.softmax( + torch.stack([qw_bc, qw_rl], dim=-1), + dim=-1, + ) + + # sample according to the weights + q_indices = torch.multinomial(q_weights, 1) + action = torch.where( + (q_indices == 0)[:, None], + bc_action, + rl_action, + ) + return action diff --git a/model/rl/gaussian_rlpd.py b/model/rl/gaussian_rlpd.py new file mode 100644 index 0000000..cd278ae --- /dev/null +++ b/model/rl/gaussian_rlpd.py @@ -0,0 +1,131 @@ +""" +Reinforcement learning with prior data (RLPD) for Gaussian policy. + +Use ensemble of critics. + +""" + +import torch +import torch.nn as nn +import logging +from copy import deepcopy + +from model.common.gaussian import GaussianModel + +log = logging.getLogger(__name__) + + +class RLPD_Gaussian(GaussianModel): + def __init__( + self, + actor, + critic, + n_critics, + backup_entropy=False, + **kwargs, + ): + super().__init__(network=actor, **kwargs) + self.n_critics = n_critics + self.backup_entropy = backup_entropy + + # initialize critic networks + self.critic_networks = [ + deepcopy(critic).to(self.device) for _ in range(n_critics) + ] + self.critic_networks = nn.ModuleList(self.critic_networks) + + # initialize target networks + self.target_networks = [ + deepcopy(critic).to(self.device) for _ in range(n_critics) + ] + self.target_networks = nn.ModuleList(self.target_networks) + + # Construct a "stateless" version of one of the models. It is "stateless" in the sense that the parameters are meta Tensors and do not have storage. + base_model = deepcopy(self.critic_networks[0]) + self.base_model = base_model.to("meta") + self.ensemble_params, self.ensemble_buffers = torch.func.stack_module_state( + self.critic_networks + ) + + def critic_wrapper(self, params, buffers, data): + """for vmap""" + return torch.func.functional_call(self.base_model, (params, buffers), data) + + def get_random_indices(self, sz=None, num_ind=2): + """get num_ind random indices from a set of size sz (used for getting critic targets)""" + if sz is None: + sz = len(self.critic_networks) + perm = torch.randperm(sz) + ind = perm[:num_ind].to(self.device) + return ind + + def loss_critic( + self, + obs, + next_obs, + actions, + rewards, + terminated, + gamma, + alpha, + ): + # get random critic index + q1_ind, q2_ind = self.get_random_indices() + with torch.no_grad(): + next_actions, next_logprobs = self.forward( + cond=next_obs, + deterministic=False, + get_logprob=True, + ) + next_q1 = self.target_networks[q1_ind](next_obs, next_actions) + next_q2 = self.target_networks[q2_ind](next_obs, next_actions) + next_q = torch.min(next_q1, next_q2) + + # target value + target_q = rewards + gamma * (1 - terminated) * next_q # (B,) + + # add entropy term to the target + if self.backup_entropy: + target_q = target_q + gamma * (1 - terminated) * alpha * ( + -next_logprobs + ) + + # run all critics in batch + current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))( + self.ensemble_params, self.ensemble_buffers, (obs, actions) + ) # (n_critics, B) + loss_critic = torch.mean((current_q - target_q[None]) ** 2) + return loss_critic + + def loss_actor(self, obs, alpha): + action, logprob = self.forward( + obs, + deterministic=False, + reparameterize=True, + get_logprob=True, + ) + current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))( + self.ensemble_params, self.ensemble_buffers, (obs, action) + ) # (n_critics, B) + current_q = current_q.mean(dim=0) + alpha * (-logprob) + loss_actor = -torch.mean(current_q) + return loss_actor + + def loss_temperature(self, obs, alpha, target_entropy): + with torch.no_grad(): + _, logprob = self.forward( + obs, + deterministic=False, + get_logprob=True, + ) + loss_alpha = -torch.mean(alpha * (logprob + target_entropy)) + return loss_alpha + + def update_target_critic(self, tau): + """need to use ensemble_params instead of critic_networks""" + for target_ind, target_critic in enumerate(self.target_networks): + for target_param_name, target_param in target_critic.named_parameters(): + source_param = self.ensemble_params[target_param_name][target_ind] + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) diff --git a/model/rl/gaussian_sac.py b/model/rl/gaussian_sac.py new file mode 100644 index 0000000..e32aadc --- /dev/null +++ b/model/rl/gaussian_sac.py @@ -0,0 +1,88 @@ +""" +Soft Actor Critic (SAC) with Gaussian policy. + +""" + +import torch +import logging +from copy import deepcopy +import torch.nn.functional as F + +from model.common.gaussian import GaussianModel + +log = logging.getLogger(__name__) + + +class SAC_Gaussian(GaussianModel): + def __init__( + self, + actor, + critic, + **kwargs, + ): + super().__init__(network=actor, **kwargs) + + # initialize doubel critic networks + self.critic = critic.to(self.device) + + # initialize double target networks + self.target_critic = deepcopy(self.critic).to(self.device) + + def loss_critic( + self, + obs, + next_obs, + actions, + rewards, + terminated, + gamma, + alpha, + ): + with torch.no_grad(): + next_actions, next_logprobs = self.forward( + cond=next_obs, + deterministic=False, + get_logprob=True, + ) + next_q1, next_q2 = self.target_critic( + next_obs, + next_actions, + ) + next_q = torch.min(next_q1, next_q2) - alpha * next_logprobs + + # target value + target_q = rewards + gamma * next_q * (1 - terminated) + current_q1, current_q2 = self.critic(obs, actions) + loss_critic = F.mse_loss(current_q1, target_q) + F.mse_loss( + current_q2, target_q + ) + return loss_critic + + def loss_actor(self, obs, alpha): + action, logprob = self.forward( + obs, + deterministic=False, + reparameterize=True, + get_logprob=True, + ) + current_q1, current_q2 = self.critic(obs, action) + loss_actor = -torch.min(current_q1, current_q2) + alpha * logprob + return loss_actor.mean() + + def loss_temperature(self, obs, alpha, target_entropy): + with torch.no_grad(): + _, logprob = self.forward( + obs, + deterministic=False, + get_logprob=True, + ) + loss_alpha = -torch.mean(alpha * (logprob + target_entropy)) + return loss_alpha + + def update_target_critic(self, tau): + for target_param, source_param in zip( + self.target_critic.parameters(), self.critic.parameters() + ): + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) diff --git a/pyproject.toml b/pyproject.toml index 39f222d..0191c91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "dppo" -version = "0.1.0" +version = "0.5.0" description = "Fine-tuning diffusion policies with PPO." readme = "README.md" requires-python = ">=3.8" diff --git a/script/dataset/README.md b/script/dataset/README.md index 8d71e7b..0b92d8a 100644 --- a/script/dataset/README.md +++ b/script/dataset/README.md @@ -1,3 +1,18 @@ ## Data processing scripts -These are some scripts used for processing the raw datasets from the benchmarks. We already pre-processed them and provide the final datasets. These scripts are for information only. \ No newline at end of file +These are some scripts used for processing the raw datasets from the benchmarks. We already pre-processed them and provide the final datasets. + +Gym and robomimic data +```console +python script/dataset/get_d4rl_dataset.py --env_name=hopper-medium-v2 --save_dir=data/gym/hopper-medium-v2 +python script/dataset/process_robomimic_dataset.py --load_path=../robomimic_raw_data/lift_low_dim_v141.hdf5 --save_dir=data/robomimic/lift --normalize +``` + +Raw robomimic data can be downloaded with a clone of the repository and then +```console +cd ~/robomimic/robomimic/scripts +python download_datasets.py --tasks all --dataset_types mh --hdf5_types low_dim # state-only policy +python download_datasets.py --tasks all --dataset_types mh --hdf5_types raw # pixel-based policy +# for pixel, replay the trajectories to extract image observations +python robomimic/scripts/dataset_states_to_obs.py --done_mode 2 --dataset datasets/can/mh/demo_v141.hdf5 --output_name image_v141.hdf5 --camera_names robot0_eye_in_hand --camera_height 96 --camera_width 96 --exclude-next-obs --n 100 +``` \ No newline at end of file diff --git a/script/dataset/get_d4rl_dataset.py b/script/dataset/get_d4rl_dataset.py index b34b557..9b11e0f 100644 --- a/script/dataset/get_d4rl_dataset.py +++ b/script/dataset/get_d4rl_dataset.py @@ -1,95 +1,79 @@ """ -Download D4RL dataset and save it into our custom format so it can be loaded for diffusion training. - +Download D4RL dataset and save it into our custom format for diffusion training. """ import os import logging import gym import random -from copy import deepcopy import numpy as np from tqdm import tqdm -import pickle - import d4rl.gym_mujoco # Import required to register environments +from copy import deepcopy def make_dataset(env_name, save_dir, save_name_prefix, val_split, logger): # Create the environment env = gym.make(env_name) - - # d4rl abides by the OpenAI gym interface env.reset() - env.step(env.action_space.sample()) - - # Each task is associated with a dataset - # dataset contains observations, actions, rewards, terminals, and infos + env.step( + env.action_space.sample() + ) # Interact with the environment to initialize it dataset = env.get_dataset() + + # rename observations to states + dataset["states"] = dataset.pop("observations") + logger.info("\n========== Basic Info ===========") logger.info(f"Keys in the dataset: {dataset.keys()}") - logger.info(f"Observation shape: {dataset['observations'].shape}") + logger.info(f"State shape: {dataset['states'].shape}") logger.info(f"Action shape: {dataset['actions'].shape}") + + # determine trajectories from terminals and timeouts terminal_indices = np.argwhere(dataset["terminals"])[:, 0] timeout_indices = np.argwhere(dataset["timeouts"])[:, 0] - obs_dim = dataset["observations"].shape[1] - action_dim = dataset["actions"].shape[1] - done_indices = np.concatenate([terminal_indices, timeout_indices]) - done_indices = np.sort(done_indices) - traj_lengths = [] - prev_index = 0 - for i in tqdm(range(len(done_indices))): - # get episode length - cur_index = done_indices[i] - traj_lengths.append(cur_index - prev_index + 1) - prev_index = cur_index + 1 - obs_min = np.min(dataset["observations"], axis=0) - obs_max = np.max(dataset["observations"], axis=0) + done_indices = np.sort(np.concatenate([terminal_indices, timeout_indices])) + traj_lengths = np.diff(np.concatenate([[0], done_indices + 1])) + + obs_min = np.min(dataset["states"], axis=0) + obs_max = np.max(dataset["states"], axis=0) action_min = np.min(dataset["actions"], axis=0) action_max = np.max(dataset["actions"], axis=0) - max_episode_steps = max(traj_lengths) - logger.info("total transitions: {}".format(np.sum(traj_lengths))) - logger.info("total trajectories: {}".format(len(traj_lengths))) - logger.info( - f"traj length mean/std: {np.mean(traj_lengths)}, {np.std(traj_lengths)}" - ) - logger.info(f"traj length min/max: {np.min(traj_lengths)}, {np.max(traj_lengths)}") - logger.info(f"obs min: {obs_min}") - logger.info(f"obs max: {obs_max}") - logger.info(f"action min: {action_min}") - logger.info(f"action max: {action_max}") - # Subsample episodes by taking the first ones + logger.info(f"Total transitions: {np.sum(traj_lengths)}") + logger.info(f"Total trajectories: {len(traj_lengths)}") + logger.info( + f"Trajectory length mean/std: {np.mean(traj_lengths)}, {np.std(traj_lengths)}" + ) + logger.info( + f"Trajectory length min/max: {np.min(traj_lengths)}, {np.max(traj_lengths)}" + ) + logger.info(f"obs min: {obs_min}, obs max: {obs_max}") + logger.info(f"action min: {action_min}, action max: {action_max}") + + # Subsample episodes if needed if args.max_episodes > 0: traj_lengths = traj_lengths[: args.max_episodes] done_indices = done_indices[: args.max_episodes] - max_episode_steps = max(traj_lengths) - # split indices in train and val + # Split into train and validation sets num_traj = len(traj_lengths) num_train = int(num_traj * (1 - val_split)) train_indices = random.sample(range(num_traj), k=num_train) - # do over all indices - out_train = {} - keys = [ - "observations", - "actions", - "rewards", - ] - out_train["observations"] = np.empty( - (0, max_episode_steps, dataset["observations"].shape[-1]) - ) - out_train["actions"] = np.empty( - (0, max_episode_steps, dataset["actions"].shape[-1]) - ) - out_train["rewards"] = np.empty((0, max_episode_steps)) - out_train["traj_length"] = [] + # Prepare data containers for train and validation sets + out_train = { + "states": [], + "actions": [], + "rewards": [], + "terminals": [], + "traj_lengths": [], + } out_val = deepcopy(out_train) prev_index = 0 train_episode_reward_all = [] val_episode_reward_all = [] - for i in tqdm(range(len(done_indices))): + for i, cur_index in tqdm(enumerate(done_indices), total=len(done_indices)): if i in train_indices: out = out_train episode_reward_all = train_episode_reward_all @@ -97,57 +81,65 @@ def make_dataset(env_name, save_dir, save_name_prefix, val_split, logger): out = out_val episode_reward_all = val_episode_reward_all - # get episode length - cur_index = done_indices[i] + # Get the trajectory length and slice traj_length = cur_index - prev_index + 1 + trajectory = { + key: dataset[key][prev_index : cur_index + 1] + for key in ["states", "actions", "rewards", "terminals"] + } - # Skip if the episode has no reward - if np.sum(dataset["rewards"][prev_index : cur_index + 1]) > 0: - out["traj_length"].append(traj_length) + # Skip if there is no reward in the episode + if np.sum(trajectory["rewards"]) > 0: + # Scale observations and actions + trajectory["states"] = ( + 2 * (trajectory["states"] - obs_min) / (obs_max - obs_min + 1e-6) - 1 + ) + trajectory["actions"] = ( + 2 + * (trajectory["actions"] - action_min) + / (action_max - action_min + 1e-6) + - 1 + ) - # apply padding to make all episodes have the same max steps - for key in keys: - traj = dataset[key][prev_index : cur_index + 1] - - # also scale - if key == "observations": - traj = 2 * (traj - obs_min) / (obs_max - obs_min + 1e-6) - 1 - elif key == "actions": - traj = ( - 2 * (traj - action_min) / (action_max - action_min + 1e-6) - 1 - ) - - if traj.ndim == 1: - traj = np.pad( - traj, - (0, max_episode_steps - len(traj)), - mode="constant", - constant_values=0, - ) - else: - traj = np.pad( - traj, - ((0, max_episode_steps - traj.shape[0]), (0, 0)), - mode="constant", - constant_values=0, - ) - out[key] = np.vstack((out[key], traj[None])) - - # check reward - episode_reward_all.append(np.sum(out["rewards"][-1])) + for key in ["states", "actions", "rewards", "terminals"]: + out[key].append(trajectory[key]) + out["traj_lengths"].append(traj_length) + episode_reward_all.append(np.sum(trajectory["rewards"])) else: - print(f"skipping {i} / {len(done_indices)}") + logger.info(f"Skipping trajectory {i} due to zero rewards.") - # update prev index prev_index = cur_index + 1 - # Save to np file - save_train_path = os.path.join(save_dir, save_name_prefix + "train.npz") - save_val_path = os.path.join(save_dir, save_name_prefix + "val.npz") - with open(save_train_path, "wb") as f: - pickle.dump(out_train, f) - with open(save_val_path, "wb") as f: - pickle.dump(out_val, f) + # Concatenate trajectories + for key in ["states", "actions", "rewards", "terminals"]: + out_train[key] = np.concatenate(out_train[key], axis=0) + + # Only concatenate validation set if it exists + if val_split > 0: + out_val[key] = np.concatenate(out_val[key], axis=0) + + # Save train dataset to npz files + train_save_path = os.path.join(save_dir, save_name_prefix + "train.npz") + np.savez_compressed( + train_save_path, + states=np.array(out_train["states"]), + actions=np.array(out_train["actions"]), + rewards=np.array(out_train["rewards"]), + terminals=np.array(out_train["terminals"]), + traj_lengths=np.array(out_train["traj_lengths"]), + ) + + # Save validation dataset to npz files + val_save_path = os.path.join(save_dir, save_name_prefix + "val.npz") + np.savez_compressed( + val_save_path, + states=np.array(out_val["states"]), + actions=np.array(out_val["actions"]), + rewards=np.array(out_val["rewards"]), + terminals=np.array(out_val["terminals"]), + traj_lengths=np.array(out_val["traj_lengths"]), + ) + normalization_save_path = os.path.join( save_dir, save_name_prefix + "normalization.npz" ) @@ -159,83 +151,49 @@ def make_dataset(env_name, save_dir, save_name_prefix, val_split, logger): action_max=action_max, ) - # debug + # Logging summary statistics logger.info("\n========== Final ===========") logger.info( - f"Train - Number of episodes and transitions: {len(out_train['traj_length'])}, {np.sum(out_train['traj_length'])}" + f"Train - Trajectories: {len(out_train['traj_lengths'])}, Transitions: {np.sum(out_train['traj_lengths'])}" ) logger.info( - f"Val - Number of episodes and transitions: {len(out_val['traj_length'])}, {np.sum(out_val['traj_length'])}" + f"Val - Trajectories: {len(out_val['traj_lengths'])}, Transitions: {np.sum(out_val['traj_lengths'])}" ) logger.info( - f"Train - Mean/Std trajectory length: {np.mean(out_train['traj_length'])}, {np.std(out_train['traj_length'])}" + f"Train - Mean/Std trajectory length: {np.mean(out_train['traj_lengths'])}, {np.std(out_train['traj_lengths'])}" ) - logger.info( - f"Train - Max/Min trajectory length: {np.max(out_train['traj_length'])}, {np.min(out_train['traj_length'])}" + ( + logger.info( + f"Val - Mean/Std trajectory length: {np.mean(out_val['traj_lengths'])}, {np.std(out_val['traj_lengths'])}" + ) + if val_split > 0 + else None ) - if val_split > 0: - logger.info( - f"Val - Mean/Std trajectory length: {np.mean(out_val['traj_length'])}, {np.std(out_val['traj_length'])}" - ) - logger.info( - f"Val - Max/Min trajectory length: {np.max(out_val['traj_length'])}, {np.min(out_val['traj_length'])}" - ) - logger.info( - f"Train - Mean/Std episode reward: {np.mean(train_episode_reward_all)}, {np.std(train_episode_reward_all)}" - ) - if val_split > 0: - logger.info( - f"Val - Mean/Std episode reward: {np.mean(val_episode_reward_all)}, {np.std(val_episode_reward_all)}" - ) - for obs_dim_ind in range(obs_dim): - obs = out_train["observations"][:, :, obs_dim_ind] - logger.info( - f"Train - Obs dim {obs_dim_ind+1} mean {np.mean(obs)} std {np.std(obs)} min {np.min(obs)} max {np.max(obs)}" - ) - for action_dim_ind in range(action_dim): - action = out_train["actions"][:, :, action_dim_ind] - logger.info( - f"Train - Action dim {action_dim_ind+1} mean {np.mean(action)} std {np.std(action)} min {np.min(action)} max {np.max(action)}" - ) - if val_split > 0: - for obs_dim_ind in range(obs_dim): - obs = out_val["observations"][:, :, obs_dim_ind] - logger.info( - f"Val - Obs dim {obs_dim_ind+1} mean {np.mean(obs)} std {np.std(obs)} min {np.min(obs)} max {np.max(obs)}" - ) - for action_dim_ind in range(action_dim): - action = out_val["actions"][:, :, action_dim_ind] - logger.info( - f"Val - Action dim {action_dim_ind+1} mean {np.mean(action)} std {np.std(action)} min {np.min(action)} max {np.max(action)}" - ) if __name__ == "__main__": import argparse + import datetime parser = argparse.ArgumentParser() parser.add_argument("--env_name", type=str, default="hopper-medium-v2") parser.add_argument("--save_dir", type=str, default=".") parser.add_argument("--save_name_prefix", type=str, default="") - parser.add_argument("--val_split", type=float, default="0.2") - parser.add_argument("--max_episodes", type=int, default="-1") + parser.add_argument("--val_split", type=float, default=0) + parser.add_argument("--max_episodes", type=int, default=-1) args = parser.parse_args() - import datetime - - # import logging.config - if args.max_episodes > 0: - args.save_name_prefix += f"max_episodes_{args.max_episodes}_" os.makedirs(args.save_dir, exist_ok=True) log_path = os.path.join( args.save_dir, args.save_name_prefix + f"_{datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}.log", ) + logger = logging.getLogger("get_D4RL_dataset") logger.setLevel(logging.INFO) file_handler = logging.FileHandler(log_path) - file_handler.setLevel(logging.INFO) # Set the minimum level for this handler + file_handler.setLevel(logging.INFO) formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) diff --git a/script/dataset/process_robomimic_dataset.py b/script/dataset/process_robomimic_dataset.py index e973cd1..2abb2a6 100644 --- a/script/dataset/process_robomimic_dataset.py +++ b/script/dataset/process_robomimic_dataset.py @@ -3,6 +3,8 @@ Process robomimic dataset and save it into our custom format so it can be loaded Using some code from robomimic/robomimic/scripts/get_dataset_info.py +Since we do not terminate episode early and cumulate reward when the goal is reached, we set terminals to all False. + can-mh: total transitions: 62756 total trajectories: 300 @@ -76,31 +78,20 @@ robomimic dataset normalizes action to [-1, 1], observation roughly? to [-1, 1]. """ + import numpy as np from tqdm import tqdm -import pickle - -try: - import h5py # not included in pyproject.toml -except: - print("Installing h5py") - os.system("pip install h5py") +import h5py import os import random from copy import deepcopy import logging -def make_dataset( - load_path, - save_dir, - save_name_prefix, - val_split, - normalize, -): +def make_dataset(load_path, save_dir, save_name_prefix, val_split, normalize): # Load hdf5 file from load_path with h5py.File(load_path, "r") as f: - # put demonstration list in increasing episode order + # Sort demonstrations in increasing episode order demos = sorted(list(f["data"].keys())) inds = np.argsort([int(elem[5:]) for elem in demos]) demos = [demos[i] for i in inds] @@ -108,7 +99,7 @@ def make_dataset( if args.max_episodes > 0: demos = demos[: args.max_episodes] - # From generate_paper_configs.py: default observation is eef pose, gripper finger position, and object information, all of which are low-dim. + # Default low-dimensional observation keys low_dim_obs_names = [ "robot0_eef_pos", "robot0_eef_quat", @@ -120,23 +111,28 @@ def make_dataset( "robot1_eef_quat", "robot1_gripper_qpos", ] - if args.cameras is None: # state-only + if args.cameras is None: low_dim_obs_names.append("object") + + # Calculate dimensions for observations and actions obs_dim = 0 for low_dim_obs_name in low_dim_obs_names: - dim = f["data/demo_0/obs/{}".format(low_dim_obs_name)].shape[1] + dim = f[f"data/demo_0/obs/{low_dim_obs_name}"].shape[1] obs_dim += dim logging.info(f"Using {low_dim_obs_name} with dim {dim} for observation") + action_dim = f["data/demo_0/actions"].shape[1] logging.info(f"Total low-dim observation dim: {obs_dim}") logging.info(f"Action dim: {action_dim}") - # get basic stats + # Initialize variables for tracking trajectory statistics traj_lengths = [] obs_min = np.zeros((obs_dim)) obs_max = np.zeros((obs_dim)) action_min = np.zeros((action_dim)) action_max = np.zeros((action_dim)) + + # Process each demo for ep in demos: traj_lengths.append(f[f"data/{ep}/actions"].shape[0]) obs = np.hstack( @@ -145,96 +141,47 @@ def make_dataset( for low_dim_obs_name in low_dim_obs_names ] ) - actions = f[f"data/{ep}/actions"] + actions = f[f"data/{ep}/actions"][()] obs_min = np.minimum(obs_min, np.min(obs, axis=0)) obs_max = np.maximum(obs_max, np.max(obs, axis=0)) action_min = np.minimum(action_min, np.min(actions, axis=0)) action_max = np.maximum(action_max, np.max(actions, axis=0)) - traj_lengths = np.array(traj_lengths) - max_traj_length = np.max(traj_lengths) - # report statistics on the data + traj_lengths = np.array(traj_lengths) + + # Report statistics logging.info("===== Basic stats =====") - logging.info("total transitions: {}".format(np.sum(traj_lengths))) - logging.info("total trajectories: {}".format(traj_lengths.shape[0])) + logging.info(f"Total transitions: {np.sum(traj_lengths)}") + logging.info(f"Total trajectories: {len(traj_lengths)}") logging.info( - f"traj length mean/std: {np.mean(traj_lengths)}, {np.std(traj_lengths)}" + f"Traj length mean/std: {np.mean(traj_lengths)}, {np.std(traj_lengths)}" ) logging.info( - f"traj length min/max: {np.min(traj_lengths)}, {np.max(traj_lengths)}" + f"Traj length min/max: {np.min(traj_lengths)}, {np.max(traj_lengths)}" ) logging.info(f"obs min: {obs_min}") logging.info(f"obs max: {obs_max}") logging.info(f"action min: {action_min}") logging.info(f"action max: {action_max}") - # deal with images - if args.cameras is not None: - img_shapes = [] - img_names = [] # not necessary but keep old implementation - for camera in args.cameras: - if f"{camera}_image" in f["data/demo_0/obs"]: - img_shape = f["data/demo_0/obs/{}_image".format(camera)].shape[1:] - img_shapes.append(img_shape) - img_names.append(f"{camera}_image") - # ensure all images have the same height and width - assert all( - [ - img_shape[0] == img_shapes[0][0] - and img_shape[1] == img_shapes[0][1] - for img_shape in img_shapes - ] - ) - combined_img_shape = ( - img_shapes[0][0], - img_shapes[0][1], - sum([img_shape[2] for img_shape in img_shapes]), - ) - logging.info(f"Image shapes: {img_shapes}") - - # split indices in train and val + # Split indices into train and validation sets num_traj = len(traj_lengths) num_train = int(num_traj * (1 - val_split)) train_indices = random.sample(range(num_traj), k=num_train) - # do over all indices - out_train = {} - keys = [ - "observations", - "actions", - "rewards", - ] - if args.cameras is not None: - keys.append("images") - out_train["observations"] = np.empty((0, max_traj_length, obs_dim)) - out_train["actions"] = np.empty((0, max_traj_length, action_dim)) - out_train["rewards"] = np.empty((0, max_traj_length)) - out_train["traj_length"] = [] - if args.cameras is not None: - out_train["images"] = np.empty( - ( - 0, - max_traj_length, - *combined_img_shape, - ), - dtype=np.uint8, - ) + # Initialize output dictionaries for train and val sets + out_train = {"states": [], "actions": [], "rewards": [], "traj_lengths": []} out_val = deepcopy(out_train) - train_episode_reward_all = [] - val_episode_reward_all = [] + + # Process each demo for i in tqdm(range(len(demos))): ep = demos[i] - if i in train_indices: - out = out_train - else: - out = out_val + out = out_train if i in train_indices else out_val - # get episode length + # Get trajectory data traj_length = f[f"data/{ep}"].attrs["num_samples"] - out["traj_length"].append(traj_length) - # print("Episode:", i, "Trajectory length:", traj_length) + out["traj_lengths"].append(traj_length) - # extract raw_actions = f[f"data/{ep}/actions"][()] rewards = f[f"data/{ep}/rewards"][()] raw_obs = np.hstack( @@ -242,9 +189,9 @@ def make_dataset( f[f"data/{ep}/obs/{low_dim_obs_name}"][()] for low_dim_obs_name in low_dim_obs_names ] - ) # not normalized + ) - # scale to [-1, 1] for both ob and action + # Normalize if specified if normalize: obs = 2 * (raw_obs - obs_min) / (obs_max - obs_min + 1e-6) - 1 actions = ( @@ -255,128 +202,60 @@ def make_dataset( obs = raw_obs actions = raw_actions - data_traj = { - "observations": obs, - "actions": actions, - "rewards": rewards, - } - if args.cameras is not None: # no normalization - data_traj["images"] = np.concatenate( - ( - [ - f["data/{}/obs/{}".format(ep, img_name)][()] - for img_name in img_names - ] - ), - axis=-1, - ) + # Store trajectories in output dictionary + out["states"].append(obs) + out["actions"].append(actions) + out["rewards"].append(rewards) - # apply padding to make all episodes have the same max steps - # later when we load this dataset, we will use the traj_length to slice the data - for key in keys: - traj = data_traj[key] - if traj.ndim == 1: - pad_width = (0, max_traj_length - len(traj)) - elif traj.ndim == 2: - pad_width = ((0, max_traj_length - traj.shape[0]), (0, 0)) - elif traj.ndim == 4: - pad_width = ( - (0, max_traj_length - traj.shape[0]), - (0, 0), - (0, 0), - (0, 0), - ) - else: - raise ValueError("Unsupported dimension") - traj = np.pad( - traj, - pad_width, - mode="constant", - constant_values=0, - ) - out[key] = np.vstack((out[key], traj[None])) + # Concatenate trajectories (no padding) + for key in ["states", "actions", "rewards"]: + out_train[key] = np.concatenate(out_train[key], axis=0) - # check reward - if i in train_indices: - train_episode_reward_all.append(np.sum(data_traj["rewards"])) - else: - val_episode_reward_all.append(np.sum(data_traj["rewards"])) + # Only concatenate validation set if it exists + if val_split > 0: + out_val[key] = np.concatenate(out_val[key], axis=0) - # Save to np file - save_train_path = os.path.join(save_dir, save_name_prefix + "train.npz") - save_val_path = os.path.join(save_dir, save_name_prefix + "val.npz") - with open(save_train_path, "wb") as f: - pickle.dump(out_train, f) - with open(save_val_path, "wb") as f: - pickle.dump(out_val, f) - if normalize: - normalization_save_path = os.path.join( - save_dir, save_name_prefix + "normalization.npz" - ) - np.savez( - normalization_save_path, - obs_min=obs_min, - obs_max=obs_max, - action_min=action_min, - action_max=action_max, + # Save datasets as npz files + train_save_path = os.path.join(save_dir, save_name_prefix + "train.npz") + np.savez_compressed( + train_save_path, + states=np.array(out_train["states"]), + actions=np.array(out_train["actions"]), + rewards=np.array(out_train["rewards"]), + terminals=np.array([False] * len(out_train["states"])), + traj_lengths=np.array(out_train["traj_lengths"]), ) - # debug - logging.info("\n========== Final ===========") - logging.info( - f"Train - Number of episodes and transitions: {len(out_train['traj_length'])}, {np.sum(out_train['traj_length'])}" - ) - logging.info( - f"Val - Number of episodes and transitions: {len(out_val['traj_length'])}, {np.sum(out_val['traj_length'])}" - ) - logging.info( - f"Train - Mean/Std trajectory length: {np.mean(out_train['traj_length'])}, {np.std(out_train['traj_length'])}" - ) - logging.info( - f"Train - Max/Min trajectory length: {np.max(out_train['traj_length'])}, {np.min(out_train['traj_length'])}" - ) - logging.info( - f"Train - Mean/Std episode reward: {np.mean(train_episode_reward_all)}, {np.std(train_episode_reward_all)}" - ) - if val_split > 0: - logging.info( - f"Val - Mean/Std trajectory length: {np.mean(out_val['traj_length'])}, {np.std(out_val['traj_length'])}" + val_save_path = os.path.join(save_dir, save_name_prefix + "val.npz") + np.savez_compressed( + val_save_path, + states=np.array(out_val["states"]), + actions=np.array(out_val["actions"]), + rewards=np.array(out_val["rewards"]), + terminals=np.array([False] * len(out_val["states"])), + traj_lengths=np.array(out_val["traj_lengths"]), ) - logging.info( - f"Val - Max/Min trajectory length: {np.max(out_val['traj_length'])}, {np.min(out_val['traj_length'])}" - ) - logging.info( - f"Val - Mean/Std episode reward: {np.mean(val_episode_reward_all)}, {np.std(val_episode_reward_all)}" - ) - for obs_dim_ind in range(obs_dim): - obs = out_train["observations"][:, :, obs_dim_ind] - logging.info( - f"Train - Obs dim {obs_dim_ind+1} mean {np.mean(obs)} std {np.std(obs)} min {np.min(obs)} max {np.max(obs)}" - ) - for action_dim_ind in range(action_dim): - action = out_train["actions"][:, :, action_dim_ind] - logging.info( - f"Train - Action dim {action_dim_ind+1} mean {np.mean(action)} std {np.std(action)} min {np.min(action)} max {np.max(action)}" - ) - if val_split > 0: - for obs_dim_ind in range(obs_dim): - obs = out_val["observations"][:, :, obs_dim_ind] - logging.info( - f"Val - Obs dim {obs_dim_ind+1} mean {np.mean(obs)} std {np.std(obs)} min {np.min(obs)} max {np.max(obs)}" + + # Save normalization stats if required + if normalize: + normalization_save_path = os.path.join( + save_dir, save_name_prefix + "normalization.npz" ) - for action_dim_ind in range(action_dim): - action = out_val["actions"][:, :, action_dim_ind] - logging.info( - f"Val - Action dim {action_dim_ind+1} mean {np.mean(action)} std {np.std(action)} min {np.min(action)} max {np.max(action)}" + np.savez_compressed( + normalization_save_path, + obs_min=obs_min, + obs_max=obs_max, + action_min=action_min, + action_max=action_max, ) - # logging.info("Train - Observation shape:", out_train["observations"].shape) - # logging.info("Train - Action shape:", out_train["actions"].shape) - # logging.info("Train - Reward shape:", out_train["rewards"].shape) - # logging.info("Val - Observation shape:", out_val["observations"].shape) - # logging.info("Val - Action shape:", out_val["actions"].shape) - # logging.info("Val - Reward shape:", out_val["rewards"].shape) - # if use_img: - # logging.info("Image shapes:", img_shapes) + + # Logging final information + logging.info( + f"Train - Trajectories: {len(out_train['traj_lengths'])}, Transitions: {np.sum(out_train['traj_lengths'])}" + ) + logging.info( + f"Val - Trajectories: {len(out_val['traj_lengths'])}, Transitions: {np.sum(out_val['traj_lengths'])}" + ) if __name__ == "__main__": @@ -386,7 +265,7 @@ if __name__ == "__main__": parser.add_argument("--load_path", type=str, default=".") parser.add_argument("--save_dir", type=str, default=".") parser.add_argument("--save_name_prefix", type=str, default="") - parser.add_argument("--val_split", type=float, default="0.2") + parser.add_argument("--val_split", type=float, default="0") parser.add_argument("--max_episodes", type=int, default="-1") parser.add_argument("--normalize", action="store_true") parser.add_argument("--cameras", nargs="*", default=None) @@ -394,9 +273,6 @@ if __name__ == "__main__": import datetime - if args.max_episodes > 0: - args.save_name_prefix += f"max_episodes_{args.max_episodes}_" - os.makedirs(args.save_dir, exist_ok=True) log_path = os.path.join( args.save_dir,