From dc8e0c9edce7ac2b2ff112abe460e1c21b0b3bdc Mon Sep 17 00:00:00 2001 From: "Allen Z. Ren" Date: Wed, 30 Oct 2024 19:58:06 -0400 Subject: [PATCH] v0.6 (#18) * Sampling over both env and denoising steps in DPPO updates (#13) * sample one from each chain * full random sampling * Add Proficient Human (PH) Configs and Pipeline (#16) * fix missing cfg * add ph config * fix how terminated flags are added to buffer in ibrl * add ph config * offline calql for 1M gradient updates * bug fix: number of calql online gradient steps is the number of new transitions collected * add sample config for DPPO with ta=1 * Sampling over both env and denoising steps in DPPO updates (#13) * sample one from each chain * full random sampling * fix diffusion loss when predicting initial noise * fix dppo inds * fix typo * remove print statement --------- Co-authored-by: Justin M. Lidard Co-authored-by: allenzren * update robomimic configs * better calql formulation * optimize calql and ibrl training * optimize data transfer in ppo agents * add kitchen configs * re-organize config folders, rerun calql and rlpd * add scratch gym locomotion configs * add kitchen installation dependencies * use truncated for termination in furniture env * update furniture and gym configs * update README and dependencies with kitchen * add url for new data and checkpoints * update demo RL configs * update batch sizes for furniture unet configs * raise error about dropout in residual mlp * fix observation bug in bc loss --------- Co-authored-by: Justin Lidard <60638575+jlidard@users.noreply.github.com> Co-authored-by: Justin M. Lidard --- README.md | 8 +- agent/finetune/train_calql_agent.py | 16 +- agent/finetune/train_ibrl_agent.py | 14 +- agent/finetune/train_ppo_diffusion_agent.py | 52 +++--- .../finetune/train_ppo_diffusion_img_agent.py | 36 ++-- .../train_ppo_exact_diffusion_agent.py | 13 +- agent/finetune/train_ppo_gaussian_agent.py | 17 +- .../finetune/train_ppo_gaussian_img_agent.py | 15 +- .../lamp_low/ft_ppo_diffusion_unet.yaml | 2 +- .../lamp_med/ft_ppo_diffusion_unet.yaml | 2 +- .../one_leg_low/ft_ppo_diffusion_mlp.yaml | 2 +- .../one_leg_low/ft_ppo_diffusion_unet.yaml | 2 +- .../one_leg_med/ft_ppo_diffusion_unet.yaml | 2 +- .../ft_ppo_diffusion_unet.yaml | 2 +- .../ft_ppo_diffusion_unet.yaml | 2 +- .../eval/kitchen-v0/eval_diffusion_mlp.yaml | 61 +++++++ .../halfcheetah-v2/calql_mlp_online.yaml | 4 +- .../halfcheetah-v2/ft_ppo_diffusion_mlp.yaml | 2 +- .../ft_ppo_diffusion_mlp_ta1.yaml | 108 ++++++++++++ cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml | 4 +- .../hopper-v2/ft_ppo_diffusion_mlp.yaml | 2 +- cfg/gym/finetune/hopper-v2/sac_mlp.yaml | 89 ---------- .../kitchen-complete-v0/calql_mlp_online.yaml | 116 +++++++++++++ .../ft_ppo_diffusion_mlp.yaml | 108 ++++++++++++ .../kitchen-complete-v0/ibrl_mlp.yaml | 109 ++++++++++++ .../calql_mlp_online.yaml | 25 ++- .../ft_ppo_diffusion_mlp.yaml | 108 ++++++++++++ .../ibrl_mlp.yaml | 41 ++--- .../kitchen-partial-v0/calql_mlp_online.yaml | 116 +++++++++++++ .../ft_ppo_diffusion_mlp.yaml | 108 ++++++++++++ .../finetune/kitchen-partial-v0/ibrl_mlp.yaml | 109 ++++++++++++ .../walker2d-v2/ft_ppo_diffusion_mlp.yaml | 2 +- cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml | 103 ------------ .../calql_mlp_offline.yaml | 2 +- .../calql_mlp_offline.yaml | 113 +++++++++++++ .../pre_diffusion_mlp.yaml | 66 ++++++++ .../kitchen-complete-v0/pre_gaussian_mlp.yaml | 60 +++++++ .../calql_mlp_offline.yaml | 30 ++-- .../kitchen-mixed-v0/pre_diffusion_mlp.yaml | 66 ++++++++ .../kitchen-mixed-v0/pre_gaussian_mlp.yaml | 59 +++++++ .../kitchen-partial-v0/calql_mlp_offline.yaml | 113 +++++++++++++ .../kitchen-partial-v0/pre_diffusion_mlp.yaml | 66 ++++++++ .../kitchen-partial-v0/pre_gaussian_mlp.yaml | 59 +++++++ .../halfcheetah-v2/ppo_diffusion_mlp.yaml | 6 +- .../halfcheetah-v2/ppo_gaussian_mlp.yaml | 2 +- .../halfcheetah-v2/rlpd_mlp.yaml | 2 +- .../halfcheetah-v2/sac_mlp.yaml | 2 +- .../scratch/hopper-v2/awr_diffusion_mlp.yaml | 99 +++++++++++ .../scratch/hopper-v2/dipo_diffusion_mlp.yaml | 101 +++++++++++ .../scratch/hopper-v2/dql_diffusion_mlp.yaml | 100 +++++++++++ .../scratch/hopper-v2/idql_diffusion_mlp.yaml | 108 ++++++++++++ .../hopper-v2/ppo_diffusion_mlp.yaml | 12 +- .../hopper-v2/ppo_gaussian_mlp.yaml | 2 +- .../scratch/hopper-v2/qsm_diffusion_mlp.yaml | 100 +++++++++++ .../scratch/hopper-v2/rwr_diffusion_mlp.yaml | 84 +++++++++ .../scratch/kitchen-complete-v0/rlpd_mlp.yaml | 109 ++++++++++++ .../kitchen-mixed-v0}/rlpd_mlp.yaml | 34 ++-- .../scratch/kitchen-partial-v0/rlpd_mlp.yaml | 109 ++++++++++++ .../walker2d-v2/ppo_diffusion_mlp.yaml | 6 +- .../walker2d-v2/ppo_gaussian_mlp.yaml | 2 +- .../finetune/can/calql_mlp_online.yaml | 4 +- .../finetune/can/calql_mlp_online_ph.yaml | 122 ++++++++++++++ .../finetune/can/ft_awr_diffusion_mlp.yaml | 4 +- .../finetune/can/ft_dql_diffusion_mlp.yaml | 2 +- .../finetune/can/ft_idql_diffusion_mlp.yaml | 2 +- .../finetune/can/ft_ppo_diffusion_mlp.yaml | 6 +- .../can/ft_ppo_diffusion_mlp_img.yaml | 16 +- .../can/ft_ppo_diffusion_mlp_ta1.yaml | 111 ++++++++++++ .../can/ft_ppo_diffusion_mlp_ta1_ph.yaml | 111 ++++++++++++ .../finetune/can/ft_qsm_diffusion_mlp.yaml | 2 +- .../finetune/can/ft_rwr_diffusion_mlp.yaml | 4 +- cfg/robomimic/finetune/can/ibrl_mlp.yaml | 4 +- cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml | 115 +++++++++++++ .../finetune/lift/ft_awr_diffusion_mlp.yaml | 2 +- .../finetune/lift/ft_dql_diffusion_mlp.yaml | 2 +- .../finetune/lift/ft_idql_diffusion_mlp.yaml | 2 +- .../finetune/lift/ft_ppo_diffusion_mlp.yaml | 10 +- .../lift/ft_ppo_diffusion_mlp_img.yaml | 16 +- .../finetune/lift/ft_qsm_diffusion_mlp.yaml | 2 +- .../finetune/lift/ft_rwr_diffusion_mlp.yaml | 2 +- .../finetune/square/calql_mlp_online.yaml | 4 +- .../finetune/square/calql_mlp_online_ph.yaml | 122 ++++++++++++++ .../finetune/square/ft_awr_diffusion_mlp.yaml | 2 +- .../finetune/square/ft_dql_diffusion_mlp.yaml | 2 +- .../square/ft_idql_diffusion_mlp.yaml | 2 +- .../finetune/square/ft_ppo_diffusion_mlp.yaml | 8 +- .../square/ft_ppo_diffusion_mlp_img.yaml | 12 +- .../square/ft_ppo_diffusion_mlp_ta1.yaml | 112 ++++++++++++ .../square/ft_ppo_diffusion_mlp_ta1_ph.yaml | 112 ++++++++++++ .../finetune/square/ft_qsm_diffusion_mlp.yaml | 2 +- .../finetune/square/ft_rwr_diffusion_mlp.yaml | 2 +- cfg/robomimic/finetune/square/ibrl_mlp.yaml | 4 +- .../finetune/square/ibrl_mlp_ph.yaml | 115 +++++++++++++ .../transport/ft_awr_diffusion_mlp.yaml | 8 +- .../transport/ft_dipo_diffusion_mlp.yaml | 12 +- .../transport/ft_dql_diffusion_mlp.yaml | 10 +- .../transport/ft_idql_diffusion_mlp.yaml | 6 +- .../transport/ft_ppo_diffusion_mlp.yaml | 12 +- .../transport/ft_ppo_diffusion_mlp_img.yaml | 18 +- .../transport/ft_qsm_diffusion_mlp.yaml | 6 +- .../transport/ft_rwr_diffusion_mlp.yaml | 6 +- .../pretrain/can/calql_mlp_offline.yaml | 10 +- .../pretrain/can/calql_mlp_offline_ph.yaml | 118 +++++++++++++ .../pretrain/can/pre_diffusion_mlp_ta1.yaml | 65 +++++++ .../can/pre_diffusion_mlp_ta1_ph.yaml | 65 +++++++ .../pre_gaussian_mlp_ta1_ph.yaml} | 14 +- .../pretrain/square/calql_mlp_offline.yaml | 10 +- .../pretrain/square/calql_mlp_offline_ph.yaml | 118 +++++++++++++ .../square/pre_diffusion_mlp_ta1.yaml | 66 ++++++++ .../square/pre_diffusion_mlp_ta1_ph.yaml | 66 ++++++++ .../pre_gaussian_mlp_ta1_ph.yaml} | 13 +- .../{finetune => scratch}/can/rlpd_mlp.yaml | 2 +- cfg/robomimic/scratch/can/rlpd_mlp_ph.yaml | 114 +++++++++++++ .../square/rlpd_mlp.yaml | 2 +- cfg/robomimic/scratch/square/rlpd_mlp_ph.yaml | 114 +++++++++++++ env/gym_utils/__init__.py | 4 +- env/gym_utils/wrapper/furniture.py | 5 +- model/common/mlp.py | 5 + model/common/mlp_gaussian.py | 5 +- model/diffusion/diffusion_ppo.py | 37 ++-- model/diffusion/diffusion_vpg.py | 65 +++++++ model/rl/gaussian_calql.py | 35 ++-- model/rl/gaussian_ibrl.py | 2 +- model/rl/gaussian_ppo.py | 36 ++-- pyproject.toml | 9 +- script/download_url.py | 159 +++++++++++++++++- 126 files changed, 4614 insertions(+), 553 deletions(-) create mode 100644 cfg/gym/eval/kitchen-v0/eval_diffusion_mlp.yaml create mode 100644 cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp_ta1.yaml delete mode 100644 cfg/gym/finetune/hopper-v2/sac_mlp.yaml create mode 100644 cfg/gym/finetune/kitchen-complete-v0/calql_mlp_online.yaml create mode 100644 cfg/gym/finetune/kitchen-complete-v0/ft_ppo_diffusion_mlp.yaml create mode 100644 cfg/gym/finetune/kitchen-complete-v0/ibrl_mlp.yaml rename cfg/gym/finetune/{hopper-v2 => kitchen-mixed-v0}/calql_mlp_online.yaml (86%) create mode 100644 cfg/gym/finetune/kitchen-mixed-v0/ft_ppo_diffusion_mlp.yaml rename cfg/gym/finetune/{hopper-v2 => kitchen-mixed-v0}/ibrl_mlp.yaml (78%) create mode 100644 cfg/gym/finetune/kitchen-partial-v0/calql_mlp_online.yaml create mode 100644 cfg/gym/finetune/kitchen-partial-v0/ft_ppo_diffusion_mlp.yaml create mode 100644 cfg/gym/finetune/kitchen-partial-v0/ibrl_mlp.yaml delete mode 100644 cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml create mode 100644 cfg/gym/pretrain/kitchen-complete-v0/calql_mlp_offline.yaml create mode 100644 cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml create mode 100644 cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml rename cfg/gym/pretrain/{hopper-medium-v2 => kitchen-mixed-v0}/calql_mlp_offline.yaml (85%) create mode 100644 cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml create mode 100644 cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml create mode 100644 cfg/gym/pretrain/kitchen-partial-v0/calql_mlp_offline.yaml create mode 100644 cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml create mode 100644 cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml rename cfg/gym/{finetune => scratch}/halfcheetah-v2/ppo_diffusion_mlp.yaml (96%) rename cfg/gym/{finetune => scratch}/halfcheetah-v2/ppo_gaussian_mlp.yaml (97%) rename cfg/gym/{finetune => scratch}/halfcheetah-v2/rlpd_mlp.yaml (98%) rename cfg/gym/{finetune => scratch}/halfcheetah-v2/sac_mlp.yaml (98%) create mode 100644 cfg/gym/scratch/hopper-v2/awr_diffusion_mlp.yaml create mode 100644 cfg/gym/scratch/hopper-v2/dipo_diffusion_mlp.yaml create mode 100644 cfg/gym/scratch/hopper-v2/dql_diffusion_mlp.yaml create mode 100644 cfg/gym/scratch/hopper-v2/idql_diffusion_mlp.yaml rename cfg/gym/{finetune => scratch}/hopper-v2/ppo_diffusion_mlp.yaml (95%) rename cfg/gym/{finetune => scratch}/hopper-v2/ppo_gaussian_mlp.yaml (97%) create mode 100644 cfg/gym/scratch/hopper-v2/qsm_diffusion_mlp.yaml create mode 100644 cfg/gym/scratch/hopper-v2/rwr_diffusion_mlp.yaml create mode 100644 cfg/gym/scratch/kitchen-complete-v0/rlpd_mlp.yaml rename cfg/gym/{finetune/hopper-v2 => scratch/kitchen-mixed-v0}/rlpd_mlp.yaml (83%) create mode 100644 cfg/gym/scratch/kitchen-partial-v0/rlpd_mlp.yaml rename cfg/gym/{finetune => scratch}/walker2d-v2/ppo_diffusion_mlp.yaml (96%) rename cfg/gym/{finetune => scratch}/walker2d-v2/ppo_gaussian_mlp.yaml (97%) create mode 100644 cfg/robomimic/finetune/can/calql_mlp_online_ph.yaml create mode 100644 cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1.yaml create mode 100644 cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1_ph.yaml create mode 100644 cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml create mode 100644 cfg/robomimic/finetune/square/calql_mlp_online_ph.yaml create mode 100644 cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1.yaml create mode 100644 cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1_ph.yaml create mode 100644 cfg/robomimic/finetune/square/ibrl_mlp_ph.yaml create mode 100644 cfg/robomimic/pretrain/can/calql_mlp_offline_ph.yaml create mode 100644 cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1.yaml create mode 100644 cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1_ph.yaml rename cfg/robomimic/pretrain/{transport/pre_gaussian_mlp_ibrl.yaml => can/pre_gaussian_mlp_ta1_ph.yaml} (85%) create mode 100644 cfg/robomimic/pretrain/square/calql_mlp_offline_ph.yaml create mode 100644 cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1.yaml create mode 100644 cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1_ph.yaml rename cfg/robomimic/pretrain/{lift/pre_gaussian_mlp_ibrl.yaml => square/pre_gaussian_mlp_ta1_ph.yaml} (84%) rename cfg/robomimic/{finetune => scratch}/can/rlpd_mlp.yaml (98%) create mode 100644 cfg/robomimic/scratch/can/rlpd_mlp_ph.yaml rename cfg/robomimic/{finetune => scratch}/square/rlpd_mlp.yaml (98%) create mode 100644 cfg/robomimic/scratch/square/rlpd_mlp_ph.yaml diff --git a/README.md b/README.md index f4f3428..2d64563 100644 --- a/README.md +++ b/README.md @@ -31,12 +31,11 @@ conda activate dppo pip install -e . ``` -3. Install specific environment dependencies (Gym / Robomimic / D3IL / Furniture-Bench) or all dependencies +3. Install specific environment dependencies (Gym / Kitchen / Robomimic / D3IL / Furniture-Bench) or all dependencies (except for Kitchen, which has dependency conflicts with other tasks). ```console -pip install -e .[gym] # or [robomimic], [d3il], [furniture] -pip install -e .[all] +pip install -e .[gym] # or [kitchen], [robomimic], [d3il], [furniture] +pip install -e .[all] # except for Kitchen ``` - 4. [Install MuJoCo for Gym and/or Robomimic](installation/install_mujoco.md). [Install D3IL](installation/install_d3il.md). [Install IsaacGym and Furniture-Bench](installation/install_furniture.md) @@ -161,6 +160,7 @@ Our diffusion implementation is mostly based on [Diffuser](https://github.com/ja * `model.min_sampling_denoising_std`: , minimum amount of noise when sampling at a denoising step * `model.min_logprob_denoising_std`: , minimum standard deviation when evaluating likelihood at a denoising step * `model.clip_ploss_coef`: PPO clipping ratio +* `train.batch_size`: you may notice the batch size is rather large --- this is due to the PPO update being in expectation over both environment steps and denoising steps (new in v0.6). ### DDIM fine-tuning diff --git a/agent/finetune/train_calql_agent.py b/agent/finetune/train_calql_agent.py index cd96d0b..c71f488 100644 --- a/agent/finetune/train_calql_agent.py +++ b/agent/finetune/train_calql_agent.py @@ -82,8 +82,6 @@ class TrainCalQLAgent(TrainAgent): if self.train_online: # number of episode to colect per epoch for training self.n_episode_per_epoch = cfg.train.n_episode_per_epoch - # UTD ratio - self.online_utd_ratio = cfg.train.online_utd_ratio # Eval episodes self.n_eval_episode = cfg.train.n_eval_episode @@ -204,9 +202,13 @@ class TrainCalQLAgent(TrainAgent): action_venv = samples[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( - self.venv.step(action_venv) - ) + ( + obs_venv, + reward_venv, + terminated_venv, + truncated_venv, + info_venv, + ) = self.venv.step(action_venv) done_venv = terminated_venv | truncated_venv reward_trajs[step] = reward_venv firsts_trajs[step + 1] = done_venv @@ -308,7 +310,8 @@ class TrainCalQLAgent(TrainAgent): # override num_update if self.train_online: - num_update = len(reward_trajs) # assume one env! + # the amount of new transitions(single env) + num_update = len(reward_trajs_split[0]) else: num_update = self.num_update for _ in range(num_update): @@ -413,7 +416,6 @@ class TrainCalQLAgent(TrainAgent): reward_to_go_b, terminated_b, self.gamma, - alpha, ) self.critic_optimizer.zero_grad() loss_critic.backward() diff --git a/agent/finetune/train_ibrl_agent.py b/agent/finetune/train_ibrl_agent.py index 0f9a06d..6de7d77 100644 --- a/agent/finetune/train_ibrl_agent.py +++ b/agent/finetune/train_ibrl_agent.py @@ -145,7 +145,6 @@ class TrainIBRLAgent(TrainAgent): # Collect a set of trajectories from env cnt_episode = 0 for step in range(n_steps): - # Select action with torch.no_grad(): cond = { @@ -164,9 +163,13 @@ class TrainIBRLAgent(TrainAgent): action_venv = samples[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( - self.venv.step(action_venv) - ) + ( + obs_venv, + reward_venv, + terminated_venv, + truncated_venv, + info_venv, + ) = self.venv.step(action_venv) done_venv = terminated_venv | truncated_venv reward_trajs[step] = reward_venv firsts_trajs[step + 1] = done_venv @@ -177,14 +180,13 @@ class TrainIBRLAgent(TrainAgent): obs_buffer.append(prev_obs_venv["state"][i]) if "final_obs" in info_venv[i]: # truncated next_obs_buffer.append(info_venv[i]["final_obs"]["state"]) - terminated_venv[i] = False else: # first obs in new episode next_obs_buffer.append(obs_venv["state"][i]) action_buffer.append(action_venv[i]) reward_buffer.extend( (reward_venv * self.scale_reward_factor).tolist() ) - terminated_buffer.append(terminated_venv.tolist()) + terminated_buffer.extend(terminated_venv.tolist()) # update for next step prev_obs_venv = obs_venv diff --git a/agent/finetune/train_ppo_diffusion_agent.py b/agent/finetune/train_ppo_diffusion_agent.py index ee073b3..998c638 100644 --- a/agent/finetune/train_ppo_diffusion_agent.py +++ b/agent/finetune/train_ppo_diffusion_agent.py @@ -19,7 +19,6 @@ from util.scheduler import CosineAnnealingWarmupRestarts class TrainPPODiffusionAgent(TrainPPOAgent): - def __init__(self, cfg): super().__init__(cfg) @@ -46,7 +45,6 @@ class TrainPPODiffusionAgent(TrainPPOAgent): ) def run(self): - # Start training loop timer = Timer() run_results = [] @@ -54,7 +52,6 @@ class TrainPPODiffusionAgent(TrainPPOAgent): last_itr_eval = False done_venv = np.zeros((1, self.n_envs)) while self.itr < self.n_train_itr: - # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env options_venv = [{} for _ in range(self.n_envs)] if self.itr % self.render_freq == 0 and self.render_video: @@ -126,9 +123,13 @@ class TrainPPODiffusionAgent(TrainPPOAgent): action_venv = output_venv[:, : self.act_steps] # Apply multi-step action - obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = ( - self.venv.step(action_venv) - ) + ( + obs_venv, + reward_venv, + terminated_venv, + truncated_venv, + info_venv, + ) = self.venv.step(action_venv) done_venv = terminated_venv | truncated_venv if self.save_full_observations: # state-only obs_full_venv = np.array( @@ -285,40 +286,45 @@ class TrainPPODiffusionAgent(TrainPPOAgent): ) } chains_k = einops.rearrange( - torch.tensor(chains_trajs).float().to(self.device), + torch.tensor(chains_trajs, device=self.device).float(), "s e t h d -> (s e) t h d", ) returns_k = ( - torch.tensor(returns_trajs).float().to(self.device).reshape(-1) + torch.tensor(returns_trajs, device=self.device).float().reshape(-1) ) values_k = ( - torch.tensor(values_trajs).float().to(self.device).reshape(-1) + torch.tensor(values_trajs, device=self.device).float().reshape(-1) ) advantages_k = ( - torch.tensor(advantages_trajs).float().to(self.device).reshape(-1) + torch.tensor(advantages_trajs, device=self.device) + .float() + .reshape(-1) ) - logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device) + logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float() # Update policy and critic - total_steps = self.n_steps * self.n_envs - inds_k = np.arange(total_steps) + total_steps = self.n_steps * self.n_envs * self.model.ft_denoising_steps clipfracs = [] for update_epoch in range(self.update_epochs): - # for each epoch, go through all data in batches flag_break = False - np.random.shuffle(inds_k) + inds_k = torch.randperm(total_steps, device=self.device) num_batch = max(1, total_steps // self.batch_size) # skip last ones for batch in range(num_batch): start = batch * self.batch_size end = start + self.batch_size inds_b = inds_k[start:end] # b for batch - obs_b = {"state": obs_k["state"][inds_b]} - chains_b = chains_k[inds_b] - returns_b = returns_k[inds_b] - values_b = values_k[inds_b] - advantages_b = advantages_k[inds_b] - logprobs_b = logprobs_k[inds_b] + batch_inds_b, denoising_inds_b = torch.unravel_index( + inds_b, + (self.n_steps * self.n_envs, self.model.ft_denoising_steps), + ) + obs_b = {"state": obs_k["state"][batch_inds_b]} + chains_prev_b = chains_k[batch_inds_b, denoising_inds_b] + chains_next_b = chains_k[batch_inds_b, denoising_inds_b + 1] + returns_b = returns_k[batch_inds_b] + values_b = values_k[batch_inds_b] + advantages_b = advantages_k[batch_inds_b] + logprobs_b = logprobs_k[batch_inds_b, denoising_inds_b] # get loss ( @@ -332,7 +338,9 @@ class TrainPPODiffusionAgent(TrainPPOAgent): eta, ) = self.model.loss( obs_b, - chains_b, + chains_prev_b, + chains_next_b, + denoising_inds_b, returns_b, values_b, advantages_b, diff --git a/agent/finetune/train_ppo_diffusion_img_agent.py b/agent/finetune/train_ppo_diffusion_img_agent.py index 9d47b0d..9eb6696 100644 --- a/agent/finetune/train_ppo_diffusion_img_agent.py +++ b/agent/finetune/train_ppo_diffusion_img_agent.py @@ -283,40 +283,44 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent): for k in obs_trajs } chains_k = einops.rearrange( - torch.tensor(chains_trajs).float().to(self.device), + torch.tensor(chains_trajs, device=self.device).float(), "s e t h d -> (s e) t h d", ) returns_k = ( - torch.tensor(returns_trajs).float().to(self.device).reshape(-1) + torch.tensor(returns_trajs, device=self.device).float().reshape(-1) ) values_k = ( - torch.tensor(values_trajs).float().to(self.device).reshape(-1) + torch.tensor(values_trajs, device=self.device).float().reshape(-1) ) advantages_k = ( - torch.tensor(advantages_trajs).float().to(self.device).reshape(-1) + torch.tensor(advantages_trajs, device=self.device).float().reshape(-1) ) - logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device) + logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float() # Update policy and critic - total_steps = self.n_steps * self.n_envs - inds_k = np.arange(total_steps) + total_steps = self.n_steps * self.n_envs * self.model.ft_denoising_steps clipfracs = [] for update_epoch in range(self.update_epochs): # for each epoch, go through all data in batches flag_break = False - np.random.shuffle(inds_k) + inds_k = torch.randperm(total_steps, device=self.device) num_batch = max(1, total_steps // self.batch_size) # skip last ones for batch in range(num_batch): start = batch * self.batch_size end = start + self.batch_size inds_b = inds_k[start:end] # b for batch - obs_b = {k: obs_k[k][inds_b] for k in obs_k} - chains_b = chains_k[inds_b] - returns_b = returns_k[inds_b] - values_b = values_k[inds_b] - advantages_b = advantages_k[inds_b] - logprobs_b = logprobs_k[inds_b] + batch_inds_b, denoising_inds_b = torch.unravel_index( + inds_b, + (self.n_steps * self.n_envs, self.model.ft_denoising_steps), + ) + obs_b = {k: obs_k[k][batch_inds_b] for k in obs_k} + chains_prev_b = chains_k[batch_inds_b, denoising_inds_b] + chains_next_b = chains_k[batch_inds_b, denoising_inds_b + 1] + returns_b = returns_k[batch_inds_b] + values_b = values_k[batch_inds_b] + advantages_b = advantages_k[batch_inds_b] + logprobs_b = logprobs_k[batch_inds_b, denoising_inds_b] # get loss ( @@ -330,7 +334,9 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent): eta, ) = self.model.loss( obs_b, - chains_b, + chains_prev_b, + chains_next_b, + denoising_inds_b, returns_b, values_b, advantages_b, diff --git a/agent/finetune/train_ppo_exact_diffusion_agent.py b/agent/finetune/train_ppo_exact_diffusion_agent.py index 920b03f..6fa425f 100644 --- a/agent/finetune/train_ppo_exact_diffusion_agent.py +++ b/agent/finetune/train_ppo_exact_diffusion_agent.py @@ -249,29 +249,28 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): ) } samples_k = einops.rearrange( - torch.tensor(samples_trajs).float().to(self.device), + torch.tensor(samples_trajs, device=self.device).float(), "s e h d -> (s e) h d", ) returns_k = ( - torch.tensor(returns_trajs).float().to(self.device).reshape(-1) + torch.tensor(returns_trajs, device=self.device).float().reshape(-1) ) values_k = ( - torch.tensor(values_trajs).float().to(self.device).reshape(-1) + torch.tensor(values_trajs, device=self.device).float().reshape(-1) ) advantages_k = ( - torch.tensor(advantages_trajs).float().to(self.device).reshape(-1) + torch.tensor(advantages_trajs, device=self.device).float().reshape(-1) ) - logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device) + logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float() # Update policy and critic total_steps = self.n_steps * self.n_envs - inds_k = np.arange(total_steps) clipfracs = [] for update_epoch in range(self.update_epochs): # for each epoch, go through all data in batches flag_break = False - np.random.shuffle(inds_k) + inds_k = torch.randperm(total_steps, device=self.device) num_batch = max(1, total_steps // self.batch_size) # skip last ones for batch in range(num_batch): start = batch * self.batch_size diff --git a/agent/finetune/train_ppo_gaussian_agent.py b/agent/finetune/train_ppo_gaussian_agent.py index 2ad38bd..be1d754 100644 --- a/agent/finetune/train_ppo_gaussian_agent.py +++ b/agent/finetune/train_ppo_gaussian_agent.py @@ -210,7 +210,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent): ) reward_trajs = reward_trajs_transpose.T - # bootstrap value with GAE if not done - apply reward scaling with constant if specified + # bootstrap value with GAE if not terminal - apply reward scaling with constant if specified obs_venv_ts = { "state": torch.from_numpy(obs_venv["state"]) .float() @@ -250,31 +250,28 @@ class TrainPPOGaussianAgent(TrainPPOAgent): ) } samples_k = einops.rearrange( - torch.tensor(samples_trajs).float().to(self.device), + torch.tensor(samples_trajs, device=self.device).float(), "s e h d -> (s e) h d", ) returns_k = ( - torch.tensor(returns_trajs).float().to(self.device).reshape(-1) + torch.tensor(returns_trajs, device=self.device).float().reshape(-1) ) values_k = ( - torch.tensor(values_trajs).float().to(self.device).reshape(-1) + torch.tensor(values_trajs, device=self.device).float().reshape(-1) ) advantages_k = ( - torch.tensor(advantages_trajs).float().to(self.device).reshape(-1) - ) - logprobs_k = ( - torch.tensor(logprobs_trajs).float().to(self.device).reshape(-1) + torch.tensor(advantages_trajs, device=self.device).float().reshape(-1) ) + logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float() # Update policy and critic total_steps = self.n_steps * self.n_envs - inds_k = np.arange(total_steps) clipfracs = [] for update_epoch in range(self.update_epochs): # for each epoch, go through all data in batches flag_break = False - np.random.shuffle(inds_k) + inds_k = torch.randperm(total_steps, device=self.device) num_batch = max(1, total_steps // self.batch_size) # skip last ones for batch in range(num_batch): start = batch * self.batch_size diff --git a/agent/finetune/train_ppo_gaussian_img_agent.py b/agent/finetune/train_ppo_gaussian_img_agent.py index de1dbb9..3c404ed 100644 --- a/agent/finetune/train_ppo_gaussian_img_agent.py +++ b/agent/finetune/train_ppo_gaussian_img_agent.py @@ -231,7 +231,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent): ) reward_trajs = reward_trajs_transpose.T - # bootstrap value with GAE if not done - apply reward scaling with constant if specified + # bootstrap value with GAE if not terminal - apply reward scaling with constant if specified obs_venv_ts = { key: torch.from_numpy(obs_venv[key]).float().to(self.device) for key in self.obs_dims @@ -271,29 +271,28 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent): for k in obs_trajs } samples_k = einops.rearrange( - torch.tensor(samples_trajs).float().to(self.device), + torch.tensor(samples_trajs, device=self.device).float(), "s e h d -> (s e) h d", ) returns_k = ( - torch.tensor(returns_trajs).float().to(self.device).reshape(-1) + torch.tensor(returns_trajs, device=self.device).float().reshape(-1) ) values_k = ( - torch.tensor(values_trajs).float().to(self.device).reshape(-1) + torch.tensor(values_trajs, device=self.device).float().reshape(-1) ) advantages_k = ( - torch.tensor(advantages_trajs).float().to(self.device).reshape(-1) + torch.tensor(advantages_trajs, device=self.device).float().reshape(-1) ) - logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device) + logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float() # Update policy and critic total_steps = self.n_steps * self.n_envs - inds_k = np.arange(total_steps) clipfracs = [] for update_epoch in range(self.update_epochs): # for each epoch, go through all data in batches flag_break = False - np.random.shuffle(inds_k) + inds_k = torch.randperm(total_steps, device=self.device) num_batch = max(1, total_steps // self.batch_size) # skip last ones for batch in range(num_batch): start = batch * self.batch_size diff --git a/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml index 69011ec..bb58881 100644 --- a/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml +++ b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml @@ -68,7 +68,7 @@ train: reward_scale_running: True reward_scale_const: 1.0 gae_lambda: 0.95 - batch_size: 8800 + batch_size: 40000 update_epochs: 5 vf_coef: 0.5 target_kl: 1 diff --git a/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml index ce73c44..183c2c9 100644 --- a/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml +++ b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml @@ -68,7 +68,7 @@ train: reward_scale_running: True reward_scale_const: 1.0 gae_lambda: 0.95 - batch_size: 8800 + batch_size: 40000 update_epochs: 5 vf_coef: 0.5 target_kl: 1 diff --git a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml index 44cd23a..92be054 100644 --- a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml +++ b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml @@ -68,7 +68,7 @@ train: reward_scale_running: True reward_scale_const: 1.0 gae_lambda: 0.95 - batch_size: 8800 + batch_size: 17600 update_epochs: 5 vf_coef: 0.5 target_kl: 1 diff --git a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml index 54082c5..79d7a6e 100644 --- a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml +++ b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml @@ -68,7 +68,7 @@ train: reward_scale_running: True reward_scale_const: 1.0 gae_lambda: 0.95 - batch_size: 8800 + batch_size: 30000 update_epochs: 5 vf_coef: 0.5 target_kl: 1 diff --git a/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml index 9484c40..86d1406 100644 --- a/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml +++ b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml @@ -68,7 +68,7 @@ train: reward_scale_running: True reward_scale_const: 1.0 gae_lambda: 0.95 - batch_size: 8800 + batch_size: 30000 update_epochs: 5 vf_coef: 0.5 target_kl: 1 diff --git a/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml index 519d59f..52753d7 100644 --- a/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml +++ b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml @@ -68,7 +68,7 @@ train: reward_scale_running: True reward_scale_const: 1.0 gae_lambda: 0.95 - batch_size: 8800 + batch_size: 40000 update_epochs: 5 vf_coef: 0.5 target_kl: 1 diff --git a/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml index 659fd30..a5f3f96 100644 --- a/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml +++ b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml @@ -68,7 +68,7 @@ train: reward_scale_running: True reward_scale_const: 1.0 gae_lambda: 0.95 - batch_size: 8800 + batch_size: 40000 update_epochs: 5 vf_coef: 0.5 target_kl: 1 diff --git a/cfg/gym/eval/kitchen-v0/eval_diffusion_mlp.yaml b/cfg/gym/eval/kitchen-v0/eval_diffusion_mlp.yaml new file mode 100644 index 0000000..f74e0fc --- /dev/null +++ b/cfg/gym/eval/kitchen-v0/eval_diffusion_mlp.yaml @@ -0,0 +1,61 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent + +name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: kitchen-mixed-v0 +obs_dim: 60 +action_dim: 9 +denoising_steps: 20 +cond_steps: 1 +horizon_steps: 4 +act_steps: 4 + +n_steps: 70 +render_num: 0 + +env: + n_envs: 40 + name: ${env_name} + max_episode_steps: 280 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 4 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +model: + _target_: model.diffusion.diffusion.DiffusionModel + predict_epsilon: True + denoised_clip_value: 1.0 + # + network_path: ${base_policy_path} + network: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [256, 256, 256] + cond_mlp_dims: [128, 32] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml b/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml index cef9f0f..311d4a1 100644 --- a/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml +++ b/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml @@ -7,7 +7,7 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent name: ${env_name}_calql_mlp_ta${horizon_steps} logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} -base_policy_path: +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_calql_mlp_ta1/2024-09-29_22-59-08_42/checkpoint/state_49.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz @@ -92,7 +92,7 @@ model: tanh_output: False # squash after sampling instead cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} std_max: 7.3891 std_min: 0.0067 critic: diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml index fbcea35..8e395ff 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml +++ b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml @@ -68,7 +68,7 @@ train: reward_scale_running: True reward_scale_const: 1.0 gae_lambda: 0.95 - batch_size: 5000 + batch_size: 50000 update_epochs: 5 vf_coef: 0.5 target_kl: 1 diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp_ta1.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp_ta1.yaml new file mode 100644 index 0000000..cba7754 --- /dev/null +++ b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp_ta1.yaml @@ -0,0 +1,108 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent + +name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta1_td20/2024-09-29_02-13-10_42/checkpoint/state_1000.pt +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: halfcheetah-medium-v2 +obs_dim: 17 +action_dim: 6 +denoising_steps: 20 +ft_denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 20 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env_name}-finetune + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 501 + n_critic_warmup_itr: 0 + n_steps: 1000 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # PPO specific + reward_scale_running: True + reward_scale_const: 1.0 + gae_lambda: 0.95 + batch_size: 10000 + update_epochs: 5 + vf_coef: 0.5 + target_kl: 1 + +model: + _target_: model.diffusion.diffusion_ppo.PPODiffusion + # HP to tune + gamma_denoising: 0.99 + clip_ploss_coef: 0.01 + clip_ploss_coef_base: 0.01 + clip_ploss_coef_rate: 3 + randn_clip_value: 3 + min_sampling_denoising_std: 0.1 + min_logprob_denoising_std: 0.1 + # + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + activation_type: ReLU + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObs + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + ft_denoising_steps: ${ft_denoising_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml index adfec91..7ab10bd 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml +++ b/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml @@ -8,7 +8,7 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent name: ${env_name}_ibrl_mlp_ta${horizon_steps} logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz -base_policy_path: +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_gaussian_mlp_ta1/2024-09-28_18-48-54_42/checkpoint/state_500.pt offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz seed: 42 @@ -87,7 +87,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct mlp_dims: [256, 256, 256] diff --git a/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml index d4b9597..5cea98a 100644 --- a/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml +++ b/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml @@ -68,7 +68,7 @@ train: reward_scale_running: True reward_scale_const: 1.0 gae_lambda: 0.95 - batch_size: 5000 + batch_size: 50000 update_epochs: 5 vf_coef: 0.5 target_kl: 1 diff --git a/cfg/gym/finetune/hopper-v2/sac_mlp.yaml b/cfg/gym/finetune/hopper-v2/sac_mlp.yaml deleted file mode 100644 index 6d44909..0000000 --- a/cfg/gym/finetune/hopper-v2/sac_mlp.yaml +++ /dev/null @@ -1,89 +0,0 @@ -defaults: - - _self_ -hydra: - run: - dir: ${logdir} -_target_: agent.finetune.train_sac_agent.TrainSACAgent - -name: ${env_name}_sac_mlp_ta${horizon_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} -normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz -offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz - -seed: 42 -device: cuda:0 -env_name: hopper-medium-v2 -obs_dim: 11 -action_dim: 3 -cond_steps: 1 -horizon_steps: 1 -act_steps: 1 - -env: - n_envs: 1 - name: ${env_name} - max_episode_steps: 1000 - reset_at_iteration: False - save_video: False - best_reward_threshold_for_success: 3 - wrappers: - mujoco_locomotion_lowdim: - normalization_path: ${normalization_path} - multi_step: - n_obs_steps: ${cond_steps} - n_action_steps: ${act_steps} - max_episode_steps: ${env.max_episode_steps} - reset_within_step: True - -wandb: - entity: ${oc.env:DPPO_WANDB_ENTITY} - project: sac-gym-${env_name} - run: ${now:%H-%M-%S}_${name} - -train: - n_train_itr: 1000000 - n_steps: 1 - gamma: 0.99 - actor_lr: 3e-4 - critic_lr: 1e-3 - save_model_freq: 100000 - val_freq: 10000 - render: - freq: 1 - num: 0 - log_freq: 200 - # SAC specific - batch_size: 256 - target_ema_rate: 0.005 - scale_reward_factor: 1 - critic_replay_ratio: 256 - actor_replay_ratio: 128 - buffer_size: 1000000 - n_eval_episode: 10 - n_explore_steps: 5000 - target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} - init_temperature: 1 - -model: - _target_: model.rl.gaussian_sac.SAC_Gaussian - randn_clip_value: 10 - tanh_output: True # squash after sampling - actor: - _target_: model.common.mlp_gaussian.Gaussian_MLP - mlp_dims: [256, 256] - activation_type: ReLU - tanh_output: False # squash after sampling instead - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - horizon_steps: ${horizon_steps} - - std_max: 7.3891 - std_min: 0.0067 - critic: # no layernorm - _target_: model.common.critic.CriticObsAct - mlp_dims: [256, 256] - activation_type: ReLU - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - action_dim: ${action_dim} - action_steps: ${act_steps} - horizon_steps: ${horizon_steps} - device: ${device} diff --git a/cfg/gym/finetune/kitchen-complete-v0/calql_mlp_online.yaml b/cfg/gym/finetune/kitchen-complete-v0/calql_mlp_online.yaml new file mode 100644 index 0000000..1d2eb3a --- /dev/null +++ b/cfg/gym/finetune/kitchen-complete-v0/calql_mlp_online.yaml @@ -0,0 +1,116 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-complete-v0_calql_mlp_ta1/2024-10-26_01-01-33_42/checkpoint/state_999.pt +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: kitchen-complete-v0 +obs_dim: 60 +action_dim: 9 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 280 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 4 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 10000 + n_steps: 1 # not used + n_episode_per_epoch: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 20 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: True + batch_size: 256 + n_random_actions: 10 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/gym/finetune/kitchen-complete-v0/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/kitchen-complete-v0/ft_ppo_diffusion_mlp.yaml new file mode 100644 index 0000000..c73997a --- /dev/null +++ b/cfg/gym/finetune/kitchen-complete-v0/ft_ppo_diffusion_mlp.yaml @@ -0,0 +1,108 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent + +name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-complete-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-47-42_42/checkpoint/state_8000.pt +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: kitchen-complete-v0 +obs_dim: 60 +action_dim: 9 +denoising_steps: 20 +ft_denoising_steps: 10 +cond_steps: 1 +horizon_steps: 4 +act_steps: 4 + +env: + n_envs: 40 + name: ${env_name} + max_episode_steps: 280 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 4 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env_name}-finetune + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 301 + n_critic_warmup_itr: 0 + n_steps: 70 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # PPO specific + reward_scale_running: True + reward_scale_const: 1.0 + gae_lambda: 0.95 + batch_size: 5600 + update_epochs: 10 + vf_coef: 0.5 + target_kl: 1 + +model: + _target_: model.diffusion.diffusion_ppo.PPODiffusion + # HP to tune + gamma_denoising: 0.99 + clip_ploss_coef: 0.01 + clip_ploss_coef_base: 0.01 + clip_ploss_coef_rate: 3 + randn_clip_value: 3 + min_sampling_denoising_std: 0.1 + min_logprob_denoising_std: 0.1 + # + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [256, 256, 256] + cond_mlp_dims: [128, 32] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObs + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + ft_denoising_steps: ${ft_denoising_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/kitchen-complete-v0/ibrl_mlp.yaml b/cfg/gym/finetune/kitchen-complete-v0/ibrl_mlp.yaml new file mode 100644 index 0000000..d0c62ee --- /dev/null +++ b/cfg/gym/finetune/kitchen-complete-v0/ibrl_mlp.yaml @@ -0,0 +1,109 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent + +name: ${env_name}_ibrl_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/gym-pretrain/kitchen-complete-v0_pre_gaussian_mlp_ta1/2024-10-25_14-48-43_42/checkpoint/state_5000.pt +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: kitchen-complete-v0 +obs_dim: 60 +action_dim: 9 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 280 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 4 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: ibrl-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 50000 + val_freq: 5000 + render: + freq: 1 + num: 0 + log_freq: 200 + # IBRL specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 5 + buffer_size: 500000 + n_eval_episode: 40 + n_explore_steps: 0 + update_freq: 2 + +model: + _target_: model.rl.gaussian_ibrl.IBRL_Gaussian + randn_clip_value: 3 + n_critics: 5 + soft_action_sample: True + soft_action_sample_beta: 10 + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + dropout: 0.5 + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + max_n_episodes: 50 \ No newline at end of file diff --git a/cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml b/cfg/gym/finetune/kitchen-mixed-v0/calql_mlp_online.yaml similarity index 86% rename from cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml rename to cfg/gym/finetune/kitchen-mixed-v0/calql_mlp_online.yaml index 10204ba..cf8da13 100644 --- a/cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml +++ b/cfg/gym/finetune/kitchen-mixed-v0/calql_mlp_online.yaml @@ -7,15 +7,15 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent name: ${env_name}_calql_mlp_ta${horizon_steps} logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} -base_policy_path: +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-mixed-v0_calql_mlp_ta1/2024-10-25_21-36-13_42/checkpoint/state_999.pt normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz seed: 42 device: cuda:0 -env_name: hopper-medium-v2 -obs_dim: 11 -action_dim: 3 +env_name: kitchen-mixed-v0 +obs_dim: 60 +action_dim: 9 cond_steps: 1 horizon_steps: 1 act_steps: 1 @@ -23,10 +23,10 @@ act_steps: 1 env: n_envs: 1 name: ${env_name} - max_episode_steps: 1000 + max_episode_steps: 280 reset_at_iteration: False save_video: False - best_reward_threshold_for_success: 3 + best_reward_threshold_for_success: 4 wrappers: mujoco_locomotion_lowdim: normalization_path: ${normalization_path} @@ -59,7 +59,7 @@ train: warmup_steps: 10 min_lr: 3e-4 save_model_freq: 100 - val_freq: 10 + val_freq: 20 render: freq: 1 num: 0 @@ -67,13 +67,12 @@ train: # CalQL specific train_online: True batch_size: 256 - n_random_actions: 4 + n_random_actions: 10 target_ema_rate: 0.005 scale_reward_factor: 1.0 num_update: 1000 buffer_size: 1000000 - online_utd_ratio: 1 - n_eval_episode: 10 + n_eval_episode: 40 n_explore_steps: 0 target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} init_temperature: 1 @@ -87,17 +86,17 @@ model: network_path: ${base_policy_path} actor: _target_: model.common.mlp_gaussian.Gaussian_MLP - mlp_dims: [256, 256] + mlp_dims: [256, 256, 256] activation_type: ReLU tanh_output: False # squash after sampling instead cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} std_max: 7.3891 std_min: 0.0067 critic: _target_: model.common.critic.CriticObsAct - mlp_dims: [256, 256] + mlp_dims: [256, 256, 256] activation_type: ReLU use_layernorm: True double_q: True diff --git a/cfg/gym/finetune/kitchen-mixed-v0/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/kitchen-mixed-v0/ft_ppo_diffusion_mlp.yaml new file mode 100644 index 0000000..f90294b --- /dev/null +++ b/cfg/gym/finetune/kitchen-mixed-v0/ft_ppo_diffusion_mlp.yaml @@ -0,0 +1,108 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent + +name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-mixed-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-28_42/checkpoint/state_8000.pt +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: kitchen-mixed-v0 +obs_dim: 60 +action_dim: 9 +denoising_steps: 20 +ft_denoising_steps: 10 +cond_steps: 1 +horizon_steps: 4 +act_steps: 4 + +env: + n_envs: 40 + name: ${env_name} + max_episode_steps: 280 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 4 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env_name}-finetune + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 301 + n_critic_warmup_itr: 0 + n_steps: 70 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # PPO specific + reward_scale_running: True + reward_scale_const: 1.0 + gae_lambda: 0.95 + batch_size: 5600 + update_epochs: 10 + vf_coef: 0.5 + target_kl: 1 + +model: + _target_: model.diffusion.diffusion_ppo.PPODiffusion + # HP to tune + gamma_denoising: 0.99 + clip_ploss_coef: 0.01 + clip_ploss_coef_base: 0.01 + clip_ploss_coef_rate: 3 + randn_clip_value: 3 + min_sampling_denoising_std: 0.1 + min_logprob_denoising_std: 0.1 + # + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [256, 256, 256] + cond_mlp_dims: [128, 32] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObs + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + ft_denoising_steps: ${ft_denoising_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml b/cfg/gym/finetune/kitchen-mixed-v0/ibrl_mlp.yaml similarity index 78% rename from cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml rename to cfg/gym/finetune/kitchen-mixed-v0/ibrl_mlp.yaml index 1737a1e..d98c3bb 100644 --- a/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml +++ b/cfg/gym/finetune/kitchen-mixed-v0/ibrl_mlp.yaml @@ -8,14 +8,14 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent name: ${env_name}_ibrl_mlp_ta${horizon_steps} logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz -base_policy_path: +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-mixed-v0_pre_gaussian_mlp_ta1/2024-10-25_01-39-44_42/checkpoint/state_5000.pt offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz seed: 42 device: cuda:0 -env_name: hopper-medium-v2 -obs_dim: 11 -action_dim: 3 +env_name: kitchen-mixed-v0 +obs_dim: 60 +action_dim: 9 cond_steps: 1 horizon_steps: 1 act_steps: 1 @@ -23,10 +23,10 @@ act_steps: 1 env: n_envs: 1 name: ${env_name} - max_episode_steps: 1000 + max_episode_steps: 280 reset_at_iteration: False save_video: False - best_reward_threshold_for_success: 3 + best_reward_threshold_for_success: 4 wrappers: mujoco_locomotion_lowdim: normalization_path: ${normalization_path} @@ -42,7 +42,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 250000 + n_train_itr: 1000000 n_steps: 1 gamma: 0.99 actor_lr: 1e-4 @@ -51,25 +51,25 @@ train: first_cycle_steps: 1000 warmup_steps: 10 min_lr: 1e-4 - critic_lr: 1e-4 + critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-4 + min_lr: 1e-3 save_model_freq: 50000 - val_freq: 2000 + val_freq: 5000 render: freq: 1 num: 0 log_freq: 200 # IBRL specific batch_size: 256 - target_ema_rate: 0.01 + target_ema_rate: 0.01 scale_reward_factor: 1 critic_num_update: 5 - buffer_size: 1000000 - n_eval_episode: 10 + buffer_size: 500000 + n_eval_episode: 40 n_explore_steps: 0 update_freq: 2 @@ -78,19 +78,19 @@ model: randn_clip_value: 3 n_critics: 5 soft_action_sample: True - soft_action_sample_beta: 0.1 - network_path: ${base_policy_path} + soft_action_sample_beta: 10 actor: _target_: model.common.mlp_gaussian.Gaussian_MLP - mlp_dims: [256, 256, 256] - activation_type: Mish + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + dropout: 0.5 fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct - mlp_dims: [256, 256, 256] + mlp_dims: [1024, 1024, 1024] activation_type: ReLU use_layernorm: True double_q: False # use ensemble @@ -105,4 +105,5 @@ offline_dataset: dataset_path: ${offline_dataset_path} horizon_steps: ${horizon_steps} cond_steps: ${cond_steps} - device: ${device} \ No newline at end of file + device: ${device} + max_n_episodes: 50 \ No newline at end of file diff --git a/cfg/gym/finetune/kitchen-partial-v0/calql_mlp_online.yaml b/cfg/gym/finetune/kitchen-partial-v0/calql_mlp_online.yaml new file mode 100644 index 0000000..160bf19 --- /dev/null +++ b/cfg/gym/finetune/kitchen-partial-v0/calql_mlp_online.yaml @@ -0,0 +1,116 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-partial-v0_calql_mlp_ta1/2024-10-25_21-26-51_42/checkpoint/state_980.pt +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: kitchen-partial-v0 +obs_dim: 60 +action_dim: 9 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 280 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 4 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 10000 + n_steps: 1 # not used + n_episode_per_epoch: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 20 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: True + batch_size: 256 + n_random_actions: 10 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/gym/finetune/kitchen-partial-v0/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/kitchen-partial-v0/ft_ppo_diffusion_mlp.yaml new file mode 100644 index 0000000..946d86b --- /dev/null +++ b/cfg/gym/finetune/kitchen-partial-v0/ft_ppo_diffusion_mlp.yaml @@ -0,0 +1,108 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent + +name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-partial-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-29_42/checkpoint/state_8000.pt +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: kitchen-partial-v0 +obs_dim: 60 +action_dim: 9 +denoising_steps: 20 +ft_denoising_steps: 10 +cond_steps: 1 +horizon_steps: 4 +act_steps: 4 + +env: + n_envs: 40 + name: ${env_name} + max_episode_steps: 280 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 4 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env_name}-finetune + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 301 + n_critic_warmup_itr: 0 + n_steps: 70 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # PPO specific + reward_scale_running: True + reward_scale_const: 1.0 + gae_lambda: 0.95 + batch_size: 5600 + update_epochs: 10 + vf_coef: 0.5 + target_kl: 1 + +model: + _target_: model.diffusion.diffusion_ppo.PPODiffusion + # HP to tune + gamma_denoising: 0.99 + clip_ploss_coef: 0.01 + clip_ploss_coef_base: 0.01 + clip_ploss_coef_rate: 3 + randn_clip_value: 3 + min_sampling_denoising_std: 0.1 + min_logprob_denoising_std: 0.1 + # + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [256, 256, 256] + cond_mlp_dims: [128, 32] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObs + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + ft_denoising_steps: ${ft_denoising_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/kitchen-partial-v0/ibrl_mlp.yaml b/cfg/gym/finetune/kitchen-partial-v0/ibrl_mlp.yaml new file mode 100644 index 0000000..3d15f16 --- /dev/null +++ b/cfg/gym/finetune/kitchen-partial-v0/ibrl_mlp.yaml @@ -0,0 +1,109 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent + +name: ${env_name}_ibrl_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-partial-v0_pre_gaussian_mlp_ta1/2024-10-25_01-45-52_42/checkpoint/state_5000.pt +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: kitchen-partial-v0 +obs_dim: 60 +action_dim: 9 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 280 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 4 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: ibrl-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 50000 + val_freq: 5000 + render: + freq: 1 + num: 0 + log_freq: 200 + # IBRL specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 5 + buffer_size: 500000 + n_eval_episode: 40 + n_explore_steps: 0 + update_freq: 2 + +model: + _target_: model.rl.gaussian_ibrl.IBRL_Gaussian + randn_clip_value: 3 + n_critics: 5 + soft_action_sample: True + soft_action_sample_beta: 10 + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + dropout: 0.5 + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + max_n_episodes: 50 \ No newline at end of file diff --git a/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml index 9158042..de70428 100644 --- a/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml +++ b/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml @@ -68,7 +68,7 @@ train: reward_scale_running: True reward_scale_const: 1.0 gae_lambda: 0.95 - batch_size: 5000 + batch_size: 50000 update_epochs: 5 vf_coef: 0.5 target_kl: 1 diff --git a/cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml deleted file mode 100644 index 42dcdf5..0000000 --- a/cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml +++ /dev/null @@ -1,103 +0,0 @@ -defaults: - - _self_ -hydra: - run: - dir: ${logdir} -_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent - -name: ${env_name}_rlpd_mlp_ta${horizon_steps}_td${denoising_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} -normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz -offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz - -seed: 42 -device: cuda:0 -env_name: walker2d-medium-v2 -obs_dim: 17 -action_dim: 6 -denoising_steps: 20 -cond_steps: 1 -horizon_steps: 1 -act_steps: 1 - -env: - n_envs: 40 - name: ${env_name} - max_episode_steps: 1000 - reset_at_iteration: False - save_video: False - best_reward_threshold_for_success: 3 - wrappers: - mujoco_locomotion_lowdim: - normalization_path: ${normalization_path} - multi_step: - n_obs_steps: ${cond_steps} - n_action_steps: ${act_steps} - max_episode_steps: ${env.max_episode_steps} - reset_within_step: True - -wandb: - entity: ${oc.env:DPPO_WANDB_ENTITY} - project: rlpd-gym-${env_name}-finetune - run: ${now:%H-%M-%S}_${name} - -train: - n_train_itr: 1000 - n_critic_warmup_itr: 5 - n_steps: 2000 - gamma: 0.99 - actor_lr: 1e-4 - actor_weight_decay: 0 - actor_lr_scheduler: - first_cycle_steps: 1000 - warmup_steps: 10 - min_lr: 1e-4 - critic_lr: 1e-3 - critic_weight_decay: 0 - critic_lr_scheduler: - first_cycle_steps: 1000 - warmup_steps: 10 - min_lr: 1e-3 - save_model_freq: 100 - val_freq: 10 - render: - freq: 1 - num: 0 - # RLPD specific - batch_size: 512 - entropy_temperature: 1.0 # alpha in RLPD paper - target_ema_rate: 0.005 # rho in RLPD paper - scale_reward_factor: 1.0 # multiply reward by this amount for more stable value estimation - replay_ratio: 64 # number of batches to sample for each learning update - buffer_size: 1000000 - -model: - _target_: model.rl.gaussian_rlpd.RLPD_Gaussian - randn_clip_value: 3 - actor: - _target_: model.common.mlp_gaussian.Gaussian_MLP - mlp_dims: [512, 512, 512] - activation_type: ReLU - residual_style: True - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - horizon_steps: ${horizon_steps} - action_dim: ${action_dim} - critic: - _target_: model.common.critic.CriticObsAct - action_dim: ${action_dim} - action_steps: ${act_steps} - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} - mlp_dims: [256, 256, 256] - activation_type: Mish - residual_style: True - use_layernorm: True - horizon_steps: ${horizon_steps} - device: ${device} - n_critics: 2 # Ensemble size for critic models - -offline_dataset: - _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset - dataset_path: ${offline_dataset_path} - horizon_steps: ${horizon_steps} - cond_steps: ${cond_steps} - device: ${device} \ No newline at end of file diff --git a/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml b/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml index 7dfd7ed..e73a4d5 100644 --- a/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml +++ b/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml @@ -88,7 +88,7 @@ model: tanh_output: False # squash after sampling instead cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} std_max: 7.3891 std_min: 0.0067 critic: diff --git a/cfg/gym/pretrain/kitchen-complete-v0/calql_mlp_offline.yaml b/cfg/gym/pretrain/kitchen-complete-v0/calql_mlp_offline.yaml new file mode 100644 index 0000000..8a2f462 --- /dev/null +++ b/cfg/gym/pretrain/kitchen-complete-v0/calql_mlp_offline.yaml @@ -0,0 +1,113 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: kitchen-complete-v0 +obs_dim: 60 +action_dim: 9 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 280 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 4 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_steps: 1 # not used + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 20 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: False + batch_size: 256 + n_random_actions: 10 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml new file mode 100644 index 0000000..092fa00 --- /dev/null +++ b/cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml @@ -0,0 +1,66 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent + +name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz + +seed: 42 +device: cuda:0 +env: kitchen-complete-v0 +obs_dim: 60 +action_dim: 9 +denoising_steps: 20 +horizon_steps: 4 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 8000 + batch_size: 128 + learning_rate: 1e-3 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 8000 + warmup_steps: 1 + min_lr: 1e-4 + epoch_start_ema: 10 + update_ema_freq: 5 + save_model_freq: 1000 + +model: + _target_: model.diffusion.diffusion.DiffusionModel + predict_epsilon: True + denoised_clip_value: 1.0 + network: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [256, 256, 256] + cond_mlp_dims: [128, 32] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml new file mode 100644 index 0000000..9426b6f --- /dev/null +++ b/cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml @@ -0,0 +1,60 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz + +seed: 42 +device: cuda:0 +env: kitchen-complete-v0 +obs_dim: 60 +action_dim: 9 +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 5000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 0 + lr_scheduler: + first_cycle_steps: 5000 + warmup_steps: 100 + min_lr: 1e-4 + epoch_start_ema: 20 + update_ema_freq: 10 + save_model_freq: 1000 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + dropout: 0.5 + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/pretrain/hopper-medium-v2/calql_mlp_offline.yaml b/cfg/gym/pretrain/kitchen-mixed-v0/calql_mlp_offline.yaml similarity index 85% rename from cfg/gym/pretrain/hopper-medium-v2/calql_mlp_offline.yaml rename to cfg/gym/pretrain/kitchen-mixed-v0/calql_mlp_offline.yaml index 24f8957..4233314 100644 --- a/cfg/gym/pretrain/hopper-medium-v2/calql_mlp_offline.yaml +++ b/cfg/gym/pretrain/kitchen-mixed-v0/calql_mlp_offline.yaml @@ -6,15 +6,15 @@ hydra: _target_: agent.finetune.train_calql_agent.TrainCalQLAgent name: ${env_name}_calql_mlp_ta${horizon_steps} -logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz seed: 42 device: cuda:0 -env_name: hopper-medium-v2 -obs_dim: 11 -action_dim: 3 +env_name: kitchen-mixed-v0 +obs_dim: 60 +action_dim: 9 cond_steps: 1 horizon_steps: 1 act_steps: 1 @@ -22,10 +22,10 @@ act_steps: 1 env: n_envs: 1 name: ${env_name} - max_episode_steps: 1000 + max_episode_steps: 280 reset_at_iteration: False save_video: False - best_reward_threshold_for_success: 3 + best_reward_threshold_for_success: 4 wrappers: mujoco_locomotion_lowdim: normalization_path: ${normalization_path} @@ -41,7 +41,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 100 + n_train_itr: 1000 n_steps: 1 # not used gamma: 0.99 actor_lr: 1e-4 @@ -50,14 +50,14 @@ train: first_cycle_steps: 1000 warmup_steps: 10 min_lr: 1e-4 - critic_lr: 3e-4 + critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 3e-4 + min_lr: 1e-3 save_model_freq: 10 - val_freq: 10 + val_freq: 20 render: freq: 1 num: 0 @@ -65,12 +65,12 @@ train: # CalQL specific train_online: False batch_size: 256 - n_random_actions: 4 + n_random_actions: 10 target_ema_rate: 0.005 scale_reward_factor: 1.0 num_update: 1000 buffer_size: 1000000 - n_eval_episode: 10 + n_eval_episode: 40 n_explore_steps: 0 target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} init_temperature: 1 @@ -83,17 +83,17 @@ model: tanh_output: True actor: _target_: model.common.mlp_gaussian.Gaussian_MLP - mlp_dims: [256, 256] + mlp_dims: [256, 256, 256] activation_type: ReLU tanh_output: False # squash after sampling instead cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} std_max: 7.3891 std_min: 0.0067 critic: _target_: model.common.critic.CriticObsAct - mlp_dims: [256, 256] + mlp_dims: [256, 256, 256] activation_type: ReLU use_layernorm: True double_q: True diff --git a/cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml new file mode 100644 index 0000000..becf244 --- /dev/null +++ b/cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml @@ -0,0 +1,66 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent + +name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz + +seed: 42 +device: cuda:0 +env: kitchen-mixed-v0 +obs_dim: 60 +action_dim: 9 +denoising_steps: 20 +horizon_steps: 4 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 8000 + batch_size: 256 + learning_rate: 1e-3 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 8000 + warmup_steps: 1 + min_lr: 1e-4 + epoch_start_ema: 10 + update_ema_freq: 5 + save_model_freq: 1000 + +model: + _target_: model.diffusion.diffusion.DiffusionModel + predict_epsilon: True + denoised_clip_value: 1.0 + network: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [256, 256, 256] + cond_mlp_dims: [128, 32] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml new file mode 100644 index 0000000..86a6e90 --- /dev/null +++ b/cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml @@ -0,0 +1,59 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz + +seed: 42 +device: cuda:0 +env: kitchen-mixed-v0 +obs_dim: 60 +action_dim: 9 +horizon_steps: 4 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 5000 + batch_size: 128 + learning_rate: 1e-3 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 5000 + warmup_steps: 1 + min_lr: 1e-4 + epoch_start_ema: 10 + update_ema_freq: 5 + save_model_freq: 1000 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: Mish + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/pretrain/kitchen-partial-v0/calql_mlp_offline.yaml b/cfg/gym/pretrain/kitchen-partial-v0/calql_mlp_offline.yaml new file mode 100644 index 0000000..f99edfd --- /dev/null +++ b/cfg/gym/pretrain/kitchen-partial-v0/calql_mlp_offline.yaml @@ -0,0 +1,113 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: kitchen-partial-v0 +obs_dim: 60 +action_dim: 9 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 280 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 4 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_steps: 1 # not used + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 10 + val_freq: 20 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: False + batch_size: 256 + n_random_actions: 10 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml new file mode 100644 index 0000000..c854707 --- /dev/null +++ b/cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml @@ -0,0 +1,66 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent + +name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz + +seed: 42 +device: cuda:0 +env: kitchen-partial-v0 +obs_dim: 60 +action_dim: 9 +denoising_steps: 20 +horizon_steps: 4 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 8000 + batch_size: 128 + learning_rate: 1e-3 + weight_decay: 1e-5 + lr_scheduler: + first_cycle_steps: 8000 + warmup_steps: 1 + min_lr: 1e-4 + epoch_start_ema: 10 + update_ema_freq: 5 + save_model_freq: 1000 + +model: + _target_: model.diffusion.diffusion.DiffusionModel + predict_epsilon: True + denoised_clip_value: 1.0 + network: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [256, 256, 256] + cond_mlp_dims: [128, 32] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml new file mode 100644 index 0000000..02413a5 --- /dev/null +++ b/cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml @@ -0,0 +1,59 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent + +name: ${env}_pre_gaussian_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz + +seed: 42 +device: cuda:0 +env: kitchen-partial-v0 +obs_dim: 60 +action_dim: 9 +horizon_steps: 4 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 5000 + batch_size: 128 + learning_rate: 1e-3 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 5000 + warmup_steps: 1 + min_lr: 1e-4 + epoch_start_ema: 10 + update_ema_freq: 5 + save_model_freq: 1000 + +model: + _target_: model.common.gaussian.GaussianModel + network: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: Mish + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml b/cfg/gym/scratch/halfcheetah-v2/ppo_diffusion_mlp.yaml similarity index 96% rename from cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml rename to cfg/gym/scratch/halfcheetah-v2/ppo_diffusion_mlp.yaml index 9be391c..49f11ed 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml +++ b/cfg/gym/scratch/halfcheetah-v2/ppo_diffusion_mlp.yaml @@ -14,8 +14,8 @@ device: cuda:0 env_name: halfcheetah-medium-v2 obs_dim: 17 action_dim: 6 -denoising_steps: 20 -ft_denoising_steps: 20 +denoising_steps: 10 +ft_denoising_steps: 10 cond_steps: 1 horizon_steps: 1 act_steps: 1 @@ -67,7 +67,7 @@ train: reward_scale_running: True reward_scale_const: 1.0 gae_lambda: 0.95 - batch_size: 1000 + batch_size: 10000 update_epochs: 10 vf_coef: 0.5 target_kl: 1 diff --git a/cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml b/cfg/gym/scratch/halfcheetah-v2/ppo_gaussian_mlp.yaml similarity index 97% rename from cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml rename to cfg/gym/scratch/halfcheetah-v2/ppo_gaussian_mlp.yaml index f09c664..b0c1241 100644 --- a/cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml +++ b/cfg/gym/scratch/halfcheetah-v2/ppo_gaussian_mlp.yaml @@ -53,7 +53,7 @@ train: critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: - first_cycle_steps: 10000 + first_cycle_steps: 1000 warmup_steps: 10 min_lr: 1e-3 save_model_freq: 100 diff --git a/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml b/cfg/gym/scratch/halfcheetah-v2/rlpd_mlp.yaml similarity index 98% rename from cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml rename to cfg/gym/scratch/halfcheetah-v2/rlpd_mlp.yaml index 898cf9b..24379c6 100644 --- a/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml +++ b/cfg/gym/scratch/halfcheetah-v2/rlpd_mlp.yaml @@ -86,7 +86,7 @@ model: tanh_output: False # squash after sampling instead cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} std_max: 7.3891 std_min: 0.0067 critic: diff --git a/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml b/cfg/gym/scratch/halfcheetah-v2/sac_mlp.yaml similarity index 98% rename from cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml rename to cfg/gym/scratch/halfcheetah-v2/sac_mlp.yaml index 8051c73..35182d5 100644 --- a/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml +++ b/cfg/gym/scratch/halfcheetah-v2/sac_mlp.yaml @@ -75,7 +75,7 @@ model: tanh_output: False # squash after sampling instead cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} std_max: 7.3891 std_min: 0.0067 critic: # no layernorm diff --git a/cfg/gym/scratch/hopper-v2/awr_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/awr_diffusion_mlp.yaml new file mode 100644 index 0000000..2f02c78 --- /dev/null +++ b/cfg/gym/scratch/hopper-v2/awr_diffusion_mlp.yaml @@ -0,0 +1,99 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent + +name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 10 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env_name}-scratch + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_critic_warmup_itr: 0 + n_steps: 1000 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # AWR specific + scale_reward_factor: 0.01 + max_adv_weight: 100 + beta: 10 + buffer_size: 100000 # * n_envs + batch_size: 256 + replay_ratio: 128 + critic_update_ratio: 4 + +model: + _target_: model.diffusion.diffusion_awr.AWRDiffusion + # Sampling HPs + min_sampling_denoising_std: 0.10 + randn_clip_value: 3 + # + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + time_dim: 16 + mlp_dims: [512, 512, 512] + activation_type: ReLU + residual_style: True + critic: + _target_: model.common.critic.CriticObs + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/scratch/hopper-v2/dipo_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/dipo_diffusion_mlp.yaml new file mode 100644 index 0000000..9eda16e --- /dev/null +++ b/cfg/gym/scratch/hopper-v2/dipo_diffusion_mlp.yaml @@ -0,0 +1,101 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent + +name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 10 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env_name}-scratch + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_critic_warmup_itr: 0 + n_steps: 1000 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # DIPO specific + scale_reward_factor: 0.01 + target_ema_rate: 0.005 + buffer_size: 1000000 + action_lr: 0.0001 + action_gradient_steps: 10 + replay_ratio: 128 + batch_size: 256 + +model: + _target_: model.diffusion.diffusion_dipo.DIPODiffusion + # Sampling HPs + min_sampling_denoising_std: 0.10 + randn_clip_value: 3 + # + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + time_dim: 16 + mlp_dims: [512, 512, 512] + activation_type: ReLU + residual_style: True + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/scratch/hopper-v2/dql_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/dql_diffusion_mlp.yaml new file mode 100644 index 0000000..9bd4885 --- /dev/null +++ b/cfg/gym/scratch/hopper-v2/dql_diffusion_mlp.yaml @@ -0,0 +1,100 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent + +name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 10 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env_name}-scratch + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_critic_warmup_itr: 0 + n_steps: 1000 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # DQL specific + scale_reward_factor: 0.01 + target_ema_rate: 0.005 + buffer_size: 1000000 + eta: 1.0 + replay_ratio: 128 + batch_size: 256 + +model: + _target_: model.diffusion.diffusion_dql.DQLDiffusion + # Sampling HPs + min_sampling_denoising_std: 0.10 + randn_clip_value: 3 + # + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + time_dim: 16 + mlp_dims: [512, 512, 512] + activation_type: ReLU + residual_style: True + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/scratch/hopper-v2/idql_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/idql_diffusion_mlp.yaml new file mode 100644 index 0000000..935263d --- /dev/null +++ b/cfg/gym/scratch/hopper-v2/idql_diffusion_mlp.yaml @@ -0,0 +1,108 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent + +name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 10 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env_name}-scratch + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_critic_warmup_itr: 0 + n_steps: 1000 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # IDQL specific + scale_reward_factor: 0.01 + eval_deterministic: True + eval_sample_num: 10 # how many samples to score during eval + critic_tau: 0.001 # rate of target q network update + use_expectile_exploration: True + buffer_size: 100000 # * n_envs + replay_ratio: 128 + batch_size: 256 + +model: + _target_: model.diffusion.diffusion_idql.IDQLDiffusion + # Sampling HPs + min_sampling_denoising_std: 0.10 + randn_clip_value: 3 + # + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + time_dim: 16 + mlp_dims: [512, 512, 512] + activation_type: ReLU + residual_style: True + critic_q: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + critic_v: + _target_: model.common.critic.CriticObs + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/ppo_diffusion_mlp.yaml similarity index 95% rename from cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml rename to cfg/gym/scratch/hopper-v2/ppo_diffusion_mlp.yaml index 3f26654..729a0c6 100644 --- a/cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml +++ b/cfg/gym/scratch/hopper-v2/ppo_diffusion_mlp.yaml @@ -1,7 +1,7 @@ defaults: - _self_ hydra: - run: + run: dir: ${logdir} _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent @@ -14,8 +14,8 @@ device: cuda:0 env_name: hopper-medium-v2 obs_dim: 11 action_dim: 3 -denoising_steps: 20 -ft_denoising_steps: 20 +denoising_steps: 10 +ft_denoising_steps: 10 cond_steps: 1 horizon_steps: 1 act_steps: 1 @@ -55,7 +55,7 @@ train: critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: - first_cycle_steps: 10000 + first_cycle_steps: 1000 warmup_steps: 10 min_lr: 1e-3 save_model_freq: 100 @@ -67,7 +67,7 @@ train: reward_scale_running: True reward_scale_const: 1.0 gae_lambda: 0.95 - batch_size: 1000 + batch_size: 10000 update_epochs: 10 vf_coef: 0.5 target_kl: 1 @@ -94,10 +94,10 @@ model: residual_style: True critic: _target_: model.common.critic.CriticObs - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} mlp_dims: [256, 256, 256] activation_type: Mish residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} ft_denoising_steps: ${ft_denoising_steps} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} diff --git a/cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml b/cfg/gym/scratch/hopper-v2/ppo_gaussian_mlp.yaml similarity index 97% rename from cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml rename to cfg/gym/scratch/hopper-v2/ppo_gaussian_mlp.yaml index 57eafcb..05f5766 100644 --- a/cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml +++ b/cfg/gym/scratch/hopper-v2/ppo_gaussian_mlp.yaml @@ -53,7 +53,7 @@ train: critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: - first_cycle_steps: 10000 + first_cycle_steps: 1000 warmup_steps: 10 min_lr: 1e-3 save_model_freq: 100 diff --git a/cfg/gym/scratch/hopper-v2/qsm_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/qsm_diffusion_mlp.yaml new file mode 100644 index 0000000..9fee721 --- /dev/null +++ b/cfg/gym/scratch/hopper-v2/qsm_diffusion_mlp.yaml @@ -0,0 +1,100 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent + +name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 10 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env_name}-scratch + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_critic_warmup_itr: 0 + n_steps: 1000 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # QSM specific + scale_reward_factor: 0.01 + q_grad_coeff: 50 + critic_tau: 0.005 + buffer_size: 100000 # * n_envs + replay_ratio: 128 + batch_size: 256 + +model: + _target_: model.diffusion.diffusion_qsm.QSMDiffusion + # Sampling HPs + min_sampling_denoising_std: 0.10 + randn_clip_value: 3 + # + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + time_dim: 16 + mlp_dims: [512, 512, 512] + activation_type: ReLU + residual_style: True + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/scratch/hopper-v2/rwr_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/rwr_diffusion_mlp.yaml new file mode 100644 index 0000000..cdd98a2 --- /dev/null +++ b/cfg/gym/scratch/hopper-v2/rwr_diffusion_mlp.yaml @@ -0,0 +1,84 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent + +name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: hopper-medium-v2 +obs_dim: 11 +action_dim: 3 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 10 + name: ${env_name} + max_episode_steps: 1000 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 3 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: gym-${env_name}-scratch + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_critic_warmup_itr: 0 + n_steps: 1000 + gamma: 0.99 + lr: 1e-4 + weight_decay: 0 + lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # RWR specific + max_reward_weight: 100 + beta: 10 + batch_size: 256 + update_epochs: 128 + +model: + _target_: model.diffusion.diffusion_rwr.RWRDiffusion + # Sampling HPs + min_sampling_denoising_std: 0.1 + randn_clip_value: 3 + # + network: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + time_dim: 16 + mlp_dims: [512, 512, 512] + activation_type: ReLU + residual_style: True + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/scratch/kitchen-complete-v0/rlpd_mlp.yaml b/cfg/gym/scratch/kitchen-complete-v0/rlpd_mlp.yaml new file mode 100644 index 0000000..b80a9a8 --- /dev/null +++ b/cfg/gym/scratch/kitchen-complete-v0/rlpd_mlp.yaml @@ -0,0 +1,109 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: kitchen-complete-v0 +obs_dim: 60 +action_dim: 9 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 280 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 4 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: rlpd-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 3e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 50000 + val_freq: 5000 + render: + freq: 1 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 10 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + +model: + _target_: model.rl.gaussian_rlpd.RLPD_Gaussian + randn_clip_value: 10 + tanh_output: True # squash after sampling + backup_entropy: True + n_critics: 5 # Ensemble size for critic models + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml b/cfg/gym/scratch/kitchen-mixed-v0/rlpd_mlp.yaml similarity index 83% rename from cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml rename to cfg/gym/scratch/kitchen-mixed-v0/rlpd_mlp.yaml index 7a33bde..e006e25 100644 --- a/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml +++ b/cfg/gym/scratch/kitchen-mixed-v0/rlpd_mlp.yaml @@ -12,9 +12,9 @@ offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz seed: 42 device: cuda:0 -env_name: hopper-medium-v2 -obs_dim: 11 -action_dim: 3 +env_name: kitchen-mixed-v0 +obs_dim: 60 +action_dim: 9 cond_steps: 1 horizon_steps: 1 act_steps: 1 @@ -22,10 +22,10 @@ act_steps: 1 env: n_envs: 1 name: ${env_name} - max_episode_steps: 1000 + max_episode_steps: 280 reset_at_iteration: False save_video: False - best_reward_threshold_for_success: 3 + best_reward_threshold_for_success: 4 wrappers: mujoco_locomotion_lowdim: normalization_path: ${normalization_path} @@ -41,7 +41,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 250000 + n_train_itr: 1000000 n_steps: 1 gamma: 0.99 actor_lr: 3e-4 @@ -50,12 +50,12 @@ train: first_cycle_steps: 1000 warmup_steps: 10 min_lr: 3e-4 - critic_lr: 3e-4 + critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 3e-4 + min_lr: 1e-3 save_model_freq: 50000 val_freq: 5000 render: @@ -64,12 +64,12 @@ train: log_freq: 200 # RLPD specific batch_size: 256 - target_ema_rate: 0.005 + target_ema_rate: 0.01 scale_reward_factor: 1 - critic_num_update: 20 - buffer_size: 1000000 - n_eval_episode: 10 - n_explore_steps: 5000 + critic_num_update: 10 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} init_temperature: 1 @@ -78,20 +78,20 @@ model: randn_clip_value: 10 tanh_output: True # squash after sampling backup_entropy: True - n_critics: 10 # Ensemble size for critic models + n_critics: 5 # Ensemble size for critic models actor: _target_: model.common.mlp_gaussian.Gaussian_MLP - mlp_dims: [256, 256] + mlp_dims: [256, 256, 256] activation_type: ReLU tanh_output: False # squash after sampling instead cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} std_max: 7.3891 std_min: 0.0067 critic: _target_: model.common.critic.CriticObsAct - mlp_dims: [256, 256] + mlp_dims: [256, 256, 256] activation_type: ReLU use_layernorm: True double_q: False # use ensemble diff --git a/cfg/gym/scratch/kitchen-partial-v0/rlpd_mlp.yaml b/cfg/gym/scratch/kitchen-partial-v0/rlpd_mlp.yaml new file mode 100644 index 0000000..a9b7781 --- /dev/null +++ b/cfg/gym/scratch/kitchen-partial-v0/rlpd_mlp.yaml @@ -0,0 +1,109 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: kitchen-partial-v0 +obs_dim: 60 +action_dim: 9 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 280 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 4 + wrappers: + mujoco_locomotion_lowdim: + normalization_path: ${normalization_path} + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: rlpd-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 3e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 50000 + val_freq: 5000 + render: + freq: 1 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 10 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + +model: + _target_: model.rl.gaussian_rlpd.RLPD_Gaussian + randn_clip_value: 10 + tanh_output: True # squash after sampling + backup_entropy: True + n_critics: 5 # Ensemble size for critic models + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [256, 256, 256] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml b/cfg/gym/scratch/walker2d-v2/ppo_diffusion_mlp.yaml similarity index 96% rename from cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml rename to cfg/gym/scratch/walker2d-v2/ppo_diffusion_mlp.yaml index 6530d49..2c1769f 100644 --- a/cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml +++ b/cfg/gym/scratch/walker2d-v2/ppo_diffusion_mlp.yaml @@ -14,8 +14,8 @@ device: cuda:0 env_name: walker2d-medium-v2 obs_dim: 17 action_dim: 6 -denoising_steps: 20 -ft_denoising_steps: 20 +denoising_steps: 10 +ft_denoising_steps: 10 cond_steps: 1 horizon_steps: 1 act_steps: 1 @@ -67,7 +67,7 @@ train: reward_scale_running: True reward_scale_const: 1.0 gae_lambda: 0.95 - batch_size: 1000 + batch_size: 10000 update_epochs: 10 vf_coef: 0.5 target_kl: 1 diff --git a/cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml b/cfg/gym/scratch/walker2d-v2/ppo_gaussian_mlp.yaml similarity index 97% rename from cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml rename to cfg/gym/scratch/walker2d-v2/ppo_gaussian_mlp.yaml index dff57a3..70b6267 100644 --- a/cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml +++ b/cfg/gym/scratch/walker2d-v2/ppo_gaussian_mlp.yaml @@ -53,7 +53,7 @@ train: critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: - first_cycle_steps: 10000 + first_cycle_steps: 1000 warmup_steps: 10 min_lr: 1e-3 save_model_freq: 100 diff --git a/cfg/robomimic/finetune/can/calql_mlp_online.yaml b/cfg/robomimic/finetune/can/calql_mlp_online.yaml index 8fc1a3c..9fd5db1 100644 --- a/cfg/robomimic/finetune/can/calql_mlp_online.yaml +++ b/cfg/robomimic/finetune/can/calql_mlp_online.yaml @@ -7,7 +7,7 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent name: ${env_name}_calql_mlp_ta${horizon_steps} logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} -base_policy_path: +base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_calql_mlp_ta1/2024-10-25_22-30-16_42/checkpoint/state_999.pt robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz @@ -97,7 +97,7 @@ model: tanh_output: False # squash after sampling instead cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} std_max: 7.3891 std_min: 0.0067 critic: diff --git a/cfg/robomimic/finetune/can/calql_mlp_online_ph.yaml b/cfg/robomimic/finetune/can/calql_mlp_online_ph.yaml new file mode 100644 index 0000000..cfb4b81 --- /dev/null +++ b/cfg/robomimic/finetune/can/calql_mlp_online_ph.yaml @@ -0,0 +1,122 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_steps: 1 # not used + n_episode_per_epoch: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: True + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + online_utd_ratio: 1 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml index 2a8343a..ab384f1 100644 --- a/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml @@ -26,7 +26,7 @@ env: name: ${env_name} best_reward_threshold_for_success: 1 max_episode_steps: 300 - save_video: false + save_video: False wrappers: robomimic_lowdim: normalization_path: ${normalization_path} @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 151 n_critic_warmup_itr: 2 n_steps: 300 gamma: 0.999 diff --git a/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml index ed9c90f..59cb0a2 100644 --- a/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml @@ -26,7 +26,7 @@ env: name: ${env_name} best_reward_threshold_for_success: 1 max_episode_steps: 300 - save_video: false + save_video: False wrappers: robomimic_lowdim: normalization_path: ${normalization_path} diff --git a/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml index 24bb53a..12e33b0 100644 --- a/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 151 n_critic_warmup_itr: 5 n_steps: 300 gamma: 0.999 diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml index ba1fa16..8256876 100644 --- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml @@ -47,16 +47,16 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 151 n_critic_warmup_itr: 2 n_steps: 300 gamma: 0.999 - actor_lr: 1e-5 + actor_lr: 1e-4 actor_weight_decay: 0 actor_lr_scheduler: first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-5 + min_lr: 1e-4 critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml index 0873cb4..54a4ab1 100644 --- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml +++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml @@ -1,7 +1,7 @@ defaults: - _self_ hydra: - run: + run: dir: ${logdir} _target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent @@ -60,22 +60,22 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 200 + n_train_itr: 151 n_critic_warmup_itr: 2 n_steps: 300 gamma: 0.999 augment: True grad_accumulate: 15 - actor_lr: 1e-5 + actor_lr: 1e-4 actor_weight_decay: 0 actor_lr_scheduler: - first_cycle_steps: 200 + first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-5 + min_lr: 1e-4 critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: - first_cycle_steps: 200 + first_cycle_steps: 1000 warmup_steps: 10 min_lr: 1e-3 save_model_freq: 100 @@ -96,7 +96,7 @@ train: model: _target_: model.diffusion.diffusion_ppo.PPODiffusion # HP to tune - gamma_denoising: 0.9 + gamma_denoising: 0.99 clip_ploss_coef: 0.01 clip_ploss_coef_base: 0.001 clip_ploss_coef_rate: 3 @@ -158,10 +158,10 @@ model: embed_style: embed2 embed_norm: 0 img_cond_steps: ${img_cond_steps} - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} mlp_dims: [256, 256, 256] activation_type: Mish residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} ft_denoising_steps: ${ft_denoising_steps} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1.yaml new file mode 100644 index 0000000..86d28df --- /dev/null +++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1.yaml @@ -0,0 +1,111 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent + +name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_diffusion_mlp_ta1_td20/2024-09-29_15-43-07_42/checkpoint/state_8000.pt +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 20 +ft_denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 50 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env_name}-finetune + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 301 + n_critic_warmup_itr: 2 + n_steps: 300 + gamma: 0.999 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # PPO specific + reward_scale_running: True + reward_scale_const: 1.0 + gae_lambda: 0.95 + batch_size: 15000 + update_epochs: 10 + vf_coef: 0.5 + target_kl: 1 + +model: + _target_: model.diffusion.diffusion_ppo.PPODiffusion + # HP to tune + gamma_denoising: 0.99 + clip_ploss_coef: 0.01 + clip_ploss_coef_base: 0.001 + clip_ploss_coef_rate: 3 + randn_clip_value: 3 + min_sampling_denoising_std: 0.1 + min_logprob_denoising_std: 0.1 + # + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObs + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + ft_denoising_steps: ${ft_denoising_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1_ph.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1_ph.yaml new file mode 100644 index 0000000..3367556 --- /dev/null +++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1_ph.yaml @@ -0,0 +1,111 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent + +name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 20 +ft_denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 40 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env_name}-finetune + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 301 + n_critic_warmup_itr: 2 + n_steps: 300 + gamma: 0.999 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # PPO specific + reward_scale_running: True + reward_scale_const: 1.0 + gae_lambda: 0.95 + batch_size: 6000 + update_epochs: 10 + vf_coef: 0.5 + target_kl: 1 + +model: + _target_: model.diffusion.diffusion_ppo.PPODiffusion + # HP to tune + gamma_denoising: 0.9 + clip_ploss_coef: 0.01 + clip_ploss_coef_base: 0.001 + clip_ploss_coef_rate: 3 + randn_clip_value: 3 + min_sampling_denoising_std: 0.1 + min_logprob_denoising_std: 0.1 + # + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObs + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + ft_denoising_steps: ${ft_denoising_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml index 591f3a9..bbd8bd6 100644 --- a/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 151 n_critic_warmup_itr: 5 n_steps: 300 gamma: 0.999 diff --git a/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml index 5037605..fa451a3 100644 --- a/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml @@ -26,7 +26,7 @@ env: name: ${env_name} best_reward_threshold_for_success: 1 max_episode_steps: 300 - save_video: false + save_video: False wrappers: robomimic_lowdim: normalization_path: ${normalization_path} @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 151 n_critic_warmup_itr: 2 n_steps: 300 gamma: 0.999 diff --git a/cfg/robomimic/finetune/can/ibrl_mlp.yaml b/cfg/robomimic/finetune/can/ibrl_mlp.yaml index 7aa8d24..c3ba56e 100644 --- a/cfg/robomimic/finetune/can/ibrl_mlp.yaml +++ b/cfg/robomimic/finetune/can/ibrl_mlp.yaml @@ -7,7 +7,7 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent name: ${env_name}_ibrl_mlp_ta${horizon_steps} logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} -base_policy_path: +base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_gaussian_mlp_ta1/2024-09-28_13-43-59_42/checkpoint/state_5000.pt robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz @@ -93,7 +93,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct mlp_dims: [1024, 1024, 1024] diff --git a/cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml b/cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml new file mode 100644 index 0000000..8940658 --- /dev/null +++ b/cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml @@ -0,0 +1,115 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent + +name: ${env_name}_ibrl_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_gaussian_mlp_ta1/2024-10-08_20-52-04_0/checkpoint/state_5000.pt +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 250 # IBRL uses 200 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: ibrl-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 100000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # IBRL specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + update_freq: 2 + +model: + _target_: model.rl.gaussian_ibrl.IBRL_Gaussian + randn_clip_value: 3 + n_critics: 5 + soft_action_sample: True + soft_action_sample_beta: 10 + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + dropout: 0.5 + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + max_n_episodes: 100 \ No newline at end of file diff --git a/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml index bddd57c..6b276bc 100644 --- a/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 81 n_critic_warmup_itr: 2 n_steps: 300 gamma: 0.999 diff --git a/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml index e0353b6..75e7c68 100644 --- a/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 81 n_critic_warmup_itr: 5 n_steps: 300 gamma: 0.999 diff --git a/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml index a0e2567..4bf3a2a 100644 --- a/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 81 n_critic_warmup_itr: 5 n_steps: 300 gamma: 0.999 diff --git a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml index b505b81..16b9485 100644 --- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml @@ -27,7 +27,7 @@ env: name: ${env_name} best_reward_threshold_for_success: 1 max_episode_steps: 300 - save_video: false + save_video: False wrappers: robomimic_lowdim: normalization_path: ${normalization_path} @@ -47,16 +47,16 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 81 n_critic_warmup_itr: 2 n_steps: 300 gamma: 0.999 - actor_lr: 1e-5 + actor_lr: 1e-4 actor_weight_decay: 0 actor_lr_scheduler: first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-5 + min_lr: 1e-4 critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: @@ -99,10 +99,10 @@ model: action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} mlp_dims: [256, 256, 256] activation_type: Mish residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} ft_denoising_steps: ${ft_denoising_steps} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} diff --git a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml index d46c44b..72207d6 100644 --- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml +++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml @@ -1,7 +1,7 @@ defaults: - _self_ hydra: - run: + run: dir: ${logdir} _target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent @@ -60,22 +60,22 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 200 + n_train_itr: 151 n_critic_warmup_itr: 2 n_steps: 300 gamma: 0.999 augment: True grad_accumulate: 15 - actor_lr: 1e-5 + actor_lr: 1e-4 actor_weight_decay: 0 actor_lr_scheduler: - first_cycle_steps: 200 + first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-5 + min_lr: 1e-4 critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: - first_cycle_steps: 200 + first_cycle_steps: 1000 warmup_steps: 10 min_lr: 1e-3 save_model_freq: 100 @@ -96,7 +96,7 @@ train: model: _target_: model.diffusion.diffusion_ppo.PPODiffusion # HP to tune - gamma_denoising: 0.9 + gamma_denoising: 0.99 clip_ploss_coef: 0.01 clip_ploss_coef_base: 0.001 clip_ploss_coef_rate: 3 @@ -158,10 +158,10 @@ model: embed_style: embed2 embed_norm: 0 img_cond_steps: ${img_cond_steps} - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} mlp_dims: [256, 256, 256] activation_type: Mish residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} ft_denoising_steps: ${ft_denoising_steps} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} diff --git a/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml index 8262daa..4c550ea 100644 --- a/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 81 n_critic_warmup_itr: 5 n_steps: 300 gamma: 0.999 diff --git a/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml index fa6b4ca..f32ef8d 100644 --- a/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 81 n_critic_warmup_itr: 2 n_steps: 300 gamma: 0.999 diff --git a/cfg/robomimic/finetune/square/calql_mlp_online.yaml b/cfg/robomimic/finetune/square/calql_mlp_online.yaml index 22ebae4..de333e6 100644 --- a/cfg/robomimic/finetune/square/calql_mlp_online.yaml +++ b/cfg/robomimic/finetune/square/calql_mlp_online.yaml @@ -7,7 +7,7 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent name: ${env_name}_calql_mlp_ta${horizon_steps} logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} -base_policy_path: +base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_calql_mlp_ta1/2024-10-25_22-44-12_42/checkpoint/state_999.pt robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz @@ -97,7 +97,7 @@ model: tanh_output: False # squash after sampling instead cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} std_max: 7.3891 std_min: 0.0067 critic: diff --git a/cfg/robomimic/finetune/square/calql_mlp_online_ph.yaml b/cfg/robomimic/finetune/square/calql_mlp_online_ph.yaml new file mode 100644 index 0000000..3332780 --- /dev/null +++ b/cfg/robomimic/finetune/square/calql_mlp_online_ph.yaml @@ -0,0 +1,122 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 400 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 10000 + n_steps: 1 # not used + n_episode_per_epoch: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: True + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + online_utd_ratio: 1 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml index c5b2e39..13dfbb4 100644 --- a/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 201 n_critic_warmup_itr: 2 n_steps: 400 gamma: 0.999 diff --git a/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml index 350bfe6..e143e5e 100644 --- a/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 201 n_critic_warmup_itr: 5 n_steps: 400 gamma: 0.999 diff --git a/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml index 87f1e5b..0c5fee8 100644 --- a/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 201 n_critic_warmup_itr: 5 n_steps: 400 gamma: 0.999 diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml index 47c539e..edbe296 100644 --- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml @@ -47,16 +47,16 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 500 + n_train_itr: 201 n_critic_warmup_itr: 2 n_steps: 400 gamma: 0.999 - actor_lr: 1e-5 + actor_lr: 1e-4 actor_weight_decay: 0 actor_lr_scheduler: first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-5 + min_lr: 1e-4 critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: @@ -100,10 +100,10 @@ model: action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} mlp_dims: [256, 256, 256] activation_type: Mish residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} ft_denoising_steps: ${ft_denoising_steps} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml index 51d3e3a..84355d6 100644 --- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml +++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml @@ -1,7 +1,7 @@ defaults: - _self_ hydra: - run: + run: dir: ${logdir} _target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent @@ -60,7 +60,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 500 + n_train_itr: 301 n_critic_warmup_itr: 2 n_steps: 400 gamma: 0.999 @@ -69,13 +69,13 @@ train: actor_lr: 1e-5 actor_weight_decay: 0 actor_lr_scheduler: - first_cycle_steps: 500 + first_cycle_steps: 1000 warmup_steps: 10 min_lr: 1e-5 critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: - first_cycle_steps: 500 + first_cycle_steps: 1000 warmup_steps: 10 min_lr: 1e-3 save_model_freq: 100 @@ -96,7 +96,7 @@ train: model: _target_: model.diffusion.diffusion_ppo.PPODiffusion # HP to tune - gamma_denoising: 0.9 + gamma_denoising: 0.99 clip_ploss_coef: 0.01 clip_ploss_coef_base: 0.001 clip_ploss_coef_rate: 3 @@ -158,10 +158,10 @@ model: embed_style: embed2 embed_norm: 0 img_cond_steps: ${img_cond_steps} - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} mlp_dims: [256, 256, 256] activation_type: Mish residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} ft_denoising_steps: ${ft_denoising_steps} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1.yaml new file mode 100644 index 0000000..156154c --- /dev/null +++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1.yaml @@ -0,0 +1,112 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent + +name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_diffusion_mlp_ta1_td20/2024-09-29_02-14-14_42/checkpoint/state_8000.pt +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 20 +ft_denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 50 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 400 + save_video: false + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env_name}-finetune + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 301 + n_critic_warmup_itr: 2 + n_steps: 400 + gamma: 0.999 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # PPO specific + reward_scale_running: True + reward_scale_const: 1.0 + gae_lambda: 0.95 + batch_size: 20000 + update_epochs: 10 + vf_coef: 0.5 + target_kl: 1 + +model: + _target_: model.diffusion.diffusion_ppo.PPODiffusion + # HP to tune + gamma_denoising: 0.99 + clip_ploss_coef: 0.01 + clip_ploss_coef_base: 0.001 + clip_ploss_coef_rate: 3 + randn_clip_value: 3 + min_sampling_denoising_std: 0.1 + min_logprob_denoising_std: 0.1 + # + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObs + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + ft_denoising_steps: ${ft_denoising_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1_ph.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1_ph.yaml new file mode 100644 index 0000000..c0d8d37 --- /dev/null +++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1_ph.yaml @@ -0,0 +1,112 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent + +name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 20 +ft_denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 40 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 400 + save_video: false + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env_name}-finetune + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 301 + n_critic_warmup_itr: 2 + n_steps: 400 + gamma: 0.999 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # PPO specific + reward_scale_running: True + reward_scale_const: 1.0 + gae_lambda: 0.95 + batch_size: 8000 + update_epochs: 10 + vf_coef: 0.5 + target_kl: 1 + +model: + _target_: model.diffusion.diffusion_ppo.PPODiffusion + # HP to tune + gamma_denoising: 0.9 + clip_ploss_coef: 0.01 + clip_ploss_coef_base: 0.001 + clip_ploss_coef_rate: 3 + randn_clip_value: 3 + min_sampling_denoising_std: 0.1 + min_logprob_denoising_std: 0.1 + # + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObs + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + ft_denoising_steps: ${ft_denoising_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml index 1ad16d7..6b17bc5 100644 --- a/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 201 n_critic_warmup_itr: 5 n_steps: 400 gamma: 0.999 diff --git a/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml index 2d34101..c27381f 100644 --- a/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 300 + n_train_itr: 201 n_critic_warmup_itr: 2 n_steps: 400 gamma: 0.999 diff --git a/cfg/robomimic/finetune/square/ibrl_mlp.yaml b/cfg/robomimic/finetune/square/ibrl_mlp.yaml index 6e34653..fba5969 100644 --- a/cfg/robomimic/finetune/square/ibrl_mlp.yaml +++ b/cfg/robomimic/finetune/square/ibrl_mlp.yaml @@ -7,7 +7,7 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent name: ${env_name}_ibrl_mlp_ta${horizon_steps} logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} -base_policy_path: +base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_gaussian_mlp_ta1/2024-09-28_13-42-43_42/checkpoint/state_5000.pt robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz @@ -93,7 +93,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct mlp_dims: [1024, 1024, 1024] diff --git a/cfg/robomimic/finetune/square/ibrl_mlp_ph.yaml b/cfg/robomimic/finetune/square/ibrl_mlp_ph.yaml new file mode 100644 index 0000000..f65c1dd --- /dev/null +++ b/cfg/robomimic/finetune/square/ibrl_mlp_ph.yaml @@ -0,0 +1,115 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent + +name: ${env_name}_ibrl_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_gaussian_mlp_ta1/2024-10-08_20-52-42_0/checkpoint/state_5000.pt +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 350 # IBRL uses 300 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: ibrl-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 100000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # IBRL specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + update_freq: 2 + +model: + _target_: model.rl.gaussian_ibrl.IBRL_Gaussian + randn_clip_value: 3 + n_critics: 5 + soft_action_sample: True + soft_action_sample_beta: 10 + network_path: ${base_policy_path} + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + dropout: 0.5 + fixed_std: 0.1 + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + max_n_episodes: 100 \ No newline at end of file diff --git a/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml index 61d7dec..8ab3b3b 100644 --- a/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml @@ -26,7 +26,7 @@ env: name: ${env_name} best_reward_threshold_for_success: 1 max_episode_steps: 800 - save_video: false + save_video: False wrappers: robomimic_lowdim: normalization_path: ${normalization_path} @@ -49,7 +49,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 1000 + n_train_itr: 201 n_critic_warmup_itr: 2 n_steps: 400 gamma: 0.999 @@ -58,7 +58,7 @@ train: actor_lr_scheduler: first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-6 + min_lr: 1e-5 critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: @@ -82,7 +82,7 @@ train: model: _target_: model.diffusion.diffusion_awr.AWRDiffusion # Sampling HPs - min_sampling_denoising_std: 0.08 + min_sampling_denoising_std: 0.1 randn_clip_value: 3 # network_path: ${base_policy_path} diff --git a/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml index ec30a80..1a99f3d 100644 --- a/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml @@ -49,7 +49,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 1000 + n_train_itr: 201 n_critic_warmup_itr: 2 n_steps: 400 gamma: 0.999 @@ -58,7 +58,7 @@ train: actor_lr_scheduler: first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-6 + min_lr: 1e-5 critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: @@ -82,7 +82,7 @@ train: model: _target_: model.diffusion.diffusion_dipo.DIPODiffusion # HP to tune - min_sampling_denoising_std: 0.08 + min_sampling_denoising_std: 0.1 randn_clip_value: 3 # network_path: ${base_policy_path} @@ -96,12 +96,12 @@ model: action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObsAct - action_dim: ${action_dim} - action_steps: ${act_steps} - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} mlp_dims: [256, 256, 256] activation_type: Mish residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} action_dim: ${action_dim} diff --git a/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml index 825e9d6..21a760e 100644 --- a/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml @@ -26,7 +26,7 @@ env: name: ${env_name} best_reward_threshold_for_success: 1 max_episode_steps: 800 - save_video: false + save_video: False wrappers: robomimic_lowdim: normalization_path: ${normalization_path} @@ -49,8 +49,8 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 1000 - n_critic_warmup_itr: 2 + n_train_itr: 201 + n_critic_warmup_itr: 5 n_steps: 400 gamma: 0.999 actor_lr: 1e-5 @@ -58,7 +58,7 @@ train: actor_lr_scheduler: first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-6 + min_lr: 1e-5 critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: @@ -81,7 +81,7 @@ train: model: _target_: model.diffusion.diffusion_dql.DQLDiffusion # Sampling HPs - min_sampling_denoising_std: 0.08 + min_sampling_denoising_std: 0.1 randn_clip_value: 3 # network_path: ${base_policy_path} diff --git a/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml index db690f9..140a39f 100644 --- a/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml @@ -49,7 +49,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 1000 + n_train_itr: 201 n_critic_warmup_itr: 5 n_steps: 400 gamma: 0.999 @@ -58,7 +58,7 @@ train: actor_lr_scheduler: first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-6 + min_lr: 1e-5 critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: @@ -83,7 +83,7 @@ train: model: _target_: model.diffusion.diffusion_idql.IDQLDiffusion # Sampling HPs - min_sampling_denoising_std: 0.08 + min_sampling_denoising_std: 0.1 randn_clip_value: 3 # network_path: ${base_policy_path} diff --git a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml index f0418c9..198855b 100644 --- a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml @@ -50,16 +50,16 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 1000 + n_train_itr: 201 n_critic_warmup_itr: 2 n_steps: 400 gamma: 0.999 - actor_lr: 1e-5 + actor_lr: 1e-4 actor_weight_decay: 0 actor_lr_scheduler: first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-6 + min_lr: 1e-4 critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: @@ -76,7 +76,7 @@ train: reward_scale_const: 1.0 gae_lambda: 0.95 batch_size: 10000 - update_epochs: 8 + update_epochs: 5 vf_coef: 0.5 target_kl: 1 @@ -88,7 +88,7 @@ model: clip_ploss_coef_base: 0.001 clip_ploss_coef_rate: 3 randn_clip_value: 3 - min_sampling_denoising_std: 0.08 + min_sampling_denoising_std: 0.1 min_logprob_denoising_std: 0.1 # network_path: ${base_policy_path} @@ -102,10 +102,10 @@ model: action_dim: ${action_dim} critic: _target_: model.common.critic.CriticObs - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} mlp_dims: [256, 256, 256] activation_type: Mish residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} ft_denoising_steps: ${ft_denoising_steps} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} diff --git a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml index ad22b83..b826e06 100644 --- a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml +++ b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml @@ -1,7 +1,7 @@ defaults: - _self_ hydra: - run: + run: dir: ${logdir} _target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent @@ -64,7 +64,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 500 + n_train_itr: 201 n_critic_warmup_itr: 2 n_steps: 400 gamma: 0.999 @@ -73,13 +73,13 @@ train: actor_lr: 1e-5 actor_weight_decay: 0 actor_lr_scheduler: - first_cycle_steps: 500 + first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-6 + min_lr: 1e-5 critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: - first_cycle_steps: 500 + first_cycle_steps: 1000 warmup_steps: 10 min_lr: 1e-3 save_model_freq: 100 @@ -93,19 +93,19 @@ train: gae_lambda: 0.95 batch_size: 500 logprob_batch_size: 1000 - update_epochs: 8 + update_epochs: 10 vf_coef: 0.5 target_kl: 1 model: _target_: model.diffusion.diffusion_ppo.PPODiffusion # HP to tune - gamma_denoising: 0.9 + gamma_denoising: 0.99 clip_ploss_coef: 0.01 clip_ploss_coef_base: 0.001 clip_ploss_coef_rate: 3 randn_clip_value: 3 - min_sampling_denoising_std: 0.08 + min_sampling_denoising_std: 0.1 min_logprob_denoising_std: 0.1 # use_ddim: ${use_ddim} @@ -164,10 +164,10 @@ model: embed_style: embed2 embed_norm: 0 img_cond_steps: ${img_cond_steps} - cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} mlp_dims: [256, 256, 256] activation_type: Mish residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} ft_denoising_steps: ${ft_denoising_steps} horizon_steps: ${horizon_steps} obs_dim: ${obs_dim} diff --git a/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml index 4072238..f116ef5 100644 --- a/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml @@ -49,7 +49,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 1000 + n_train_itr: 201 n_critic_warmup_itr: 5 n_steps: 400 gamma: 0.999 @@ -58,7 +58,7 @@ train: actor_lr_scheduler: first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-6 + min_lr: 1e-5 critic_lr: 1e-3 critic_weight_decay: 0 critic_lr_scheduler: @@ -81,7 +81,7 @@ train: model: _target_: model.diffusion.diffusion_qsm.QSMDiffusion # Sampling HPs - min_sampling_denoising_std: 0.08 + min_sampling_denoising_std: 0.1 randn_clip_value: 3 # network_path: ${base_policy_path} diff --git a/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml index af9e9cb..40cd186 100644 --- a/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml @@ -49,7 +49,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 1000 + n_train_itr: 201 n_critic_warmup_itr: 2 n_steps: 400 gamma: 0.999 @@ -58,7 +58,7 @@ train: lr_scheduler: first_cycle_steps: 1000 warmup_steps: 10 - min_lr: 1e-6 + min_lr: 1e-5 save_model_freq: 100 val_freq: 10 render: @@ -73,7 +73,7 @@ train: model: _target_: model.diffusion.diffusion_rwr.RWRDiffusion # Sampling HPs - min_sampling_denoising_std: 0.08 + min_sampling_denoising_std: 0.1 randn_clip_value: 3 # network_path: ${base_policy_path} diff --git a/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml b/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml index 3c610a1..0fd05ae 100644 --- a/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml +++ b/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 100 + n_train_itr: 1000 n_steps: 1 gamma: 0.99 actor_lr: 1e-4 @@ -61,8 +61,8 @@ train: first_cycle_steps: 1000 warmup_steps: 10 min_lr: 3e-4 - save_model_freq: 10 - val_freq: 10 + save_model_freq: 100 + val_freq: 20 render: freq: 1 num: 0 @@ -70,7 +70,7 @@ train: # CalQL specific train_online: False batch_size: 256 - n_random_actions: 4 + n_random_actions: 10 target_ema_rate: 0.005 scale_reward_factor: 1.0 num_update: 1000 @@ -93,7 +93,7 @@ model: tanh_output: False # squash after sampling instead cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} std_max: 7.3891 std_min: 0.0067 critic: diff --git a/cfg/robomimic/pretrain/can/calql_mlp_offline_ph.yaml b/cfg/robomimic/pretrain/can/calql_mlp_offline_ph.yaml new file mode 100644 index 0000000..a70d4aa --- /dev/null +++ b/cfg/robomimic/pretrain/can/calql_mlp_offline_ph.yaml @@ -0,0 +1,118 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 10 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: False + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1.yaml new file mode 100644 index 0000000..62a09e8 --- /dev/null +++ b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1.yaml @@ -0,0 +1,65 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent + +name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz + +seed: 42 +device: cuda:0 +env: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 20 +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 8000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 10000 + warmup_steps: 100 + min_lr: 1e-5 + epoch_start_ema: 20 + update_ema_freq: 10 + save_model_freq: 1000 + +model: + _target_: model.diffusion.diffusion.DiffusionModel + predict_epsilon: True + denoised_clip_value: 1.0 + network: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1_ph.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1_ph.yaml new file mode 100644 index 0000000..46593c6 --- /dev/null +++ b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1_ph.yaml @@ -0,0 +1,65 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent + +name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz + +seed: 42 +device: cuda:0 +env: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 20 +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 8000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 10000 + warmup_steps: 100 + min_lr: 1e-5 + epoch_start_ema: 20 + update_ema_freq: 10 + save_model_freq: 1000 + +model: + _target_: model.diffusion.diffusion.DiffusionModel + predict_epsilon: True + denoised_clip_value: 1.0 + network: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ta1_ph.yaml similarity index 85% rename from cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml rename to cfg/robomimic/pretrain/can/pre_gaussian_mlp_ta1_ph.yaml index c7e0d9c..1bb170d 100644 --- a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml +++ b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ta1_ph.yaml @@ -7,13 +7,13 @@ _target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent name: ${env}_pre_gaussian_mlp_ta${horizon_steps} logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} -train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz seed: 42 device: cuda:0 -env: transport -obs_dim: 59 -action_dim: 14 +env: can +obs_dim: 23 +action_dim: 7 horizon_steps: 1 cond_steps: 1 @@ -26,11 +26,11 @@ train: n_epochs: 5000 batch_size: 256 learning_rate: 1e-4 - weight_decay: 0 + weight_decay: 1e-6 lr_scheduler: first_cycle_steps: 5000 warmup_steps: 100 - min_lr: 1e-4 + min_lr: 1e-5 epoch_start_ema: 20 update_ema_freq: 10 save_model_freq: 1000 @@ -45,7 +45,7 @@ model: fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - action_dim: ${action_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} diff --git a/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml b/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml index 1cf5527..cb52740 100644 --- a/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml +++ b/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml @@ -46,7 +46,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 100 + n_train_itr: 1000 n_steps: 1 gamma: 0.99 actor_lr: 1e-4 @@ -61,8 +61,8 @@ train: first_cycle_steps: 1000 warmup_steps: 10 min_lr: 3e-4 - save_model_freq: 10 - val_freq: 10 + save_model_freq: 100 + val_freq: 20 render: freq: 1 num: 0 @@ -70,7 +70,7 @@ train: # CalQL specific train_online: False batch_size: 256 - n_random_actions: 4 + n_random_actions: 10 target_ema_rate: 0.005 scale_reward_factor: 1.0 num_update: 1000 @@ -93,7 +93,7 @@ model: tanh_output: False # squash after sampling instead cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} std_max: 7.3891 std_min: 0.0067 critic: diff --git a/cfg/robomimic/pretrain/square/calql_mlp_offline_ph.yaml b/cfg/robomimic/pretrain/square/calql_mlp_offline_ph.yaml new file mode 100644 index 0000000..5e541a4 --- /dev/null +++ b/cfg/robomimic/pretrain/square/calql_mlp_offline_ph.yaml @@ -0,0 +1,118 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_agent.TrainCalQLAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 400 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: calql-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 10 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: False + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + n_eval_episode: 10 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + automatic_entropy_tuning: True + +model: + _target_: model.rl.gaussian_calql.CalQL_Gaussian + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1.yaml new file mode 100644 index 0000000..53e572e --- /dev/null +++ b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1.yaml @@ -0,0 +1,66 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent + +name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz + +seed: 42 +device: cuda:0 +env: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 20 +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 8000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 10000 + warmup_steps: 100 + min_lr: 1e-5 + epoch_start_ema: 20 + update_ema_freq: 10 + save_model_freq: 1000 + +model: + _target_: model.diffusion.diffusion.DiffusionModel + predict_epsilon: True + denoised_clip_value: 1.0 + network: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1_ph.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1_ph.yaml new file mode 100644 index 0000000..7bffecd --- /dev/null +++ b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1_ph.yaml @@ -0,0 +1,66 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent + +name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz + +seed: 42 +device: cuda:0 +env: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 20 +horizon_steps: 1 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 8000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 10000 + warmup_steps: 100 + min_lr: 1e-5 + epoch_start_ema: 20 + update_ema_freq: 10 + save_model_freq: 1000 + +model: + _target_: model.diffusion.diffusion.DiffusionModel + predict_epsilon: True + denoised_clip_value: 1.0 + network: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ta1_ph.yaml similarity index 84% rename from cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml rename to cfg/robomimic/pretrain/square/pre_gaussian_mlp_ta1_ph.yaml index 11d3f08..0cc2204 100644 --- a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml +++ b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ta1_ph.yaml @@ -7,12 +7,12 @@ _target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent name: ${env}_pre_gaussian_mlp_ta${horizon_steps} logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} -train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz seed: 42 device: cuda:0 -env: lift -obs_dim: 19 +env: square +obs_dim: 23 action_dim: 7 horizon_steps: 1 cond_steps: 1 @@ -40,14 +40,15 @@ model: network: _target_: model.common.mlp_gaussian.Gaussian_MLP mlp_dims: [1024, 1024, 1024] - residual_style: False + activation_type: ReLU + dropout: 0.5 + fixed_std: 0.1 cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - action_dim: ${action_dim} + action_dim: ${action_dim} horizon_steps: ${horizon_steps} device: ${device} - ema: decay: 0.995 diff --git a/cfg/robomimic/finetune/can/rlpd_mlp.yaml b/cfg/robomimic/scratch/can/rlpd_mlp.yaml similarity index 98% rename from cfg/robomimic/finetune/can/rlpd_mlp.yaml rename to cfg/robomimic/scratch/can/rlpd_mlp.yaml index 4f5a948..8b66075 100644 --- a/cfg/robomimic/finetune/can/rlpd_mlp.yaml +++ b/cfg/robomimic/scratch/can/rlpd_mlp.yaml @@ -91,7 +91,7 @@ model: tanh_output: False # squash after sampling instead cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} std_max: 7.3891 std_min: 0.0067 critic: diff --git a/cfg/robomimic/scratch/can/rlpd_mlp_ph.yaml b/cfg/robomimic/scratch/can/rlpd_mlp_ph.yaml new file mode 100644 index 0000000..d574d5a --- /dev/null +++ b/cfg/robomimic/scratch/can/rlpd_mlp_ph.yaml @@ -0,0 +1,114 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: rlpd-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 100000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + +model: + _target_: model.rl.gaussian_rlpd.RLPD_Gaussian + randn_clip_value: 10 + backup_entropy: True + n_critics: 5 + tanh_output: True + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/finetune/square/rlpd_mlp.yaml b/cfg/robomimic/scratch/square/rlpd_mlp.yaml similarity index 98% rename from cfg/robomimic/finetune/square/rlpd_mlp.yaml rename to cfg/robomimic/scratch/square/rlpd_mlp.yaml index d62a41d..46730a7 100644 --- a/cfg/robomimic/finetune/square/rlpd_mlp.yaml +++ b/cfg/robomimic/scratch/square/rlpd_mlp.yaml @@ -91,7 +91,7 @@ model: tanh_output: False # squash after sampling instead cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} horizon_steps: ${horizon_steps} - + action_dim: ${action_dim} std_max: 7.3891 std_min: 0.0067 critic: diff --git a/cfg/robomimic/scratch/square/rlpd_mlp_ph.yaml b/cfg/robomimic/scratch/square/rlpd_mlp_ph.yaml new file mode 100644 index 0000000..cb8a8b4 --- /dev/null +++ b/cfg/robomimic/scratch/square/rlpd_mlp_ph.yaml @@ -0,0 +1,114 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 400 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: rlpd-${env_name} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 100000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 1 + +model: + _target_: model.rl.gaussian_rlpd.RLPD_Gaussian + randn_clip_value: 10 + backup_entropy: True + n_critics: 5 + tanh_output: True + actor: + _target_: model.common.mlp_gaussian.Gaussian_MLP + mlp_dims: [512, 512, 512] + activation_type: ReLU + tanh_output: False # squash after sampling instead + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + std_max: 7.3891 + std_min: 0.0067 + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/env/gym_utils/__init__.py b/env/gym_utils/__init__.py index cea639c..e70870a 100644 --- a/env/gym_utils/__init__.py +++ b/env/gym_utils/__init__.py @@ -165,7 +165,9 @@ def make_async( # https://github.com/ARISE-Initiative/robosuite/blob/92abf5595eddb3a845cd1093703e5a3ccd01e77e/robosuite/environments/base.py#L247-L248 env.env.hard_reset = False else: # d3il, gym - env = make_(id, render=render, **kwargs) + if "kitchen" not in id: # d4rl kitchen does not support rendering! + kwargs["render"] = render + env = make_(id, **kwargs) # add wrappers if wrappers is not None: diff --git a/env/gym_utils/wrapper/furniture.py b/env/gym_utils/wrapper/furniture.py index 3c02895..3ca67e1 100644 --- a/env/gym_utils/wrapper/furniture.py +++ b/env/gym_utils/wrapper/furniture.py @@ -132,9 +132,10 @@ class FurnitureRLSimEnvMultiStepWrapper(gym.Wrapper): nobs: np.ndarray = self.process_obs(obs) truncated: np.ndarray = truncated.squeeze().cpu().numpy() - terminated: np.ndarray = np.zeros_like(truncated, dtype=bool) + # terminated: np.ndarray = np.zeros_like(truncated, dtype=bool) - return {"state": nobs}, reward, terminated, truncated, info + # since we only assign reward at the timestep where one stage is finished, and reward does not accumulate, we consider the final step of the episode as terminal + return {"state": nobs}, reward, truncated, truncated, info def _inner_step(self, action_chunk: torch.Tensor): dense_reward = torch.zeros(action_chunk.shape[0], device=action_chunk.device) diff --git a/model/common/mlp.py b/model/common/mlp.py index 3322af9..4ab137c 100644 --- a/model/common/mlp.py +++ b/model/common/mlp.py @@ -96,6 +96,7 @@ class ResidualMLP(nn.Module): out_activation_type="Identity", use_layernorm=False, use_layernorm_final=False, + dropout=0, ): super(ResidualMLP, self).__init__() hidden_dim = dim_list[1] @@ -108,6 +109,7 @@ class ResidualMLP(nn.Module): hidden_dim=hidden_dim, activation_type=activation_type, use_layernorm=use_layernorm, + dropout=dropout, ) for _ in range(1, num_hidden_layers, 2) ] @@ -129,6 +131,7 @@ class TwoLayerPreActivationResNetLinear(nn.Module): hidden_dim, activation_type="Mish", use_layernorm=False, + dropout=0, ): super().__init__() self.l1 = nn.Linear(hidden_dim, hidden_dim) @@ -137,6 +140,8 @@ class TwoLayerPreActivationResNetLinear(nn.Module): if use_layernorm: self.norm1 = nn.LayerNorm(hidden_dim, eps=1e-06) self.norm2 = nn.LayerNorm(hidden_dim, eps=1e-06) + if dropout > 0: + raise NotImplementedError("Dropout not implemented for residual MLP!") def forward(self, x): x_input = x diff --git a/model/common/mlp_gaussian.py b/model/common/mlp_gaussian.py index e05dbed..dbd10cf 100644 --- a/model/common/mlp_gaussian.py +++ b/model/common/mlp_gaussian.py @@ -212,6 +212,7 @@ class Gaussian_MLP(nn.Module): out_activation_type=activation_type, use_layernorm=use_layernorm, use_layernorm_final=use_layernorm, + dropout=dropout, ) self.mlp_mean = MLP( mlp_dims[-1:] + [output_dim], @@ -233,9 +234,7 @@ class Gaussian_MLP(nn.Module): if learn_fixed_std: # initialize to fixed_std self.logvar = torch.nn.Parameter( - torch.log( - torch.tensor([fixed_std**2 for _ in range(action_dim)]) - ), + torch.log(torch.tensor([fixed_std**2 for _ in range(action_dim)])), requires_grad=True, ) self.logvar_min = torch.nn.Parameter( diff --git a/model/diffusion/diffusion_ppo.py b/model/diffusion/diffusion_ppo.py index 9c13863..1c574a3 100644 --- a/model/diffusion/diffusion_ppo.py +++ b/model/diffusion/diffusion_ppo.py @@ -22,7 +22,6 @@ from model.diffusion.diffusion_vpg import VPGDiffusion class PPODiffusion(VPGDiffusion): - def __init__( self, gamma_denoising: float, @@ -58,7 +57,9 @@ class PPODiffusion(VPGDiffusion): def loss( self, obs, - chains, + chains_prev, + chains_next, + denoising_inds, returns, oldvalues, advantages, @@ -81,9 +82,11 @@ class PPODiffusion(VPGDiffusion): reward_horizon: action horizon that backpropagates gradient """ # Get new logprobs for denoising steps from T-1 to 0 - entropy is fixed fod diffusion - newlogprobs, eta = self.get_logprobs( + newlogprobs, eta = self.get_logprobs_subsample( obs, - chains, + chains_prev, + chains_next, + denoising_inds, get_ent=True, ) entropy_loss = -eta.mean() @@ -92,7 +95,7 @@ class PPODiffusion(VPGDiffusion): # only backpropagate through the earlier steps (e.g., ones actually executed in the environment) newlogprobs = newlogprobs[:, :reward_horizon, :] - oldlogprobs = oldlogprobs[:, :, :reward_horizon, :] + oldlogprobs = oldlogprobs[:, :reward_horizon, :] # Get the logprobs - batch over B and denoising steps newlogprobs = newlogprobs.mean(dim=(-1, -2)).view(-1) @@ -106,9 +109,7 @@ class PPODiffusion(VPGDiffusion): # Get counterfactual teacher actions samples = self.forward( - cond=obs.float() - .unsqueeze(1) - .to(self.device), # B x horizon=1 x obs_dim + cond=obs, deterministic=False, return_chain=True, use_base_policy=True, @@ -116,7 +117,7 @@ class PPODiffusion(VPGDiffusion): # Get logprobs of teacher actions under this policy bc_logprobs = self.get_logprobs( obs, - samples.chains, # n_env x denoising x horizon x act + samples.chains, get_ent=False, use_base_policy=False, ) @@ -133,14 +134,13 @@ class PPODiffusion(VPGDiffusion): advantage_max = torch.quantile(advantages, self.clip_advantage_upper_quantile) advantages = advantages.clamp(min=advantage_min, max=advantage_max) - # repeat advantages for denoising steps and horizon steps - advantages = advantages.repeat_interleave(self.ft_denoising_steps) - # denoising discount discount = torch.tensor( - [self.gamma_denoising**i for i in reversed(range(self.ft_denoising_steps))] + [ + self.gamma_denoising ** (self.ft_denoising_steps - i - 1) + for i in denoising_inds + ] ).to(self.device) - discount = discount.repeat(len(advantages) // self.ft_denoising_steps) advantages *= discount # get ratio @@ -148,9 +148,7 @@ class PPODiffusion(VPGDiffusion): ratio = logratio.exp() # exponentially interpolate between the base and the current clipping value over denoising steps and repeat - t = torch.arange(self.ft_denoising_steps).float().to(self.device) / ( - self.ft_denoising_steps - 1 - ) # 0 to 1 + t = (denoising_inds.float() / (self.ft_denoising_steps - 1)).to(self.device) if self.ft_denoising_steps > 1: clip_ploss_coef = self.clip_ploss_coef_base + ( self.clip_ploss_coef - self.clip_ploss_coef_base @@ -158,10 +156,7 @@ class PPODiffusion(VPGDiffusion): math.exp(self.clip_ploss_coef_rate) - 1 ) else: - clip_ploss_coef = torch.tensor([self.clip_ploss_coef]).to(self.device) - clip_ploss_coef = clip_ploss_coef.repeat( - len(advantages) // self.ft_denoising_steps - ) + clip_ploss_coef = t # get kl difference and whether value clipped with torch.no_grad(): diff --git a/model/diffusion/diffusion_vpg.py b/model/diffusion/diffusion_vpg.py index e61b716..cfa9a5a 100644 --- a/model/diffusion/diffusion_vpg.py +++ b/model/diffusion/diffusion_vpg.py @@ -395,6 +395,71 @@ class VPGDiffusion(DiffusionModel): return log_prob, eta return log_prob + def get_logprobs_subsample( + self, + cond, + chains_prev, + chains_next, + denoising_inds, + get_ent: bool = False, + use_base_policy: bool = False, + ): + """ + Calculating the logprobs of random samples of denoised chains. + + Args: + cond: dict with key state/rgb; more recent obs at the end + state: (B, To, Do) + rgb: (B, To, C, H, W) + chains: (B, K+1, Ta, Da) + get_ent: flag for returning entropy + use_base_policy: flag for using base policy + + Returns: + logprobs: (B, Ta, Da) + entropy (if get_ent=True): (B, Ta) + denoising_indices: (B, ) + """ + # Sample t for batch dim, keep it 1-dim + if self.use_ddim: + t_single = self.ddim_t[-self.ft_denoising_steps :] + else: + t_single = torch.arange( + start=self.ft_denoising_steps - 1, + end=-1, + step=-1, + device=self.device, + ) + # 4,3,2,1,0,4,3,2,1,0,...,4,3,2,1,0 + t_all = t_single[denoising_inds] + if self.use_ddim: + ddim_indices_single = torch.arange( + start=self.ddim_steps - self.ft_denoising_steps, + end=self.ddim_steps, + device=self.device, + ) # only used for DDIM + ddim_indices = ddim_indices_single[denoising_inds] + else: + ddim_indices = None + + # Forward pass with previous chains + next_mean, logvar, eta = self.p_mean_var( + chains_prev, + t_all, + cond=cond, + index=ddim_indices, + use_base_policy=use_base_policy, + ) + std = torch.exp(0.5 * logvar) + std = torch.clip(std, min=self.min_logprob_denoising_std) + dist = Normal(next_mean, std) + + # Get logprobs with gaussian + log_prob = dist.log_prob(chains_next) + if get_ent: + return log_prob, eta + return log_prob + def loss(self, cond, chains, reward): """ REINFORCE loss. Not used right now. diff --git a/model/rl/gaussian_calql.py b/model/rl/gaussian_calql.py index 14d87f0..0ea9ddb 100644 --- a/model/rl/gaussian_calql.py +++ b/model/rl/gaussian_calql.py @@ -63,7 +63,6 @@ class CalQL_Gaussian(GaussianModel): returns, terminated, gamma, - alpha, ): B = len(actions) @@ -71,17 +70,17 @@ class CalQL_Gaussian(GaussianModel): q_data1, q_data2 = self.critic(obs, actions) with torch.no_grad(): # repeat for action samples - next_obs["state"] = next_obs["state"].repeat_interleave( + next_obs_repeated = {"state": next_obs["state"].repeat_interleave( self.cql_n_actions, dim=0 - ) + )} # Get the next actions and logprobs next_actions, next_logprobs = self.forward( - next_obs, + next_obs_repeated, deterministic=False, get_logprob=True, ) - next_q1, next_q2 = self.target_critic(next_obs, next_actions) + next_q1, next_q2 = self.target_critic(next_obs_repeated, next_actions) next_q = torch.min(next_q1, next_q2) # Reshape the next_q to match the number of samples @@ -96,9 +95,6 @@ class CalQL_Gaussian(GaussianModel): # Get the target Q values target_q = rewards + gamma * (1 - terminated) * next_q - # Subtract the entropy bonus - target_q = target_q - alpha * next_logprobs - # TD loss td_loss_1 = nn.functional.mse_loss(q_data1, target_q) td_loss_2 = nn.functional.mse_loss(q_data2, target_q) @@ -111,6 +107,12 @@ class CalQL_Gaussian(GaussianModel): reparameterize=False, get_logprob=True, ) # no gradient + pi_next_actions, log_pi_next = self.forward( + next_obs, + deterministic=False, + reparameterize=False, + get_logprob=True, + ) # no gradient # Random action Q values n_random_actions = random_actions.shape[1] @@ -130,17 +132,26 @@ class CalQL_Gaussian(GaussianModel): # Policy action Q values q_pi_1, q_pi_2 = self.critic(obs, pi_actions) - q_pi_1 = q_pi_1 - log_pi - q_pi_2 = q_pi_2 - log_pi + q_pi_next_1, q_pi_next_2 = self.critic(next_obs, pi_next_actions) # Ensure calibration w.r.t. value function estimate q_pi_1 = torch.max(q_pi_1, returns)[:, None] # (B, 1) q_pi_2 = torch.max(q_pi_2, returns)[:, None] # (B, 1) - cat_q_1 = torch.cat([q_rand_1, q_pi_1], dim=-1) # (B, num_samples+1) + q_pi_next_1 = torch.max(q_pi_next_1, returns)[:, None] # (B, 1) + q_pi_next_2 = torch.max(q_pi_next_2, returns)[:, None] # (B, 1) + + # cql_importance_sample + q_pi_1 = q_pi_1 - log_pi + q_pi_2 = q_pi_2 - log_pi + q_pi_next_1 = q_pi_next_1 - log_pi_next + q_pi_next_2 = q_pi_next_2 - log_pi_next + cat_q_1 = torch.cat([q_rand_1, q_pi_1, q_pi_next_1], dim=-1) # (B, num_samples+1) cql_qf1_ood = torch.logsumexp(cat_q_1, dim=-1) # max over num_samples - cat_q_2 = torch.cat([q_rand_2, q_pi_2], dim=-1) # (B, num_samples+1) + cat_q_2 = torch.cat([q_rand_2, q_pi_2, q_pi_next_2], dim=-1) # (B, num_samples+1) cql_qf2_ood = torch.logsumexp(cat_q_2, dim=-1) # sum over num_samples + # skip cal_lagrange since the paper shows cql_target_action_gap not used in kitchen + # Subtract the log likelihood of the data cql_qf1_diff = torch.clamp( cql_qf1_ood - q_data1, diff --git a/model/rl/gaussian_ibrl.py b/model/rl/gaussian_ibrl.py index ce96232..4a87f2d 100644 --- a/model/rl/gaussian_ibrl.py +++ b/model/rl/gaussian_ibrl.py @@ -20,7 +20,7 @@ class IBRL_Gaussian(GaussianModel): critic, n_critics, soft_action_sample=False, - soft_action_sample_beta=0.1, + soft_action_sample_beta=10, **kwargs, ): super().__init__(network=actor, **kwargs) diff --git a/model/rl/gaussian_ppo.py b/model/rl/gaussian_ppo.py index a7e3be8..05c047d 100644 --- a/model/rl/gaussian_ppo.py +++ b/model/rl/gaussian_ppo.py @@ -63,6 +63,23 @@ class PPO_Gaussian(VPG_Gaussian): oldlogprobs = oldlogprobs.clamp(min=-5, max=2) entropy_loss = -entropy + bc_loss = 0.0 + if use_bc_loss: + # See Eqn. 2 of https://arxiv.org/pdf/2403.03949.pdf + # Give a reward for maximizing probability of teacher policy's action with current policy. + # Actions are chosen along trajectory induced by current policy. + + # Get counterfactual teacher actions + samples = self.forward( + cond=obs, + deterministic=False, + use_base_policy=True, + ) + # Get logprobs of teacher actions under this policy + bc_logprobs, _, _ = self.get_logprobs(obs, samples, use_base_policy=False) + bc_logprobs = bc_logprobs.clamp(min=-5, max=2) + bc_loss = -bc_logprobs.mean() + # get ratio logratio = newlogprobs - oldlogprobs ratio = logratio.exp() @@ -99,25 +116,6 @@ class PPO_Gaussian(VPG_Gaussian): v_loss = 0.5 * v_loss_max.mean() else: v_loss = 0.5 * ((newvalues - returns) ** 2).mean() - - bc_loss = 0.0 - if use_bc_loss: - # See Eqn. 2 of https://arxiv.org/pdf/2403.03949.pdf - # Give a reward for maximizing probability of teacher policy's action with current policy. - # Actions are chosen along trajectory induced by current policy. - - # Get counterfactual teacher actions - samples = self.forward( - cond=obs.float() - .unsqueeze(1) - .to(self.device), # B x horizon=1 x obs_dim - deterministic=False, - use_base_policy=True, - ) - # Get logprobs of teacher actions under this policy - bc_logprobs, _, _ = self.get_logprobs(obs, samples, use_base_policy=False) - bc_logprobs = bc_logprobs.clamp(min=-5, max=2) - bc_loss = -bc_logprobs.mean() return ( pg_loss, entropy_loss, diff --git a/pyproject.toml b/pyproject.toml index 0191c91..b1dbffe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "dppo" -version = "0.5.0" +version = "0.6.0" description = "Fine-tuning diffusion policies with PPO." readme = "README.md" requires-python = ">=3.8" @@ -32,6 +32,13 @@ gym = [ "d4rl", "patchelf", ] +kitchen = [ + "cython<3", + "d4rl", + "dm_control==1.0.16", + "mujoco==3.1.6", + "patchelf", +] robomimic = [ "cython<3", "d4rl", diff --git a/script/download_url.py b/script/download_url.py index 9838088..1fa2069 100644 --- a/script/download_url.py +++ b/script/download_url.py @@ -7,6 +7,12 @@ def get_dataset_download_url(cfg): return "https://drive.google.com/drive/u/1/folders/1BJu8NklriunDHsDrLT6fEpcro3_2IPFf" elif env == "halfcheetah-medium-v2": return "https://drive.google.com/drive/u/1/folders/1Drel26tiuQ9oD3YNf1eyy0UVaf5SQj-U" + elif env == "kitchen-complete-v0": + return "https://drive.google.com/drive/u/1/folders/18aqg7KIv-YNXohTsRR7Zmg-RyDtdhkLc" + elif env == "kitchen-partial-v0": + return "https://drive.google.com/drive/u/1/folders/1zLOx1q4FbJK1ZWLui_vhM2x1fMEkBC2D" + elif env == "kitchen-mixed-v0": + return "https://drive.google.com/drive/u/1/folders/1HRMM16UC10A00oBqjYOL1E8hS5icwtvo" # D3IL elif env == "avoid" and cfg.mode == "d56_r12": # M1 return "https://drive.google.com/drive/u/1/folders/1ZAPvLQwv2y4Q98UDVKXFT4fvGF5yhD_o" @@ -14,7 +20,20 @@ def get_dataset_download_url(cfg): return "https://drive.google.com/drive/u/1/folders/1wyJi1Zbnd6JNy4WGszHBH40A0bbl-vkd" elif env == "avoid" and cfg.mode == "d58_r12": # M3 return "https://drive.google.com/drive/u/1/folders/1mNXCIPnCO_FDBlEj95InA9eWJM2XcEEj" - # Robomimic + # Robomimic-PH + elif ( + env == "can" + and "ph" in cfg.train_dataset_path + and "img" not in cfg.train_dataset_path + ): + return "https://drive.google.com/drive/folders/1rpVsdpqWPygL89E-t4SLQmZgwQ3mpNnY?usp=drive_link" + elif ( + env == "square" + and "ph" in cfg.train_dataset_path + and "img" not in cfg.train_dataset_path + ): + return "https://drive.google.com/drive/folders/1wqqjT9JZ9LX11l2Sz_vGxfcT3BfcNrGk?usp=drive_link" + # Robomimic-MH elif env == "lift" and "img" not in cfg.train_dataset_path: # state return "https://drive.google.com/drive/u/1/folders/1lbXgMKBTAiFdJqPZqWXpwjEyrVW16MBu" elif env == "lift" and "img" in cfg.train_dataset_path: # img @@ -58,6 +77,12 @@ def get_normalization_download_url(cfg): return "https://drive.google.com/file/d/1NSX7t3DFKaBj5HNpv91Oo5h6oXTk0zoo/view?usp=drive_link" elif env == "halfcheetah-medium-v2": return "https://drive.google.com/file/d/1LlwCMfy1b5e8jSx99CV3lWhcrQWrI2Jm/view?usp=drive_link" + elif env == "kitchen-complete-v0": + return "https://drive.google.com/file/d/1tBATWLoP1E5s08vr5fiUZBzn8EEsjEZh/view?usp=drive_link" + elif env == "kitchen-partial-v0": + return "https://drive.google.com/file/d/1Ptt0cwQwmb5_HGNM-zggRaDKfkqqNO5e/view?usp=drive_link" + elif env == "kitchen-mixed-v0": + return "https://drive.google.com/file/d/11gj846QTYFPeV14nhcL5Z9OA5RHIGVt1/view?usp=drive_link" # D3IL elif env == "avoiding-m5" and cfg.mode == "d56_r12": # M1 return "https://drive.google.com/file/d/1PubKaPabbiSdWYpGmouDhYfXp4QwNHFG/view?usp=drive_link" @@ -65,7 +90,20 @@ def get_normalization_download_url(cfg): return "https://drive.google.com/file/d/1Hoohw8buhsLzXoqivMA6IzKS5Izlj07_/view?usp=drive_link" elif env == "avoiding-m5" and cfg.mode == "d58_r12": # M3 return "https://drive.google.com/file/d/1qt7apV52C9Tflsc-A55J6uDMHzaFa1wN/view?usp=drive_link" - # Robomimic + # Robomimic-PH + elif ( + env == "can" + and "ph" in cfg.normalization_path + and "img" not in cfg.normalization_path + ): + return "https://drive.google.com/file/d/1y04FAEXgK6UlZuDiQzTumS9lz-Ufn47B/view?usp=drive_link" + elif ( + env == "square" + and "ph" in cfg.normalization_path + and "img" not in cfg.normalization_path + ): + return "https://drive.google.com/file/d/1_75UM0frCZVtcROgfWsdJ0FstToZd1b5/view?usp=drive_link" + # Robomimic-MH elif env == "lift" and "img" not in cfg.normalization_path: # state return "https://drive.google.com/file/d/1d3WjwRds-7I5bBFpZuY27OT9ycb8r_QM/view?usp=drive_link" elif env == "lift" and "img" in cfg.normalization_path: # img @@ -120,6 +158,71 @@ def get_checkpoint_download_url(cfg): in path ): return "https://drive.google.com/file/d/1o9ryyeZQAsaB4ffUTCJkIaGCi0frL3G4/view?usp=drive_link" + # Demo-RL + elif ( + "halfcheetah-medium-v2_pre_diffusion_mlp_ta1_td20/2024-09-29_02-13-10_42/checkpoint/state_1000.pt" + in path + ): + return "https://drive.google.com/file/d/1Oi5JhsU45ScHdlrtn5AX8Ji7InLBVj4D/view?usp=drive_link" + elif ( + "halfcheetah-medium-v2_pre_gaussian_mlp_ta1/2024-09-28_18-48-54_42/checkpoint/state_500.pt" + in path + ): + return "https://drive.google.com/file/d/14rbYGaCxvj1PtELKVfdXNHJ1Od2G6FLw/view?usp=drive_link" + elif ( + "halfcheetah-medium-v2_calql_mlp_ta1/2024-09-29_22-59-08_42/checkpoint/state_49.pt" + in path + ): + return "https://drive.google.com/file/d/1Xf758xzsAqpFwV955OVUNL6Za90XPo1K/view?usp=drive_link" + + elif ( + "kitchen-complete-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-47-42_42/checkpoint/state_8000.pt" + in path + ): + return "https://drive.google.com/file/d/1YBwyNd30a4_inu2sZzNSNLJQsj8fN3ZX/view?usp=drive_link" + elif ( + "kitchen-complete-v0_calql_mlp_ta1/2024-10-26_01-01-33_42/checkpoint/state_999.pt" + in path + ): + return "https://drive.google.com/file/d/1K4V59iXNQbpOvu3u5y6C9R5piMU9idYm/view?usp=drive_link" + elif ( + "kitchen-complete-v0_pre_gaussian_mlp_ta1/2024-10-25_14-48-43_42/checkpoint/state_5000.pt" + in path + ): + return "https://drive.google.com/file/d/1tQYgnkdhR5wnuXC4Ha_mKHuIdg6J627s/view?usp=drive_link" + + elif ( + "kitchen-partial-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-29_42/checkpoint/state_8000.pt" + in path + ): + return "https://drive.google.com/file/d/1oSupKkUjCFQVWBIJV5Seh-CclWhgpopS/view?usp=drive_link" + elif ( + "kitchen-partial-v0_calql_mlp_ta1/2024-10-25_21-26-51_42/checkpoint/state_980.pt" + in path + ): + return "https://drive.google.com/file/d/17HUDp3l8mJsMIW-DRraKPhUkH44KGTbA/view?usp=drive_link" + elif ( + "kitchen-partial-v0_pre_gaussian_mlp_ta1/2024-10-25_01-45-52_42/checkpoint/state_5000.pt" + in path + ): + return "https://drive.google.com/file/d/1-ZmGRPi4jMS7HfqHPvWrSPxNSoTwih6q/view?usp=drive_link" + + elif ( + "kitchen-mixed-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-28_42/checkpoint/state_8000.pt" + in path + ): + return "https://drive.google.com/file/d/1X24Hqbn4b4xyLK_1A3D6zhSgsN7frVCG/view?usp=drive_link" + elif ( + "kitchen-mixed-v0_calql_mlp_ta1/2024-10-25_21-36-13_42/checkpoint/state_999.pt" + in path + ): + return "https://drive.google.com/file/d/1AP7bbzAwwfuSLmV1HkQLfmd76MXQn2Za/view?usp=drive_link" + elif ( + "kitchen-mixed-v0_pre_gaussian_mlp_ta1/2024-10-25_01-39-44_42/checkpoint/state_5000.pt" + in path + ): + return "https://drive.google.com/file/d/1LEzGhMOqL3YZFXMGn1mTcOh-tm4Lh1SH/view?usp=drive_link" + ###################################### #### D3IL ###################################### @@ -246,6 +349,32 @@ def get_checkpoint_download_url(cfg): in path ): return "https://drive.google.com/file/d/1xSgwGG40zdoO2DDSM79l0rMHeNmaifnq/view?usp=drive_link" + # demo-PH + elif ( + "can_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt" + in path + ): + return "https://drive.google.com/file/d/1Ze86hw2E0jJinn3Vx683JQ10Gq5FIJad/view?usp=drive_link" + elif ( + "can_pre_gaussian_mlp_ta1/2024-10-08_20-52-04_0/checkpoint/state_5000.pt" + in path + ): + return "https://drive.google.com/file/d/1jP3mEOhZojWiTXCMZ0ajFRMkAAmonGxV/view?usp=drive_link" + elif "can_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt" in path: + return "https://drive.google.com/file/d/1ERaZKTXmL-vdyU8PZ2X9GjFIMVKJjA2N/view?usp=drive_link" + # demo-MH + elif ( + "can_pre_diffusion_mlp_ta1_td20/2024-09-29_15-43-07_42/checkpoint/state_8000.pt" + in path + ): + return "https://drive.google.com/file/d/1pEs1cK1x5obAtJA9pFSN1CWG79gNhH24/view?usp=drive_link" + elif ( + "can_pre_gaussian_mlp_ta1/2024-09-28_13-43-59_42/checkpoint/state_5000.pt" + in path + ): + return "https://drive.google.com/file/d/1Fa3yflkvYSAy6PKT646U1VAqUJ0YHqsj/view?usp=drive_link" + elif "can_calql_mlp_ta1/2024-10-25_22-30-16_42/checkpoint/state_999.pt" in path: + return "https://drive.google.com/file/d/1AA94uEaK_SzG2mTpaKqZIwNMh6omL_g0/view?usp=drive_link" ###################################### #### Robomimic-Square ###################################### @@ -286,6 +415,32 @@ def get_checkpoint_download_url(cfg): in path ): return "https://drive.google.com/file/d/1LczXhgeNtQfqySsfGNbbviPrlLwyh-E3/view?usp=drive_link" + # demo-PH + elif ( + "square_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt" + in path + ): + return "https://drive.google.com/file/d/1_Jnz14ySxqbtZa9IIEWkXqy5_-EwJLBw/view?usp=drive_link" + elif ( + "square_pre_gaussian_mlp_ta1/2024-10-08_20-52-42_0/checkpoint/state_5000.pt" + in path + ): + return "https://drive.google.com/file/d/1ZPWKUoZ93OqqVX3ephQMkpeBZoYrceM5/view?usp=drive_link" + elif "square_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt" in path: + return "https://drive.google.com/file/d/1_7YtUwRd_U5tuOvhHogJDhkEsE-4D24V/view?usp=drive_link" + # demo-MH + elif ( + "square_pre_diffusion_mlp_ta1_td20/2024-09-29_02-14-14_42/checkpoint/state_8000.pt" + in path + ): + return "https://drive.google.com/file/d/1ks1PnUBvFVWPnpGnYL8_eIfLNeGZbv1p/view?usp=drive_link" + elif ( + "square_pre_gaussian_mlp_ta1/2024-09-28_13-42-43_42/checkpoint/state_5000.pt" + in path + ): + return "https://drive.google.com/file/d/1uIOn8QUkGRbhZLkQ9ziOkP7yGQnpYdk7/view?usp=drive_link" + elif "square_calql_mlp_ta1/2024-10-25_22-44-12_42/checkpoint/state_999.pt" in path: + return "https://drive.google.com/file/d/1zgzG6bx6ugAEaq72z9WpXX6iewClcKTV/view?usp=drive_link" ###################################### #### Robomimic-Transport ######################################