From dc8e0c9edce7ac2b2ff112abe460e1c21b0b3bdc Mon Sep 17 00:00:00 2001
From: "Allen Z. Ren" <allen.ren@princeton.edu>
Date: Wed, 30 Oct 2024 19:58:06 -0400
Subject: [PATCH] v0.6 (#18)

* Sampling over both env and denoising steps in DPPO updates (#13)

* sample one from each chain

* full random sampling

* Add Proficient Human (PH) Configs and Pipeline (#16)

* fix missing cfg

* add ph config

* fix how terminated flags are added to buffer in ibrl

* add ph config

* offline calql for 1M gradient updates

* bug fix: number of calql online gradient steps is the number of new transitions collected

* add sample config for DPPO with ta=1

* Sampling over both env and denoising steps in DPPO updates (#13)

* sample one from each chain

* full random sampling

* fix diffusion loss when predicting initial noise

* fix dppo inds

* fix typo

* remove print statement

---------

Co-authored-by: Justin M. Lidard <jlidard@neuronic.cs.princeton.edu>
Co-authored-by: allenzren <allen.ren@princeton.edu>

* update robomimic configs

* better calql formulation

* optimize calql and ibrl training

* optimize data transfer in ppo agents

* add kitchen configs

* re-organize config folders, rerun calql and rlpd

* add scratch gym locomotion configs

* add kitchen installation dependencies

* use truncated for termination in furniture env

* update furniture and gym configs

* update README and dependencies with kitchen

* add url for new data and checkpoints

* update demo RL configs

* update batch sizes for furniture unet configs

* raise error about dropout in residual mlp

* fix observation bug in bc loss

---------

Co-authored-by: Justin Lidard <60638575+jlidard@users.noreply.github.com>
Co-authored-by: Justin M. Lidard <jlidard@neuronic.cs.princeton.edu>
---
 README.md                                     |   8 +-
 agent/finetune/train_calql_agent.py           |  16 +-
 agent/finetune/train_ibrl_agent.py            |  14 +-
 agent/finetune/train_ppo_diffusion_agent.py   |  52 +++---
 .../finetune/train_ppo_diffusion_img_agent.py |  36 ++--
 .../train_ppo_exact_diffusion_agent.py        |  13 +-
 agent/finetune/train_ppo_gaussian_agent.py    |  17 +-
 .../finetune/train_ppo_gaussian_img_agent.py  |  15 +-
 .../lamp_low/ft_ppo_diffusion_unet.yaml       |   2 +-
 .../lamp_med/ft_ppo_diffusion_unet.yaml       |   2 +-
 .../one_leg_low/ft_ppo_diffusion_mlp.yaml     |   2 +-
 .../one_leg_low/ft_ppo_diffusion_unet.yaml    |   2 +-
 .../one_leg_med/ft_ppo_diffusion_unet.yaml    |   2 +-
 .../ft_ppo_diffusion_unet.yaml                |   2 +-
 .../ft_ppo_diffusion_unet.yaml                |   2 +-
 .../eval/kitchen-v0/eval_diffusion_mlp.yaml   |  61 +++++++
 .../halfcheetah-v2/calql_mlp_online.yaml      |   4 +-
 .../halfcheetah-v2/ft_ppo_diffusion_mlp.yaml  |   2 +-
 .../ft_ppo_diffusion_mlp_ta1.yaml             | 108 ++++++++++++
 cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml |   4 +-
 .../hopper-v2/ft_ppo_diffusion_mlp.yaml       |   2 +-
 cfg/gym/finetune/hopper-v2/sac_mlp.yaml       |  89 ----------
 .../kitchen-complete-v0/calql_mlp_online.yaml | 116 +++++++++++++
 .../ft_ppo_diffusion_mlp.yaml                 | 108 ++++++++++++
 .../kitchen-complete-v0/ibrl_mlp.yaml         | 109 ++++++++++++
 .../calql_mlp_online.yaml                     |  25 ++-
 .../ft_ppo_diffusion_mlp.yaml                 | 108 ++++++++++++
 .../ibrl_mlp.yaml                             |  41 ++---
 .../kitchen-partial-v0/calql_mlp_online.yaml  | 116 +++++++++++++
 .../ft_ppo_diffusion_mlp.yaml                 | 108 ++++++++++++
 .../finetune/kitchen-partial-v0/ibrl_mlp.yaml | 109 ++++++++++++
 .../walker2d-v2/ft_ppo_diffusion_mlp.yaml     |   2 +-
 cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml | 103 ------------
 .../calql_mlp_offline.yaml                    |   2 +-
 .../calql_mlp_offline.yaml                    | 113 +++++++++++++
 .../pre_diffusion_mlp.yaml                    |  66 ++++++++
 .../kitchen-complete-v0/pre_gaussian_mlp.yaml |  60 +++++++
 .../calql_mlp_offline.yaml                    |  30 ++--
 .../kitchen-mixed-v0/pre_diffusion_mlp.yaml   |  66 ++++++++
 .../kitchen-mixed-v0/pre_gaussian_mlp.yaml    |  59 +++++++
 .../kitchen-partial-v0/calql_mlp_offline.yaml | 113 +++++++++++++
 .../kitchen-partial-v0/pre_diffusion_mlp.yaml |  66 ++++++++
 .../kitchen-partial-v0/pre_gaussian_mlp.yaml  |  59 +++++++
 .../halfcheetah-v2/ppo_diffusion_mlp.yaml     |   6 +-
 .../halfcheetah-v2/ppo_gaussian_mlp.yaml      |   2 +-
 .../halfcheetah-v2/rlpd_mlp.yaml              |   2 +-
 .../halfcheetah-v2/sac_mlp.yaml               |   2 +-
 .../scratch/hopper-v2/awr_diffusion_mlp.yaml  |  99 +++++++++++
 .../scratch/hopper-v2/dipo_diffusion_mlp.yaml | 101 +++++++++++
 .../scratch/hopper-v2/dql_diffusion_mlp.yaml  | 100 +++++++++++
 .../scratch/hopper-v2/idql_diffusion_mlp.yaml | 108 ++++++++++++
 .../hopper-v2/ppo_diffusion_mlp.yaml          |  12 +-
 .../hopper-v2/ppo_gaussian_mlp.yaml           |   2 +-
 .../scratch/hopper-v2/qsm_diffusion_mlp.yaml  | 100 +++++++++++
 .../scratch/hopper-v2/rwr_diffusion_mlp.yaml  |  84 +++++++++
 .../scratch/kitchen-complete-v0/rlpd_mlp.yaml | 109 ++++++++++++
 .../kitchen-mixed-v0}/rlpd_mlp.yaml           |  34 ++--
 .../scratch/kitchen-partial-v0/rlpd_mlp.yaml  | 109 ++++++++++++
 .../walker2d-v2/ppo_diffusion_mlp.yaml        |   6 +-
 .../walker2d-v2/ppo_gaussian_mlp.yaml         |   2 +-
 .../finetune/can/calql_mlp_online.yaml        |   4 +-
 .../finetune/can/calql_mlp_online_ph.yaml     | 122 ++++++++++++++
 .../finetune/can/ft_awr_diffusion_mlp.yaml    |   4 +-
 .../finetune/can/ft_dql_diffusion_mlp.yaml    |   2 +-
 .../finetune/can/ft_idql_diffusion_mlp.yaml   |   2 +-
 .../finetune/can/ft_ppo_diffusion_mlp.yaml    |   6 +-
 .../can/ft_ppo_diffusion_mlp_img.yaml         |  16 +-
 .../can/ft_ppo_diffusion_mlp_ta1.yaml         | 111 ++++++++++++
 .../can/ft_ppo_diffusion_mlp_ta1_ph.yaml      | 111 ++++++++++++
 .../finetune/can/ft_qsm_diffusion_mlp.yaml    |   2 +-
 .../finetune/can/ft_rwr_diffusion_mlp.yaml    |   4 +-
 cfg/robomimic/finetune/can/ibrl_mlp.yaml      |   4 +-
 cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml   | 115 +++++++++++++
 .../finetune/lift/ft_awr_diffusion_mlp.yaml   |   2 +-
 .../finetune/lift/ft_dql_diffusion_mlp.yaml   |   2 +-
 .../finetune/lift/ft_idql_diffusion_mlp.yaml  |   2 +-
 .../finetune/lift/ft_ppo_diffusion_mlp.yaml   |  10 +-
 .../lift/ft_ppo_diffusion_mlp_img.yaml        |  16 +-
 .../finetune/lift/ft_qsm_diffusion_mlp.yaml   |   2 +-
 .../finetune/lift/ft_rwr_diffusion_mlp.yaml   |   2 +-
 .../finetune/square/calql_mlp_online.yaml     |   4 +-
 .../finetune/square/calql_mlp_online_ph.yaml  | 122 ++++++++++++++
 .../finetune/square/ft_awr_diffusion_mlp.yaml |   2 +-
 .../finetune/square/ft_dql_diffusion_mlp.yaml |   2 +-
 .../square/ft_idql_diffusion_mlp.yaml         |   2 +-
 .../finetune/square/ft_ppo_diffusion_mlp.yaml |   8 +-
 .../square/ft_ppo_diffusion_mlp_img.yaml      |  12 +-
 .../square/ft_ppo_diffusion_mlp_ta1.yaml      | 112 ++++++++++++
 .../square/ft_ppo_diffusion_mlp_ta1_ph.yaml   | 112 ++++++++++++
 .../finetune/square/ft_qsm_diffusion_mlp.yaml |   2 +-
 .../finetune/square/ft_rwr_diffusion_mlp.yaml |   2 +-
 cfg/robomimic/finetune/square/ibrl_mlp.yaml   |   4 +-
 .../finetune/square/ibrl_mlp_ph.yaml          | 115 +++++++++++++
 .../transport/ft_awr_diffusion_mlp.yaml       |   8 +-
 .../transport/ft_dipo_diffusion_mlp.yaml      |  12 +-
 .../transport/ft_dql_diffusion_mlp.yaml       |  10 +-
 .../transport/ft_idql_diffusion_mlp.yaml      |   6 +-
 .../transport/ft_ppo_diffusion_mlp.yaml       |  12 +-
 .../transport/ft_ppo_diffusion_mlp_img.yaml   |  18 +-
 .../transport/ft_qsm_diffusion_mlp.yaml       |   6 +-
 .../transport/ft_rwr_diffusion_mlp.yaml       |   6 +-
 .../pretrain/can/calql_mlp_offline.yaml       |  10 +-
 .../pretrain/can/calql_mlp_offline_ph.yaml    | 118 +++++++++++++
 .../pretrain/can/pre_diffusion_mlp_ta1.yaml   |  65 +++++++
 .../can/pre_diffusion_mlp_ta1_ph.yaml         |  65 +++++++
 .../pre_gaussian_mlp_ta1_ph.yaml}             |  14 +-
 .../pretrain/square/calql_mlp_offline.yaml    |  10 +-
 .../pretrain/square/calql_mlp_offline_ph.yaml | 118 +++++++++++++
 .../square/pre_diffusion_mlp_ta1.yaml         |  66 ++++++++
 .../square/pre_diffusion_mlp_ta1_ph.yaml      |  66 ++++++++
 .../pre_gaussian_mlp_ta1_ph.yaml}             |  13 +-
 .../{finetune => scratch}/can/rlpd_mlp.yaml   |   2 +-
 cfg/robomimic/scratch/can/rlpd_mlp_ph.yaml    | 114 +++++++++++++
 .../square/rlpd_mlp.yaml                      |   2 +-
 cfg/robomimic/scratch/square/rlpd_mlp_ph.yaml | 114 +++++++++++++
 env/gym_utils/__init__.py                     |   4 +-
 env/gym_utils/wrapper/furniture.py            |   5 +-
 model/common/mlp.py                           |   5 +
 model/common/mlp_gaussian.py                  |   5 +-
 model/diffusion/diffusion_ppo.py              |  37 ++--
 model/diffusion/diffusion_vpg.py              |  65 +++++++
 model/rl/gaussian_calql.py                    |  35 ++--
 model/rl/gaussian_ibrl.py                     |   2 +-
 model/rl/gaussian_ppo.py                      |  36 ++--
 pyproject.toml                                |   9 +-
 script/download_url.py                        | 159 +++++++++++++++++-
 126 files changed, 4614 insertions(+), 553 deletions(-)
 create mode 100644 cfg/gym/eval/kitchen-v0/eval_diffusion_mlp.yaml
 create mode 100644 cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp_ta1.yaml
 delete mode 100644 cfg/gym/finetune/hopper-v2/sac_mlp.yaml
 create mode 100644 cfg/gym/finetune/kitchen-complete-v0/calql_mlp_online.yaml
 create mode 100644 cfg/gym/finetune/kitchen-complete-v0/ft_ppo_diffusion_mlp.yaml
 create mode 100644 cfg/gym/finetune/kitchen-complete-v0/ibrl_mlp.yaml
 rename cfg/gym/finetune/{hopper-v2 => kitchen-mixed-v0}/calql_mlp_online.yaml (86%)
 create mode 100644 cfg/gym/finetune/kitchen-mixed-v0/ft_ppo_diffusion_mlp.yaml
 rename cfg/gym/finetune/{hopper-v2 => kitchen-mixed-v0}/ibrl_mlp.yaml (78%)
 create mode 100644 cfg/gym/finetune/kitchen-partial-v0/calql_mlp_online.yaml
 create mode 100644 cfg/gym/finetune/kitchen-partial-v0/ft_ppo_diffusion_mlp.yaml
 create mode 100644 cfg/gym/finetune/kitchen-partial-v0/ibrl_mlp.yaml
 delete mode 100644 cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml
 create mode 100644 cfg/gym/pretrain/kitchen-complete-v0/calql_mlp_offline.yaml
 create mode 100644 cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml
 create mode 100644 cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml
 rename cfg/gym/pretrain/{hopper-medium-v2 => kitchen-mixed-v0}/calql_mlp_offline.yaml (85%)
 create mode 100644 cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml
 create mode 100644 cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml
 create mode 100644 cfg/gym/pretrain/kitchen-partial-v0/calql_mlp_offline.yaml
 create mode 100644 cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml
 create mode 100644 cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml
 rename cfg/gym/{finetune => scratch}/halfcheetah-v2/ppo_diffusion_mlp.yaml (96%)
 rename cfg/gym/{finetune => scratch}/halfcheetah-v2/ppo_gaussian_mlp.yaml (97%)
 rename cfg/gym/{finetune => scratch}/halfcheetah-v2/rlpd_mlp.yaml (98%)
 rename cfg/gym/{finetune => scratch}/halfcheetah-v2/sac_mlp.yaml (98%)
 create mode 100644 cfg/gym/scratch/hopper-v2/awr_diffusion_mlp.yaml
 create mode 100644 cfg/gym/scratch/hopper-v2/dipo_diffusion_mlp.yaml
 create mode 100644 cfg/gym/scratch/hopper-v2/dql_diffusion_mlp.yaml
 create mode 100644 cfg/gym/scratch/hopper-v2/idql_diffusion_mlp.yaml
 rename cfg/gym/{finetune => scratch}/hopper-v2/ppo_diffusion_mlp.yaml (95%)
 rename cfg/gym/{finetune => scratch}/hopper-v2/ppo_gaussian_mlp.yaml (97%)
 create mode 100644 cfg/gym/scratch/hopper-v2/qsm_diffusion_mlp.yaml
 create mode 100644 cfg/gym/scratch/hopper-v2/rwr_diffusion_mlp.yaml
 create mode 100644 cfg/gym/scratch/kitchen-complete-v0/rlpd_mlp.yaml
 rename cfg/gym/{finetune/hopper-v2 => scratch/kitchen-mixed-v0}/rlpd_mlp.yaml (83%)
 create mode 100644 cfg/gym/scratch/kitchen-partial-v0/rlpd_mlp.yaml
 rename cfg/gym/{finetune => scratch}/walker2d-v2/ppo_diffusion_mlp.yaml (96%)
 rename cfg/gym/{finetune => scratch}/walker2d-v2/ppo_gaussian_mlp.yaml (97%)
 create mode 100644 cfg/robomimic/finetune/can/calql_mlp_online_ph.yaml
 create mode 100644 cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1.yaml
 create mode 100644 cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1_ph.yaml
 create mode 100644 cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml
 create mode 100644 cfg/robomimic/finetune/square/calql_mlp_online_ph.yaml
 create mode 100644 cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1.yaml
 create mode 100644 cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1_ph.yaml
 create mode 100644 cfg/robomimic/finetune/square/ibrl_mlp_ph.yaml
 create mode 100644 cfg/robomimic/pretrain/can/calql_mlp_offline_ph.yaml
 create mode 100644 cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1.yaml
 create mode 100644 cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1_ph.yaml
 rename cfg/robomimic/pretrain/{transport/pre_gaussian_mlp_ibrl.yaml => can/pre_gaussian_mlp_ta1_ph.yaml} (85%)
 create mode 100644 cfg/robomimic/pretrain/square/calql_mlp_offline_ph.yaml
 create mode 100644 cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1.yaml
 create mode 100644 cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1_ph.yaml
 rename cfg/robomimic/pretrain/{lift/pre_gaussian_mlp_ibrl.yaml => square/pre_gaussian_mlp_ta1_ph.yaml} (84%)
 rename cfg/robomimic/{finetune => scratch}/can/rlpd_mlp.yaml (98%)
 create mode 100644 cfg/robomimic/scratch/can/rlpd_mlp_ph.yaml
 rename cfg/robomimic/{finetune => scratch}/square/rlpd_mlp.yaml (98%)
 create mode 100644 cfg/robomimic/scratch/square/rlpd_mlp_ph.yaml

diff --git a/README.md b/README.md
index f4f3428..2d64563 100644
--- a/README.md
+++ b/README.md
@@ -31,12 +31,11 @@ conda activate dppo
 pip install -e .
 ```
 
-3. Install specific environment dependencies (Gym / Robomimic / D3IL / Furniture-Bench) or all dependencies
+3. Install specific environment dependencies (Gym / Kitchen / Robomimic / D3IL / Furniture-Bench) or all dependencies (except for Kitchen, which has dependency conflicts with other tasks).
 ```console
-pip install -e .[gym] # or [robomimic], [d3il], [furniture]
-pip install -e .[all]
+pip install -e .[gym] # or [kitchen], [robomimic], [d3il], [furniture]
+pip install -e .[all] # except for Kitchen
 ```
-<!-- **Note**: Please do not set macros for robomimic and robosuite that the warnings suggest --- we will use some different global variables than the ones defined in macro.py  -->
 
 4. [Install MuJoCo for Gym and/or Robomimic](installation/install_mujoco.md). [Install D3IL](installation/install_d3il.md). [Install IsaacGym and Furniture-Bench](installation/install_furniture.md)
 
@@ -161,6 +160,7 @@ Our diffusion implementation is mostly based on [Diffuser](https://github.com/ja
 * `model.min_sampling_denoising_std`: <img src="https://latex.codecogs.com/gif.latex?\epsilon^\text{exp}_\text{min} "/>, minimum amount of noise when sampling at a denoising step
 * `model.min_logprob_denoising_std`: <img src="https://latex.codecogs.com/gif.latex?\epsilon^\text{prob}_\text{min} "/>, minimum standard deviation when evaluating likelihood at a denoising step
 * `model.clip_ploss_coef`: PPO clipping ratio
+* `train.batch_size`: you may notice the batch size is rather large --- this is due to the PPO update being in expectation over both environment steps and denoising steps (new in v0.6).
 
 ### DDIM fine-tuning
 
diff --git a/agent/finetune/train_calql_agent.py b/agent/finetune/train_calql_agent.py
index cd96d0b..c71f488 100644
--- a/agent/finetune/train_calql_agent.py
+++ b/agent/finetune/train_calql_agent.py
@@ -82,8 +82,6 @@ class TrainCalQLAgent(TrainAgent):
         if self.train_online:
             # number of episode to colect per epoch for training
             self.n_episode_per_epoch = cfg.train.n_episode_per_epoch
-            # UTD ratio
-            self.online_utd_ratio = cfg.train.online_utd_ratio
 
         # Eval episodes
         self.n_eval_episode = cfg.train.n_eval_episode
@@ -204,9 +202,13 @@ class TrainCalQLAgent(TrainAgent):
                     action_venv = samples[:, : self.act_steps]
 
                 # Apply multi-step action
-                obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
-                    self.venv.step(action_venv)
-                )
+                (
+                    obs_venv,
+                    reward_venv,
+                    terminated_venv,
+                    truncated_venv,
+                    info_venv,
+                ) = self.venv.step(action_venv)
                 done_venv = terminated_venv | truncated_venv
                 reward_trajs[step] = reward_venv
                 firsts_trajs[step + 1] = done_venv
@@ -308,7 +310,8 @@ class TrainCalQLAgent(TrainAgent):
 
                 # override num_update
                 if self.train_online:
-                    num_update = len(reward_trajs)  # assume one env!
+                    # the amount of new transitions(single env)
+                    num_update = len(reward_trajs_split[0])
                 else:
                     num_update = self.num_update
                 for _ in range(num_update):
@@ -413,7 +416,6 @@ class TrainCalQLAgent(TrainAgent):
                         reward_to_go_b,
                         terminated_b,
                         self.gamma,
-                        alpha,
                     )
                     self.critic_optimizer.zero_grad()
                     loss_critic.backward()
diff --git a/agent/finetune/train_ibrl_agent.py b/agent/finetune/train_ibrl_agent.py
index 0f9a06d..6de7d77 100644
--- a/agent/finetune/train_ibrl_agent.py
+++ b/agent/finetune/train_ibrl_agent.py
@@ -145,7 +145,6 @@ class TrainIBRLAgent(TrainAgent):
             # Collect a set of trajectories from env
             cnt_episode = 0
             for step in range(n_steps):
-
                 # Select action
                 with torch.no_grad():
                     cond = {
@@ -164,9 +163,13 @@ class TrainIBRLAgent(TrainAgent):
                 action_venv = samples[:, : self.act_steps]
 
                 # Apply multi-step action
-                obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
-                    self.venv.step(action_venv)
-                )
+                (
+                    obs_venv,
+                    reward_venv,
+                    terminated_venv,
+                    truncated_venv,
+                    info_venv,
+                ) = self.venv.step(action_venv)
                 done_venv = terminated_venv | truncated_venv
                 reward_trajs[step] = reward_venv
                 firsts_trajs[step + 1] = done_venv
@@ -177,14 +180,13 @@ class TrainIBRLAgent(TrainAgent):
                         obs_buffer.append(prev_obs_venv["state"][i])
                         if "final_obs" in info_venv[i]:  # truncated
                             next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
-                            terminated_venv[i] = False
                         else:  # first obs in new episode
                             next_obs_buffer.append(obs_venv["state"][i])
                         action_buffer.append(action_venv[i])
                     reward_buffer.extend(
                         (reward_venv * self.scale_reward_factor).tolist()
                     )
-                    terminated_buffer.append(terminated_venv.tolist())
+                    terminated_buffer.extend(terminated_venv.tolist())
 
                 # update for next step
                 prev_obs_venv = obs_venv
diff --git a/agent/finetune/train_ppo_diffusion_agent.py b/agent/finetune/train_ppo_diffusion_agent.py
index ee073b3..998c638 100644
--- a/agent/finetune/train_ppo_diffusion_agent.py
+++ b/agent/finetune/train_ppo_diffusion_agent.py
@@ -19,7 +19,6 @@ from util.scheduler import CosineAnnealingWarmupRestarts
 
 
 class TrainPPODiffusionAgent(TrainPPOAgent):
-
     def __init__(self, cfg):
         super().__init__(cfg)
 
@@ -46,7 +45,6 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
             )
 
     def run(self):
-
         # Start training loop
         timer = Timer()
         run_results = []
@@ -54,7 +52,6 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
         last_itr_eval = False
         done_venv = np.zeros((1, self.n_envs))
         while self.itr < self.n_train_itr:
-
             # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
             options_venv = [{} for _ in range(self.n_envs)]
             if self.itr % self.render_freq == 0 and self.render_video:
@@ -126,9 +123,13 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
                 action_venv = output_venv[:, : self.act_steps]
 
                 # Apply multi-step action
-                obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
-                    self.venv.step(action_venv)
-                )
+                (
+                    obs_venv,
+                    reward_venv,
+                    terminated_venv,
+                    truncated_venv,
+                    info_venv,
+                ) = self.venv.step(action_venv)
                 done_venv = terminated_venv | truncated_venv
                 if self.save_full_observations:  # state-only
                     obs_full_venv = np.array(
@@ -285,40 +286,45 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
                     )
                 }
                 chains_k = einops.rearrange(
-                    torch.tensor(chains_trajs).float().to(self.device),
+                    torch.tensor(chains_trajs, device=self.device).float(),
                     "s e t h d -> (s e) t h d",
                 )
                 returns_k = (
-                    torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
                 )
                 values_k = (
-                    torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(values_trajs, device=self.device).float().reshape(-1)
                 )
                 advantages_k = (
-                    torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(advantages_trajs, device=self.device)
+                    .float()
+                    .reshape(-1)
                 )
-                logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
+                logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
 
                 # Update policy and critic
-                total_steps = self.n_steps * self.n_envs
-                inds_k = np.arange(total_steps)
+                total_steps = self.n_steps * self.n_envs * self.model.ft_denoising_steps
                 clipfracs = []
                 for update_epoch in range(self.update_epochs):
-
                     # for each epoch, go through all data in batches
                     flag_break = False
-                    np.random.shuffle(inds_k)
+                    inds_k = torch.randperm(total_steps, device=self.device)
                     num_batch = max(1, total_steps // self.batch_size)  # skip last ones
                     for batch in range(num_batch):
                         start = batch * self.batch_size
                         end = start + self.batch_size
                         inds_b = inds_k[start:end]  # b for batch
-                        obs_b = {"state": obs_k["state"][inds_b]}
-                        chains_b = chains_k[inds_b]
-                        returns_b = returns_k[inds_b]
-                        values_b = values_k[inds_b]
-                        advantages_b = advantages_k[inds_b]
-                        logprobs_b = logprobs_k[inds_b]
+                        batch_inds_b, denoising_inds_b = torch.unravel_index(
+                            inds_b,
+                            (self.n_steps * self.n_envs, self.model.ft_denoising_steps),
+                        )
+                        obs_b = {"state": obs_k["state"][batch_inds_b]}
+                        chains_prev_b = chains_k[batch_inds_b, denoising_inds_b]
+                        chains_next_b = chains_k[batch_inds_b, denoising_inds_b + 1]
+                        returns_b = returns_k[batch_inds_b]
+                        values_b = values_k[batch_inds_b]
+                        advantages_b = advantages_k[batch_inds_b]
+                        logprobs_b = logprobs_k[batch_inds_b, denoising_inds_b]
 
                         # get loss
                         (
@@ -332,7 +338,9 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
                             eta,
                         ) = self.model.loss(
                             obs_b,
-                            chains_b,
+                            chains_prev_b,
+                            chains_next_b,
+                            denoising_inds_b,
                             returns_b,
                             values_b,
                             advantages_b,
diff --git a/agent/finetune/train_ppo_diffusion_img_agent.py b/agent/finetune/train_ppo_diffusion_img_agent.py
index 9d47b0d..9eb6696 100644
--- a/agent/finetune/train_ppo_diffusion_img_agent.py
+++ b/agent/finetune/train_ppo_diffusion_img_agent.py
@@ -283,40 +283,44 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
                     for k in obs_trajs
                 }
                 chains_k = einops.rearrange(
-                    torch.tensor(chains_trajs).float().to(self.device),
+                    torch.tensor(chains_trajs, device=self.device).float(),
                     "s e t h d -> (s e) t h d",
                 )
                 returns_k = (
-                    torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
                 )
                 values_k = (
-                    torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(values_trajs, device=self.device).float().reshape(-1)
                 )
                 advantages_k = (
-                    torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(advantages_trajs, device=self.device).float().reshape(-1)
                 )
-                logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
+                logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
 
                 # Update policy and critic
-                total_steps = self.n_steps * self.n_envs
-                inds_k = np.arange(total_steps)
+                total_steps = self.n_steps * self.n_envs * self.model.ft_denoising_steps
                 clipfracs = []
                 for update_epoch in range(self.update_epochs):
 
                     # for each epoch, go through all data in batches
                     flag_break = False
-                    np.random.shuffle(inds_k)
+                    inds_k = torch.randperm(total_steps, device=self.device)
                     num_batch = max(1, total_steps // self.batch_size)  # skip last ones
                     for batch in range(num_batch):
                         start = batch * self.batch_size
                         end = start + self.batch_size
                         inds_b = inds_k[start:end]  # b for batch
-                        obs_b = {k: obs_k[k][inds_b] for k in obs_k}
-                        chains_b = chains_k[inds_b]
-                        returns_b = returns_k[inds_b]
-                        values_b = values_k[inds_b]
-                        advantages_b = advantages_k[inds_b]
-                        logprobs_b = logprobs_k[inds_b]
+                        batch_inds_b, denoising_inds_b = torch.unravel_index(
+                            inds_b,
+                            (self.n_steps * self.n_envs, self.model.ft_denoising_steps),
+                        )
+                        obs_b = {k: obs_k[k][batch_inds_b] for k in obs_k}
+                        chains_prev_b = chains_k[batch_inds_b, denoising_inds_b]
+                        chains_next_b = chains_k[batch_inds_b, denoising_inds_b + 1]
+                        returns_b = returns_k[batch_inds_b]
+                        values_b = values_k[batch_inds_b]
+                        advantages_b = advantages_k[batch_inds_b]
+                        logprobs_b = logprobs_k[batch_inds_b, denoising_inds_b]
 
                         # get loss
                         (
@@ -330,7 +334,9 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
                             eta,
                         ) = self.model.loss(
                             obs_b,
-                            chains_b,
+                            chains_prev_b,
+                            chains_next_b,
+                            denoising_inds_b,
                             returns_b,
                             values_b,
                             advantages_b,
diff --git a/agent/finetune/train_ppo_exact_diffusion_agent.py b/agent/finetune/train_ppo_exact_diffusion_agent.py
index 920b03f..6fa425f 100644
--- a/agent/finetune/train_ppo_exact_diffusion_agent.py
+++ b/agent/finetune/train_ppo_exact_diffusion_agent.py
@@ -249,29 +249,28 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
                     )
                 }
                 samples_k = einops.rearrange(
-                    torch.tensor(samples_trajs).float().to(self.device),
+                    torch.tensor(samples_trajs, device=self.device).float(),
                     "s e h d -> (s e) h d",
                 )
                 returns_k = (
-                    torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
                 )
                 values_k = (
-                    torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(values_trajs, device=self.device).float().reshape(-1)
                 )
                 advantages_k = (
-                    torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(advantages_trajs, device=self.device).float().reshape(-1)
                 )
-                logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
+                logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
 
                 # Update policy and critic
                 total_steps = self.n_steps * self.n_envs
-                inds_k = np.arange(total_steps)
                 clipfracs = []
                 for update_epoch in range(self.update_epochs):
 
                     # for each epoch, go through all data in batches
                     flag_break = False
-                    np.random.shuffle(inds_k)
+                    inds_k = torch.randperm(total_steps, device=self.device)
                     num_batch = max(1, total_steps // self.batch_size)  # skip last ones
                     for batch in range(num_batch):
                         start = batch * self.batch_size
diff --git a/agent/finetune/train_ppo_gaussian_agent.py b/agent/finetune/train_ppo_gaussian_agent.py
index 2ad38bd..be1d754 100644
--- a/agent/finetune/train_ppo_gaussian_agent.py
+++ b/agent/finetune/train_ppo_gaussian_agent.py
@@ -210,7 +210,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
                         )
                         reward_trajs = reward_trajs_transpose.T
 
-                    # bootstrap value with GAE if not done - apply reward scaling with constant if specified
+                    # bootstrap value with GAE if not terminal - apply reward scaling with constant if specified
                     obs_venv_ts = {
                         "state": torch.from_numpy(obs_venv["state"])
                         .float()
@@ -250,31 +250,28 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
                     )
                 }
                 samples_k = einops.rearrange(
-                    torch.tensor(samples_trajs).float().to(self.device),
+                    torch.tensor(samples_trajs, device=self.device).float(),
                     "s e h d -> (s e) h d",
                 )
                 returns_k = (
-                    torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
                 )
                 values_k = (
-                    torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(values_trajs, device=self.device).float().reshape(-1)
                 )
                 advantages_k = (
-                    torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
-                )
-                logprobs_k = (
-                    torch.tensor(logprobs_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(advantages_trajs, device=self.device).float().reshape(-1)
                 )
+                logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
 
                 # Update policy and critic
                 total_steps = self.n_steps * self.n_envs
-                inds_k = np.arange(total_steps)
                 clipfracs = []
                 for update_epoch in range(self.update_epochs):
 
                     # for each epoch, go through all data in batches
                     flag_break = False
-                    np.random.shuffle(inds_k)
+                    inds_k = torch.randperm(total_steps, device=self.device)
                     num_batch = max(1, total_steps // self.batch_size)  # skip last ones
                     for batch in range(num_batch):
                         start = batch * self.batch_size
diff --git a/agent/finetune/train_ppo_gaussian_img_agent.py b/agent/finetune/train_ppo_gaussian_img_agent.py
index de1dbb9..3c404ed 100644
--- a/agent/finetune/train_ppo_gaussian_img_agent.py
+++ b/agent/finetune/train_ppo_gaussian_img_agent.py
@@ -231,7 +231,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
                         )
                         reward_trajs = reward_trajs_transpose.T
 
-                    # bootstrap value with GAE if not done - apply reward scaling with constant if specified
+                    # bootstrap value with GAE if not terminal - apply reward scaling with constant if specified
                     obs_venv_ts = {
                         key: torch.from_numpy(obs_venv[key]).float().to(self.device)
                         for key in self.obs_dims
@@ -271,29 +271,28 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
                     for k in obs_trajs
                 }
                 samples_k = einops.rearrange(
-                    torch.tensor(samples_trajs).float().to(self.device),
+                    torch.tensor(samples_trajs, device=self.device).float(),
                     "s e h d -> (s e) h d",
                 )
                 returns_k = (
-                    torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
                 )
                 values_k = (
-                    torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(values_trajs, device=self.device).float().reshape(-1)
                 )
                 advantages_k = (
-                    torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
+                    torch.tensor(advantages_trajs, device=self.device).float().reshape(-1)
                 )
-                logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
+                logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
 
                 # Update policy and critic
                 total_steps = self.n_steps * self.n_envs
-                inds_k = np.arange(total_steps)
                 clipfracs = []
                 for update_epoch in range(self.update_epochs):
 
                     # for each epoch, go through all data in batches
                     flag_break = False
-                    np.random.shuffle(inds_k)
+                    inds_k = torch.randperm(total_steps, device=self.device)
                     num_batch = max(1, total_steps // self.batch_size)  # skip last ones
                     for batch in range(num_batch):
                         start = batch * self.batch_size
diff --git a/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml
index 69011ec..bb58881 100644
--- a/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml
+++ b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml
@@ -68,7 +68,7 @@ train:
   reward_scale_running: True
   reward_scale_const: 1.0
   gae_lambda: 0.95
-  batch_size: 8800
+  batch_size: 40000
   update_epochs: 5
   vf_coef: 0.5
   target_kl: 1
diff --git a/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml
index ce73c44..183c2c9 100644
--- a/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml
+++ b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml
@@ -68,7 +68,7 @@ train:
   reward_scale_running: True
   reward_scale_const: 1.0
   gae_lambda: 0.95
-  batch_size: 8800
+  batch_size: 40000
   update_epochs: 5
   vf_coef: 0.5
   target_kl: 1
diff --git a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml
index 44cd23a..92be054 100644
--- a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml
@@ -68,7 +68,7 @@ train:
   reward_scale_running: True
   reward_scale_const: 1.0
   gae_lambda: 0.95
-  batch_size: 8800
+  batch_size: 17600
   update_epochs: 5
   vf_coef: 0.5
   target_kl: 1
diff --git a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml
index 54082c5..79d7a6e 100644
--- a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml
+++ b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml
@@ -68,7 +68,7 @@ train:
   reward_scale_running: True
   reward_scale_const: 1.0
   gae_lambda: 0.95
-  batch_size: 8800
+  batch_size: 30000
   update_epochs: 5
   vf_coef: 0.5
   target_kl: 1
diff --git a/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml
index 9484c40..86d1406 100644
--- a/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml
+++ b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml
@@ -68,7 +68,7 @@ train:
   reward_scale_running: True
   reward_scale_const: 1.0
   gae_lambda: 0.95
-  batch_size: 8800
+  batch_size: 30000
   update_epochs: 5
   vf_coef: 0.5
   target_kl: 1
diff --git a/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml
index 519d59f..52753d7 100644
--- a/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml
+++ b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml
@@ -68,7 +68,7 @@ train:
   reward_scale_running: True
   reward_scale_const: 1.0
   gae_lambda: 0.95
-  batch_size: 8800
+  batch_size: 40000
   update_epochs: 5
   vf_coef: 0.5
   target_kl: 1
diff --git a/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml
index 659fd30..a5f3f96 100644
--- a/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml
+++ b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml
@@ -68,7 +68,7 @@ train:
   reward_scale_running: True
   reward_scale_const: 1.0
   gae_lambda: 0.95
-  batch_size: 8800
+  batch_size: 40000
   update_epochs: 5
   vf_coef: 0.5
   target_kl: 1
diff --git a/cfg/gym/eval/kitchen-v0/eval_diffusion_mlp.yaml b/cfg/gym/eval/kitchen-v0/eval_diffusion_mlp.yaml
new file mode 100644
index 0000000..f74e0fc
--- /dev/null
+++ b/cfg/gym/eval/kitchen-v0/eval_diffusion_mlp.yaml
@@ -0,0 +1,61 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+n_steps: 70
+render_num: 0
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 280
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 4
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [256, 256, 256]
+    cond_mlp_dims: [128, 32]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml b/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml
index cef9f0f..311d4a1 100644
--- a/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml
+++ b/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml
@@ -7,7 +7,7 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent
 
 name: ${env_name}_calql_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_calql_mlp_ta1/2024-09-29_22-59-08_42/checkpoint/state_49.pt
 normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
 offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
 
@@ -92,7 +92,7 @@ model:
     tanh_output: False  # squash after sampling instead
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}
     std_max: 7.3891
     std_min: 0.0067
   critic:
diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml
index fbcea35..8e395ff 100644
--- a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml
@@ -68,7 +68,7 @@ train:
   reward_scale_running: True
   reward_scale_const: 1.0
   gae_lambda: 0.95
-  batch_size: 5000
+  batch_size: 50000
   update_epochs: 5
   vf_coef: 0.5
   target_kl: 1
diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp_ta1.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp_ta1.yaml
new file mode 100644
index 0000000..cba7754
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp_ta1.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta1_td20/2024-09-29_02-13-10_42/checkpoint/state_1000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 20
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3  # success rate not relevant for gym tasks
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 501
+  n_critic_warmup_itr: 0
+  n_steps: 1000
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 10000
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.01
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml
index adfec91..7ab10bd 100644
--- a/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml
+++ b/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml
@@ -8,7 +8,7 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
 name: ${env_name}_ibrl_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_gaussian_mlp_ta1/2024-09-28_18-48-54_42/checkpoint/state_500.pt
 offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
 
 seed: 42
@@ -87,7 +87,7 @@ model:
     fixed_std: 0.1
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}
   critic:
     _target_: model.common.critic.CriticObsAct
     mlp_dims: [256, 256, 256]
diff --git a/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml
index d4b9597..5cea98a 100644
--- a/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml
@@ -68,7 +68,7 @@ train:
   reward_scale_running: True
   reward_scale_const: 1.0
   gae_lambda: 0.95
-  batch_size: 5000
+  batch_size: 50000
   update_epochs: 5
   vf_coef: 0.5
   target_kl: 1
diff --git a/cfg/gym/finetune/hopper-v2/sac_mlp.yaml b/cfg/gym/finetune/hopper-v2/sac_mlp.yaml
deleted file mode 100644
index 6d44909..0000000
--- a/cfg/gym/finetune/hopper-v2/sac_mlp.yaml
+++ /dev/null
@@ -1,89 +0,0 @@
-defaults:
-  - _self_
-hydra:
-  run:
-    dir: ${logdir}
-_target_: agent.finetune.train_sac_agent.TrainSACAgent
-
-name: ${env_name}_sac_mlp_ta${horizon_steps}
-logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
-offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
-
-seed: 42
-device: cuda:0
-env_name: hopper-medium-v2
-obs_dim: 11
-action_dim: 3
-cond_steps: 1
-horizon_steps: 1
-act_steps: 1
-
-env:
-  n_envs: 1
-  name: ${env_name}
-  max_episode_steps: 1000
-  reset_at_iteration: False
-  save_video: False
-  best_reward_threshold_for_success: 3
-  wrappers:
-    mujoco_locomotion_lowdim:
-      normalization_path: ${normalization_path}
-    multi_step:
-      n_obs_steps: ${cond_steps}
-      n_action_steps: ${act_steps}
-      max_episode_steps: ${env.max_episode_steps}
-      reset_within_step: True
-
-wandb:
-  entity: ${oc.env:DPPO_WANDB_ENTITY}
-  project: sac-gym-${env_name}
-  run: ${now:%H-%M-%S}_${name}
-
-train:
-  n_train_itr: 1000000
-  n_steps: 1
-  gamma: 0.99
-  actor_lr: 3e-4
-  critic_lr: 1e-3
-  save_model_freq: 100000
-  val_freq: 10000
-  render:
-    freq: 1
-    num: 0
-  log_freq: 200
-  # SAC specific
-  batch_size: 256
-  target_ema_rate: 0.005
-  scale_reward_factor: 1
-  critic_replay_ratio: 256
-  actor_replay_ratio: 128
-  buffer_size: 1000000
-  n_eval_episode: 10
-  n_explore_steps: 5000
-  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
-  init_temperature: 1
-
-model:
-  _target_: model.rl.gaussian_sac.SAC_Gaussian
-  randn_clip_value: 10
-  tanh_output: True # squash after sampling
-  actor:
-    _target_: model.common.mlp_gaussian.Gaussian_MLP
-    mlp_dims: [256, 256]
-    activation_type: ReLU
-    tanh_output: False  # squash after sampling instead
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
-    horizon_steps: ${horizon_steps}
-    
-    std_max: 7.3891
-    std_min: 0.0067
-  critic: # no layernorm
-    _target_: model.common.critic.CriticObsAct
-    mlp_dims: [256, 256]
-    activation_type: ReLU
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
-    action_dim: ${action_dim}
-    action_steps: ${act_steps}
-  horizon_steps: ${horizon_steps}
-  device: ${device}
diff --git a/cfg/gym/finetune/kitchen-complete-v0/calql_mlp_online.yaml b/cfg/gym/finetune/kitchen-complete-v0/calql_mlp_online.yaml
new file mode 100644
index 0000000..1d2eb3a
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-complete-v0/calql_mlp_online.yaml
@@ -0,0 +1,116 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-complete-v0_calql_mlp_ta1/2024-10-26_01-01-33_42/checkpoint/state_999.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 280
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 4
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: calql-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 10000
+  n_steps: 1  # not used
+  n_episode_per_epoch: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 100
+  val_freq: 20
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: True
+  batch_size: 256
+  n_random_actions: 10
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 1000
+  buffer_size: 1000000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 1
+  automatic_entropy_tuning: True
+
+model:
+  _target_: model.rl.gaussian_calql.CalQL_Gaussian
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    tanh_output: False  # squash after sampling instead
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+    std_max: 7.3891
+    std_min: 0.0067
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/gym/finetune/kitchen-complete-v0/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/kitchen-complete-v0/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..c73997a
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-complete-v0/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-complete-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-47-42_42/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 280
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 4
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 301
+  n_critic_warmup_itr: 0
+  n_steps: 70
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 5600
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.01
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [256, 256, 256]
+    cond_mlp_dims: [128, 32]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  ft_denoising_steps: ${ft_denoising_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/kitchen-complete-v0/ibrl_mlp.yaml b/cfg/gym/finetune/kitchen-complete-v0/ibrl_mlp.yaml
new file mode 100644
index 0000000..d0c62ee
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-complete-v0/ibrl_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
+
+name: ${env_name}_ibrl_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/gym-pretrain/kitchen-complete-v0_pre_gaussian_mlp_ta1/2024-10-25_14-48-43_42/checkpoint/state_5000.pt
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 280
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 4
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: ibrl-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50000
+  val_freq: 5000
+  render:
+    freq: 1
+    num: 0
+  log_freq: 200
+  # IBRL specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 5
+  buffer_size: 500000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  update_freq: 2
+
+model:
+  _target_: model.rl.gaussian_ibrl.IBRL_Gaussian
+  randn_clip_value: 3
+  n_critics: 5
+  soft_action_sample: True
+  soft_action_sample_beta: 10
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    dropout: 0.5
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  max_n_episodes: 50
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml b/cfg/gym/finetune/kitchen-mixed-v0/calql_mlp_online.yaml
similarity index 86%
rename from cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml
rename to cfg/gym/finetune/kitchen-mixed-v0/calql_mlp_online.yaml
index 10204ba..cf8da13 100644
--- a/cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml
+++ b/cfg/gym/finetune/kitchen-mixed-v0/calql_mlp_online.yaml
@@ -7,15 +7,15 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent
 
 name: ${env_name}_calql_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-mixed-v0_calql_mlp_ta1/2024-10-25_21-36-13_42/checkpoint/state_999.pt
 normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
 offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
 
 seed: 42
 device: cuda:0
-env_name: hopper-medium-v2
-obs_dim: 11
-action_dim: 3
+env_name: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
 cond_steps: 1
 horizon_steps: 1
 act_steps: 1
@@ -23,10 +23,10 @@ act_steps: 1
 env:
   n_envs: 1
   name: ${env_name}
-  max_episode_steps: 1000
+  max_episode_steps: 280
   reset_at_iteration: False
   save_video: False
-  best_reward_threshold_for_success: 3
+  best_reward_threshold_for_success: 4
   wrappers:
     mujoco_locomotion_lowdim:
       normalization_path: ${normalization_path}
@@ -59,7 +59,7 @@ train:
     warmup_steps: 10
     min_lr: 3e-4
   save_model_freq: 100
-  val_freq: 10
+  val_freq: 20
   render:
     freq: 1
     num: 0
@@ -67,13 +67,12 @@ train:
   # CalQL specific
   train_online: True
   batch_size: 256
-  n_random_actions: 4
+  n_random_actions: 10
   target_ema_rate: 0.005
   scale_reward_factor: 1.0
   num_update: 1000
   buffer_size: 1000000
-  online_utd_ratio: 1
-  n_eval_episode: 10
+  n_eval_episode: 40
   n_explore_steps: 0
   target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
   init_temperature: 1
@@ -87,17 +86,17 @@ model:
   network_path: ${base_policy_path}
   actor:
     _target_: model.common.mlp_gaussian.Gaussian_MLP
-    mlp_dims: [256, 256]
+    mlp_dims: [256, 256, 256]
     activation_type: ReLU
     tanh_output: False  # squash after sampling instead
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}
     std_max: 7.3891
     std_min: 0.0067
   critic:
     _target_: model.common.critic.CriticObsAct
-    mlp_dims: [256, 256]
+    mlp_dims: [256, 256, 256]
     activation_type: ReLU
     use_layernorm: True
     double_q: True
diff --git a/cfg/gym/finetune/kitchen-mixed-v0/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/kitchen-mixed-v0/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..f90294b
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-mixed-v0/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-mixed-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-28_42/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 280
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 4
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 301
+  n_critic_warmup_itr: 0
+  n_steps: 70
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 5600
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.01
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [256, 256, 256]
+    cond_mlp_dims: [128, 32]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  ft_denoising_steps: ${ft_denoising_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml b/cfg/gym/finetune/kitchen-mixed-v0/ibrl_mlp.yaml
similarity index 78%
rename from cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml
rename to cfg/gym/finetune/kitchen-mixed-v0/ibrl_mlp.yaml
index 1737a1e..d98c3bb 100644
--- a/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml
+++ b/cfg/gym/finetune/kitchen-mixed-v0/ibrl_mlp.yaml
@@ -8,14 +8,14 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
 name: ${env_name}_ibrl_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-mixed-v0_pre_gaussian_mlp_ta1/2024-10-25_01-39-44_42/checkpoint/state_5000.pt
 offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
 
 seed: 42
 device: cuda:0
-env_name: hopper-medium-v2
-obs_dim: 11
-action_dim: 3
+env_name: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
 cond_steps: 1
 horizon_steps: 1
 act_steps: 1
@@ -23,10 +23,10 @@ act_steps: 1
 env:
   n_envs: 1
   name: ${env_name}
-  max_episode_steps: 1000
+  max_episode_steps: 280
   reset_at_iteration: False
   save_video: False
-  best_reward_threshold_for_success: 3
+  best_reward_threshold_for_success: 4
   wrappers:
     mujoco_locomotion_lowdim:
       normalization_path: ${normalization_path}
@@ -42,7 +42,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 250000
+  n_train_itr: 1000000
   n_steps: 1
   gamma: 0.99
   actor_lr: 1e-4
@@ -51,25 +51,25 @@ train:
     first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 1e-4
-  critic_lr: 1e-4
+  critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
     first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-4
+    min_lr: 1e-3
   save_model_freq: 50000
-  val_freq: 2000
+  val_freq: 5000
   render:
     freq: 1
     num: 0
   log_freq: 200
   # IBRL specific
   batch_size: 256
-  target_ema_rate: 0.01        
+  target_ema_rate: 0.01
   scale_reward_factor: 1
   critic_num_update: 5
-  buffer_size: 1000000
-  n_eval_episode: 10
+  buffer_size: 500000
+  n_eval_episode: 40
   n_explore_steps: 0
   update_freq: 2
 
@@ -78,19 +78,19 @@ model:
   randn_clip_value: 3
   n_critics: 5
   soft_action_sample: True
-  soft_action_sample_beta: 0.1
-  network_path: ${base_policy_path}
+  soft_action_sample_beta: 10
   actor:
     _target_: model.common.mlp_gaussian.Gaussian_MLP
-    mlp_dims: [256, 256, 256]
-    activation_type: Mish
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    dropout: 0.5
     fixed_std: 0.1
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}
   critic:
     _target_: model.common.critic.CriticObsAct
-    mlp_dims: [256, 256, 256]
+    mlp_dims: [1024, 1024, 1024]
     activation_type: ReLU
     use_layernorm: True
     double_q: False # use ensemble
@@ -105,4 +105,5 @@ offline_dataset:
   dataset_path: ${offline_dataset_path}
   horizon_steps: ${horizon_steps}
   cond_steps: ${cond_steps}
-  device: ${device}
\ No newline at end of file
+  device: ${device}
+  max_n_episodes: 50
\ No newline at end of file
diff --git a/cfg/gym/finetune/kitchen-partial-v0/calql_mlp_online.yaml b/cfg/gym/finetune/kitchen-partial-v0/calql_mlp_online.yaml
new file mode 100644
index 0000000..160bf19
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-partial-v0/calql_mlp_online.yaml
@@ -0,0 +1,116 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-partial-v0_calql_mlp_ta1/2024-10-25_21-26-51_42/checkpoint/state_980.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 280
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 4
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: calql-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 10000
+  n_steps: 1  # not used
+  n_episode_per_epoch: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 100
+  val_freq: 20
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: True
+  batch_size: 256
+  n_random_actions: 10
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 1000
+  buffer_size: 1000000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 1
+  automatic_entropy_tuning: True
+
+model:
+  _target_: model.rl.gaussian_calql.CalQL_Gaussian
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    tanh_output: False  # squash after sampling instead
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+    std_max: 7.3891
+    std_min: 0.0067
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/gym/finetune/kitchen-partial-v0/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/kitchen-partial-v0/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..946d86b
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-partial-v0/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-partial-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-29_42/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 280
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 4
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 301
+  n_critic_warmup_itr: 0
+  n_steps: 70
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 5600
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.01
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [256, 256, 256]
+    cond_mlp_dims: [128, 32]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  ft_denoising_steps: ${ft_denoising_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/kitchen-partial-v0/ibrl_mlp.yaml b/cfg/gym/finetune/kitchen-partial-v0/ibrl_mlp.yaml
new file mode 100644
index 0000000..3d15f16
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-partial-v0/ibrl_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
+
+name: ${env_name}_ibrl_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-partial-v0_pre_gaussian_mlp_ta1/2024-10-25_01-45-52_42/checkpoint/state_5000.pt
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 280
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 4
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: ibrl-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50000
+  val_freq: 5000
+  render:
+    freq: 1
+    num: 0
+  log_freq: 200
+  # IBRL specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 5
+  buffer_size: 500000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  update_freq: 2
+
+model:
+  _target_: model.rl.gaussian_ibrl.IBRL_Gaussian
+  randn_clip_value: 3
+  n_critics: 5
+  soft_action_sample: True
+  soft_action_sample_beta: 10
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    dropout: 0.5
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  max_n_episodes: 50
\ No newline at end of file
diff --git a/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml
index 9158042..de70428 100644
--- a/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml
@@ -68,7 +68,7 @@ train:
   reward_scale_running: True
   reward_scale_const: 1.0
   gae_lambda: 0.95
-  batch_size: 5000
+  batch_size: 50000
   update_epochs: 5
   vf_coef: 0.5
   target_kl: 1
diff --git a/cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml
deleted file mode 100644
index 42dcdf5..0000000
--- a/cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-defaults:
-  - _self_
-hydra:
-  run:
-    dir: ${logdir}
-_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
-
-name: ${env_name}_rlpd_mlp_ta${horizon_steps}_td${denoising_steps}
-logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
-offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
-
-seed: 42
-device: cuda:0
-env_name: walker2d-medium-v2
-obs_dim: 17
-action_dim: 6
-denoising_steps: 20
-cond_steps: 1
-horizon_steps: 1
-act_steps: 1
-
-env:
-  n_envs: 40
-  name: ${env_name}
-  max_episode_steps: 1000
-  reset_at_iteration: False
-  save_video: False
-  best_reward_threshold_for_success: 3
-  wrappers:
-    mujoco_locomotion_lowdim:
-      normalization_path: ${normalization_path}
-    multi_step: 
-      n_obs_steps: ${cond_steps}
-      n_action_steps: ${act_steps}
-      max_episode_steps: ${env.max_episode_steps}
-      reset_within_step: True
-
-wandb:
-  entity: ${oc.env:DPPO_WANDB_ENTITY}
-  project: rlpd-gym-${env_name}-finetune
-  run: ${now:%H-%M-%S}_${name}
-
-train:
-  n_train_itr: 1000
-  n_critic_warmup_itr: 5
-  n_steps: 2000
-  gamma: 0.99
-  actor_lr: 1e-4
-  actor_weight_decay: 0
-  actor_lr_scheduler:
-    first_cycle_steps: 1000
-    warmup_steps: 10
-    min_lr: 1e-4
-  critic_lr: 1e-3
-  critic_weight_decay: 0
-  critic_lr_scheduler:
-    first_cycle_steps: 1000
-    warmup_steps: 10
-    min_lr: 1e-3
-  save_model_freq: 100
-  val_freq: 10
-  render:
-    freq: 1
-    num: 0
-  # RLPD specific
-  batch_size: 512
-  entropy_temperature: 1.0    # alpha in RLPD paper
-  target_ema_rate: 0.005         # rho in RLPD paper
-  scale_reward_factor: 1.0    # multiply reward by this amount for more stable value estimation
-  replay_ratio: 64          # number of batches to sample for each learning update
-  buffer_size: 1000000
-
-model:
-  _target_: model.rl.gaussian_rlpd.RLPD_Gaussian
-  randn_clip_value: 3
-  actor:
-    _target_: model.common.mlp_gaussian.Gaussian_MLP
-    mlp_dims: [512, 512, 512]
-    activation_type: ReLU
-    residual_style: True
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
-    horizon_steps: ${horizon_steps}
-    action_dim: ${action_dim}
-  critic:
-    _target_: model.common.critic.CriticObsAct
-    action_dim: ${action_dim}
-    action_steps: ${act_steps}
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
-    mlp_dims: [256, 256, 256]
-    activation_type: Mish
-    residual_style: True
-    use_layernorm: True
-  horizon_steps: ${horizon_steps}
-  device: ${device}
-  n_critics: 2            # Ensemble size for critic models
-
-offline_dataset:
-  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
-  dataset_path: ${offline_dataset_path}
-  horizon_steps: ${horizon_steps}
-  cond_steps: ${cond_steps}
-  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml b/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml
index 7dfd7ed..e73a4d5 100644
--- a/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml
+++ b/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml
@@ -88,7 +88,7 @@ model:
     tanh_output: False  # squash after sampling instead
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}
     std_max: 7.3891
     std_min: 0.0067
   critic:
diff --git a/cfg/gym/pretrain/kitchen-complete-v0/calql_mlp_offline.yaml b/cfg/gym/pretrain/kitchen-complete-v0/calql_mlp_offline.yaml
new file mode 100644
index 0000000..8a2f462
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-complete-v0/calql_mlp_offline.yaml
@@ -0,0 +1,113 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 280
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 4
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: calql-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_steps: 1  # not used
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 20
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: False
+  batch_size: 256
+  n_random_actions: 10
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 1000
+  buffer_size: 1000000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 1
+  automatic_entropy_tuning: True
+
+model:
+  _target_: model.rl.gaussian_calql.CalQL_Gaussian
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    tanh_output: False  # squash after sampling instead
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+    std_max: 7.3891
+    std_min: 0.0067
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..092fa00
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml
@@ -0,0 +1,66 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 128
+  learning_rate: 1e-3
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 8000
+    warmup_steps: 1
+    min_lr: 1e-4
+  epoch_start_ema: 10
+  update_ema_freq: 5
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [256, 256, 256]
+    cond_mlp_dims: [128, 32]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..9426b6f
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml
@@ -0,0 +1,60 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 0
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-4
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    dropout: 0.5
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/hopper-medium-v2/calql_mlp_offline.yaml b/cfg/gym/pretrain/kitchen-mixed-v0/calql_mlp_offline.yaml
similarity index 85%
rename from cfg/gym/pretrain/hopper-medium-v2/calql_mlp_offline.yaml
rename to cfg/gym/pretrain/kitchen-mixed-v0/calql_mlp_offline.yaml
index 24f8957..4233314 100644
--- a/cfg/gym/pretrain/hopper-medium-v2/calql_mlp_offline.yaml
+++ b/cfg/gym/pretrain/kitchen-mixed-v0/calql_mlp_offline.yaml
@@ -6,15 +6,15 @@ hydra:
 _target_: agent.finetune.train_calql_agent.TrainCalQLAgent
 
 name: ${env_name}_calql_mlp_ta${horizon_steps}
-logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
 offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
 
 seed: 42
 device: cuda:0
-env_name: hopper-medium-v2
-obs_dim: 11
-action_dim: 3
+env_name: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
 cond_steps: 1
 horizon_steps: 1
 act_steps: 1
@@ -22,10 +22,10 @@ act_steps: 1
 env:
   n_envs: 1
   name: ${env_name}
-  max_episode_steps: 1000
+  max_episode_steps: 280
   reset_at_iteration: False
   save_video: False
-  best_reward_threshold_for_success: 3
+  best_reward_threshold_for_success: 4
   wrappers:
     mujoco_locomotion_lowdim:
       normalization_path: ${normalization_path}
@@ -41,7 +41,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 100
+  n_train_itr: 1000
   n_steps: 1  # not used
   gamma: 0.99
   actor_lr: 1e-4
@@ -50,14 +50,14 @@ train:
     first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 1e-4
-  critic_lr: 3e-4
+  critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
     first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 3e-4
+    min_lr: 1e-3
   save_model_freq: 10
-  val_freq: 10
+  val_freq: 20
   render:
     freq: 1
     num: 0
@@ -65,12 +65,12 @@ train:
   # CalQL specific
   train_online: False
   batch_size: 256
-  n_random_actions: 4
+  n_random_actions: 10
   target_ema_rate: 0.005
   scale_reward_factor: 1.0
   num_update: 1000
   buffer_size: 1000000
-  n_eval_episode: 10
+  n_eval_episode: 40
   n_explore_steps: 0
   target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
   init_temperature: 1
@@ -83,17 +83,17 @@ model:
   tanh_output: True
   actor:
     _target_: model.common.mlp_gaussian.Gaussian_MLP
-    mlp_dims: [256, 256]
+    mlp_dims: [256, 256, 256]
     activation_type: ReLU
     tanh_output: False  # squash after sampling instead
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}
     std_max: 7.3891
     std_min: 0.0067
   critic:
     _target_: model.common.critic.CriticObsAct
-    mlp_dims: [256, 256]
+    mlp_dims: [256, 256, 256]
     activation_type: ReLU
     use_layernorm: True
     double_q: True
diff --git a/cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..becf244
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml
@@ -0,0 +1,66 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-3
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 8000
+    warmup_steps: 1
+    min_lr: 1e-4
+  epoch_start_ema: 10
+  update_ema_freq: 5
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [256, 256, 256]
+    cond_mlp_dims: [128, 32]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..86a6e90
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml
@@ -0,0 +1,59 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 128
+  learning_rate: 1e-3
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 1
+    min_lr: 1e-4
+  epoch_start_ema: 10
+  update_ema_freq: 5
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/kitchen-partial-v0/calql_mlp_offline.yaml b/cfg/gym/pretrain/kitchen-partial-v0/calql_mlp_offline.yaml
new file mode 100644
index 0000000..f99edfd
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-partial-v0/calql_mlp_offline.yaml
@@ -0,0 +1,113 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 280
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 4
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: calql-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_steps: 1  # not used
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 10
+  val_freq: 20
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: False
+  batch_size: 256
+  n_random_actions: 10
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 1000
+  buffer_size: 1000000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 1
+  automatic_entropy_tuning: True
+
+model:
+  _target_: model.rl.gaussian_calql.CalQL_Gaussian
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    tanh_output: False  # squash after sampling instead
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+    std_max: 7.3891
+    std_min: 0.0067
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..c854707
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml
@@ -0,0 +1,66 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 128
+  learning_rate: 1e-3
+  weight_decay: 1e-5
+  lr_scheduler:
+    first_cycle_steps: 8000
+    warmup_steps: 1
+    min_lr: 1e-4
+  epoch_start_ema: 10
+  update_ema_freq: 5
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [256, 256, 256]
+    cond_mlp_dims: [128, 32]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..02413a5
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml
@@ -0,0 +1,59 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 128
+  learning_rate: 1e-3
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 1
+    min_lr: 1e-4
+  epoch_start_ema: 10
+  update_ema_freq: 5
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml b/cfg/gym/scratch/halfcheetah-v2/ppo_diffusion_mlp.yaml
similarity index 96%
rename from cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml
rename to cfg/gym/scratch/halfcheetah-v2/ppo_diffusion_mlp.yaml
index 9be391c..49f11ed 100644
--- a/cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml
+++ b/cfg/gym/scratch/halfcheetah-v2/ppo_diffusion_mlp.yaml
@@ -14,8 +14,8 @@ device: cuda:0
 env_name: halfcheetah-medium-v2
 obs_dim: 17
 action_dim: 6
-denoising_steps: 20
-ft_denoising_steps: 20
+denoising_steps: 10
+ft_denoising_steps: 10
 cond_steps: 1
 horizon_steps: 1
 act_steps: 1
@@ -67,7 +67,7 @@ train:
   reward_scale_running: True
   reward_scale_const: 1.0
   gae_lambda: 0.95
-  batch_size: 1000
+  batch_size: 10000
   update_epochs: 10
   vf_coef: 0.5
   target_kl: 1
diff --git a/cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml b/cfg/gym/scratch/halfcheetah-v2/ppo_gaussian_mlp.yaml
similarity index 97%
rename from cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml
rename to cfg/gym/scratch/halfcheetah-v2/ppo_gaussian_mlp.yaml
index f09c664..b0c1241 100644
--- a/cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml
+++ b/cfg/gym/scratch/halfcheetah-v2/ppo_gaussian_mlp.yaml
@@ -53,7 +53,7 @@ train:
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 1e-3
   save_model_freq: 100
diff --git a/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml b/cfg/gym/scratch/halfcheetah-v2/rlpd_mlp.yaml
similarity index 98%
rename from cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml
rename to cfg/gym/scratch/halfcheetah-v2/rlpd_mlp.yaml
index 898cf9b..24379c6 100644
--- a/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml
+++ b/cfg/gym/scratch/halfcheetah-v2/rlpd_mlp.yaml
@@ -86,7 +86,7 @@ model:
     tanh_output: False  # squash after sampling instead
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}
     std_max: 7.3891
     std_min: 0.0067
   critic:
diff --git a/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml b/cfg/gym/scratch/halfcheetah-v2/sac_mlp.yaml
similarity index 98%
rename from cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml
rename to cfg/gym/scratch/halfcheetah-v2/sac_mlp.yaml
index 8051c73..35182d5 100644
--- a/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml
+++ b/cfg/gym/scratch/halfcheetah-v2/sac_mlp.yaml
@@ -75,7 +75,7 @@ model:
     tanh_output: False  # squash after sampling instead
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}
     std_max: 7.3891
     std_min: 0.0067
   critic: # no layernorm
diff --git a/cfg/gym/scratch/hopper-v2/awr_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/awr_diffusion_mlp.yaml
new file mode 100644
index 0000000..2f02c78
--- /dev/null
+++ b/cfg/gym/scratch/hopper-v2/awr_diffusion_mlp.yaml
@@ -0,0 +1,99 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
+
+name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 10
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-scratch
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 1000
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # AWR specific
+  scale_reward_factor: 0.01
+  max_adv_weight: 100
+  beta: 10
+  buffer_size: 100000 # * n_envs
+  batch_size: 256
+  replay_ratio: 128
+  critic_update_ratio: 4
+
+model:
+  _target_: model.diffusion.diffusion_awr.AWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObs
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/scratch/hopper-v2/dipo_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/dipo_diffusion_mlp.yaml
new file mode 100644
index 0000000..9eda16e
--- /dev/null
+++ b/cfg/gym/scratch/hopper-v2/dipo_diffusion_mlp.yaml
@@ -0,0 +1,101 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
+
+name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 10
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-scratch
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 1000
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DIPO specific
+  scale_reward_factor: 0.01
+  target_ema_rate: 0.005
+  buffer_size: 1000000
+  action_lr: 0.0001
+  action_gradient_steps: 10
+  replay_ratio: 128
+  batch_size: 256
+
+model:
+  _target_: model.diffusion.diffusion_dipo.DIPODiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/scratch/hopper-v2/dql_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/dql_diffusion_mlp.yaml
new file mode 100644
index 0000000..9bd4885
--- /dev/null
+++ b/cfg/gym/scratch/hopper-v2/dql_diffusion_mlp.yaml
@@ -0,0 +1,100 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
+
+name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 10
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-scratch
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 1000
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DQL specific
+  scale_reward_factor: 0.01
+  target_ema_rate: 0.005
+  buffer_size: 1000000
+  eta: 1.0
+  replay_ratio: 128
+  batch_size: 256
+
+model:
+  _target_: model.diffusion.diffusion_dql.DQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/scratch/hopper-v2/idql_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/idql_diffusion_mlp.yaml
new file mode 100644
index 0000000..935263d
--- /dev/null
+++ b/cfg/gym/scratch/hopper-v2/idql_diffusion_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
+
+name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 10
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-scratch
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 1000
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # IDQL specific
+  scale_reward_factor: 0.01
+  eval_deterministic: True
+  eval_sample_num: 10 # how many samples to score during eval
+  critic_tau: 0.001  # rate of target q network update
+  use_expectile_exploration: True
+  buffer_size: 100000 # * n_envs
+  replay_ratio: 128
+  batch_size: 256
+
+model:
+  _target_: model.diffusion.diffusion_idql.IDQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic_q:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  critic_v:
+    _target_: model.common.critic.CriticObs
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/ppo_diffusion_mlp.yaml
similarity index 95%
rename from cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml
rename to cfg/gym/scratch/hopper-v2/ppo_diffusion_mlp.yaml
index 3f26654..729a0c6 100644
--- a/cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml
+++ b/cfg/gym/scratch/hopper-v2/ppo_diffusion_mlp.yaml
@@ -1,7 +1,7 @@
 defaults:
   - _self_
 hydra:
-  run:  
+  run:
     dir: ${logdir}
 _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
 
@@ -14,8 +14,8 @@ device: cuda:0
 env_name: hopper-medium-v2
 obs_dim: 11
 action_dim: 3
-denoising_steps: 20
-ft_denoising_steps: 20
+denoising_steps: 10
+ft_denoising_steps: 10
 cond_steps: 1
 horizon_steps: 1
 act_steps: 1
@@ -55,7 +55,7 @@ train:
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 1e-3
   save_model_freq: 100
@@ -67,7 +67,7 @@ train:
   reward_scale_running: True
   reward_scale_const: 1.0
   gae_lambda: 0.95
-  batch_size: 1000
+  batch_size: 10000
   update_epochs: 10
   vf_coef: 0.5
   target_kl: 1
@@ -94,10 +94,10 @@ model:
     residual_style: True
   critic:
     _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     mlp_dims: [256, 256, 256]
     activation_type: Mish
     residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
   ft_denoising_steps: ${ft_denoising_steps}
   horizon_steps: ${horizon_steps}
   obs_dim: ${obs_dim}
diff --git a/cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml b/cfg/gym/scratch/hopper-v2/ppo_gaussian_mlp.yaml
similarity index 97%
rename from cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml
rename to cfg/gym/scratch/hopper-v2/ppo_gaussian_mlp.yaml
index 57eafcb..05f5766 100644
--- a/cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml
+++ b/cfg/gym/scratch/hopper-v2/ppo_gaussian_mlp.yaml
@@ -53,7 +53,7 @@ train:
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 1e-3
   save_model_freq: 100
diff --git a/cfg/gym/scratch/hopper-v2/qsm_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/qsm_diffusion_mlp.yaml
new file mode 100644
index 0000000..9fee721
--- /dev/null
+++ b/cfg/gym/scratch/hopper-v2/qsm_diffusion_mlp.yaml
@@ -0,0 +1,100 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent
+
+name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 10
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-scratch
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 1000
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # QSM specific
+  scale_reward_factor: 0.01
+  q_grad_coeff: 50
+  critic_tau: 0.005
+  buffer_size: 100000 # * n_envs
+  replay_ratio: 128
+  batch_size: 256
+
+model:
+  _target_: model.diffusion.diffusion_qsm.QSMDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/scratch/hopper-v2/rwr_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/rwr_diffusion_mlp.yaml
new file mode 100644
index 0000000..cdd98a2
--- /dev/null
+++ b/cfg/gym/scratch/hopper-v2/rwr_diffusion_mlp.yaml
@@ -0,0 +1,84 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent
+
+name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 10
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-scratch
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 1000
+  gamma: 0.99
+  lr: 1e-4
+  weight_decay: 0
+  lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # RWR specific
+  max_reward_weight: 100
+  beta: 10
+  batch_size: 256
+  update_epochs: 128
+
+model:
+  _target_: model.diffusion.diffusion_rwr.RWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/scratch/kitchen-complete-v0/rlpd_mlp.yaml b/cfg/gym/scratch/kitchen-complete-v0/rlpd_mlp.yaml
new file mode 100644
index 0000000..b80a9a8
--- /dev/null
+++ b/cfg/gym/scratch/kitchen-complete-v0/rlpd_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 280
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 4
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: rlpd-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 3e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50000
+  val_freq: 5000
+  render:
+    freq: 1
+    num: 0
+  log_freq: 200
+  # RLPD specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 10
+  buffer_size: 400000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 1
+
+model:
+  _target_: model.rl.gaussian_rlpd.RLPD_Gaussian
+  randn_clip_value: 10
+  tanh_output: True # squash after sampling
+  backup_entropy: True
+  n_critics: 5 # Ensemble size for critic models
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    tanh_output: False  # squash after sampling instead
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+    std_max: 7.3891
+    std_min: 0.0067
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml b/cfg/gym/scratch/kitchen-mixed-v0/rlpd_mlp.yaml
similarity index 83%
rename from cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml
rename to cfg/gym/scratch/kitchen-mixed-v0/rlpd_mlp.yaml
index 7a33bde..e006e25 100644
--- a/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml
+++ b/cfg/gym/scratch/kitchen-mixed-v0/rlpd_mlp.yaml
@@ -12,9 +12,9 @@ offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
 
 seed: 42
 device: cuda:0
-env_name: hopper-medium-v2
-obs_dim: 11
-action_dim: 3
+env_name: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
 cond_steps: 1
 horizon_steps: 1
 act_steps: 1
@@ -22,10 +22,10 @@ act_steps: 1
 env:
   n_envs: 1
   name: ${env_name}
-  max_episode_steps: 1000
+  max_episode_steps: 280
   reset_at_iteration: False
   save_video: False
-  best_reward_threshold_for_success: 3
+  best_reward_threshold_for_success: 4
   wrappers:
     mujoco_locomotion_lowdim:
       normalization_path: ${normalization_path}
@@ -41,7 +41,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 250000
+  n_train_itr: 1000000
   n_steps: 1
   gamma: 0.99
   actor_lr: 3e-4
@@ -50,12 +50,12 @@ train:
     first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 3e-4
-  critic_lr: 3e-4
+  critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
     first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 3e-4
+    min_lr: 1e-3
   save_model_freq: 50000
   val_freq: 5000
   render:
@@ -64,12 +64,12 @@ train:
   log_freq: 200
   # RLPD specific
   batch_size: 256
-  target_ema_rate: 0.005
+  target_ema_rate: 0.01
   scale_reward_factor: 1
-  critic_num_update: 20
-  buffer_size: 1000000
-  n_eval_episode: 10
-  n_explore_steps: 5000
+  critic_num_update: 10
+  buffer_size: 400000
+  n_eval_episode: 40
+  n_explore_steps: 0
   target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
   init_temperature: 1
 
@@ -78,20 +78,20 @@ model:
   randn_clip_value: 10
   tanh_output: True # squash after sampling
   backup_entropy: True
-  n_critics: 10 # Ensemble size for critic models
+  n_critics: 5 # Ensemble size for critic models
   actor:
     _target_: model.common.mlp_gaussian.Gaussian_MLP
-    mlp_dims: [256, 256]
+    mlp_dims: [256, 256, 256]
     activation_type: ReLU
     tanh_output: False  # squash after sampling instead
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}
     std_max: 7.3891
     std_min: 0.0067
   critic:
     _target_: model.common.critic.CriticObsAct
-    mlp_dims: [256, 256]
+    mlp_dims: [256, 256, 256]
     activation_type: ReLU
     use_layernorm: True
     double_q: False # use ensemble
diff --git a/cfg/gym/scratch/kitchen-partial-v0/rlpd_mlp.yaml b/cfg/gym/scratch/kitchen-partial-v0/rlpd_mlp.yaml
new file mode 100644
index 0000000..a9b7781
--- /dev/null
+++ b/cfg/gym/scratch/kitchen-partial-v0/rlpd_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 280
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 4
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: rlpd-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 3e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50000
+  val_freq: 5000
+  render:
+    freq: 1
+    num: 0
+  log_freq: 200
+  # RLPD specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 10
+  buffer_size: 400000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 1
+
+model:
+  _target_: model.rl.gaussian_rlpd.RLPD_Gaussian
+  randn_clip_value: 10
+  tanh_output: True # squash after sampling
+  backup_entropy: True
+  n_critics: 5 # Ensemble size for critic models
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    tanh_output: False  # squash after sampling instead
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+    std_max: 7.3891
+    std_min: 0.0067
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml b/cfg/gym/scratch/walker2d-v2/ppo_diffusion_mlp.yaml
similarity index 96%
rename from cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml
rename to cfg/gym/scratch/walker2d-v2/ppo_diffusion_mlp.yaml
index 6530d49..2c1769f 100644
--- a/cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml
+++ b/cfg/gym/scratch/walker2d-v2/ppo_diffusion_mlp.yaml
@@ -14,8 +14,8 @@ device: cuda:0
 env_name: walker2d-medium-v2
 obs_dim: 17
 action_dim: 6
-denoising_steps: 20
-ft_denoising_steps: 20
+denoising_steps: 10
+ft_denoising_steps: 10
 cond_steps: 1
 horizon_steps: 1
 act_steps: 1
@@ -67,7 +67,7 @@ train:
   reward_scale_running: True
   reward_scale_const: 1.0
   gae_lambda: 0.95
-  batch_size: 1000
+  batch_size: 10000
   update_epochs: 10
   vf_coef: 0.5
   target_kl: 1
diff --git a/cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml b/cfg/gym/scratch/walker2d-v2/ppo_gaussian_mlp.yaml
similarity index 97%
rename from cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml
rename to cfg/gym/scratch/walker2d-v2/ppo_gaussian_mlp.yaml
index dff57a3..70b6267 100644
--- a/cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml
+++ b/cfg/gym/scratch/walker2d-v2/ppo_gaussian_mlp.yaml
@@ -53,7 +53,7 @@ train:
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 1e-3
   save_model_freq: 100
diff --git a/cfg/robomimic/finetune/can/calql_mlp_online.yaml b/cfg/robomimic/finetune/can/calql_mlp_online.yaml
index 8fc1a3c..9fd5db1 100644
--- a/cfg/robomimic/finetune/can/calql_mlp_online.yaml
+++ b/cfg/robomimic/finetune/can/calql_mlp_online.yaml
@@ -7,7 +7,7 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent
 
 name: ${env_name}_calql_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_calql_mlp_ta1/2024-10-25_22-30-16_42/checkpoint/state_999.pt
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
 offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
@@ -97,7 +97,7 @@ model:
     tanh_output: False  # squash after sampling instead
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}    
     std_max: 7.3891
     std_min: 0.0067
   critic:
diff --git a/cfg/robomimic/finetune/can/calql_mlp_online_ph.yaml b/cfg/robomimic/finetune/can/calql_mlp_online_ph.yaml
new file mode 100644
index 0000000..cfb4b81
--- /dev/null
+++ b/cfg/robomimic/finetune/can/calql_mlp_online_ph.yaml
@@ -0,0 +1,122 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: calql-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_steps: 1  # not used
+  n_episode_per_epoch: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: True
+  batch_size: 256
+  n_random_actions: 4
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 1000
+  buffer_size: 1000000
+  online_utd_ratio: 1
+  n_eval_episode: 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 1
+  automatic_entropy_tuning: True
+
+model:
+  _target_: model.rl.gaussian_calql.CalQL_Gaussian
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    tanh_output: False  # squash after sampling instead
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}    
+    std_max: 7.3891
+    std_min: 0.0067
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml
index 2a8343a..ab384f1 100644
--- a/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml
@@ -26,7 +26,7 @@ env:
   name: ${env_name}
   best_reward_threshold_for_success: 1
   max_episode_steps: 300
-  save_video: false
+  save_video: False
   wrappers:
     robomimic_lowdim:
       normalization_path: ${normalization_path}
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 151
   n_critic_warmup_itr: 2
   n_steps: 300
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml
index ed9c90f..59cb0a2 100644
--- a/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml
@@ -26,7 +26,7 @@ env:
   name: ${env_name}
   best_reward_threshold_for_success: 1
   max_episode_steps: 300
-  save_video: false
+  save_video: False
   wrappers:
     robomimic_lowdim:
       normalization_path: ${normalization_path}
diff --git a/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml
index 24bb53a..12e33b0 100644
--- a/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 151
   n_critic_warmup_itr: 5
   n_steps: 300
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml
index ba1fa16..8256876 100644
--- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml
@@ -47,16 +47,16 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 151
   n_critic_warmup_itr: 2
   n_steps: 300
   gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
   actor_weight_decay: 0
   actor_lr_scheduler:
     first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml
index 0873cb4..54a4ab1 100644
--- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml
@@ -1,7 +1,7 @@
 defaults:
   - _self_
 hydra:
-  run:  
+  run:
     dir: ${logdir}
 _target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
 
@@ -60,22 +60,22 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 200
+  n_train_itr: 151
   n_critic_warmup_itr: 2
   n_steps: 300
   gamma: 0.999
   augment: True
   grad_accumulate: 15
-  actor_lr: 1e-5
+  actor_lr: 1e-4
   actor_weight_decay: 0
   actor_lr_scheduler:
-    first_cycle_steps: 200
+    first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
-    first_cycle_steps: 200
+    first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 1e-3
   save_model_freq: 100
@@ -96,7 +96,7 @@ train:
 model:
   _target_: model.diffusion.diffusion_ppo.PPODiffusion
   # HP to tune
-  gamma_denoising: 0.9
+  gamma_denoising: 0.99
   clip_ploss_coef: 0.01
   clip_ploss_coef_base: 0.001
   clip_ploss_coef_rate: 3
@@ -158,10 +158,10 @@ model:
         embed_style: embed2
         embed_norm: 0
     img_cond_steps: ${img_cond_steps}
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     mlp_dims: [256, 256, 256]
     activation_type: Mish
     residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
   ft_denoising_steps: ${ft_denoising_steps}
   horizon_steps: ${horizon_steps}
   obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1.yaml
new file mode 100644
index 0000000..86d28df
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1.yaml
@@ -0,0 +1,111 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_diffusion_mlp_ta1_td20/2024-09-29_15-43-07_42/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 301
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 15000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  ft_denoising_steps: ${ft_denoising_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1_ph.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1_ph.yaml
new file mode 100644
index 0000000..3367556
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1_ph.yaml
@@ -0,0 +1,111 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 301
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 6000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  ft_denoising_steps: ${ft_denoising_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml
index 591f3a9..bbd8bd6 100644
--- a/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 151
   n_critic_warmup_itr: 5
   n_steps: 300
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml
index 5037605..fa451a3 100644
--- a/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml
@@ -26,7 +26,7 @@ env:
   name: ${env_name}
   best_reward_threshold_for_success: 1
   max_episode_steps: 300
-  save_video: false
+  save_video: False
   wrappers:
     robomimic_lowdim:
       normalization_path: ${normalization_path}
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 151
   n_critic_warmup_itr: 2
   n_steps: 300
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/can/ibrl_mlp.yaml b/cfg/robomimic/finetune/can/ibrl_mlp.yaml
index 7aa8d24..c3ba56e 100644
--- a/cfg/robomimic/finetune/can/ibrl_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ibrl_mlp.yaml
@@ -7,7 +7,7 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
 
 name: ${env_name}_ibrl_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_gaussian_mlp_ta1/2024-09-28_13-43-59_42/checkpoint/state_5000.pt
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
 offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
@@ -93,7 +93,7 @@ model:
     fixed_std: 0.1
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}    
   critic:
     _target_: model.common.critic.CriticObsAct
     mlp_dims: [1024, 1024, 1024]
diff --git a/cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml b/cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml
new file mode 100644
index 0000000..8940658
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml
@@ -0,0 +1,115 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
+
+name: ${env_name}_ibrl_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_gaussian_mlp_ta1/2024-10-08_20-52-04_0/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 250  # IBRL uses 200
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 1
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: ibrl-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 100000
+  val_freq: 10000
+  render:
+    freq: 10000
+    num: 0
+  log_freq: 200
+  # IBRL specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 3
+  buffer_size: 400000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  update_freq: 2
+
+model:
+  _target_: model.rl.gaussian_ibrl.IBRL_Gaussian
+  randn_clip_value: 3
+  n_critics: 5
+  soft_action_sample: True
+  soft_action_sample_beta: 10
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    dropout: 0.5
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}    
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  max_n_episodes: 100
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml
index bddd57c..6b276bc 100644
--- a/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 81
   n_critic_warmup_itr: 2
   n_steps: 300
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml
index e0353b6..75e7c68 100644
--- a/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 81
   n_critic_warmup_itr: 5
   n_steps: 300
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml
index a0e2567..4bf3a2a 100644
--- a/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 81
   n_critic_warmup_itr: 5
   n_steps: 300
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml
index b505b81..16b9485 100644
--- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml
@@ -27,7 +27,7 @@ env:
   name: ${env_name}
   best_reward_threshold_for_success: 1
   max_episode_steps: 300
-  save_video: false
+  save_video: False
   wrappers:
     robomimic_lowdim:
       normalization_path: ${normalization_path}
@@ -47,16 +47,16 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 81
   n_critic_warmup_itr: 2
   n_steps: 300
   gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
   actor_weight_decay: 0
   actor_lr_scheduler:
     first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
@@ -99,10 +99,10 @@ model:
     action_dim: ${action_dim}
   critic:
     _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     mlp_dims: [256, 256, 256]
     activation_type: Mish
     residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
   ft_denoising_steps: ${ft_denoising_steps}
   horizon_steps: ${horizon_steps}
   obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml
index d46c44b..72207d6 100644
--- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml
@@ -1,7 +1,7 @@
 defaults:
   - _self_
 hydra:
-  run:  
+  run:
     dir: ${logdir}
 _target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
 
@@ -60,22 +60,22 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 200
+  n_train_itr: 151
   n_critic_warmup_itr: 2
   n_steps: 300
   gamma: 0.999
   augment: True
   grad_accumulate: 15
-  actor_lr: 1e-5
+  actor_lr: 1e-4
   actor_weight_decay: 0
   actor_lr_scheduler:
-    first_cycle_steps: 200
+    first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
-    first_cycle_steps: 200
+    first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 1e-3
   save_model_freq: 100
@@ -96,7 +96,7 @@ train:
 model:
   _target_: model.diffusion.diffusion_ppo.PPODiffusion
   # HP to tune
-  gamma_denoising: 0.9
+  gamma_denoising: 0.99
   clip_ploss_coef: 0.01
   clip_ploss_coef_base: 0.001
   clip_ploss_coef_rate: 3
@@ -158,10 +158,10 @@ model:
         embed_style: embed2
         embed_norm: 0
     img_cond_steps: ${img_cond_steps}
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     mlp_dims: [256, 256, 256]
     activation_type: Mish
     residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
   ft_denoising_steps: ${ft_denoising_steps}
   horizon_steps: ${horizon_steps}
   obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml
index 8262daa..4c550ea 100644
--- a/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 81
   n_critic_warmup_itr: 5
   n_steps: 300
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml
index fa6b4ca..f32ef8d 100644
--- a/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 81
   n_critic_warmup_itr: 2
   n_steps: 300
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/square/calql_mlp_online.yaml b/cfg/robomimic/finetune/square/calql_mlp_online.yaml
index 22ebae4..de333e6 100644
--- a/cfg/robomimic/finetune/square/calql_mlp_online.yaml
+++ b/cfg/robomimic/finetune/square/calql_mlp_online.yaml
@@ -7,7 +7,7 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent
 
 name: ${env_name}_calql_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_calql_mlp_ta1/2024-10-25_22-44-12_42/checkpoint/state_999.pt
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
 offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
@@ -97,7 +97,7 @@ model:
     tanh_output: False  # squash after sampling instead
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}    
     std_max: 7.3891
     std_min: 0.0067
   critic:
diff --git a/cfg/robomimic/finetune/square/calql_mlp_online_ph.yaml b/cfg/robomimic/finetune/square/calql_mlp_online_ph.yaml
new file mode 100644
index 0000000..3332780
--- /dev/null
+++ b/cfg/robomimic/finetune/square/calql_mlp_online_ph.yaml
@@ -0,0 +1,122 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  reset_at_iteration: False
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: calql-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 10000
+  n_steps: 1  # not used
+  n_episode_per_epoch: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: True
+  batch_size: 256
+  n_random_actions: 4
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 1000
+  buffer_size: 1000000
+  online_utd_ratio: 1
+  n_eval_episode: 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 1
+  automatic_entropy_tuning: True
+
+model:
+  _target_: model.rl.gaussian_calql.CalQL_Gaussian
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    tanh_output: False  # squash after sampling instead
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}    
+    std_max: 7.3891
+    std_min: 0.0067
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml
index c5b2e39..13dfbb4 100644
--- a/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 201
   n_critic_warmup_itr: 2
   n_steps: 400
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml
index 350bfe6..e143e5e 100644
--- a/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 201
   n_critic_warmup_itr: 5
   n_steps: 400
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml
index 87f1e5b..0c5fee8 100644
--- a/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 201
   n_critic_warmup_itr: 5
   n_steps: 400
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml
index 47c539e..edbe296 100644
--- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml
@@ -47,16 +47,16 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 500
+  n_train_itr: 201
   n_critic_warmup_itr: 2
   n_steps: 400
   gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
   actor_weight_decay: 0
   actor_lr_scheduler:
     first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
@@ -100,10 +100,10 @@ model:
     action_dim: ${action_dim}
   critic:
     _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     mlp_dims: [256, 256, 256]
     activation_type: Mish
     residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
   ft_denoising_steps: ${ft_denoising_steps}
   horizon_steps: ${horizon_steps}
   obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml
index 51d3e3a..84355d6 100644
--- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml
@@ -1,7 +1,7 @@
 defaults:
   - _self_
 hydra:
-  run:  
+  run:
     dir: ${logdir}
 _target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
 
@@ -60,7 +60,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 500
+  n_train_itr: 301
   n_critic_warmup_itr: 2
   n_steps: 400
   gamma: 0.999
@@ -69,13 +69,13 @@ train:
   actor_lr: 1e-5
   actor_weight_decay: 0
   actor_lr_scheduler:
-    first_cycle_steps: 500
+    first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 1e-5
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
-    first_cycle_steps: 500
+    first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 1e-3
   save_model_freq: 100
@@ -96,7 +96,7 @@ train:
 model:
   _target_: model.diffusion.diffusion_ppo.PPODiffusion
   # HP to tune
-  gamma_denoising: 0.9
+  gamma_denoising: 0.99
   clip_ploss_coef: 0.01
   clip_ploss_coef_base: 0.001
   clip_ploss_coef_rate: 3
@@ -158,10 +158,10 @@ model:
         embed_style: embed2
         embed_norm: 0
     img_cond_steps: ${img_cond_steps}
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     mlp_dims: [256, 256, 256]
     activation_type: Mish
     residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
   ft_denoising_steps: ${ft_denoising_steps}
   horizon_steps: ${horizon_steps}
   obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1.yaml
new file mode 100644
index 0000000..156154c
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1.yaml
@@ -0,0 +1,112 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_diffusion_mlp_ta1_td20/2024-09-29_02-14-14_42/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 301
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 20000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1_ph.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1_ph.yaml
new file mode 100644
index 0000000..c0d8d37
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1_ph.yaml
@@ -0,0 +1,112 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 301
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml
index 1ad16d7..6b17bc5 100644
--- a/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 201
   n_critic_warmup_itr: 5
   n_steps: 400
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml
index 2d34101..c27381f 100644
--- a/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 300
+  n_train_itr: 201
   n_critic_warmup_itr: 2
   n_steps: 400
   gamma: 0.999
diff --git a/cfg/robomimic/finetune/square/ibrl_mlp.yaml b/cfg/robomimic/finetune/square/ibrl_mlp.yaml
index 6e34653..fba5969 100644
--- a/cfg/robomimic/finetune/square/ibrl_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ibrl_mlp.yaml
@@ -7,7 +7,7 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
 
 name: ${env_name}_ibrl_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_gaussian_mlp_ta1/2024-09-28_13-42-43_42/checkpoint/state_5000.pt
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
 offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
@@ -93,7 +93,7 @@ model:
     fixed_std: 0.1
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}    
   critic:
     _target_: model.common.critic.CriticObsAct
     mlp_dims: [1024, 1024, 1024]
diff --git a/cfg/robomimic/finetune/square/ibrl_mlp_ph.yaml b/cfg/robomimic/finetune/square/ibrl_mlp_ph.yaml
new file mode 100644
index 0000000..f65c1dd
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ibrl_mlp_ph.yaml
@@ -0,0 +1,115 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
+
+name: ${env_name}_ibrl_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_gaussian_mlp_ta1/2024-10-08_20-52-42_0/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 350  # IBRL uses 300
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 1
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: ibrl-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 100000
+  val_freq: 10000
+  render:
+    freq: 10000
+    num: 0
+  log_freq: 200
+  # IBRL specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 3
+  buffer_size: 400000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  update_freq: 2
+
+model:
+  _target_: model.rl.gaussian_ibrl.IBRL_Gaussian
+  randn_clip_value: 3
+  n_critics: 5
+  soft_action_sample: True
+  soft_action_sample_beta: 10
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    dropout: 0.5
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}    
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  max_n_episodes: 100
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml
index 61d7dec..8ab3b3b 100644
--- a/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml
@@ -26,7 +26,7 @@ env:
   name: ${env_name}
   best_reward_threshold_for_success: 1
   max_episode_steps: 800
-  save_video: false
+  save_video: False
   wrappers:
     robomimic_lowdim:
       normalization_path: ${normalization_path}
@@ -49,7 +49,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 1000
+  n_train_itr: 201
   n_critic_warmup_itr: 2
   n_steps: 400
   gamma: 0.999
@@ -58,7 +58,7 @@ train:
   actor_lr_scheduler:
     first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-6
+    min_lr: 1e-5
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
@@ -82,7 +82,7 @@ train:
 model:
   _target_: model.diffusion.diffusion_awr.AWRDiffusion
   # Sampling HPs
-  min_sampling_denoising_std: 0.08
+  min_sampling_denoising_std: 0.1
   randn_clip_value: 3
   #
   network_path: ${base_policy_path}
diff --git a/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml
index ec30a80..1a99f3d 100644
--- a/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml
@@ -49,7 +49,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 1000
+  n_train_itr: 201
   n_critic_warmup_itr: 2
   n_steps: 400
   gamma: 0.999
@@ -58,7 +58,7 @@ train:
   actor_lr_scheduler:
     first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-6
+    min_lr: 1e-5
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
@@ -82,7 +82,7 @@ train:
 model:
   _target_: model.diffusion.diffusion_dipo.DIPODiffusion
   # HP to tune
-  min_sampling_denoising_std: 0.08
+  min_sampling_denoising_std: 0.1
   randn_clip_value: 3
   #
   network_path: ${base_policy_path}
@@ -96,12 +96,12 @@ model:
     action_dim: ${action_dim}
   critic:
     _target_: model.common.critic.CriticObsAct
-    action_dim: ${action_dim}
-    action_steps: ${act_steps}
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     mlp_dims: [256, 256, 256]
     activation_type: Mish
     residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
   horizon_steps: ${horizon_steps}
   obs_dim: ${obs_dim}
   action_dim: ${action_dim}
diff --git a/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml
index 825e9d6..21a760e 100644
--- a/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml
@@ -26,7 +26,7 @@ env:
   name: ${env_name}
   best_reward_threshold_for_success: 1
   max_episode_steps: 800
-  save_video: false
+  save_video: False
   wrappers:
     robomimic_lowdim:
       normalization_path: ${normalization_path}
@@ -49,8 +49,8 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 1000
-  n_critic_warmup_itr: 2
+  n_train_itr: 201
+  n_critic_warmup_itr: 5
   n_steps: 400
   gamma: 0.999
   actor_lr: 1e-5
@@ -58,7 +58,7 @@ train:
   actor_lr_scheduler:
     first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-6
+    min_lr: 1e-5
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
@@ -81,7 +81,7 @@ train:
 model:
   _target_: model.diffusion.diffusion_dql.DQLDiffusion
   # Sampling HPs
-  min_sampling_denoising_std: 0.08
+  min_sampling_denoising_std: 0.1
   randn_clip_value: 3
   #
   network_path: ${base_policy_path}
diff --git a/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml
index db690f9..140a39f 100644
--- a/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml
@@ -49,7 +49,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 1000
+  n_train_itr: 201
   n_critic_warmup_itr: 5
   n_steps: 400
   gamma: 0.999
@@ -58,7 +58,7 @@ train:
   actor_lr_scheduler:
     first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-6
+    min_lr: 1e-5
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
@@ -83,7 +83,7 @@ train:
 model:
   _target_: model.diffusion.diffusion_idql.IDQLDiffusion
   # Sampling HPs
-  min_sampling_denoising_std: 0.08
+  min_sampling_denoising_std: 0.1
   randn_clip_value: 3
   #
   network_path: ${base_policy_path}
diff --git a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml
index f0418c9..198855b 100644
--- a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml
@@ -50,16 +50,16 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 1000
+  n_train_itr: 201
   n_critic_warmup_itr: 2
   n_steps: 400
   gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
   actor_weight_decay: 0
   actor_lr_scheduler:
     first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-6
+    min_lr: 1e-4
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
@@ -76,7 +76,7 @@ train:
   reward_scale_const: 1.0
   gae_lambda: 0.95
   batch_size: 10000
-  update_epochs: 8
+  update_epochs: 5
   vf_coef: 0.5
   target_kl: 1
 
@@ -88,7 +88,7 @@ model:
   clip_ploss_coef_base: 0.001
   clip_ploss_coef_rate: 3
   randn_clip_value: 3
-  min_sampling_denoising_std: 0.08
+  min_sampling_denoising_std: 0.1
   min_logprob_denoising_std: 0.1
   #
   network_path: ${base_policy_path}
@@ -102,10 +102,10 @@ model:
     action_dim: ${action_dim}
   critic:
     _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     mlp_dims: [256, 256, 256]
     activation_type: Mish
     residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
   ft_denoising_steps: ${ft_denoising_steps}
   horizon_steps: ${horizon_steps}
   obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml
index ad22b83..b826e06 100644
--- a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml
@@ -1,7 +1,7 @@
 defaults:
   - _self_
 hydra:
-  run:  
+  run:
     dir: ${logdir}
 _target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
 
@@ -64,7 +64,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 500
+  n_train_itr: 201
   n_critic_warmup_itr: 2
   n_steps: 400
   gamma: 0.999
@@ -73,13 +73,13 @@ train:
   actor_lr: 1e-5
   actor_weight_decay: 0
   actor_lr_scheduler:
-    first_cycle_steps: 500
+    first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-6
+    min_lr: 1e-5
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
-    first_cycle_steps: 500
+    first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 1e-3
   save_model_freq: 100
@@ -93,19 +93,19 @@ train:
   gae_lambda: 0.95
   batch_size: 500
   logprob_batch_size: 1000
-  update_epochs: 8
+  update_epochs: 10
   vf_coef: 0.5
   target_kl: 1
 
 model:
   _target_: model.diffusion.diffusion_ppo.PPODiffusion
   # HP to tune
-  gamma_denoising: 0.9
+  gamma_denoising: 0.99
   clip_ploss_coef: 0.01
   clip_ploss_coef_base: 0.001
   clip_ploss_coef_rate: 3
   randn_clip_value: 3
-  min_sampling_denoising_std: 0.08
+  min_sampling_denoising_std: 0.1
   min_logprob_denoising_std: 0.1
   #
   use_ddim: ${use_ddim}
@@ -164,10 +164,10 @@ model:
         embed_style: embed2
         embed_norm: 0
     img_cond_steps: ${img_cond_steps}
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     mlp_dims: [256, 256, 256]
     activation_type: Mish
     residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
   ft_denoising_steps: ${ft_denoising_steps}
   horizon_steps: ${horizon_steps}
   obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml
index 4072238..f116ef5 100644
--- a/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml
@@ -49,7 +49,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 1000
+  n_train_itr: 201
   n_critic_warmup_itr: 5
   n_steps: 400
   gamma: 0.999
@@ -58,7 +58,7 @@ train:
   actor_lr_scheduler:
     first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-6
+    min_lr: 1e-5
   critic_lr: 1e-3
   critic_weight_decay: 0
   critic_lr_scheduler:
@@ -81,7 +81,7 @@ train:
 model:
   _target_: model.diffusion.diffusion_qsm.QSMDiffusion
   # Sampling HPs
-  min_sampling_denoising_std: 0.08
+  min_sampling_denoising_std: 0.1
   randn_clip_value: 3
   #
   network_path: ${base_policy_path}
diff --git a/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml
index af9e9cb..40cd186 100644
--- a/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml
@@ -49,7 +49,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 1000
+  n_train_itr: 201
   n_critic_warmup_itr: 2
   n_steps: 400
   gamma: 0.999
@@ -58,7 +58,7 @@ train:
   lr_scheduler:
     first_cycle_steps: 1000
     warmup_steps: 10
-    min_lr: 1e-6
+    min_lr: 1e-5
   save_model_freq: 100
   val_freq: 10
   render:
@@ -73,7 +73,7 @@ train:
 model:
   _target_: model.diffusion.diffusion_rwr.RWRDiffusion
   # Sampling HPs
-  min_sampling_denoising_std: 0.08
+  min_sampling_denoising_std: 0.1
   randn_clip_value: 3
   #
   network_path: ${base_policy_path}
diff --git a/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml b/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml
index 3c610a1..0fd05ae 100644
--- a/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml
+++ b/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 100
+  n_train_itr: 1000
   n_steps: 1
   gamma: 0.99
   actor_lr: 1e-4
@@ -61,8 +61,8 @@ train:
     first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 3e-4
-  save_model_freq: 10
-  val_freq: 10
+  save_model_freq: 100
+  val_freq: 20
   render:
     freq: 1
     num: 0
@@ -70,7 +70,7 @@ train:
   # CalQL specific
   train_online: False
   batch_size: 256
-  n_random_actions: 4
+  n_random_actions: 10
   target_ema_rate: 0.005
   scale_reward_factor: 1.0
   num_update: 1000
@@ -93,7 +93,7 @@ model:
     tanh_output: False  # squash after sampling instead
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}    
     std_max: 7.3891
     std_min: 0.0067
   critic:
diff --git a/cfg/robomimic/pretrain/can/calql_mlp_offline_ph.yaml b/cfg/robomimic/pretrain/can/calql_mlp_offline_ph.yaml
new file mode 100644
index 0000000..a70d4aa
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/calql_mlp_offline_ph.yaml
@@ -0,0 +1,118 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: calql-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 10
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: False
+  batch_size: 256
+  n_random_actions: 4
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 1000
+  buffer_size: 1000000
+  n_eval_episode: 10
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 1
+  automatic_entropy_tuning: True
+
+model:
+  _target_: model.rl.gaussian_calql.CalQL_Gaussian
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    tanh_output: False  # squash after sampling instead
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}    
+    std_max: 7.3891
+    std_min: 0.0067
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1.yaml
new file mode 100644
index 0000000..62a09e8
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1.yaml
@@ -0,0 +1,65 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1_ph.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1_ph.yaml
new file mode 100644
index 0000000..46593c6
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1_ph.yaml
@@ -0,0 +1,65 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ta1_ph.yaml
similarity index 85%
rename from cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml
rename to cfg/robomimic/pretrain/can/pre_gaussian_mlp_ta1_ph.yaml
index c7e0d9c..1bb170d 100644
--- a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml
+++ b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ta1_ph.yaml
@@ -7,13 +7,13 @@ _target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
 
 name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz
 
 seed: 42
 device: cuda:0
-env: transport
-obs_dim: 59
-action_dim: 14
+env: can
+obs_dim: 23
+action_dim: 7
 horizon_steps: 1
 cond_steps: 1
 
@@ -26,11 +26,11 @@ train:
   n_epochs: 5000
   batch_size: 256
   learning_rate: 1e-4
-  weight_decay: 0
+  weight_decay: 1e-6
   lr_scheduler:
     first_cycle_steps: 5000
     warmup_steps: 100
-    min_lr: 1e-4
+    min_lr: 1e-5
   epoch_start_ema: 20
   update_ema_freq: 10
   save_model_freq: 1000
@@ -45,7 +45,7 @@ model:
     fixed_std: 0.1
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    action_dim: ${action_dim}
+    action_dim: ${action_dim}    
   horizon_steps: ${horizon_steps}
   device: ${device}
 
diff --git a/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml b/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml
index 1cf5527..cb52740 100644
--- a/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml
+++ b/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml
@@ -46,7 +46,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 100
+  n_train_itr: 1000
   n_steps: 1
   gamma: 0.99
   actor_lr: 1e-4
@@ -61,8 +61,8 @@ train:
     first_cycle_steps: 1000
     warmup_steps: 10
     min_lr: 3e-4
-  save_model_freq: 10
-  val_freq: 10
+  save_model_freq: 100
+  val_freq: 20
   render:
     freq: 1
     num: 0
@@ -70,7 +70,7 @@ train:
   # CalQL specific
   train_online: False
   batch_size: 256
-  n_random_actions: 4
+  n_random_actions: 10
   target_ema_rate: 0.005
   scale_reward_factor: 1.0
   num_update: 1000
@@ -93,7 +93,7 @@ model:
     tanh_output: False  # squash after sampling instead
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}    
     std_max: 7.3891
     std_min: 0.0067
   critic:
diff --git a/cfg/robomimic/pretrain/square/calql_mlp_offline_ph.yaml b/cfg/robomimic/pretrain/square/calql_mlp_offline_ph.yaml
new file mode 100644
index 0000000..5e541a4
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/calql_mlp_offline_ph.yaml
@@ -0,0 +1,118 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  reset_at_iteration: False
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: calql-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 10
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: False
+  batch_size: 256
+  n_random_actions: 4
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 1000
+  buffer_size: 1000000
+  n_eval_episode: 10
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 1
+  automatic_entropy_tuning: True
+
+model:
+  _target_: model.rl.gaussian_calql.CalQL_Gaussian
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    tanh_output: False  # squash after sampling instead
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}    
+    std_max: 7.3891
+    std_min: 0.0067
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1.yaml
new file mode 100644
index 0000000..53e572e
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1.yaml
@@ -0,0 +1,66 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1_ph.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1_ph.yaml
new file mode 100644
index 0000000..7bffecd
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1_ph.yaml
@@ -0,0 +1,66 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ta1_ph.yaml
similarity index 84%
rename from cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml
rename to cfg/robomimic/pretrain/square/pre_gaussian_mlp_ta1_ph.yaml
index 11d3f08..0cc2204 100644
--- a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml
+++ b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ta1_ph.yaml
@@ -7,12 +7,12 @@ _target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
 
 name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz
 
 seed: 42
 device: cuda:0
-env: lift
-obs_dim: 19
+env: square
+obs_dim: 23
 action_dim: 7
 horizon_steps: 1
 cond_steps: 1
@@ -40,14 +40,15 @@ model:
   network:
     _target_: model.common.mlp_gaussian.Gaussian_MLP
     mlp_dims: [1024, 1024, 1024]
-    residual_style: False
+    activation_type: ReLU
+    dropout: 0.5
+    fixed_std: 0.1
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    action_dim: ${action_dim}
+    action_dim: ${action_dim}    
   horizon_steps: ${horizon_steps}
   device: ${device}
 
-
 ema:
   decay: 0.995
 
diff --git a/cfg/robomimic/finetune/can/rlpd_mlp.yaml b/cfg/robomimic/scratch/can/rlpd_mlp.yaml
similarity index 98%
rename from cfg/robomimic/finetune/can/rlpd_mlp.yaml
rename to cfg/robomimic/scratch/can/rlpd_mlp.yaml
index 4f5a948..8b66075 100644
--- a/cfg/robomimic/finetune/can/rlpd_mlp.yaml
+++ b/cfg/robomimic/scratch/can/rlpd_mlp.yaml
@@ -91,7 +91,7 @@ model:
     tanh_output: False  # squash after sampling instead
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}    
     std_max: 7.3891
     std_min: 0.0067
   critic:
diff --git a/cfg/robomimic/scratch/can/rlpd_mlp_ph.yaml b/cfg/robomimic/scratch/can/rlpd_mlp_ph.yaml
new file mode 100644
index 0000000..d574d5a
--- /dev/null
+++ b/cfg/robomimic/scratch/can/rlpd_mlp_ph.yaml
@@ -0,0 +1,114 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 1
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: rlpd-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 100000
+  val_freq: 10000
+  render:
+    freq: 10000
+    num: 0
+  log_freq: 200
+  # RLPD specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 3
+  buffer_size: 400000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 1
+
+model:
+  _target_: model.rl.gaussian_rlpd.RLPD_Gaussian
+  randn_clip_value: 10
+  backup_entropy: True
+  n_critics: 5
+  tanh_output: True
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    tanh_output: False  # squash after sampling instead
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}    
+    std_max: 7.3891
+    std_min: 0.0067
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/rlpd_mlp.yaml b/cfg/robomimic/scratch/square/rlpd_mlp.yaml
similarity index 98%
rename from cfg/robomimic/finetune/square/rlpd_mlp.yaml
rename to cfg/robomimic/scratch/square/rlpd_mlp.yaml
index d62a41d..46730a7 100644
--- a/cfg/robomimic/finetune/square/rlpd_mlp.yaml
+++ b/cfg/robomimic/scratch/square/rlpd_mlp.yaml
@@ -91,7 +91,7 @@ model:
     tanh_output: False  # squash after sampling instead
     cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
     horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}
     std_max: 7.3891
     std_min: 0.0067
   critic:
diff --git a/cfg/robomimic/scratch/square/rlpd_mlp_ph.yaml b/cfg/robomimic/scratch/square/rlpd_mlp_ph.yaml
new file mode 100644
index 0000000..cb8a8b4
--- /dev/null
+++ b/cfg/robomimic/scratch/square/rlpd_mlp_ph.yaml
@@ -0,0 +1,114 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 400
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 1
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: rlpd-${env_name}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 100000
+  val_freq: 10000
+  render:
+    freq: 10000
+    num: 0
+  log_freq: 200
+  # RLPD specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 3
+  buffer_size: 400000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 1
+
+model:
+  _target_: model.rl.gaussian_rlpd.RLPD_Gaussian
+  randn_clip_value: 10
+  backup_entropy: True
+  n_critics: 5
+  tanh_output: True
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    tanh_output: False  # squash after sampling instead
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}    
+    std_max: 7.3891
+    std_min: 0.0067
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/env/gym_utils/__init__.py b/env/gym_utils/__init__.py
index cea639c..e70870a 100644
--- a/env/gym_utils/__init__.py
+++ b/env/gym_utils/__init__.py
@@ -165,7 +165,9 @@ def make_async(
             # https://github.com/ARISE-Initiative/robosuite/blob/92abf5595eddb3a845cd1093703e5a3ccd01e77e/robosuite/environments/base.py#L247-L248
             env.env.hard_reset = False
         else:  # d3il, gym
-            env = make_(id, render=render, **kwargs)
+            if "kitchen" not in id:  # d4rl kitchen does not support rendering!
+                kwargs["render"] = render
+            env = make_(id, **kwargs)
 
         # add wrappers
         if wrappers is not None:
diff --git a/env/gym_utils/wrapper/furniture.py b/env/gym_utils/wrapper/furniture.py
index 3c02895..3ca67e1 100644
--- a/env/gym_utils/wrapper/furniture.py
+++ b/env/gym_utils/wrapper/furniture.py
@@ -132,9 +132,10 @@ class FurnitureRLSimEnvMultiStepWrapper(gym.Wrapper):
 
         nobs: np.ndarray = self.process_obs(obs)
         truncated: np.ndarray = truncated.squeeze().cpu().numpy()
-        terminated: np.ndarray = np.zeros_like(truncated, dtype=bool)
+        # terminated: np.ndarray = np.zeros_like(truncated, dtype=bool)
 
-        return {"state": nobs}, reward, terminated, truncated, info
+        # since we only assign reward at the timestep where one stage is finished, and reward does not accumulate, we consider the final step of the episode as terminal
+        return {"state": nobs}, reward, truncated, truncated, info
 
     def _inner_step(self, action_chunk: torch.Tensor):
         dense_reward = torch.zeros(action_chunk.shape[0], device=action_chunk.device)
diff --git a/model/common/mlp.py b/model/common/mlp.py
index 3322af9..4ab137c 100644
--- a/model/common/mlp.py
+++ b/model/common/mlp.py
@@ -96,6 +96,7 @@ class ResidualMLP(nn.Module):
         out_activation_type="Identity",
         use_layernorm=False,
         use_layernorm_final=False,
+        dropout=0,
     ):
         super(ResidualMLP, self).__init__()
         hidden_dim = dim_list[1]
@@ -108,6 +109,7 @@ class ResidualMLP(nn.Module):
                     hidden_dim=hidden_dim,
                     activation_type=activation_type,
                     use_layernorm=use_layernorm,
+                    dropout=dropout,
                 )
                 for _ in range(1, num_hidden_layers, 2)
             ]
@@ -129,6 +131,7 @@ class TwoLayerPreActivationResNetLinear(nn.Module):
         hidden_dim,
         activation_type="Mish",
         use_layernorm=False,
+        dropout=0,
     ):
         super().__init__()
         self.l1 = nn.Linear(hidden_dim, hidden_dim)
@@ -137,6 +140,8 @@ class TwoLayerPreActivationResNetLinear(nn.Module):
         if use_layernorm:
             self.norm1 = nn.LayerNorm(hidden_dim, eps=1e-06)
             self.norm2 = nn.LayerNorm(hidden_dim, eps=1e-06)
+        if dropout > 0:
+            raise NotImplementedError("Dropout not implemented for residual MLP!")
 
     def forward(self, x):
         x_input = x
diff --git a/model/common/mlp_gaussian.py b/model/common/mlp_gaussian.py
index e05dbed..dbd10cf 100644
--- a/model/common/mlp_gaussian.py
+++ b/model/common/mlp_gaussian.py
@@ -212,6 +212,7 @@ class Gaussian_MLP(nn.Module):
                 out_activation_type=activation_type,
                 use_layernorm=use_layernorm,
                 use_layernorm_final=use_layernorm,
+                dropout=dropout,
             )
             self.mlp_mean = MLP(
                 mlp_dims[-1:] + [output_dim],
@@ -233,9 +234,7 @@ class Gaussian_MLP(nn.Module):
             if learn_fixed_std:
                 # initialize to fixed_std
                 self.logvar = torch.nn.Parameter(
-                    torch.log(
-                        torch.tensor([fixed_std**2 for _ in range(action_dim)])
-                    ),
+                    torch.log(torch.tensor([fixed_std**2 for _ in range(action_dim)])),
                     requires_grad=True,
                 )
         self.logvar_min = torch.nn.Parameter(
diff --git a/model/diffusion/diffusion_ppo.py b/model/diffusion/diffusion_ppo.py
index 9c13863..1c574a3 100644
--- a/model/diffusion/diffusion_ppo.py
+++ b/model/diffusion/diffusion_ppo.py
@@ -22,7 +22,6 @@ from model.diffusion.diffusion_vpg import VPGDiffusion
 
 
 class PPODiffusion(VPGDiffusion):
-
     def __init__(
         self,
         gamma_denoising: float,
@@ -58,7 +57,9 @@ class PPODiffusion(VPGDiffusion):
     def loss(
         self,
         obs,
-        chains,
+        chains_prev,
+        chains_next,
+        denoising_inds,
         returns,
         oldvalues,
         advantages,
@@ -81,9 +82,11 @@ class PPODiffusion(VPGDiffusion):
         reward_horizon: action horizon that backpropagates gradient
         """
         # Get new logprobs for denoising steps from T-1 to 0 - entropy is fixed fod diffusion
-        newlogprobs, eta = self.get_logprobs(
+        newlogprobs, eta = self.get_logprobs_subsample(
             obs,
-            chains,
+            chains_prev,
+            chains_next,
+            denoising_inds,
             get_ent=True,
         )
         entropy_loss = -eta.mean()
@@ -92,7 +95,7 @@ class PPODiffusion(VPGDiffusion):
 
         # only backpropagate through the earlier steps (e.g., ones actually executed in the environment)
         newlogprobs = newlogprobs[:, :reward_horizon, :]
-        oldlogprobs = oldlogprobs[:, :, :reward_horizon, :]
+        oldlogprobs = oldlogprobs[:, :reward_horizon, :]
 
         # Get the logprobs - batch over B and denoising steps
         newlogprobs = newlogprobs.mean(dim=(-1, -2)).view(-1)
@@ -106,9 +109,7 @@ class PPODiffusion(VPGDiffusion):
 
             # Get counterfactual teacher actions
             samples = self.forward(
-                cond=obs.float()
-                .unsqueeze(1)
-                .to(self.device),  # B x horizon=1 x obs_dim
+                cond=obs,
                 deterministic=False,
                 return_chain=True,
                 use_base_policy=True,
@@ -116,7 +117,7 @@ class PPODiffusion(VPGDiffusion):
             # Get logprobs of teacher actions under this policy
             bc_logprobs = self.get_logprobs(
                 obs,
-                samples.chains,  # n_env x denoising x horizon x act
+                samples.chains,
                 get_ent=False,
                 use_base_policy=False,
             )
@@ -133,14 +134,13 @@ class PPODiffusion(VPGDiffusion):
         advantage_max = torch.quantile(advantages, self.clip_advantage_upper_quantile)
         advantages = advantages.clamp(min=advantage_min, max=advantage_max)
 
-        # repeat advantages for denoising steps and horizon steps
-        advantages = advantages.repeat_interleave(self.ft_denoising_steps)
-
         # denoising discount
         discount = torch.tensor(
-            [self.gamma_denoising**i for i in reversed(range(self.ft_denoising_steps))]
+            [
+                self.gamma_denoising ** (self.ft_denoising_steps - i - 1)
+                for i in denoising_inds
+            ]
         ).to(self.device)
-        discount = discount.repeat(len(advantages) // self.ft_denoising_steps)
         advantages *= discount
 
         # get ratio
@@ -148,9 +148,7 @@ class PPODiffusion(VPGDiffusion):
         ratio = logratio.exp()
 
         # exponentially interpolate between the base and the current clipping value over denoising steps and repeat
-        t = torch.arange(self.ft_denoising_steps).float().to(self.device) / (
-            self.ft_denoising_steps - 1
-        )  # 0 to 1
+        t = (denoising_inds.float() / (self.ft_denoising_steps - 1)).to(self.device)
         if self.ft_denoising_steps > 1:
             clip_ploss_coef = self.clip_ploss_coef_base + (
                 self.clip_ploss_coef - self.clip_ploss_coef_base
@@ -158,10 +156,7 @@ class PPODiffusion(VPGDiffusion):
                 math.exp(self.clip_ploss_coef_rate) - 1
             )
         else:
-            clip_ploss_coef = torch.tensor([self.clip_ploss_coef]).to(self.device)
-        clip_ploss_coef = clip_ploss_coef.repeat(
-            len(advantages) // self.ft_denoising_steps
-        )
+            clip_ploss_coef = t
 
         # get kl difference and whether value clipped
         with torch.no_grad():
diff --git a/model/diffusion/diffusion_vpg.py b/model/diffusion/diffusion_vpg.py
index e61b716..cfa9a5a 100644
--- a/model/diffusion/diffusion_vpg.py
+++ b/model/diffusion/diffusion_vpg.py
@@ -395,6 +395,71 @@ class VPGDiffusion(DiffusionModel):
             return log_prob, eta
         return log_prob
 
+    def get_logprobs_subsample(
+        self,
+        cond,
+        chains_prev,
+        chains_next,
+        denoising_inds,
+        get_ent: bool = False,
+        use_base_policy: bool = False,
+    ):
+        """
+        Calculating the logprobs of random samples of denoised chains.
+
+        Args:
+            cond: dict with key state/rgb; more recent obs at the end
+                state: (B, To, Do)
+                rgb: (B, To, C, H, W)
+            chains: (B, K+1, Ta, Da)
+            get_ent: flag for returning entropy
+            use_base_policy: flag for using base policy
+
+        Returns:
+            logprobs: (B, Ta, Da)
+            entropy (if get_ent=True):  (B, Ta)
+            denoising_indices: (B, )
+        """
+        # Sample t for batch dim, keep it 1-dim
+        if self.use_ddim:
+            t_single = self.ddim_t[-self.ft_denoising_steps :]
+        else:
+            t_single = torch.arange(
+                start=self.ft_denoising_steps - 1,
+                end=-1,
+                step=-1,
+                device=self.device,
+            )
+            # 4,3,2,1,0,4,3,2,1,0,...,4,3,2,1,0
+        t_all = t_single[denoising_inds]
+        if self.use_ddim:
+            ddim_indices_single = torch.arange(
+                start=self.ddim_steps - self.ft_denoising_steps,
+                end=self.ddim_steps,
+                device=self.device,
+            )  # only used for DDIM
+            ddim_indices = ddim_indices_single[denoising_inds]
+        else:
+            ddim_indices = None
+
+        # Forward pass with previous chains
+        next_mean, logvar, eta = self.p_mean_var(
+            chains_prev,
+            t_all,
+            cond=cond,
+            index=ddim_indices,
+            use_base_policy=use_base_policy,
+        )
+        std = torch.exp(0.5 * logvar)
+        std = torch.clip(std, min=self.min_logprob_denoising_std)
+        dist = Normal(next_mean, std)
+
+        # Get logprobs with gaussian
+        log_prob = dist.log_prob(chains_next)
+        if get_ent:
+            return log_prob, eta
+        return log_prob
+
     def loss(self, cond, chains, reward):
         """
         REINFORCE loss. Not used right now.
diff --git a/model/rl/gaussian_calql.py b/model/rl/gaussian_calql.py
index 14d87f0..0ea9ddb 100644
--- a/model/rl/gaussian_calql.py
+++ b/model/rl/gaussian_calql.py
@@ -63,7 +63,6 @@ class CalQL_Gaussian(GaussianModel):
         returns,
         terminated,
         gamma,
-        alpha,
     ):
         B = len(actions)
 
@@ -71,17 +70,17 @@ class CalQL_Gaussian(GaussianModel):
         q_data1, q_data2 = self.critic(obs, actions)
         with torch.no_grad():
             # repeat for action samples
-            next_obs["state"] = next_obs["state"].repeat_interleave(
+            next_obs_repeated = {"state": next_obs["state"].repeat_interleave(
                 self.cql_n_actions, dim=0
-            )
+            )}
 
             # Get the next actions and logprobs
             next_actions, next_logprobs = self.forward(
-                next_obs,
+                next_obs_repeated,
                 deterministic=False,
                 get_logprob=True,
             )
-            next_q1, next_q2 = self.target_critic(next_obs, next_actions)
+            next_q1, next_q2 = self.target_critic(next_obs_repeated, next_actions)
             next_q = torch.min(next_q1, next_q2)
 
             # Reshape the next_q to match the number of samples
@@ -96,9 +95,6 @@ class CalQL_Gaussian(GaussianModel):
             # Get the target Q values
             target_q = rewards + gamma * (1 - terminated) * next_q
 
-            # Subtract the entropy bonus
-            target_q = target_q - alpha * next_logprobs
-
         # TD loss
         td_loss_1 = nn.functional.mse_loss(q_data1, target_q)
         td_loss_2 = nn.functional.mse_loss(q_data2, target_q)
@@ -111,6 +107,12 @@ class CalQL_Gaussian(GaussianModel):
             reparameterize=False,
             get_logprob=True,
         )  # no gradient
+        pi_next_actions, log_pi_next = self.forward(
+            next_obs,
+            deterministic=False,
+            reparameterize=False,
+            get_logprob=True,
+        )  # no gradient
 
         # Random action Q values
         n_random_actions = random_actions.shape[1]
@@ -130,17 +132,26 @@ class CalQL_Gaussian(GaussianModel):
 
         # Policy action Q values
         q_pi_1, q_pi_2 = self.critic(obs, pi_actions)
-        q_pi_1 = q_pi_1 - log_pi
-        q_pi_2 = q_pi_2 - log_pi
+        q_pi_next_1, q_pi_next_2 = self.critic(next_obs, pi_next_actions)
 
         # Ensure calibration w.r.t. value function estimate
         q_pi_1 = torch.max(q_pi_1, returns)[:, None]  # (B, 1)
         q_pi_2 = torch.max(q_pi_2, returns)[:, None]  # (B, 1)
-        cat_q_1 = torch.cat([q_rand_1, q_pi_1], dim=-1)  # (B, num_samples+1)
+        q_pi_next_1 = torch.max(q_pi_next_1, returns)[:, None]  # (B, 1)
+        q_pi_next_2 = torch.max(q_pi_next_2, returns)[:, None]  # (B, 1)
+
+        # cql_importance_sample
+        q_pi_1 = q_pi_1 - log_pi
+        q_pi_2 = q_pi_2 - log_pi
+        q_pi_next_1 = q_pi_next_1 - log_pi_next
+        q_pi_next_2 = q_pi_next_2 - log_pi_next
+        cat_q_1 = torch.cat([q_rand_1, q_pi_1, q_pi_next_1], dim=-1)  # (B, num_samples+1)
         cql_qf1_ood = torch.logsumexp(cat_q_1, dim=-1)  # max over num_samples
-        cat_q_2 = torch.cat([q_rand_2, q_pi_2], dim=-1)  # (B, num_samples+1)
+        cat_q_2 = torch.cat([q_rand_2, q_pi_2, q_pi_next_2], dim=-1)  # (B, num_samples+1)
         cql_qf2_ood = torch.logsumexp(cat_q_2, dim=-1)  # sum over num_samples
 
+        # skip cal_lagrange since the paper shows cql_target_action_gap not used in kitchen
+
         # Subtract the log likelihood of the data
         cql_qf1_diff = torch.clamp(
             cql_qf1_ood - q_data1,
diff --git a/model/rl/gaussian_ibrl.py b/model/rl/gaussian_ibrl.py
index ce96232..4a87f2d 100644
--- a/model/rl/gaussian_ibrl.py
+++ b/model/rl/gaussian_ibrl.py
@@ -20,7 +20,7 @@ class IBRL_Gaussian(GaussianModel):
         critic,
         n_critics,
         soft_action_sample=False,
-        soft_action_sample_beta=0.1,
+        soft_action_sample_beta=10,
         **kwargs,
     ):
         super().__init__(network=actor, **kwargs)
diff --git a/model/rl/gaussian_ppo.py b/model/rl/gaussian_ppo.py
index a7e3be8..05c047d 100644
--- a/model/rl/gaussian_ppo.py
+++ b/model/rl/gaussian_ppo.py
@@ -63,6 +63,23 @@ class PPO_Gaussian(VPG_Gaussian):
         oldlogprobs = oldlogprobs.clamp(min=-5, max=2)
         entropy_loss = -entropy
 
+        bc_loss = 0.0
+        if use_bc_loss:
+            # See Eqn. 2 of https://arxiv.org/pdf/2403.03949.pdf
+            # Give a reward for maximizing probability of teacher policy's action with current policy.
+            # Actions are chosen along trajectory induced by current policy.
+
+            # Get counterfactual teacher actions
+            samples = self.forward(
+                cond=obs,
+                deterministic=False,
+                use_base_policy=True,
+            )
+            # Get logprobs of teacher actions under this policy
+            bc_logprobs, _, _ = self.get_logprobs(obs, samples, use_base_policy=False)
+            bc_logprobs = bc_logprobs.clamp(min=-5, max=2)
+            bc_loss = -bc_logprobs.mean()
+
         # get ratio
         logratio = newlogprobs - oldlogprobs
         ratio = logratio.exp()
@@ -99,25 +116,6 @@ class PPO_Gaussian(VPG_Gaussian):
             v_loss = 0.5 * v_loss_max.mean()
         else:
             v_loss = 0.5 * ((newvalues - returns) ** 2).mean()
-
-        bc_loss = 0.0
-        if use_bc_loss:
-            # See Eqn. 2 of https://arxiv.org/pdf/2403.03949.pdf
-            # Give a reward for maximizing probability of teacher policy's action with current policy.
-            # Actions are chosen along trajectory induced by current policy.
-
-            # Get counterfactual teacher actions
-            samples = self.forward(
-                cond=obs.float()
-                .unsqueeze(1)
-                .to(self.device),  # B x horizon=1 x obs_dim
-                deterministic=False,
-                use_base_policy=True,
-            )
-            # Get logprobs of teacher actions under this policy
-            bc_logprobs, _, _ = self.get_logprobs(obs, samples, use_base_policy=False)
-            bc_logprobs = bc_logprobs.clamp(min=-5, max=2)
-            bc_loss = -bc_logprobs.mean()
         return (
             pg_loss,
             entropy_loss,
diff --git a/pyproject.toml b/pyproject.toml
index 0191c91..b1dbffe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "dppo"
-version = "0.5.0"
+version = "0.6.0"
 description = "Fine-tuning diffusion policies with PPO."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -32,6 +32,13 @@ gym = [
     "d4rl",
     "patchelf",
 ]
+kitchen = [
+    "cython<3",
+    "d4rl",
+    "dm_control==1.0.16",
+    "mujoco==3.1.6",
+    "patchelf",
+]
 robomimic = [
     "cython<3",
     "d4rl",
diff --git a/script/download_url.py b/script/download_url.py
index 9838088..1fa2069 100644
--- a/script/download_url.py
+++ b/script/download_url.py
@@ -7,6 +7,12 @@ def get_dataset_download_url(cfg):
         return "https://drive.google.com/drive/u/1/folders/1BJu8NklriunDHsDrLT6fEpcro3_2IPFf"
     elif env == "halfcheetah-medium-v2":
         return "https://drive.google.com/drive/u/1/folders/1Drel26tiuQ9oD3YNf1eyy0UVaf5SQj-U"
+    elif env == "kitchen-complete-v0":
+        return "https://drive.google.com/drive/u/1/folders/18aqg7KIv-YNXohTsRR7Zmg-RyDtdhkLc"
+    elif env == "kitchen-partial-v0":
+        return "https://drive.google.com/drive/u/1/folders/1zLOx1q4FbJK1ZWLui_vhM2x1fMEkBC2D"
+    elif env == "kitchen-mixed-v0":
+        return "https://drive.google.com/drive/u/1/folders/1HRMM16UC10A00oBqjYOL1E8hS5icwtvo"
     # D3IL
     elif env == "avoid" and cfg.mode == "d56_r12":  # M1
         return "https://drive.google.com/drive/u/1/folders/1ZAPvLQwv2y4Q98UDVKXFT4fvGF5yhD_o"
@@ -14,7 +20,20 @@ def get_dataset_download_url(cfg):
         return "https://drive.google.com/drive/u/1/folders/1wyJi1Zbnd6JNy4WGszHBH40A0bbl-vkd"
     elif env == "avoid" and cfg.mode == "d58_r12":  # M3
         return "https://drive.google.com/drive/u/1/folders/1mNXCIPnCO_FDBlEj95InA9eWJM2XcEEj"
-    # Robomimic
+    # Robomimic-PH
+    elif (
+        env == "can"
+        and "ph" in cfg.train_dataset_path
+        and "img" not in cfg.train_dataset_path
+    ):
+        return "https://drive.google.com/drive/folders/1rpVsdpqWPygL89E-t4SLQmZgwQ3mpNnY?usp=drive_link"
+    elif (
+        env == "square"
+        and "ph" in cfg.train_dataset_path
+        and "img" not in cfg.train_dataset_path
+    ):
+        return "https://drive.google.com/drive/folders/1wqqjT9JZ9LX11l2Sz_vGxfcT3BfcNrGk?usp=drive_link"
+    # Robomimic-MH
     elif env == "lift" and "img" not in cfg.train_dataset_path:  # state
         return "https://drive.google.com/drive/u/1/folders/1lbXgMKBTAiFdJqPZqWXpwjEyrVW16MBu"
     elif env == "lift" and "img" in cfg.train_dataset_path:  # img
@@ -58,6 +77,12 @@ def get_normalization_download_url(cfg):
         return "https://drive.google.com/file/d/1NSX7t3DFKaBj5HNpv91Oo5h6oXTk0zoo/view?usp=drive_link"
     elif env == "halfcheetah-medium-v2":
         return "https://drive.google.com/file/d/1LlwCMfy1b5e8jSx99CV3lWhcrQWrI2Jm/view?usp=drive_link"
+    elif env == "kitchen-complete-v0":
+        return "https://drive.google.com/file/d/1tBATWLoP1E5s08vr5fiUZBzn8EEsjEZh/view?usp=drive_link"
+    elif env == "kitchen-partial-v0":
+        return "https://drive.google.com/file/d/1Ptt0cwQwmb5_HGNM-zggRaDKfkqqNO5e/view?usp=drive_link"
+    elif env == "kitchen-mixed-v0":
+        return "https://drive.google.com/file/d/11gj846QTYFPeV14nhcL5Z9OA5RHIGVt1/view?usp=drive_link"
     # D3IL
     elif env == "avoiding-m5" and cfg.mode == "d56_r12":  # M1
         return "https://drive.google.com/file/d/1PubKaPabbiSdWYpGmouDhYfXp4QwNHFG/view?usp=drive_link"
@@ -65,7 +90,20 @@ def get_normalization_download_url(cfg):
         return "https://drive.google.com/file/d/1Hoohw8buhsLzXoqivMA6IzKS5Izlj07_/view?usp=drive_link"
     elif env == "avoiding-m5" and cfg.mode == "d58_r12":  # M3
         return "https://drive.google.com/file/d/1qt7apV52C9Tflsc-A55J6uDMHzaFa1wN/view?usp=drive_link"
-    # Robomimic
+    # Robomimic-PH
+    elif (
+        env == "can"
+        and "ph" in cfg.normalization_path
+        and "img" not in cfg.normalization_path
+    ):
+        return "https://drive.google.com/file/d/1y04FAEXgK6UlZuDiQzTumS9lz-Ufn47B/view?usp=drive_link"
+    elif (
+        env == "square"
+        and "ph" in cfg.normalization_path
+        and "img" not in cfg.normalization_path
+    ):
+        return "https://drive.google.com/file/d/1_75UM0frCZVtcROgfWsdJ0FstToZd1b5/view?usp=drive_link"
+    # Robomimic-MH
     elif env == "lift" and "img" not in cfg.normalization_path:  # state
         return "https://drive.google.com/file/d/1d3WjwRds-7I5bBFpZuY27OT9ycb8r_QM/view?usp=drive_link"
     elif env == "lift" and "img" in cfg.normalization_path:  # img
@@ -120,6 +158,71 @@ def get_checkpoint_download_url(cfg):
         in path
     ):
         return "https://drive.google.com/file/d/1o9ryyeZQAsaB4ffUTCJkIaGCi0frL3G4/view?usp=drive_link"
+    # Demo-RL
+    elif (
+        "halfcheetah-medium-v2_pre_diffusion_mlp_ta1_td20/2024-09-29_02-13-10_42/checkpoint/state_1000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1Oi5JhsU45ScHdlrtn5AX8Ji7InLBVj4D/view?usp=drive_link"
+    elif (
+        "halfcheetah-medium-v2_pre_gaussian_mlp_ta1/2024-09-28_18-48-54_42/checkpoint/state_500.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/14rbYGaCxvj1PtELKVfdXNHJ1Od2G6FLw/view?usp=drive_link"
+    elif (
+        "halfcheetah-medium-v2_calql_mlp_ta1/2024-09-29_22-59-08_42/checkpoint/state_49.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1Xf758xzsAqpFwV955OVUNL6Za90XPo1K/view?usp=drive_link"
+
+    elif (
+        "kitchen-complete-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-47-42_42/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1YBwyNd30a4_inu2sZzNSNLJQsj8fN3ZX/view?usp=drive_link"
+    elif (
+        "kitchen-complete-v0_calql_mlp_ta1/2024-10-26_01-01-33_42/checkpoint/state_999.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1K4V59iXNQbpOvu3u5y6C9R5piMU9idYm/view?usp=drive_link"
+    elif (
+        "kitchen-complete-v0_pre_gaussian_mlp_ta1/2024-10-25_14-48-43_42/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1tQYgnkdhR5wnuXC4Ha_mKHuIdg6J627s/view?usp=drive_link"
+
+    elif (
+        "kitchen-partial-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-29_42/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1oSupKkUjCFQVWBIJV5Seh-CclWhgpopS/view?usp=drive_link"
+    elif (
+        "kitchen-partial-v0_calql_mlp_ta1/2024-10-25_21-26-51_42/checkpoint/state_980.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/17HUDp3l8mJsMIW-DRraKPhUkH44KGTbA/view?usp=drive_link"
+    elif (
+        "kitchen-partial-v0_pre_gaussian_mlp_ta1/2024-10-25_01-45-52_42/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1-ZmGRPi4jMS7HfqHPvWrSPxNSoTwih6q/view?usp=drive_link"
+
+    elif (
+        "kitchen-mixed-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-28_42/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1X24Hqbn4b4xyLK_1A3D6zhSgsN7frVCG/view?usp=drive_link"
+    elif (
+        "kitchen-mixed-v0_calql_mlp_ta1/2024-10-25_21-36-13_42/checkpoint/state_999.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1AP7bbzAwwfuSLmV1HkQLfmd76MXQn2Za/view?usp=drive_link"
+    elif (
+        "kitchen-mixed-v0_pre_gaussian_mlp_ta1/2024-10-25_01-39-44_42/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1LEzGhMOqL3YZFXMGn1mTcOh-tm4Lh1SH/view?usp=drive_link"
+
     ######################################
     ####             D3IL
     ######################################
@@ -246,6 +349,32 @@ def get_checkpoint_download_url(cfg):
         in path
     ):
         return "https://drive.google.com/file/d/1xSgwGG40zdoO2DDSM79l0rMHeNmaifnq/view?usp=drive_link"
+    # demo-PH
+    elif (
+        "can_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1Ze86hw2E0jJinn3Vx683JQ10Gq5FIJad/view?usp=drive_link"
+    elif (
+        "can_pre_gaussian_mlp_ta1/2024-10-08_20-52-04_0/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1jP3mEOhZojWiTXCMZ0ajFRMkAAmonGxV/view?usp=drive_link"
+    elif "can_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt" in path:
+        return "https://drive.google.com/file/d/1ERaZKTXmL-vdyU8PZ2X9GjFIMVKJjA2N/view?usp=drive_link"
+    # demo-MH
+    elif (
+        "can_pre_diffusion_mlp_ta1_td20/2024-09-29_15-43-07_42/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1pEs1cK1x5obAtJA9pFSN1CWG79gNhH24/view?usp=drive_link"
+    elif (
+        "can_pre_gaussian_mlp_ta1/2024-09-28_13-43-59_42/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1Fa3yflkvYSAy6PKT646U1VAqUJ0YHqsj/view?usp=drive_link"
+    elif "can_calql_mlp_ta1/2024-10-25_22-30-16_42/checkpoint/state_999.pt" in path:
+        return "https://drive.google.com/file/d/1AA94uEaK_SzG2mTpaKqZIwNMh6omL_g0/view?usp=drive_link"
     ######################################
     ####             Robomimic-Square
     ######################################
@@ -286,6 +415,32 @@ def get_checkpoint_download_url(cfg):
         in path
     ):
         return "https://drive.google.com/file/d/1LczXhgeNtQfqySsfGNbbviPrlLwyh-E3/view?usp=drive_link"
+    # demo-PH
+    elif (
+        "square_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1_Jnz14ySxqbtZa9IIEWkXqy5_-EwJLBw/view?usp=drive_link"
+    elif (
+        "square_pre_gaussian_mlp_ta1/2024-10-08_20-52-42_0/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1ZPWKUoZ93OqqVX3ephQMkpeBZoYrceM5/view?usp=drive_link"
+    elif "square_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt" in path:
+        return "https://drive.google.com/file/d/1_7YtUwRd_U5tuOvhHogJDhkEsE-4D24V/view?usp=drive_link"
+    # demo-MH
+    elif (
+        "square_pre_diffusion_mlp_ta1_td20/2024-09-29_02-14-14_42/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1ks1PnUBvFVWPnpGnYL8_eIfLNeGZbv1p/view?usp=drive_link"
+    elif (
+        "square_pre_gaussian_mlp_ta1/2024-09-28_13-42-43_42/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1uIOn8QUkGRbhZLkQ9ziOkP7yGQnpYdk7/view?usp=drive_link"
+    elif "square_calql_mlp_ta1/2024-10-25_22-44-12_42/checkpoint/state_999.pt" in path:
+        return "https://drive.google.com/file/d/1zgzG6bx6ugAEaq72z9WpXX6iewClcKTV/view?usp=drive_link"
     ######################################
     ####             Robomimic-Transport
     ######################################