commit 8293b0936b62a63ffe75f993f61d32177be8f470
Author: allenzren <allen.ren@princeton.edu>
Date:   Tue Sep 3 21:03:27 2024 -0400

    release

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f145a12
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,146 @@
+# Custom
+result/
+wandb/
+.vscode/
+data/
+log/
+logs/
+checkpoints/
+.github/
+out/
+err/
+*.pkl
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+# env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+cfg/robomimic/*.sh
+
+*.out
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..68bcaa9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Intelligent Robot Motion Lab
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..227697c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,189 @@
+# Diffusion Policy Policy Optimization (DPPO)
+
+[[Paper](https://arxiv.org/abs/2409.00588)]&nbsp;&nbsp;[[Website](https://diffusion-ppo.github.io/)]
+
+[Allen Z. Ren](https://allenzren.github.io/)<sup>1</sup>, [Justin Lidard](https://jlidard.github.io/)<sup>1</sup>, [Lars L. Ankile](https://ankile.com/)<sup>2,3</sup>, [Anthony Simeonov](https://anthonysimeonov.github.io/)<sup>3</sup><br>
+[Pulkit Agrawal](https://people.csail.mit.edu/pulkitag/)<sup>3</sup>, [Anirudha Majumdar](https://mae.princeton.edu/people/faculty/majumdar)<sup>1</sup>, [Benjamin Burchfiel](http://www.benburchfiel.com/)<sup>4</sup>, [Hongkai Dai](https://hongkai-dai.github.io/)<sup>4</sup>, [Max Simchowitz](https://msimchowitz.github.io/)<sup>3,5</sup>
+
+<sup>1</sup>Princeton University, <sup>2</sup>Harvard University, <sup>3</sup>Masschusetts Institute of Technology<br>
+<sup>4</sup>Toyota Research Institute, <sup>5</sup>Carnegie Mellon University
+
+<img src="https://github.com/diffusion-ppo/diffusion-ppo.github.io/blob/main/img/overview-full.png" alt="drawing" width="100%"/>
+
+> DPPO is an algorithmic framework and set of best practices for fine-tuning diffusion-based policies in continuous control and robot learning tasks.
+
+<!-- ## Release
+
+* [08/30/24] [DPPO](https://diffusion-ppo.github.io/) codebase and technical whitepaper are released.  -->
+
+## Installation 
+
+1. Clone the repository
+```console
+git clone git@github.com:allenzren/dppo.git
+cd dppo
+```
+
+2. Install core dependencies with a conda environment (if you do not plan to use Furniture-Bench, a higher Python version such as 3.10 can be installed instead) on a Linux machine with a Nvidia GPU.
+```console
+conda create -n dppo python=3.8 -y
+conda activate dppo
+pip install -e .
+```
+
+3. Install specific environment dependencies (Gym / Robomimic / D3IL / Furniture-Bench) or all dependencies
+```console
+pip install -e .[gym] # or [robomimic], [d3il], [furniture]
+pip install -e .[all]
+```
+<!-- **Note**: Please do not set macros for robomimic and robosuite that the warnings suggest --- we will use some different global variables than the ones defined in macro.py  -->
+
+4. [Install MuJoCo for Gym and/or Robomimic](installation/install_mujoco.md). [Install D3IL](installation/install_d3il.md). [Install IsaacGym and Furniture-Bench](installation/install_furniture.md)
+
+5. Set environment variables for data and logging directory (default is `data/` and `log/`), and set WandB entity (username or team name)
+```
+source script/set_path.sh
+```
+
+## Usage - Pre-training
+
+**Note**: You may skip pre-training if you would like to use the default checkpoint (available for download) for fine-tuning.
+
+<!-- ### Prepare pre-training data
+
+First create a directory as the parent directory of the pre-training data and set the environment variable for it.
+```console
+export DPPO_DATA_DIR=/path/to/data -->
+<!-- ``` -->
+
+Pre-training data for all tasks are pre-processed and can be found at [here](https://drive.google.com/drive/folders/1AXZvNQEKOrp0_jk1VLepKh_oHCg_9e3r?usp=drive_link). Pre-training script will download the data (including normalization statistics) automatically to the data directory.
+<!-- The data path follows `${DPPO_DATA_DIR}/<benchmark>/<task>/train.npz`, e.g., `${DPPO_DATA_DIR}/gym/hopper-medium-v2/train.pkl`. -->
+
+### Run pre-training with data
+All the configs can be found under `cfg/<env>/pretrain/`. A new WandB project may be created based on `wandb.project` in the config file; set `wandb=null` in the command line to test without WandB logging.
+<!-- To run pre-training, first set your WandB entity (username or team name) and the parent directory for logging as environment variables. -->
+<!-- ```console
+export DPPO_WANDB_ENTITY=<your_wandb_entity>
+export DPPO_LOG_DIR=<your_prefered_logging_directory>
+``` -->
+```console
+# Gym - hopper/walker2d/halfcheetah
+python script/train.py --config-name=pre_diffusion_mlp \
+    --config-dir=cfg/gym/pretrain/hopper-medium-v2
+# Robomimic - lift/can/square/transport
+python script/train.py --config-name=pre_diffusion_mlp \
+    --config-dir=cfg/robomimic/pretrain/can
+# D3IL - avoid_m1/m2/m3
+python script/train.py --config-name=pre_diffusion_mlp \
+    --config-dir=cfg/d3il/pretrain/avoid_m1
+# Furniture-Bench - one_leg/lamp/round_table_low/med
+python script/train.py --config-name=pre_diffusion_mlp \
+    --config-dir=cfg/furniture/pretrain/one_leg_low
+```
+
+See [here](cfg/pretraining.md) for details of the experiments in the paper.
+
+
+## Usage - Fine-tuning
+
+<!-- ### Set up pre-trained policy -->
+
+<!-- If you did not set the environment variables for pre-training, we need to set them here for fine-tuning. 
+```console
+export DPPO_WANDB_ENTITY=<your_wandb_entity>
+export DPPO_LOG_DIR=<your_prefered_logging_directory>
+``` -->
+<!-- First create a directory as the parent directory of the downloaded checkpoints and set the environment variable for it.
+```console
+export DPPO_LOG_DIR=/path/to/checkpoint
+``` -->
+
+Pre-trained policies used in the paper can be found [here](https://drive.google.com/drive/folders/1ZlFqmhxC4S8Xh1pzZ-fXYzS5-P8sfpiP?usp=drive_link). Fine-tuning script will download the default checkpoint automatically to the logging directory.
+ <!-- or you may manually download other ones (different epochs) or use your own pre-trained policy if you like. -->
+
+ <!-- e.g., `${DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-08-26_22-31-03_42/checkpoint/state_0.pt`. -->
+
+<!-- The checkpoint path follows `${DPPO_LOG_DIR}/<benchmark>/<task>/.../<run>/checkpoint/state_<epoch>.pt`. -->
+
+### Fine-tuning pre-trained policy
+
+All the configs can be found under `cfg/<env>/finetune/`. A new WandB project may be created based on `wandb.project` in the config file; set `wandb=null` in the command line to test without WandB logging.
+<!-- Running them will download the default pre-trained policy. -->
+<!-- Running the script will download the default pre-trained policy checkpoint specified in the config (`base_policy_path`) automatically, as well as the normalization statistics, to `DPPO_LOG_DIR`.  -->
+```console
+# Gym - hopper/walker2d/halfcheetah
+python script/train.py --config-name=ft_ppo_diffusion_mlp \
+    --config-dir=cfg/gym/finetune/hopper-v2
+# Robomimic - lift/can/square/transport
+python script/train.py --config-name=ft_ppo_diffusion_mlp \
+    --config-dir=cfg/robomimic/finetune/can
+# D3IL - avoid_m1/m2/m3
+python script/train.py --config-name=ft_ppo_diffusion_mlp \
+    --config-dir=cfg/d3il/finetune/avoid_m1
+# Furniture-Bench - one_leg/lamp/round_table_low/med
+python script/train.py --config-name=ft_ppo_diffusion_mlp \
+    --config-dir=cfg/furniture/finetune/one_leg_low
+```
+
+**Note**: In Gym, Robomimic, and D3IL tasks, we run 40, 50, and 50 parallelized MuJoCo environments on CPU, respectively. If you would like to use fewer environments (given limited CPU threads, or GPU memory for rendering), you can, e.g., set `env.n_envs` half and `train.n_steps` double, thus the total number of steps collected in each iteration remains the same. Furniture-Bench tasks run IsaacGym on a single GPU.
+
+To fine-tune your own pre-trained policy instead, override `base_policy_path` to your own checkpoint, which is saved under `checkpoint/` of the pre-training directory. You can set `base_policy_path=<path>` in the command line when launching fine-tuning.
+
+<!-- **Note**: If you did not download the pre-training [data](https://drive.google.com/drive/folders/1AXZvNQEKOrp0_jk1VLepKh_oHCg_9e3r?usp=drive_link), you need to download the normalization statistics from it for fine-tuning, e.g., `${DPPO_DATA_DIR}/furniture/round_table_low/normalization.pkl`. -->
+
+See [here](cfg/finetuning.md) for details of the experiments in the paper.
+
+
+### Visualization
+* Furniture-Bench tasks can be visualized in GUI by specifying `env.specific.headless=False` and `env.n_envs=1` in fine-tuning configs.
+* D3IL environment can be visualized in GUI by `+env.render=True`, `env.n_envs=1`, and `train.render.num=1`. There is a basic script at `script/test_d3il_render.py`.
+* Videos of trials in Robomimic tasks can be recorded by specifying `env.save_video=True`, `train.render.freq=<iterations>`, and `train.render.num=<num_video>` in fine-tuning configs.
+
+## DPPO implementation
+
+Our diffusion implementation is mostly based on [Diffuser](https://github.com/jannerm/diffuser) and at [`model/diffusion/diffusion.py`](model/diffusion/diffusion.py) and [`model/diffusion/diffusion_vpg.py`](model/diffusion/diffusion_vpg.py). PPO specifics are implemented at [`model/diffusion/diffusion_ppo.py`](model/diffusion/diffusion_ppo.py). The main training script is at [`agent/finetune/train_ppo_diffusion_agent.py`](agent/finetune/train_ppo_diffusion_agent.py) that follows [CleanRL](https://github.com/vwxyzjn/cleanrl).
+
+### Key configurations
+* `denoising_steps`: number of denoising steps (should always be the same for pre-training and fine-tuning regardless the fine-tuning scheme)
+* `ft_denoising_steps`: number of fine-tuned denoising steps
+* `horizon_steps`: predicted action chunk size (should be the same as `act_steps`, executed action chunk size, with MLP. Can be different with UNet, e.g., `horizon_steps=16` and `act_steps=8`)
+* `model.gamma_denoising`: denoising discount factor
+* `model.min_sampling_denoising_std`: <img src="https://latex.codecogs.com/gif.latex?\epsilon^\text{exp}_\text{min} "/>, minimum amount of noise when sampling at a denoising step
+* `model.min_logprob_denoising_std`: <img src="https://latex.codecogs.com/gif.latex?\epsilon^\text{prob}_\text{min} "/>, minimum standard deviation when evaluating likelihood at a denoising step
+* `model.clip_ploss_coef`: PPO clipping ratio
+
+### DDIM fine-tuning
+
+To use DDIM fine-tuning, set `denoising_steps=100` in pre-training and set `model.use_ddim=True`, `model.ddim_steps` to the desired number of total DDIM steps, and `ft_denoising_steps` to the desired number of fine-tuned DDIM steps. In our Furniture-Bench experiments we use `denoising_steps=100`, `model_ddim_steps=5`, and `ft_denoising_steps=5`.
+
+## Adding your own dataset/environment
+
+### Pre-training data
+Pre-training script is at [`agent/pretrain/train_diffusion_agent.py`](agent/pretrain/train_diffusion_agent.py). The pre-training dataset [loader](agent/dataset/sequence.py) assumes a pickle file containing a dictionary of `observations`, `actions`, and `traj_length`, where `observations` and `actions` have the shape of num_episode x max_episode_length x obs_dim/act_dim, and `traj_length` is a 1-D array. One pre-processing example can be found at [`script/process_robomimic_dataset.py`](script/process_robomimic_dataset.py).
+
+**Note:** The current implementation does not support loading history observations (only using observation at the current timestep). If needed, you can modify [here](agent/dataset/sequence.py#L130-L131).
+
+### Fine-tuning environment
+We follow the Gym format for interacting with the environments. The vectorized environments are initialized at [make_async](env/gym_utils/__init__.py#L10) (called in the parent fine-tuning agent class [here](agent/finetune/train_agent.py#L38-L39)). The current implementation is not the cleanest as we tried to make it compatible with Gym, Robomimic, Furniture-Bench, and D3IL environments, but it should be easy to modify and allow using other environments. We use [multi_step](env/gym_utils/wrapper/multi_step.py) wrapper for history observations (not used currently) and multi-environment-step action execution. We also use environment-specific wrappers such as [robomimic_lowdim](env/gym_utils/wrapper/robomimic_lowdim.py) and [furniture](env/gym_utils/wrapper/furniture.py) for observation/action normalization, etc. You can implement a new environment wrapper if needed.
+
+## Known issues
+* IsaacGym simulation can become unstable at times and lead to NaN observations in Furniture-Bench. The current env wrapper does not handle NaN observations.
+
+## License
+This repository is released under the MIT license. See [LICENSE](LICENSE).
+
+## Acknowledgement
+* [Diffuser, Janner et al.](https://github.com/jannerm/diffuser): general code base and DDPM implementation
+* [Diffusion Policy, Chi et al.](https://github.com/real-stanford/diffusion_policy): general code base especially the env wrappers
+* [CleanRL, Huang et al.](https://github.com/vwxyzjn/cleanrl): PPO implementation
+* [IBRL, Hu et al.](https://github.com/hengyuan-hu/ibrl): ViT implementation
+* [D3IL, Jia et al.](https://github.com/ALRhub/d3il): D3IL benchmark
+* [Robomimic, Mandlekar et al.](https://github.com/ARISE-Initiative/robomimic): Robomimic benchmark
+* [Furniture-Bench, Heo et al.](https://github.com/clvrai/furniture-bench): Furniture-Bench benchmark
+* [AWR, Peng et al.](https://github.com/xbpeng/awr): DAWR baseline (modified from AWR)
+* [DIPO, Yang et al.](https://github.com/BellmanTimeHut/DIPO): DIPO baseline
+* [IDQL, Hansen-Estruch et al.](https://github.com/philippe-eecs/IDQL): IDQL baseline
+* [DQL, Wang et al.](https://github.com/Zhendong-Wang/Diffusion-Policies-for-Offline-RL): DQL baseline
+* [QSM, Psenka et al.](https://www.michaelpsenka.io/qsm/): QSM baseline
+* [Score SDE, Song et al.](https://github.com/yang-song/score_sde_pytorch/): diffusion exact likelihood
+
diff --git a/agent/dataset/__init__.py b/agent/dataset/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/agent/dataset/buffer.py b/agent/dataset/buffer.py
new file mode 100644
index 0000000..251c505
--- /dev/null
+++ b/agent/dataset/buffer.py
@@ -0,0 +1,110 @@
+"""
+Pre-training data loader. Modified from https://github.com/jannerm/diffuser/blob/main/diffuser/datasets/buffer.py
+
+"""
+
+import numpy as np
+import torch
+
+
+def atleast_2d(x):
+    if isinstance(x, torch.Tensor):
+        while x.dim() < 2:
+            x = x.unsqueeze(-1)
+        return x
+    else:
+        while x.ndim < 2:
+            x = np.expand_dims(x, axis=-1)
+    return x
+
+
+class StitchedBuffer:
+
+    def __init__(
+        self,
+        sum_of_path_lengths,
+        device="cpu",
+    ):
+        self.sum_of_path_lengths = sum_of_path_lengths
+        if device == "cpu":
+            self._dict = {
+                "path_lengths": np.zeros(sum_of_path_lengths, dtype=int),
+            }
+        else:
+            self._dict = {
+                "path_lengths": torch.zeros(sum_of_path_lengths, dtype=int).to(device),
+            }
+        self._count = 0
+        self.sum_of_path_lengths = sum_of_path_lengths
+        self.device = device
+
+    def __repr__(self):
+        return "Fields:\n" + "\n".join(
+            f"    {key}: {val.shape}" for key, val in self.items()
+        )
+
+    def __getitem__(self, key):
+        return self._dict[key]
+
+    def __setitem__(self, key, val):
+        self._dict[key] = val
+        self._add_attributes()
+
+    @property
+    def n_episodes(self):
+        return self._count
+
+    @property
+    def n_steps(self):
+        return sum(self["path_lengths"])
+
+    def _add_keys(self, path):
+        if hasattr(self, "keys"):
+            return
+        self.keys = list(path.keys())
+
+    def _add_attributes(self):
+        """
+        can access fields with `buffer.observations`
+        instead of `buffer['observations']`
+        """
+        for key, val in self._dict.items():
+            setattr(self, key, val)
+
+    def items(self):
+        return {k: v for k, v in self._dict.items() if k != "path_lengths"}.items()
+
+    def _allocate(self, key, array):
+        assert key not in self._dict
+        dim = array.shape[1:]  # skip batch dimension
+        shape = (self.sum_of_path_lengths, *dim)
+        if self.device == "cpu":
+            self._dict[key] = np.zeros(shape, dtype=np.float32)
+        else:
+            self._dict[key] = torch.zeros(shape, dtype=torch.float32).to(self.device)
+        # print(f'[ utils/mujoco ] Allocated {key} with size {shape}')
+
+    def add_path(self, path):
+        path_length = len(path["observations"])
+        # assert path_length <= self.sum_of_path_lengths
+
+        ## if first path added, set keys based on contents
+        self._add_keys(path)
+
+        ## add tracked keys in path
+        for key in self.keys:
+            array = atleast_2d(path[key])
+            if key not in self._dict:
+                self._allocate(key, array)
+            self._dict[key][self._count : self._count + path_length] = array
+
+        ## record path length
+        self._dict["path_lengths"][
+            self._count : self._count + path_length
+        ] = path_length
+
+        ## increment path counter
+        self._count += path_length
+
+    def finalize(self):
+        self._add_attributes()
diff --git a/agent/dataset/d3il_dataset/__init__.py b/agent/dataset/d3il_dataset/__init__.py
new file mode 100644
index 0000000..68c6d21
--- /dev/null
+++ b/agent/dataset/d3il_dataset/__init__.py
@@ -0,0 +1,18 @@
+import os
+
+FRAMEWORK_DIR = os.path.dirname(__file__)
+
+
+def sim_framework_path(*args) -> str:
+    """
+    Abstraction from os.path.join()
+    Builds absolute paths from relative path strings with SIM_FRAMEWORK/ as root.
+    If args already contains an absolute path, it is used as root for the subsequent joins
+    Args:
+        *args:
+
+    Returns:
+        absolute path
+
+    """
+    return os.path.abspath(os.path.join(FRAMEWORK_DIR, *args))
diff --git a/agent/dataset/d3il_dataset/aligning_dataset.py b/agent/dataset/d3il_dataset/aligning_dataset.py
new file mode 100644
index 0000000..1b5a5f3
--- /dev/null
+++ b/agent/dataset/d3il_dataset/aligning_dataset.py
@@ -0,0 +1,342 @@
+import random
+from typing import Optional, Callable, Any
+import logging
+
+import os
+import glob
+
+try:
+    import cv2  # not included in pyproject.toml
+except:
+    print("Installing cv2")
+    os.system("pip install opencv-python")
+import torch
+import pickle
+import numpy as np
+from tqdm import tqdm
+
+from agent.dataset.d3il_dataset.base_dataset import TrajectoryDataset
+from agent.dataset.d3il_dataset import sim_framework_path
+
+
+class Aligning_Dataset(TrajectoryDataset):
+    def __init__(
+        self,
+        data_directory: os.PathLike,
+        device="cpu",
+        obs_dim: int = 20,
+        action_dim: int = 2,
+        max_len_data: int = 256,
+        window_size: int = 1,
+    ):
+
+        super().__init__(
+            data_directory=data_directory,
+            device=device,
+            obs_dim=obs_dim,
+            action_dim=action_dim,
+            max_len_data=max_len_data,
+            window_size=window_size,
+        )
+
+        logging.info("Loading Robot Push Dataset")
+
+        inputs = []
+        actions = []
+        masks = []
+
+        # data_dir = sim_framework_path(data_directory)
+        # state_files = glob.glob(data_dir + "/env*")
+
+        rp_data_dir = sim_framework_path("data/aligning/all_data/state")
+        state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
+
+        for file in state_files:
+
+            with open(os.path.join(rp_data_dir, file), "rb") as f:
+                env_state = pickle.load(f)
+
+            # lengths.append(len(env_state['robot']['des_c_pos']))
+
+            zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
+            zero_action = np.zeros(
+                (1, self.max_len_data, self.action_dim), dtype=np.float32
+            )
+            zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
+
+            # robot and box positions
+            robot_des_pos = env_state["robot"]["des_c_pos"]
+
+            robot_c_pos = env_state["robot"]["c_pos"]
+
+            push_box_pos = env_state["push-box"]["pos"]
+            push_box_quat = env_state["push-box"]["quat"]
+
+            target_box_pos = env_state["target-box"]["pos"]
+            target_box_quat = env_state["target-box"]["quat"]
+
+            # target_box_pos = np.zeros(push_box_pos.shape)
+            # target_box_quat = np.zeros(push_box_quat.shape)
+            # target_box_pos[:] = push_box_pos[-1:]
+            # target_box_quat[:] = push_box_quat[-1:]
+
+            input_state = np.concatenate(
+                (
+                    robot_des_pos,
+                    robot_c_pos,
+                    push_box_pos,
+                    push_box_quat,
+                    target_box_pos,
+                    target_box_quat,
+                ),
+                axis=-1,
+            )
+
+            vel_state = robot_des_pos[1:] - robot_des_pos[:-1]
+
+            valid_len = len(input_state) - 1
+
+            zero_obs[0, :valid_len, :] = input_state[:-1]
+            zero_action[0, :valid_len, :] = vel_state
+            zero_mask[0, :valid_len] = 1
+
+            inputs.append(zero_obs)
+            actions.append(zero_action)
+            masks.append(zero_mask)
+
+        # shape: B, T, n
+        self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
+        self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
+        self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
+
+        self.num_data = len(self.observations)
+
+        self.slices = self.get_slices()
+
+    def get_slices(self):
+        slices = []
+
+        min_seq_length = np.inf
+        for i in range(self.num_data):
+            T = self.get_seq_length(i)
+            min_seq_length = min(T, min_seq_length)
+
+            if T - self.window_size < 0:
+                print(
+                    f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
+                )
+            else:
+                slices += [
+                    (i, start, start + self.window_size)
+                    for start in range(T - self.window_size + 1)
+                ]  # slice indices follow convention [start, end)
+
+        return slices
+
+    def get_seq_length(self, idx):
+        return int(self.masks[idx].sum().item())
+
+    def get_all_actions(self):
+        result = []
+        # mask out invalid actions
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.actions[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def get_all_observations(self):
+        result = []
+        # mask out invalid observations
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.observations[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def __len__(self):
+        return len(self.slices)
+
+    def __getitem__(self, idx):
+
+        i, start, end = self.slices[idx]
+
+        obs = self.observations[i, start:end]
+        act = self.actions[i, start:end]
+        mask = self.masks[i, start:end]
+
+        return obs, act, mask
+
+
+class Aligning_Img_Dataset(TrajectoryDataset):
+    def __init__(
+        self,
+        data_directory: os.PathLike,
+        device="cpu",
+        obs_dim: int = 20,
+        action_dim: int = 2,
+        max_len_data: int = 256,
+        window_size: int = 1,
+    ):
+
+        super().__init__(
+            data_directory=data_directory,
+            device=device,
+            obs_dim=obs_dim,
+            action_dim=action_dim,
+            max_len_data=max_len_data,
+            window_size=window_size,
+        )
+
+        logging.info("Loading Robot Push Dataset")
+
+        inputs = []
+        actions = []
+        masks = []
+
+        data_dir = sim_framework_path("environments/dataset/data/aligning/all_data")
+
+        state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
+
+        bp_cam_imgs = []
+        inhand_cam_imgs = []
+
+        for file in tqdm(state_files[:3]):
+
+            with open(os.path.join(data_dir, "state", file), "rb") as f:
+                env_state = pickle.load(f)
+
+            # lengths.append(len(env_state['robot']['des_c_pos']))
+
+            zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
+            zero_action = np.zeros(
+                (1, self.max_len_data, self.action_dim), dtype=np.float32
+            )
+            zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
+
+            # robot and box positions
+            robot_des_pos = env_state["robot"]["des_c_pos"]
+            robot_c_pos = env_state["robot"]["c_pos"]
+
+            file_name = os.path.basename(file).split(".")[0]
+
+            ###############################################################
+            bp_images = []
+            bp_imgs = glob.glob(data_dir + "/images/bp-cam/" + file_name + "/*")
+            bp_imgs.sort(key=lambda x: int(os.path.basename(x).split(".")[0]))
+
+            for img in bp_imgs:
+                image = cv2.imread(img).astype(np.float32)
+                image = image.transpose((2, 0, 1)) / 255.0
+
+                image = torch.from_numpy(image).to(self.device).float().unsqueeze(0)
+
+                bp_images.append(image)
+
+            bp_images = torch.concatenate(bp_images, dim=0)
+            ################################################################
+            inhand_imgs = glob.glob(data_dir + "/images/inhand-cam/" + file_name + "/*")
+            inhand_imgs.sort(key=lambda x: int(os.path.basename(x).split(".")[0]))
+            inhand_images = []
+            for img in inhand_imgs:
+                image = cv2.imread(img).astype(np.float32)
+                image = image.transpose((2, 0, 1)) / 255.0
+
+                image = torch.from_numpy(image).to(self.device).float().unsqueeze(0)
+
+                inhand_images.append(image)
+            inhand_images = torch.concatenate(inhand_images, dim=0)
+            ##################################################################
+
+            # push_box_pos = env_state['push-box']['pos']
+            # push_box_quat = env_state['push-box']['quat']
+            #
+            # target_box_pos = env_state['target-box']['pos']
+            # target_box_quat = env_state['target-box']['quat']
+
+            # target_box_pos = np.zeros(push_box_pos.shape)
+            # target_box_quat = np.zeros(push_box_quat.shape)
+            # target_box_pos[:] = push_box_pos[-1:]
+            # target_box_quat[:] = push_box_quat[-1:]
+
+            # input_state = np.concatenate((robot_des_pos), axis=-1)
+
+            vel_state = robot_des_pos[1:] - robot_des_pos[:-1]
+
+            valid_len = len(vel_state)
+
+            zero_obs[0, :valid_len, :] = robot_des_pos[:-1]
+            zero_action[0, :valid_len, :] = vel_state
+            zero_mask[0, :valid_len] = 1
+
+            bp_cam_imgs.append(bp_images)
+            inhand_cam_imgs.append(inhand_images)
+
+            inputs.append(zero_obs)
+            actions.append(zero_action)
+            masks.append(zero_mask)
+
+        self.bp_cam_imgs = bp_cam_imgs
+        self.inhand_cam_imgs = inhand_cam_imgs
+
+        # shape: B, T, n
+        self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
+        self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
+        self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
+
+        self.num_data = len(self.observations)
+
+        self.slices = self.get_slices()
+
+    def get_slices(self):
+        slices = []
+
+        min_seq_length = np.inf
+        for i in range(self.num_data):
+            T = self.get_seq_length(i)
+            min_seq_length = min(T, min_seq_length)
+
+            if T - self.window_size < 0:
+                print(
+                    f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
+                )
+            else:
+                slices += [
+                    (i, start, start + self.window_size)
+                    for start in range(T - self.window_size + 1)
+                ]  # slice indices follow convention [start, end)
+
+        return slices
+
+    def get_seq_length(self, idx):
+        return int(self.masks[idx].sum().item())
+
+    def get_all_actions(self):
+        result = []
+        # mask out invalid actions
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.actions[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def get_all_observations(self):
+        result = []
+        # mask out invalid observations
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.observations[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def __len__(self):
+        return len(self.slices)
+
+    def __getitem__(self, idx):
+
+        i, start, end = self.slices[idx]
+
+        obs = self.observations[i, start:end]
+        act = self.actions[i, start:end]
+        mask = self.masks[i, start:end]
+
+        bp_imgs = self.bp_cam_imgs[i][start:end]
+        inhand_imgs = self.inhand_cam_imgs[i][start:end]
+
+        return bp_imgs, inhand_imgs, obs, act, mask
diff --git a/agent/dataset/d3il_dataset/avoiding_dataset.py b/agent/dataset/d3il_dataset/avoiding_dataset.py
new file mode 100644
index 0000000..eb32c81
--- /dev/null
+++ b/agent/dataset/d3il_dataset/avoiding_dataset.py
@@ -0,0 +1,126 @@
+import logging
+
+import os
+import torch
+import pickle
+import numpy as np
+
+from agent.dataset.d3il_dataset.base_dataset import TrajectoryDataset
+
+
+class Avoiding_Dataset(TrajectoryDataset):
+    def __init__(
+        self,
+        data_directory: os.PathLike,
+        device="cpu",
+        obs_dim: int = 20,
+        action_dim: int = 2,
+        max_len_data: int = 256,
+        window_size: int = 1,
+    ):
+
+        super().__init__(
+            data_directory=data_directory,
+            device=device,
+            obs_dim=obs_dim,
+            action_dim=action_dim,
+            max_len_data=max_len_data,
+            window_size=window_size,
+        )
+
+        logging.info("Loading Sorting Dataset")
+
+        inputs = []
+        actions = []
+        masks = []
+
+        data_dir = data_directory
+        state_files = os.listdir(data_dir)
+
+        for file in state_files:
+            with open(os.path.join(data_dir, file), "rb") as f:
+                env_state = pickle.load(f)
+
+            zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
+            zero_action = np.zeros(
+                (1, self.max_len_data, self.action_dim), dtype=np.float32
+            )
+            zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
+
+            # robot and box posistion
+            robot_des_pos = env_state["robot"]["des_c_pos"][:, :2]
+            robot_c_pos = env_state["robot"]["c_pos"][:, :2]
+
+            input_state = np.concatenate((robot_des_pos, robot_c_pos), axis=-1)
+
+            vel_state = robot_des_pos[1:] - robot_des_pos[:-1]
+            valid_len = len(vel_state)
+
+            zero_obs[0, :valid_len, :] = input_state[:-1]
+            zero_action[0, :valid_len, :] = vel_state
+            zero_mask[0, :valid_len] = 1
+
+            inputs.append(zero_obs)
+            actions.append(zero_action)
+            masks.append(zero_mask)
+
+        # shape: B, T, n
+        self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
+        self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
+        self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
+
+        self.num_data = len(self.observations)
+
+        self.slices = self.get_slices()
+
+    def get_slices(self):
+        slices = []
+
+        min_seq_length = np.inf
+        for i in range(self.num_data):
+            T = self.get_seq_length(i)
+            min_seq_length = min(T, min_seq_length)
+
+            if T - self.window_size < 0:
+                print(
+                    f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
+                )
+            else:
+                slices += [
+                    (i, start, start + self.window_size)
+                    for start in range(T - self.window_size + 1)
+                ]  # slice indices follow convention [start, end)
+
+        return slices
+
+    def get_seq_length(self, idx):
+        return int(self.masks[idx].sum().item())
+
+    def get_all_actions(self):
+        result = []
+        # mask out invalid actions
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.actions[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def get_all_observations(self):
+        result = []
+        # mask out invalid observations
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.observations[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def __len__(self):
+        return len(self.slices)
+
+    def __getitem__(self, idx):
+
+        i, start, end = self.slices[idx]
+
+        obs = self.observations[i, start:end]
+        act = self.actions[i, start:end]
+        mask = self.masks[i, start:end]
+
+        return obs, act, mask
diff --git a/agent/dataset/d3il_dataset/base_dataset.py b/agent/dataset/d3il_dataset/base_dataset.py
new file mode 100644
index 0000000..fc57b06
--- /dev/null
+++ b/agent/dataset/d3il_dataset/base_dataset.py
@@ -0,0 +1,54 @@
+import abc
+import os
+
+from torch.utils.data import Dataset
+
+
+class TrajectoryDataset(Dataset, abc.ABC):
+    """
+    A dataset containing trajectories.
+    TrajectoryDataset[i] returns: (observations, actions, mask)
+        observations: Tensor[T, ...], T frames of observations
+        actions: Tensor[T, ...], T frames of actions
+        mask: Tensor[T]: 0: invalid; 1: valid
+    """
+
+    def __init__(
+        self,
+        data_directory: os.PathLike,
+        device="cpu",
+        obs_dim: int = 20,
+        action_dim: int = 2,
+        max_len_data: int = 256,
+        window_size: int = 1,
+    ):
+
+        self.data_directory = data_directory
+        self.device = device
+
+        self.max_len_data = max_len_data
+        self.action_dim = action_dim
+        self.obs_dim = obs_dim
+
+        self.window_size = window_size
+
+    @abc.abstractmethod
+    def get_seq_length(self, idx):
+        """
+        Returns the length of the idx-th trajectory.
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def get_all_actions(self):
+        """
+        Returns all actions from all trajectories, concatenated on dim 0 (time).
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def get_all_observations(self):
+        """
+        Returns all actions from all trajectories, concatenated on dim 0 (time).
+        """
+        raise NotImplementedError
diff --git a/agent/dataset/d3il_dataset/geo_transform.py b/agent/dataset/d3il_dataset/geo_transform.py
new file mode 100644
index 0000000..0386022
--- /dev/null
+++ b/agent/dataset/d3il_dataset/geo_transform.py
@@ -0,0 +1,350 @@
+import itertools
+
+import numpy as np
+
+"""
+From OpenAIGym Please see there under mujoco/Robots
+"""
+
+# For testing whether a number is close to zero
+_FLOAT_EPS = np.finfo(np.float64).eps
+_EPS4 = _FLOAT_EPS * 4.0
+
+
+def get_quaternion_error(curr_quat, des_quat):
+    """
+    Calculates the difference between the current quaternion and the desired quaternion.
+    See Siciliano textbook page 140 Eq 3.91
+
+    :param curr_quat: current quaternion
+    :param des_quat: desired quaternion
+    :return: difference between current quaternion and desired quaternion
+    """
+    quatError = np.zeros((3,))
+
+    quatError[0] = (
+        curr_quat[0] * des_quat[1]
+        - des_quat[0] * curr_quat[1]
+        - curr_quat[3] * des_quat[2]
+        + curr_quat[2] * des_quat[3]
+    )
+
+    quatError[1] = (
+        curr_quat[0] * des_quat[2]
+        - des_quat[0] * curr_quat[2]
+        + curr_quat[3] * des_quat[1]
+        - curr_quat[1] * des_quat[3]
+    )
+
+    quatError[2] = (
+        curr_quat[0] * des_quat[3]
+        - des_quat[0] * curr_quat[3]
+        - curr_quat[2] * des_quat[1]
+        + curr_quat[1] * des_quat[2]
+    )
+
+    return quatError
+
+
+def euler2mat(euler):
+    """Convert Euler Angles to Rotation Matrix.  See rotation.py for notes"""
+    euler = np.asarray(euler, dtype=np.float64)
+    assert euler.shape[-1] == 3, "Invalid shaped euler {}".format(euler)
+
+    ai, aj, ak = -euler[..., 2], -euler[..., 1], -euler[..., 0]
+    si, sj, sk = np.sin(ai), np.sin(aj), np.sin(ak)
+    ci, cj, ck = np.cos(ai), np.cos(aj), np.cos(ak)
+    cc, cs = ci * ck, ci * sk
+    sc, ss = si * ck, si * sk
+
+    mat = np.empty(euler.shape[:-1] + (3, 3), dtype=np.float64)
+    mat[..., 2, 2] = cj * ck
+    mat[..., 2, 1] = sj * sc - cs
+    mat[..., 2, 0] = sj * cc + ss
+    mat[..., 1, 2] = cj * sk
+    mat[..., 1, 1] = sj * ss + cc
+    mat[..., 1, 0] = sj * cs - sc
+    mat[..., 0, 2] = -sj
+    mat[..., 0, 1] = cj * si
+    mat[..., 0, 0] = cj * ci
+    return mat
+
+
+def euler2quat(euler):
+    """Convert Euler Angles to Quaternions.  See rotation.py for notes"""
+    euler = np.asarray(euler, dtype=np.float64)
+    assert euler.shape[-1] == 3, "Invalid shape euler {}".format(euler)
+
+    ai, aj, ak = euler[..., 2] / 2, -euler[..., 1] / 2, euler[..., 0] / 2
+    si, sj, sk = np.sin(ai), np.sin(aj), np.sin(ak)
+    ci, cj, ck = np.cos(ai), np.cos(aj), np.cos(ak)
+    cc, cs = ci * ck, ci * sk
+    sc, ss = si * ck, si * sk
+
+    quat = np.empty(euler.shape[:-1] + (4,), dtype=np.float64)
+    quat[..., 0] = cj * cc + sj * ss
+    quat[..., 3] = cj * sc - sj * cs
+    quat[..., 2] = -(cj * ss + sj * cc)
+    quat[..., 1] = cj * cs - sj * sc
+    return quat
+
+
+def mat2euler(mat):
+    """Convert Rotation Matrix to Euler Angles.  See rotation.py for notes"""
+    mat = np.asarray(mat, dtype=np.float64)
+    assert mat.shape[-2:] == (3, 3), "Invalid shape matrix {}".format(mat)
+
+    cy = np.sqrt(mat[..., 2, 2] * mat[..., 2, 2] + mat[..., 1, 2] * mat[..., 1, 2])
+    condition = cy > _EPS4
+    euler = np.empty(mat.shape[:-1], dtype=np.float64)
+    euler[..., 2] = np.where(
+        condition,
+        -np.arctan2(mat[..., 0, 1], mat[..., 0, 0]),
+        -np.arctan2(-mat[..., 1, 0], mat[..., 1, 1]),
+    )
+    euler[..., 1] = np.where(
+        condition, -np.arctan2(-mat[..., 0, 2], cy), -np.arctan2(-mat[..., 0, 2], cy)
+    )
+    euler[..., 0] = np.where(
+        condition, -np.arctan2(mat[..., 1, 2], mat[..., 2, 2]), 0.0
+    )
+    return euler
+
+
+def mat2quat(mat):
+    """Convert Rotation Matrix to Quaternion.  See rotation.py for notes"""
+    mat = np.asarray(mat, dtype=np.float64)
+    assert mat.shape[-2:] == (3, 3), "Invalid shape matrix {}".format(mat)
+
+    Qxx, Qyx, Qzx = mat[..., 0, 0], mat[..., 0, 1], mat[..., 0, 2]
+    Qxy, Qyy, Qzy = mat[..., 1, 0], mat[..., 1, 1], mat[..., 1, 2]
+    Qxz, Qyz, Qzz = mat[..., 2, 0], mat[..., 2, 1], mat[..., 2, 2]
+    # Fill only lower half of symmetric matrix
+    K = np.zeros(mat.shape[:-2] + (4, 4), dtype=np.float64)
+    K[..., 0, 0] = Qxx - Qyy - Qzz
+    K[..., 1, 0] = Qyx + Qxy
+    K[..., 1, 1] = Qyy - Qxx - Qzz
+    K[..., 2, 0] = Qzx + Qxz
+    K[..., 2, 1] = Qzy + Qyz
+    K[..., 2, 2] = Qzz - Qxx - Qyy
+    K[..., 3, 0] = Qyz - Qzy
+    K[..., 3, 1] = Qzx - Qxz
+    K[..., 3, 2] = Qxy - Qyx
+    K[..., 3, 3] = Qxx + Qyy + Qzz
+    K /= 3.0
+    # TODO: vectorize this -- probably could be made faster
+    q = np.empty(K.shape[:-2] + (4,))
+    it = np.nditer(q[..., 0], flags=["multi_index"])
+    while not it.finished:
+        # Use Hermitian eigenvectors, values for speed
+        vals, vecs = np.linalg.eigh(K[it.multi_index])
+        # Select largest eigenvector, reorder to w,x,y,z quaternion
+        q[it.multi_index] = vecs[[3, 0, 1, 2], np.argmax(vals)]
+        # Prefer quaternion with positive w
+        # (q * -1 corresponds to same rotation as q)
+        if q[it.multi_index][0] < 0:
+            q[it.multi_index] *= -1
+        it.iternext()
+    return q
+
+
+def quat2euler(quat):
+    """Convert Quaternion to Euler Angles.  See rotation.py for notes"""
+    return mat2euler(quat2mat(quat))
+
+
+def subtract_euler(e1, e2):
+    assert e1.shape == e2.shape
+    assert e1.shape[-1] == 3
+    q1 = euler2quat(e1)
+    q2 = euler2quat(e2)
+    q_diff = quat_mul(q1, quat_conjugate(q2))
+    return quat2euler(q_diff)
+
+
+def quat2mat(quat):
+    """Convert Quaternion to Euler Angles.  See rotation.py for notes"""
+    quat = np.asarray(quat, dtype=np.float64)
+    assert quat.shape[-1] == 4, "Invalid shape quat {}".format(quat)
+
+    w, x, y, z = quat[..., 0], quat[..., 1], quat[..., 2], quat[..., 3]
+    Nq = np.sum(quat * quat, axis=-1)
+    s = 2.0 / Nq
+    X, Y, Z = x * s, y * s, z * s
+    wX, wY, wZ = w * X, w * Y, w * Z
+    xX, xY, xZ = x * X, x * Y, x * Z
+    yY, yZ, zZ = y * Y, y * Z, z * Z
+
+    mat = np.empty(quat.shape[:-1] + (3, 3), dtype=np.float64)
+    mat[..., 0, 0] = 1.0 - (yY + zZ)
+    mat[..., 0, 1] = xY - wZ
+    mat[..., 0, 2] = xZ + wY
+    mat[..., 1, 0] = xY + wZ
+    mat[..., 1, 1] = 1.0 - (xX + zZ)
+    mat[..., 1, 2] = yZ - wX
+    mat[..., 2, 0] = xZ - wY
+    mat[..., 2, 1] = yZ + wX
+    mat[..., 2, 2] = 1.0 - (xX + yY)
+    return np.where((Nq > _FLOAT_EPS)[..., np.newaxis, np.newaxis], mat, np.eye(3))
+
+
+def quat_conjugate(q):
+    inv_q = -q
+    inv_q[..., 0] *= -1
+    return inv_q
+
+
+def quat_mul(q0, q1):
+    assert q0.shape == q1.shape
+    assert q0.shape[-1] == 4
+    assert q1.shape[-1] == 4
+
+    w0 = q0[..., 0]
+    x0 = q0[..., 1]
+    y0 = q0[..., 2]
+    z0 = q0[..., 3]
+
+    w1 = q1[..., 0]
+    x1 = q1[..., 1]
+    y1 = q1[..., 2]
+    z1 = q1[..., 3]
+
+    w = w0 * w1 - x0 * x1 - y0 * y1 - z0 * z1
+    x = w0 * x1 + x0 * w1 + y0 * z1 - z0 * y1
+    y = w0 * y1 + y0 * w1 + z0 * x1 - x0 * z1
+    z = w0 * z1 + z0 * w1 + x0 * y1 - y0 * x1
+    q = np.array([w, x, y, z])
+    if q.ndim == 2:
+        q = q.swapaxes(0, 1)
+    assert q.shape == q0.shape
+    return q
+
+
+def quat_rot_vec(q, v0):
+    q_v0 = np.array([0, v0[0], v0[1], v0[2]])
+    q_v = quat_mul(q, quat_mul(q_v0, quat_conjugate(q)))
+    v = q_v[1:]
+    return v
+
+
+def quat_identity():
+    return np.array([1, 0, 0, 0])
+
+
+def quat2axisangle(quat):
+    theta = 0
+    axis = np.array([0, 0, 1])
+    sin_theta = np.linalg.norm(quat[1:])
+
+    if sin_theta > 0.0001:
+        theta = 2 * np.arcsin(sin_theta)
+        theta *= 1 if quat[0] >= 0 else -1
+        axis = quat[1:] / sin_theta
+
+    return axis, theta
+
+
+def euler2point_euler(euler):
+    _euler = euler.copy()
+    if len(_euler.shape) < 2:
+        _euler = np.expand_dims(_euler, 0)
+    assert _euler.shape[1] == 3
+    _euler_sin = np.sin(_euler)
+    _euler_cos = np.cos(_euler)
+    return np.concatenate([_euler_sin, _euler_cos], axis=-1)
+
+
+def point_euler2euler(euler):
+    _euler = euler.copy()
+    if len(_euler.shape) < 2:
+        _euler = np.expand_dims(_euler, 0)
+    assert _euler.shape[1] == 6
+    angle = np.arctan(_euler[..., :3] / _euler[..., 3:])
+    angle[_euler[..., 3:] < 0] += np.pi
+    return angle
+
+
+def quat2point_quat(quat):
+    # Should be in qw, qx, qy, qz
+    _quat = quat.copy()
+    if len(_quat.shape) < 2:
+        _quat = np.expand_dims(_quat, 0)
+    assert _quat.shape[1] == 4
+    angle = np.arccos(_quat[:, [0]]) * 2
+    xyz = _quat[:, 1:]
+    xyz[np.squeeze(np.abs(np.sin(angle / 2))) >= 1e-5] = (xyz / np.sin(angle / 2))[
+        np.squeeze(np.abs(np.sin(angle / 2))) >= 1e-5
+    ]
+    return np.concatenate([np.sin(angle), np.cos(angle), xyz], axis=-1)
+
+
+def point_quat2quat(quat):
+    _quat = quat.copy()
+    if len(_quat.shape) < 2:
+        _quat = np.expand_dims(_quat, 0)
+    assert _quat.shape[1] == 5
+    angle = np.arctan(_quat[:, [0]] / _quat[:, [1]])
+    qw = np.cos(angle / 2)
+
+    qxyz = _quat[:, 2:]
+    qxyz[np.squeeze(np.abs(np.sin(angle / 2))) >= 1e-5] = (qxyz * np.sin(angle / 2))[
+        np.squeeze(np.abs(np.sin(angle / 2))) >= 1e-5
+    ]
+    return np.concatenate([qw, qxyz], axis=-1)
+
+
+def normalize_angles(angles):
+    """Puts angles in [-pi, pi] range."""
+    angles = angles.copy()
+    if angles.size > 0:
+        angles = (angles + np.pi) % (2 * np.pi) - np.pi
+        assert -np.pi - 1e-6 <= angles.min() and angles.max() <= np.pi + 1e-6
+    return angles
+
+
+def round_to_straight_angles(angles):
+    """Returns closest angle modulo 90 degrees"""
+    angles = np.round(angles / (np.pi / 2)) * (np.pi / 2)
+    return normalize_angles(angles)
+
+
+def get_parallel_rotations():
+    mult90 = [0, np.pi / 2, -np.pi / 2, np.pi]
+    parallel_rotations = []
+    for euler in itertools.product(mult90, repeat=3):
+        canonical = mat2euler(euler2mat(euler))
+        canonical = np.round(canonical / (np.pi / 2))
+        if canonical[0] == -2:
+            canonical[0] = 2
+        if canonical[2] == -2:
+            canonical[2] = 2
+        canonical *= np.pi / 2
+        if all([(canonical != rot).any() for rot in parallel_rotations]):
+            parallel_rotations += [canonical]
+    assert len(parallel_rotations) == 24
+    return parallel_rotations
+
+
+def posRotMat2TFMat(pos, rot_mat):
+    """Converts a position and a 3x3 rotation matrix to a 4x4 transformation matrix"""
+    t_mat = np.eye(4)
+    t_mat[:3, :3] = rot_mat
+    t_mat[:3, 3] = np.array(pos)
+    return t_mat
+
+
+def mat2posQuat(mat):
+    """Converts a 4x4 rotation matrix to a position and a quaternion"""
+    pos = mat[:3, 3]
+    quat = mat2quat(mat[:3, :3])
+    return pos, quat
+
+
+def wxyz_to_xyzw(quat):
+    """Converts WXYZ Quaternions to XYZW Quaternions"""
+    return np.roll(quat, -1)
+
+
+def xyzw_to_wxyz(quat):
+    """Converts XYZW Quaternions to WXYZ Quaternions"""
+    return np.roll(quat, 1)
diff --git a/agent/dataset/d3il_dataset/pushing_dataset.py b/agent/dataset/d3il_dataset/pushing_dataset.py
new file mode 100644
index 0000000..d81a7bc
--- /dev/null
+++ b/agent/dataset/d3il_dataset/pushing_dataset.py
@@ -0,0 +1,161 @@
+import logging
+
+import os
+import torch
+import pickle
+import numpy as np
+
+from agent.dataset.d3il_dataset.base_dataset import TrajectoryDataset
+from agent.dataset.d3il_dataset import sim_framework_path
+from .geo_transform import quat2euler
+
+
+class Pushing_Dataset(TrajectoryDataset):
+    def __init__(
+        self,
+        data_directory: os.PathLike,
+        device="cpu",
+        obs_dim: int = 20,
+        action_dim: int = 2,
+        max_len_data: int = 256,
+        window_size: int = 1,
+    ):
+
+        super().__init__(
+            data_directory=data_directory,
+            device=device,
+            obs_dim=obs_dim,
+            action_dim=action_dim,
+            max_len_data=max_len_data,
+            window_size=window_size,
+        )
+
+        logging.info("Loading Block Push Dataset")
+
+        inputs = []
+        actions = []
+        masks = []
+
+        # for root, dirs, files in os.walk(self.data_directory):
+        #
+        #     for mode_dir in dirs:
+
+        # state_files = glob.glob(os.path.join(root, mode_dir) + "/env*")
+        # data_dir = os.path.join(sim_framework_path(data_directory), "local")
+        # data_dir = sim_framework_path(data_directory)
+        # state_files = glob.glob(data_dir + "/env*")
+
+        bp_data_dir = sim_framework_path("data/pushing/all_data")
+
+        state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
+
+        for file in state_files:
+            with open(os.path.join(bp_data_dir, file), "rb") as f:
+                env_state = pickle.load(f)
+
+            # lengths.append(len(env_state['robot']['des_c_pos']))
+
+            zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
+            zero_action = np.zeros(
+                (1, self.max_len_data, self.action_dim), dtype=np.float32
+            )
+            zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
+
+            # robot and box positions
+            robot_des_pos = env_state["robot"]["des_c_pos"][:, :2]
+
+            robot_c_pos = env_state["robot"]["c_pos"][:, :2]
+
+            red_box_pos = env_state["red-box"]["pos"][:, :2]
+            red_box_quat = np.tan(quat2euler(env_state["red-box"]["quat"])[:, -1:])
+
+            green_box_pos = env_state["green-box"]["pos"][:, :2]
+            green_box_quat = np.tan(quat2euler(env_state["green-box"]["quat"])[:, -1:])
+
+            red_target_pos = env_state["red-target"]["pos"][:, :2]
+            green_target_pos = env_state["green-target"]["pos"][:, :2]
+
+            input_state = np.concatenate(
+                (
+                    robot_des_pos,
+                    robot_c_pos,
+                    red_box_pos,
+                    red_box_quat,
+                    green_box_pos,
+                    green_box_quat,
+                ),
+                axis=-1,
+            )
+
+            vel_state = robot_des_pos[1:] - robot_des_pos[:-1]
+
+            valid_len = len(input_state) - 1
+
+            zero_obs[0, :valid_len, :] = input_state[:-1]
+            zero_action[0, :valid_len, :] = vel_state
+            zero_mask[0, :valid_len] = 1
+
+            inputs.append(zero_obs)
+            actions.append(zero_action)
+            masks.append(zero_mask)
+
+        # shape: B, T, n
+        self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
+        self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
+        self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
+
+        self.num_data = len(self.observations)
+
+        self.slices = self.get_slices()
+
+    def get_slices(self):
+        slices = []
+
+        min_seq_length = np.inf
+        for i in range(self.num_data):
+            T = self.get_seq_length(i)
+            min_seq_length = min(T, min_seq_length)
+
+            if T - self.window_size < 0:
+                print(
+                    f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
+                )
+            else:
+                slices += [
+                    (i, start, start + self.window_size)
+                    for start in range(T - self.window_size + 1)
+                ]  # slice indices follow convention [start, end)
+
+        return slices
+
+    def get_seq_length(self, idx):
+        return int(self.masks[idx].sum().item())
+
+    def get_all_actions(self):
+        result = []
+        # mask out invalid actions
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.actions[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def get_all_observations(self):
+        result = []
+        # mask out invalid observations
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.observations[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def __len__(self):
+        return len(self.slices)
+
+    def __getitem__(self, idx):
+
+        i, start, end = self.slices[idx]
+
+        obs = self.observations[i, start:end]
+        act = self.actions[i, start:end]
+        mask = self.masks[i, start:end]
+
+        return obs, act, mask
diff --git a/agent/dataset/d3il_dataset/sorting_dataset.py b/agent/dataset/d3il_dataset/sorting_dataset.py
new file mode 100644
index 0000000..e7445a7
--- /dev/null
+++ b/agent/dataset/d3il_dataset/sorting_dataset.py
@@ -0,0 +1,491 @@
+import logging
+
+import os
+import glob
+import torch
+import pickle
+import numpy as np
+
+try:
+    import cv2  # not included in pyproject.toml
+except:
+    print("Installing cv2")
+    os.system("pip install opencv-python")
+from tqdm import tqdm
+
+from agent.dataset.d3il_dataset.base_dataset import TrajectoryDataset
+from agent.dataset.d3il_dataset import sim_framework_path
+from .geo_transform import quat2euler
+
+
+class Sorting_Dataset(TrajectoryDataset):
+    def __init__(
+        self,
+        data_directory: os.PathLike,
+        device="cpu",
+        obs_dim: int = 20,
+        action_dim: int = 2,
+        max_len_data: int = 256,
+        window_size: int = 1,
+        num_boxes: int = 2,
+    ):
+
+        super().__init__(
+            data_directory=data_directory,
+            device=device,
+            obs_dim=obs_dim,
+            action_dim=action_dim,
+            max_len_data=max_len_data,
+            window_size=window_size,
+        )
+
+        logging.info("Loading Sorting Dataset")
+
+        inputs = []
+        actions = []
+        masks = []
+
+        # for root, dirs, files in os.walk(self.data_directory):
+        #
+        #     for mode_dir in dirs:
+
+        # state_files = glob.glob(os.path.join(root, mode_dir) + "/env*")
+        # data_dir = os.path.join(sim_framework_path(data_directory), "local")
+        # data_dir = sim_framework_path(data_directory)
+        # state_files = glob.glob(data_dir + "/env*")
+
+        # random.seed(0)
+
+        # data_dir = sim_framework_path(data_directory)
+        # state_files = os.listdir(data_dir)
+        #
+        # random.shuffle(state_files)
+        #
+        # if data == "train":
+        #     env_state_files = state_files[50:]
+        # elif data == "eval":
+        #     env_state_files = state_files[:50]
+        # else:
+        #     assert False, "wrong data type"
+
+        if num_boxes == 2:
+            data_dir = sim_framework_path("data/sorting/2_boxes/state")
+        elif num_boxes == 4:
+            data_dir = sim_framework_path("data/sorting/4_boxes/state")
+        elif num_boxes == 6:
+            data_dir = sim_framework_path("data/sorting/6_boxes/state")
+        else:
+            assert False, "check num boxes"
+
+        state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
+
+        for file in state_files:
+            with open(os.path.join(data_dir, file), "rb") as f:
+                env_state = pickle.load(f)
+
+            # lengths.append(len(env_state['robot']['des_c_pos']))
+
+            zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
+            zero_action = np.zeros(
+                (1, self.max_len_data, self.action_dim), dtype=np.float32
+            )
+            zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
+
+            # robot and box posistion
+            robot_des_pos = env_state["robot"]["des_c_pos"][:, :2]
+            robot_c_pos = env_state["robot"]["c_pos"][:, :2]
+
+            if num_boxes == 2:
+                red_box1_pos = env_state["red-box1"]["pos"][:, :2]
+                red_box1_quat = np.tan(
+                    quat2euler(env_state["red-box1"]["quat"])[:, -1:]
+                )
+
+                blue_box1_pos = env_state["blue-box1"]["pos"][:, :2]
+                blue_box1_quat = np.tan(
+                    quat2euler(env_state["blue-box1"]["quat"])[:, -1:]
+                )
+
+                input_state = np.concatenate(
+                    (
+                        robot_des_pos,
+                        robot_c_pos,
+                        red_box1_pos,
+                        red_box1_quat,
+                        blue_box1_pos,
+                        blue_box1_quat,
+                    ),
+                    axis=-1,
+                )
+
+            elif num_boxes == 4:
+
+                red_box1_pos = env_state["red-box1"]["pos"][:, :2]
+                red_box1_quat = np.tan(
+                    quat2euler(env_state["red-box1"]["quat"])[:, -1:]
+                )
+
+                red_box2_pos = env_state["red-box2"]["pos"][:, :2]
+                red_box2_quat = np.tan(
+                    quat2euler(env_state["red-box2"]["quat"])[:, -1:]
+                )
+
+                blue_box1_pos = env_state["blue-box1"]["pos"][:, :2]
+                blue_box1_quat = np.tan(
+                    quat2euler(env_state["blue-box1"]["quat"])[:, -1:]
+                )
+
+                blue_box2_pos = env_state["blue-box2"]["pos"][:, :2]
+                blue_box2_quat = np.tan(
+                    quat2euler(env_state["blue-box2"]["quat"])[:, -1:]
+                )
+
+                input_state = np.concatenate(
+                    (
+                        robot_des_pos,
+                        robot_c_pos,
+                        red_box1_pos,
+                        red_box1_quat,
+                        red_box2_pos,
+                        red_box2_quat,
+                        blue_box1_pos,
+                        blue_box1_quat,
+                        blue_box2_pos,
+                        blue_box2_quat,
+                    ),
+                    axis=-1,
+                )
+
+            elif num_boxes == 6:
+
+                red_box1_pos = env_state["red-box1"]["pos"][:, :2]
+                red_box1_quat = np.tan(
+                    quat2euler(env_state["red-box1"]["quat"])[:, -1:]
+                )
+
+                red_box2_pos = env_state["red-box2"]["pos"][:, :2]
+                red_box2_quat = np.tan(
+                    quat2euler(env_state["red-box2"]["quat"])[:, -1:]
+                )
+
+                red_box3_pos = env_state["red-box3"]["pos"][:, :2]
+                red_box3_quat = np.tan(
+                    quat2euler(env_state["red-box3"]["quat"])[:, -1:]
+                )
+
+                blue_box1_pos = env_state["blue-box1"]["pos"][:, :2]
+                blue_box1_quat = np.tan(
+                    quat2euler(env_state["blue-box1"]["quat"])[:, -1:]
+                )
+
+                blue_box2_pos = env_state["blue-box2"]["pos"][:, :2]
+                blue_box2_quat = np.tan(
+                    quat2euler(env_state["blue-box2"]["quat"])[:, -1:]
+                )
+
+                blue_box3_pos = env_state["blue-box3"]["pos"][:, :2]
+                blue_box3_quat = np.tan(
+                    quat2euler(env_state["blue-box3"]["quat"])[:, -1:]
+                )
+
+                input_state = np.concatenate(
+                    (
+                        robot_des_pos,
+                        robot_c_pos,
+                        red_box1_pos,
+                        red_box1_quat,
+                        red_box2_pos,
+                        red_box2_quat,
+                        red_box3_pos,
+                        red_box3_quat,
+                        blue_box1_pos,
+                        blue_box1_quat,
+                        blue_box2_pos,
+                        blue_box2_quat,
+                        blue_box3_pos,
+                        blue_box3_quat,
+                    ),
+                    axis=-1,
+                )
+
+            else:
+                assert False, "check num boxes"
+
+            vel_state = robot_des_pos[1:] - robot_des_pos[:-1]
+            valid_len = len(vel_state)
+
+            zero_obs[0, :valid_len, :] = input_state[:-1]
+            zero_action[0, :valid_len, :] = vel_state
+            zero_mask[0, :valid_len] = 1
+
+            inputs.append(zero_obs)
+            actions.append(zero_action)
+            masks.append(zero_mask)
+
+        # shape: B, T, n
+        self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
+        self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
+        self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
+
+        self.num_data = len(self.observations)
+
+        self.slices = self.get_slices()
+
+    def get_slices(self):
+        slices = []
+
+        min_seq_length = np.inf
+        for i in range(self.num_data):
+            T = self.get_seq_length(i)
+            min_seq_length = min(T, min_seq_length)
+
+            if T - self.window_size < 0:
+                print(
+                    f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
+                )
+            else:
+                slices += [
+                    (i, start, start + self.window_size)
+                    for start in range(T - self.window_size + 1)
+                ]  # slice indices follow convention [start, end)
+
+        return slices
+
+    def get_seq_length(self, idx):
+        return int(self.masks[idx].sum().item())
+
+    def get_all_actions(self):
+        result = []
+        # mask out invalid actions
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.actions[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def get_all_observations(self):
+        result = []
+        # mask out invalid observations
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.observations[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def __len__(self):
+        return len(self.slices)
+
+    def __getitem__(self, idx):
+
+        i, start, end = self.slices[idx]
+
+        obs = self.observations[i, start:end]
+        act = self.actions[i, start:end]
+        mask = self.masks[i, start:end]
+
+        return obs, act, mask
+
+
+class Sorting_Img_Dataset(TrajectoryDataset):
+    def __init__(
+        self,
+        data_directory: os.PathLike,
+        device="cpu",
+        obs_dim: int = 20,
+        action_dim: int = 2,
+        max_len_data: int = 256,
+        window_size: int = 1,
+        num_boxes: int = 2,
+    ):
+
+        super().__init__(
+            data_directory=data_directory,
+            device=device,
+            obs_dim=obs_dim,
+            action_dim=action_dim,
+            max_len_data=max_len_data,
+            window_size=window_size,
+        )
+
+        logging.info("Loading Sorting Dataset")
+
+        inputs = []
+        actions = []
+        masks = []
+
+        # for root, dirs, files in os.walk(self.data_directory):
+        #
+        #     for mode_dir in dirs:
+
+        # state_files = glob.glob(os.path.join(root, mode_dir) + "/env*")
+        # data_dir = os.path.join(sim_framework_path(data_directory), "local")
+        # data_dir = sim_framework_path(data_directory)
+        # state_files = glob.glob(data_dir + "/env*")
+
+        # random.seed(0)
+        #
+        # data_dir = sim_framework_path(data_directory)
+        # state_files = glob.glob(data_dir + '/state/*')
+        #
+        # random.shuffle(state_files)
+        #
+        # if data == "train":
+        #     env_state_files = state_files[30:]
+        # elif data == "eval":
+        #     env_state_files = state_files[:30]
+        # else:
+        #     assert False, "wrong data type"
+
+        if num_boxes == 2:
+            data_dir = sim_framework_path("environments/dataset/data/sorting/2_boxes/")
+        elif num_boxes == 4:
+            data_dir = sim_framework_path("environments/dataset/data/sorting/4_boxes/")
+        elif num_boxes == 6:
+            data_dir = sim_framework_path("environments/dataset/data/sorting/6_boxes/")
+        else:
+            assert False, "check num boxes"
+
+        state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
+
+        bp_cam_imgs = []
+        inhand_cam_imgs = []
+
+        for file in tqdm(state_files[:100]):
+            with open(os.path.join(data_dir, "state", file), "rb") as f:
+                env_state = pickle.load(f)
+
+            # lengths.append(len(env_state['robot']['des_c_pos']))
+            zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
+            zero_action = np.zeros(
+                (1, self.max_len_data, self.action_dim), dtype=np.float32
+            )
+            zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
+
+            # robot and box posistion
+            robot_des_pos = env_state["robot"]["des_c_pos"][:, :2]
+            robot_c_pos = env_state["robot"]["c_pos"][:, :2]
+
+            file_name = os.path.basename(file).split(".")[0]
+
+            ###############################################################
+            bp_images = []
+            bp_imgs = glob.glob(data_dir + "/images/bp-cam/" + file_name + "/*")
+            bp_imgs.sort(key=lambda x: int(os.path.basename(x).split(".")[0]))
+
+            for img in bp_imgs:
+                image = cv2.imread(img).astype(np.float32)
+                image = image.transpose((2, 0, 1)) / 255.0
+
+                image = torch.from_numpy(image).to(self.device).float().unsqueeze(0)
+
+                bp_images.append(image)
+
+            bp_images = torch.concatenate(bp_images, dim=0)
+            ################################################################
+            inhand_imgs = glob.glob(data_dir + "/images/inhand-cam/" + file_name + "/*")
+            inhand_imgs.sort(key=lambda x: int(os.path.basename(x).split(".")[0]))
+            inhand_images = []
+            for img in inhand_imgs:
+                image = cv2.imread(img).astype(np.float32)
+                image = image.transpose((2, 0, 1)) / 255.0
+
+                image = torch.from_numpy(image).to(self.device).float().unsqueeze(0)
+
+                inhand_images.append(image)
+            inhand_images = torch.concatenate(inhand_images, dim=0)
+            ##################################################################
+            # input_state = np.concatenate((robot_des_pos, robot_c_pos), axis=-1)
+
+            vel_state = robot_des_pos[1:] - robot_des_pos[:-1]
+
+            valid_len = len(vel_state)
+
+            zero_obs[0, :valid_len, :] = robot_des_pos[:-1]
+            zero_action[0, :valid_len, :] = vel_state
+            zero_mask[0, :valid_len] = 1
+
+            bp_cam_imgs.append(bp_images)
+            inhand_cam_imgs.append(inhand_images)
+
+            inputs.append(zero_obs)
+            actions.append(zero_action)
+            masks.append(zero_mask)
+
+        self.bp_cam_imgs = bp_cam_imgs
+        self.inhand_cam_imgs = inhand_cam_imgs
+
+        # shape: B, T, n
+        self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
+        self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
+        self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
+
+        self.num_data = len(self.actions)
+
+        self.slices = self.get_slices()
+
+    def get_slices(self):
+        slices = []
+
+        min_seq_length = np.inf
+        for i in range(self.num_data):
+            T = self.get_seq_length(i)
+            min_seq_length = min(T, min_seq_length)
+
+            if T - self.window_size < 0:
+                print(
+                    f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
+                )
+            else:
+                slices += [
+                    (i, start, start + self.window_size)
+                    for start in range(T - self.window_size + 1)
+                ]  # slice indices follow convention [start, end)
+
+        return slices
+
+    def get_seq_length(self, idx):
+        return int(self.masks[idx].sum().item())
+
+    def get_all_actions(self):
+        result = []
+        # mask out invalid actions
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.actions[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def get_all_observations(self):
+        result = []
+        # mask out invalid observations
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.observations[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def __len__(self):
+        return len(self.slices)
+
+    def __getitem__(self, idx):
+
+        i, start, end = self.slices[idx]
+
+        obs = self.observations[i, start:end]
+        act = self.actions[i, start:end]
+        mask = self.masks[i, start:end]
+
+        bp_imgs = self.bp_cam_imgs[i][start:end]
+        inhand_imgs = self.inhand_cam_imgs[i][start:end]
+
+        # bp_imgs = np.zeros((self.window_size, 3, 96, 96), dtype=np.float32)
+        # inhand_imgs = np.zeros((self.window_size, 3, 96, 96), dtype=np.float32)
+        #
+        # for num_frame, img_file in enumerate(bp_img_files):
+        #     image = cv2.imread(img_file).astype(np.float32)
+        #     bp_imgs[num_frame] = image.transpose((2, 0, 1)) / 255.
+        #
+        # for num_frame, img_file in enumerate(inhand_img_files):
+        #     image = cv2.imread(img_file).astype(np.float32)
+        #     inhand_imgs[num_frame] = image.transpose((2, 0, 1)) / 255.
+        #
+        # bp_imgs = torch.from_numpy(bp_imgs).to(self.device).float()
+        # inhand_imgs = torch.from_numpy(inhand_imgs).to(self.device).float()
+
+        return bp_imgs, inhand_imgs, obs, act, mask
diff --git a/agent/dataset/d3il_dataset/stacking_dataset.py b/agent/dataset/d3il_dataset/stacking_dataset.py
new file mode 100644
index 0000000..1c6ca54
--- /dev/null
+++ b/agent/dataset/d3il_dataset/stacking_dataset.py
@@ -0,0 +1,398 @@
+import logging
+
+import os
+import glob
+
+try:
+    import cv2  # not included in pyproject.toml
+except:
+    print("Installing cv2")
+    os.system("pip install opencv-python")
+import torch
+import pickle
+import numpy as np
+from tqdm import tqdm
+
+from agent.dataset.d3il_dataset.base_dataset import TrajectoryDataset
+from agent.dataset.d3il_dataset import sim_framework_path
+
+from .geo_transform import quat2euler
+
+
+class Stacking_Dataset(TrajectoryDataset):
+    def __init__(
+        self,
+        data_directory: os.PathLike,
+        # data='train',
+        device="cpu",
+        obs_dim: int = 20,
+        action_dim: int = 2,
+        max_len_data: int = 256,
+        window_size: int = 1,
+    ):
+
+        super().__init__(
+            data_directory=data_directory,
+            device=device,
+            obs_dim=obs_dim,
+            action_dim=action_dim,
+            max_len_data=max_len_data,
+            window_size=window_size,
+        )
+
+        logging.info("Loading CubeStacking Dataset")
+
+        inputs = []
+        actions = []
+        masks = []
+
+        # for root, dirs, files in os.walk(self.data_directory):
+        #
+        #     for mode_dir in dirs:
+
+        # state_files = glob.glob(os.path.join(root, mode_dir) + "/env*")
+        # data_dir = os.path.join(sim_framework_path(data_directory), "local")
+        # data_dir = sim_framework_path(data_directory)
+        # state_files = glob.glob(data_dir + "/env*")
+
+        # bp_data_dir = sim_framework_path("environments/dataset/data/stacking/all_data_new")
+        # state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
+
+        # bp_data_dir = sim_framework_path("environments/dataset/data/stacking/single_test")
+        # state_files = os.listdir(bp_data_dir)
+
+        # random.seed(0)
+        #
+        # data_dir = sim_framework_path(data_directory)
+        # state_files = os.listdir(data_dir)
+        #
+        # random.shuffle(state_files)
+        #
+        # if data == "train":
+        #     env_state_files = state_files[20:]
+        # elif data == "eval":
+        #     env_state_files = state_files[:20]
+        # else:
+        #     assert False, "wrong data type"
+
+        data_dir = sim_framework_path("data/stacking/all_data")
+        state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
+
+        for file in state_files:
+            with open(os.path.join(data_dir, file), "rb") as f:
+                env_state = pickle.load(f)
+
+            # lengths.append(len(env_state['robot']['des_c_pos']))
+
+            zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
+            zero_action = np.zeros(
+                (1, self.max_len_data, self.action_dim), dtype=np.float32
+            )
+            zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
+
+            # robot and box positions
+            robot_des_j_pos = env_state["robot"]["des_j_pos"]
+            robot_des_j_vel = env_state["robot"]["des_j_vel"]
+
+            robot_des_c_pos = env_state["robot"]["des_c_pos"]
+            robot_des_quat = env_state["robot"]["des_c_quat"]
+
+            robot_c_pos = env_state["robot"]["c_pos"]
+            robot_c_quat = env_state["robot"]["c_quat"]
+
+            robot_j_pos = env_state["robot"]["j_pos"]
+            robot_j_vel = env_state["robot"]["j_vel"]
+
+            robot_gripper = np.expand_dims(env_state["robot"]["gripper_width"], -1)
+            # pred_gripper = np.zeros(robot_gripper.shape, dtype=np.float32)
+            # pred_gripper[robot_gripper > 0.075] = 1
+
+            sim_steps = np.expand_dims(np.arange(len(robot_des_j_pos)), -1)
+
+            red_box_pos = env_state["red-box"]["pos"]
+            red_box_quat = np.tan(quat2euler(env_state["red-box"]["quat"])[:, -1:])
+            # red_box_quat = np.concatenate((np.sin(red_box_quat), np.cos(red_box_quat)), axis=-1)
+
+            green_box_pos = env_state["green-box"]["pos"]
+            green_box_quat = np.tan(quat2euler(env_state["green-box"]["quat"])[:, -1:])
+            # green_box_quat = np.concatenate((np.sin(green_box_quat), np.cos(green_box_quat)), axis=-1)
+
+            blue_box_pos = env_state["blue-box"]["pos"]
+            blue_box_quat = np.tan(quat2euler(env_state["blue-box"]["quat"])[:, -1:])
+            # blue_box_quat = np.concatenate((np.sin(blue_box_quat), np.cos(blue_box_quat)), axis=-1)
+
+            # target_box_pos = env_state['target-box']['pos'] #- robot_c_pos
+
+            # input_state = np.concatenate((robot_des_c_pos, robot_des_quat, pred_gripper, red_box_pos, red_box_quat), axis=-1)
+
+            # input_state = np.concatenate((robot_des_j_pos, robot_gripper, blue_box_pos, blue_box_quat), axis=-1)
+
+            input_state = np.concatenate(
+                (
+                    robot_des_j_pos,
+                    robot_gripper,
+                    red_box_pos,
+                    red_box_quat,
+                    green_box_pos,
+                    green_box_quat,
+                    blue_box_pos,
+                    blue_box_quat,
+                ),
+                axis=-1,
+            )
+
+            # input_state = np.concatenate((robot_des_j_pos, robot_des_j_vel, robot_c_pos, robot_c_quat, green_box_pos, green_box_quat,
+            #                               target_box_pos), axis=-1)
+
+            vel_state = robot_des_j_pos[1:] - robot_des_j_pos[:-1]
+
+            valid_len = len(input_state) - 1
+
+            zero_obs[0, :valid_len, :] = input_state[:-1]
+            zero_action[0, :valid_len, :] = np.concatenate(
+                (vel_state, robot_gripper[1:]), axis=-1
+            )
+            zero_mask[0, :valid_len] = 1
+
+            inputs.append(zero_obs)
+            actions.append(zero_action)
+            masks.append(zero_mask)
+
+        # shape: B, T, n
+        self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
+        self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
+        self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
+
+        self.num_data = len(self.observations)
+
+        self.slices = self.get_slices()
+
+    def get_slices(self):
+        slices = []
+
+        min_seq_length = np.inf
+        for i in range(self.num_data):
+            T = self.get_seq_length(i)
+            min_seq_length = min(T, min_seq_length)
+
+            if T - self.window_size < 0:
+                print(
+                    f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
+                )
+            else:
+                slices += [
+                    (i, start, start + self.window_size)
+                    for start in range(T - self.window_size + 1)
+                ]  # slice indices follow convention [start, end)
+
+        return slices
+
+    def get_seq_length(self, idx):
+        return int(self.masks[idx].sum().item())
+
+    def get_all_actions(self):
+        result = []
+        # mask out invalid actions
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.actions[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def get_all_observations(self):
+        result = []
+        # mask out invalid observations
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.observations[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def __len__(self):
+        return len(self.slices)
+
+    def __getitem__(self, idx):
+
+        i, start, end = self.slices[idx]
+
+        obs = self.observations[i, start:end]
+        act = self.actions[i, start:end]
+        mask = self.masks[i, start:end]
+
+        return obs, act, mask
+
+
+class Stacking_Img_Dataset(TrajectoryDataset):
+    def __init__(
+        self,
+        data_directory: os.PathLike,
+        # data='train',
+        device="cpu",
+        obs_dim: int = 20,
+        action_dim: int = 2,
+        max_len_data: int = 256,
+        window_size: int = 1,
+    ):
+
+        super().__init__(
+            data_directory=data_directory,
+            device=device,
+            obs_dim=obs_dim,
+            action_dim=action_dim,
+            max_len_data=max_len_data,
+            window_size=window_size,
+        )
+
+        logging.info("Loading CubeStacking Dataset")
+
+        inputs = []
+        actions = []
+        masks = []
+
+        # TODO: insert data_dir here
+        state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
+
+        bp_cam_imgs = []
+        inhand_cam_imgs = []
+
+        for file in tqdm(state_files):
+            with open(os.path.join(data_dir, "state", file), "rb") as f:
+                env_state = pickle.load(f)
+
+            # lengths.append(len(env_state['robot']['des_c_pos']))
+
+            zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
+            zero_action = np.zeros(
+                (1, self.max_len_data, self.action_dim), dtype=np.float32
+            )
+            zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
+
+            # robot and box positions
+            robot_des_j_pos = env_state["robot"]["des_j_pos"]
+            robot_des_j_vel = env_state["robot"]["des_j_vel"]
+
+            robot_des_c_pos = env_state["robot"]["des_c_pos"]
+            robot_des_quat = env_state["robot"]["des_c_quat"]
+
+            robot_c_pos = env_state["robot"]["c_pos"]
+            robot_c_quat = env_state["robot"]["c_quat"]
+
+            robot_j_pos = env_state["robot"]["j_pos"]
+            robot_j_vel = env_state["robot"]["j_vel"]
+
+            robot_gripper = np.expand_dims(env_state["robot"]["gripper_width"], -1)
+            # pred_gripper = np.zeros(robot_gripper.shape, dtype=np.float32)
+            # pred_gripper[robot_gripper > 0.075] = 1
+
+            file_name = os.path.basename(file).split(".")[0]
+
+            ###############################################################
+            bp_images = []
+            bp_imgs = glob.glob(data_dir + "/images/bp-cam/" + file_name + "/*")
+            bp_imgs.sort(key=lambda x: int(os.path.basename(x).split(".")[0]))
+
+            for img in bp_imgs:
+                image = cv2.imread(img).astype(np.float32)
+                image = image.transpose((2, 0, 1)) / 255.0
+
+                image = torch.from_numpy(image).to(self.device).float().unsqueeze(0)
+
+                bp_images.append(image)
+
+            bp_images = torch.concatenate(bp_images, dim=0)
+            ################################################################
+            inhand_imgs = glob.glob(data_dir + "/images/inhand-cam/" + file_name + "/*")
+            inhand_imgs.sort(key=lambda x: int(os.path.basename(x).split(".")[0]))
+            inhand_images = []
+            for img in inhand_imgs:
+                image = cv2.imread(img).astype(np.float32)
+                image = image.transpose((2, 0, 1)) / 255.0
+
+                image = torch.from_numpy(image).to(self.device).float().unsqueeze(0)
+
+                inhand_images.append(image)
+            inhand_images = torch.concatenate(inhand_images, dim=0)
+            ##################################################################
+
+            input_state = np.concatenate((robot_des_j_pos, robot_gripper), axis=-1)
+            vel_state = robot_des_j_pos[1:] - robot_des_j_pos[:-1]
+
+            valid_len = len(input_state) - 1
+
+            zero_obs[0, :valid_len, :] = input_state[:-1]
+            zero_action[0, :valid_len, :] = np.concatenate(
+                (vel_state, robot_gripper[1:]), axis=-1
+            )
+            zero_mask[0, :valid_len] = 1
+
+            bp_cam_imgs.append(bp_images)
+            inhand_cam_imgs.append(inhand_images)
+
+            inputs.append(zero_obs)
+            actions.append(zero_action)
+            masks.append(zero_mask)
+
+        self.bp_cam_imgs = bp_cam_imgs
+        self.inhand_cam_imgs = inhand_cam_imgs
+
+        # shape: B, T, n
+        self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
+        self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
+        self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
+
+        self.num_data = len(self.observations)
+
+        self.slices = self.get_slices()
+
+    def get_slices(self):
+        slices = []
+
+        min_seq_length = np.inf
+        for i in range(self.num_data):
+            T = self.get_seq_length(i)
+            min_seq_length = min(T, min_seq_length)
+
+            if T - self.window_size < 0:
+                print(
+                    f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
+                )
+            else:
+                slices += [
+                    (i, start, start + self.window_size)
+                    for start in range(T - self.window_size + 1)
+                ]  # slice indices follow convention [start, end)
+
+        return slices
+
+    def get_seq_length(self, idx):
+        return int(self.masks[idx].sum().item())
+
+    def get_all_actions(self):
+        result = []
+        # mask out invalid actions
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.actions[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def get_all_observations(self):
+        result = []
+        # mask out invalid observations
+        for i in range(len(self.masks)):
+            T = int(self.masks[i].sum().item())
+            result.append(self.observations[i, :T, :])
+        return torch.cat(result, dim=0)
+
+    def __len__(self):
+        return len(self.slices)
+
+    def __getitem__(self, idx):
+
+        i, start, end = self.slices[idx]
+
+        obs = self.observations[i, start:end]
+        act = self.actions[i, start:end]
+        mask = self.masks[i, start:end]
+
+        bp_imgs = self.bp_cam_imgs[i][start:end]
+        inhand_imgs = self.inhand_cam_imgs[i][start:end]
+
+        return bp_imgs, inhand_imgs, obs, act, mask
diff --git a/agent/dataset/sequence.py b/agent/dataset/sequence.py
new file mode 100644
index 0000000..48b54dc
--- /dev/null
+++ b/agent/dataset/sequence.py
@@ -0,0 +1,162 @@
+"""
+Pre-training data loader. Modified from https://github.com/jannerm/diffuser/blob/main/diffuser/datasets/sequence.py
+
+TODO: implement history observation
+
+No normalization is applied here --- we always normalize the data when pre-processing it with a different script, and the normalization info is also used in RL fine-tuning.
+
+"""
+
+from collections import namedtuple
+from tqdm import tqdm
+import numpy as np
+import torch
+import logging
+import pickle
+import random
+
+log = logging.getLogger(__name__)
+
+from .buffer import StitchedBuffer
+
+
+Batch = namedtuple("Batch", "trajectories conditions")
+ValueBatch = namedtuple("ValueBatch", "trajectories conditions values")
+
+
+class StitchedSequenceDataset(torch.utils.data.Dataset):
+    """
+    Dataset to efficiently load and sample trajectories. Stitches episodes together in the time dimension to avoid excessive zero padding. Episode ID's are used to index unique trajectories.
+
+    Returns a dictionary with values of shape: [sum_e(T_e), *D] where T_e is traj length of episode e and D is
+    (tuple of) dimension of observation, action, images, etc.
+
+    Example:
+        Observations: [----------traj 1----------][---------traj 2----------] ... [---------traj N----------]
+        Episode IDs:  [----------   1  ----------][----------   2  ---------] ... [----------   N  ---------]
+    """
+
+    def __init__(
+        self,
+        dataset_path,
+        horizon_steps=64,
+        cond_steps=1,
+        max_n_episodes=10000,
+        use_img=False,
+        device="cpu",
+    ):
+        self.horizon_steps = horizon_steps
+        self.cond_steps = cond_steps
+        self.device = device
+
+        # Load dataset to device specified
+        if dataset_path.endswith(".npz"):
+            dataset = np.load(dataset_path, allow_pickle=True)
+        else:
+            with open(dataset_path, "rb") as f:
+                dataset = pickle.load(f)
+        num_episodes = dataset["observations"].shape[0]
+
+        # Get the sum total of the valid trajectories' lengths
+        traj_lengths = dataset["traj_length"]
+        sum_of_path_lengths = np.sum(traj_lengths)
+        self.sum_of_path_lengths = sum_of_path_lengths
+
+        fields = StitchedBuffer(sum_of_path_lengths, device)
+        for i in tqdm(
+            range(min(max_n_episodes, num_episodes)), desc="Loading trajectories"
+        ):
+            traj_length = traj_lengths[i]
+            episode = {
+                "observations": dataset["observations"][i][:traj_length],
+                "actions": dataset["actions"][i][:traj_length],
+                "episode_ids": i * np.ones(traj_length),
+            }
+            if use_img:
+                episode["images"] = dataset["images"][i][:traj_length]
+            for key, val in episode.items():
+                if device == "cpu":
+                    episode[key] = val
+                else:
+                    # if None array, save as empty tensor
+                    if np.all(np.equal(episode[key], None)):
+                        episode[key] = torch.empty(episode[key].shape).to(device)
+                    else:
+                        if key == "images":
+                            episode[key] = torch.tensor(val, dtype=torch.uint8).to(
+                                device
+                            )
+                            # (, H, W, C) -> (, C, H, W)
+                            episode[key] = episode[key].permute(0, 3, 1, 2)
+                        else:
+                            episode[key] = torch.tensor(val, dtype=torch.float32).to(
+                                device
+                            )
+            fields.add_path(episode)
+        fields.finalize()
+
+        self.indices = self.make_indices(traj_lengths, horizon_steps)
+        self.obs_dim = fields.observations.shape[-1]
+        self.action_dim = fields.actions.shape[-1]
+        self.fields = fields
+        self.n_episodes = fields.n_episodes
+        self.path_lengths = fields.path_lengths
+        self.traj_lengths = traj_lengths
+        self.use_img = use_img
+        log.info(fields)
+
+    def make_indices(self, traj_lengths, horizon_steps):
+        """
+        makes indices for sampling from dataset;
+        each index maps to a datapoint
+        """
+        indices = []
+        cur_traj_index = 0
+        for traj_length in traj_lengths:
+            max_start = cur_traj_index + traj_length - horizon_steps + 1
+            indices += list(range(cur_traj_index, max_start))
+            cur_traj_index += traj_length
+        return indices
+
+    def set_train_val_split(self, train_split):
+        num_train = int(len(self.indices) * train_split)
+        train_indices = random.sample(self.indices, num_train)
+        val_indices = [i for i in range(len(self.indices)) if i not in train_indices]
+        self.indices = train_indices
+        return val_indices
+
+    def set_indices(self, indices):
+        self.indices = indices
+
+    def get_conditions(self, observations, images=None):
+        """
+        condition on current observation for planning. Take into account the number of conditioning steps.
+        """
+        if images is not None:
+            return {
+                1 - self.cond_steps: {"state": observations[0], "rgb": images[0]}
+            }  # TODO: allow obs history, -1, -2, ...
+        else:
+            return {1 - self.cond_steps: observations[0]}
+
+    def __len__(self):
+        return len(self.indices)
+
+    def __getitem__(self, idx, eps=1e-4):
+        raise NotImplementedError("Get item defined in subclass.")
+
+
+class StitchedActionSequenceDataset(StitchedSequenceDataset):
+    """Only use action trajectory, and then obs_cond for current observation"""
+
+    def __getitem__(self, idx):
+        start = self.indices[idx]
+        end = start + self.horizon_steps
+        observations = self.fields.observations[start:end]
+        actions = self.fields.actions[start:end]
+        images = None
+        if self.use_img:
+            images = self.fields.images[start:end]
+        conditions = self.get_conditions(observations, images)
+        batch = Batch(actions, conditions)
+        return batch
diff --git a/agent/finetune/train_agent.py b/agent/finetune/train_agent.py
new file mode 100644
index 0000000..4c68b51
--- /dev/null
+++ b/agent/finetune/train_agent.py
@@ -0,0 +1,170 @@
+"""
+Parent fine-tuning agent class.
+
+"""
+
+import os
+import numpy as np
+from omegaconf import OmegaConf
+import torch
+import hydra
+import logging
+import wandb
+import random
+
+log = logging.getLogger(__name__)
+from env.gym_utils import make_async
+
+
+class TrainAgent:
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.device = cfg.device
+        self.seed = cfg.get("seed", 42)
+        random.seed(self.seed)
+        np.random.seed(self.seed)
+        torch.manual_seed(self.seed)
+
+        # Wandb
+        self.use_wandb = cfg.wandb is not None
+        if cfg.wandb is not None:
+            wandb.init(
+                entity=cfg.wandb.entity,
+                project=cfg.wandb.project,
+                name=cfg.wandb.run,
+                config=OmegaConf.to_container(cfg, resolve=True),
+            )
+
+        # Make vectorized env
+        self.env_name = cfg.env.name
+        env_type = cfg.env.get("env_type", None)
+        self.venv = make_async(
+            cfg.env.name,
+            env_type=env_type,
+            num_envs=cfg.env.n_envs,
+            asynchronous=True,
+            max_episode_steps=cfg.env.max_episode_steps,
+            wrappers=cfg.env.get("wrappers", None),
+            robomimic_env_cfg_path=cfg.get("robomimic_env_cfg_path", None),
+            shape_meta=cfg.get("shape_meta", None),
+            use_image_obs=cfg.env.get("use_image_obs", False),
+            render=cfg.env.get("render", False),
+            render_offscreen=cfg.env.get("save_video", False),
+            obs_dim=cfg.obs_dim,
+            action_dim=cfg.action_dim,
+            **cfg.env.specific if "specific" in cfg.env else {},
+        )
+        if not env_type == "furniture":
+            self.venv.seed(
+                [self.seed + i for i in range(cfg.env.n_envs)]
+            )  # otherwise parallel envs might have the same initial states!
+            # isaacgym environments do not need seeding
+        self.n_envs = cfg.env.n_envs
+        self.n_cond_step = cfg.cond_steps
+        self.obs_dim = cfg.obs_dim
+        self.action_dim = cfg.action_dim
+        self.act_steps = cfg.act_steps
+        self.horizon_steps = cfg.horizon_steps
+        self.max_episode_steps = cfg.env.max_episode_steps
+        self.reset_at_iteration = cfg.env.get("reset_at_iteration", True)
+        self.save_full_observations = cfg.env.get("save_full_observations", False)
+        self.furniture_sparse_reward = (
+            cfg.env.specific.get("sparse_reward", False)
+            if "specific" in cfg.env
+            else False
+        )  # furniture specific, for best reward calculation
+
+        # Batch size for gradient update
+        self.batch_size: int = cfg.train.batch_size
+
+        # Build model and load checkpoint
+        self.model = hydra.utils.instantiate(cfg.model)
+
+        # Training params
+        self.itr = 0
+        self.n_train_itr = cfg.train.n_train_itr
+        self.val_freq = cfg.train.val_freq
+        self.force_train = cfg.train.get("force_train", False)
+        self.n_steps = cfg.train.n_steps
+        self.best_reward_threshold_for_success = (
+            len(self.venv.pairs_to_assemble)
+            if env_type == "furniture"
+            else cfg.env.best_reward_threshold_for_success
+        )
+        self.max_grad_norm = cfg.train.get("max_grad_norm", None)
+
+        # Logging, rendering, checkpoints
+        self.logdir = cfg.logdir
+        self.render_dir = os.path.join(self.logdir, "render")
+        self.checkpoint_dir = os.path.join(self.logdir, "checkpoint")
+        self.result_path = os.path.join(self.logdir, "result.pkl")
+        os.makedirs(self.render_dir, exist_ok=True)
+        os.makedirs(self.checkpoint_dir, exist_ok=True)
+        self.save_trajs = cfg.train.get("save_trajs", False)
+        self.log_freq = cfg.train.get("log_freq", 1)
+        self.save_model_freq = cfg.train.save_model_freq
+        self.render_freq = cfg.train.render.freq
+        self.n_render = cfg.train.render.num
+        self.render_video = cfg.env.get("save_video", False)
+        assert self.n_render <= self.n_envs, "n_render must be <= n_envs"
+        assert not (
+            self.n_render <= 0 and self.render_video
+        ), "Need to set n_render > 0 if saving video"
+        self.traj_plotter = (
+            hydra.utils.instantiate(cfg.train.plotter)
+            if "plotter" in cfg.train
+            else None
+        )
+
+    def run(self):
+        pass
+
+    def save_model(self):
+        """
+        saves model to disk; no ema
+        """
+        data = {
+            "itr": self.itr,
+            "model": self.model.state_dict(),
+        }
+        savepath = os.path.join(self.checkpoint_dir, f"state_{self.itr}.pt")
+        torch.save(data, savepath)
+        log.info(f"Saved model to {savepath}")
+
+    def load(self, itr):
+        """
+        loads model from disk
+        """
+        loadpath = os.path.join(self.checkpoint_dir, f"state_{itr}.pt")
+        data = torch.load(loadpath, weights_only=True)
+
+        self.itr = data["itr"]
+        self.model.load_state_dict(data["model"])
+
+    def reset_env_all(self, verbose=False, options_venv=None, **kwargs):
+        if options_venv is None:
+            options_venv = [
+                {k: v for k, v in kwargs.items()} for _ in range(self.n_envs)
+            ]
+        obs_venv = self.venv.reset_arg(options_list=options_venv)
+        # convert to OrderedDict if obs_venv is a list of dict
+        if isinstance(obs_venv, list):
+            obs_venv = {
+                key: np.stack([obs_venv[i][key] for i in range(self.n_envs)])
+                for key in obs_venv[0].keys()
+            }
+        if verbose:
+            for index in range(self.n_envs):
+                logging.info(
+                    f"<-- Reset environment {index} with options {options_venv[index]}"
+                )
+        return obs_venv
+
+    def reset_env(self, env_ind, verbose=False):
+        task = {}
+        obs = self.venv.reset_one_arg(env_ind=env_ind, options=task)
+        if verbose:
+            logging.info(f"<-- Reset environment {env_ind} with task {task}")
+        return obs
diff --git a/agent/finetune/train_awr_diffusion_agent.py b/agent/finetune/train_awr_diffusion_agent.py
new file mode 100644
index 0000000..2c96556
--- /dev/null
+++ b/agent/finetune/train_awr_diffusion_agent.py
@@ -0,0 +1,389 @@
+"""
+Advantage-weighted regression (AWR) for diffusion policy.
+
+Advantage = discounted-reward-to-go - V(s)
+
+"""
+
+import os
+import pickle
+import einops
+import numpy as np
+import torch
+import logging
+import wandb
+from copy import deepcopy
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from collections import deque
+from agent.finetune.train_agent import TrainAgent
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+
+def td_values(
+    states,
+    rewards,
+    dones,
+    state_values,
+    gamma=0.99,
+    alpha=0.95,
+    lam=0.95,
+):
+    """
+    Gives a list of TD estimates for a given list of samples from an RL environment.
+    The TD(λ) estimator is used for this computation.
+
+    :param replay_buffers: The replay buffers filled by exploring the RL environment.
+    Includes: states, rewards, "final state?"s.
+    :param state_values: The currently estimated state values.
+    :return: The TD estimates.
+    """
+    sample_count = len(states)
+    tds = np.zeros_like(state_values, dtype=np.float32)
+    dones[-1] = 1
+    next_value = 1 - dones[-1]
+
+    val = 0.0
+    for i in range(sample_count - 1, -1, -1):
+        # next_value = 0.0 if dones[i] else state_values[i + 1]
+
+        # get next_value for vectorized
+        if i < sample_count - 1:
+            next_value = state_values[i + 1]
+            next_value = next_value * (1 - dones[i])
+
+        state_value = state_values[i]
+        error = rewards[i] + gamma * next_value - state_value
+        val = alpha * error + gamma * lam * (1 - dones[i]) * val
+
+        tds[i] = val + state_value
+    return tds
+
+
+class TrainAWRDiffusionAgent(TrainAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+        self.logprob_batch_size = cfg.train.get("logprob_batch_size", 10000)
+
+        # note the discount factor gamma here is applied to reward every act_steps, instead of every env step
+        self.gamma = cfg.train.gamma
+
+        # Wwarm up period for critic before actor updates
+        self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
+
+        # Optimizer
+        self.actor_optimizer = torch.optim.AdamW(
+            self.model.actor.parameters(),
+            lr=cfg.train.actor_lr,
+            weight_decay=cfg.train.actor_weight_decay,
+        )
+        self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.actor_optimizer,
+            first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.actor_lr,
+            min_lr=cfg.train.actor_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+        self.critic_optimizer = torch.optim.AdamW(
+            self.model.critic.parameters(),
+            lr=cfg.train.critic_lr,
+            weight_decay=cfg.train.critic_weight_decay,
+        )
+        self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.critic_optimizer,
+            first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.critic_lr,
+            min_lr=cfg.train.critic_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+
+        # Buffer size
+        self.buffer_size = cfg.train.buffer_size
+
+        # Reward exponential
+        self.beta = cfg.train.beta
+
+        # Max weight for AWR
+        self.max_adv_weight = cfg.train.max_adv_weight
+
+        # Scaling reward
+        self.scale_reward_factor = cfg.train.scale_reward_factor
+
+        # Updates
+        self.replay_ratio = cfg.train.replay_ratio
+        self.critic_update_ratio = cfg.train.critic_update_ratio
+
+    def run(self):
+
+        # make a FIFO replay buffer for obs, action, and reward
+        obs_buffer = deque(maxlen=self.buffer_size)
+        action_buffer = deque(maxlen=self.buffer_size)
+        reward_buffer = deque(maxlen=self.buffer_size)
+        done_buffer = deque(maxlen=self.buffer_size)
+        first_buffer = deque(maxlen=self.buffer_size)
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = self.itr % self.val_freq == 0 and not self.force_train
+            self.model.eval() if eval_mode else self.model.train()
+            firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
+
+            # Reset env at the beginning of an iteration
+            if self.reset_at_iteration or eval_mode or last_itr_eval:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                firsts_trajs[0] = (
+                    done_venv  # if done at the end of last iteration, then the envs are just reset
+                )
+            last_itr_eval = eval_mode
+            reward_trajs = np.empty((0, self.n_envs))
+
+            # Collect a set of trajectories from env
+            for step in range(self.n_steps):
+                if step % 10 == 0:
+                    print(f"Processed step {step} of {self.n_steps}")
+
+                # Select action
+                with torch.no_grad():
+                    samples = (
+                        self.model(
+                            cond=torch.from_numpy(prev_obs_venv)
+                            .float()
+                            .to(self.device),
+                            deterministic=eval_mode,
+                        )
+                        .cpu()
+                        .numpy()
+                    )  # n_env x horizon x act
+                action_venv = samples[:, : self.act_steps]
+
+                # Apply multi-step action
+                obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
+                    action_venv
+                )
+                reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
+
+                # add to buffer
+                obs_buffer.append(prev_obs_venv)
+                action_buffer.append(action_venv)
+                reward_buffer.append(reward_venv * self.scale_reward_factor)
+                done_buffer.append(done_venv)
+                first_buffer.append(firsts_trajs[step])
+
+                firsts_trajs[step + 1] = done_venv
+                prev_obs_venv = obs_venv
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                episode_best_reward = np.array(
+                    [
+                        np.max(reward_traj) / self.act_steps
+                        for reward_traj in reward_trajs_split
+                    ]
+                )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+                log.info("[WARNING] No episode completed within the iteration!")
+
+            # Update
+            if not eval_mode:
+
+                obs_trajs = np.array(deepcopy(obs_buffer))
+                reward_trajs = np.array(deepcopy(reward_buffer))
+                dones_trajs = np.array(deepcopy(done_buffer))
+
+                obs_t = einops.rearrange(
+                    torch.from_numpy(obs_trajs).float().to(self.device),
+                    "s e h d -> (s e) h d",
+                )
+                values_t = np.array(self.model.critic(obs_t).detach().cpu().numpy())
+                values_trajs = values_t.reshape(-1, self.n_envs)
+                td_trajs = td_values(obs_trajs, reward_trajs, dones_trajs, values_trajs)
+
+                # flatten
+                obs_trajs = einops.rearrange(
+                    obs_trajs,
+                    "s e h d -> (s e) h d",
+                )
+                td_trajs = einops.rearrange(
+                    td_trajs,
+                    "s e -> (s e)",
+                )
+
+                # Update policy and critic
+                num_batch = int(
+                    self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
+                )
+                for _ in range(num_batch // self.critic_update_ratio):
+
+                    # Sample batch
+                    inds = np.random.choice(len(obs_trajs), self.batch_size)
+                    obs_b = torch.from_numpy(obs_trajs[inds]).float().to(self.device)
+                    td_b = torch.from_numpy(td_trajs[inds]).float().to(self.device)
+
+                    # Update critic
+                    loss_critic = self.model.loss_critic(obs_b, td_b)
+                    self.critic_optimizer.zero_grad()
+                    loss_critic.backward()
+                    self.critic_optimizer.step()
+
+                obs_trajs = np.array(deepcopy(obs_buffer))
+                samples_trajs = np.array(deepcopy(action_buffer))
+                reward_trajs = np.array(deepcopy(reward_buffer))
+                dones_trajs = np.array(deepcopy(done_buffer))
+
+                obs_t = einops.rearrange(
+                    torch.from_numpy(obs_trajs).float().to(self.device),
+                    "s e h d -> (s e) h d",
+                )
+                values_t = np.array(self.model.critic(obs_t).detach().cpu().numpy())
+                values_trajs = values_t.reshape(-1, self.n_envs)
+                td_trajs = td_values(obs_trajs, reward_trajs, dones_trajs, values_trajs)
+                advantages_trajs = td_trajs - values_trajs
+
+                # flatten
+                obs_trajs = einops.rearrange(
+                    obs_trajs,
+                    "s e h d -> (s e) h d",
+                )
+                samples_trajs = einops.rearrange(
+                    samples_trajs,
+                    "s e h d -> (s e) h d",
+                )
+                advantages_trajs = einops.rearrange(
+                    advantages_trajs,
+                    "s e -> (s e)",
+                )
+
+                for _ in range(num_batch):
+
+                    # Sample batch
+                    inds = np.random.choice(len(obs_trajs), self.batch_size)
+                    obs_b = torch.from_numpy(obs_trajs[inds]).float().to(self.device)
+                    actions_b = (
+                        torch.from_numpy(samples_trajs[inds]).float().to(self.device)
+                    )
+                    advantages_b = (
+                        torch.from_numpy(advantages_trajs[inds]).float().to(self.device)
+                    )
+                    advantages_b = (advantages_b - advantages_b.mean()) / (
+                        advantages_b.std() + 1e-6
+                    )
+                    advantages_b_scaled = torch.exp(self.beta * advantages_b)
+                    advantages_b_scaled.clamp_(max=self.max_adv_weight)
+
+                    # Update policy with collected trajectories
+                    loss = self.model.loss(
+                        actions_b,
+                        obs_b,
+                        advantages_b_scaled.detach(),
+                    )
+                    self.actor_optimizer.zero_grad()
+                    loss.backward()
+                    if self.itr >= self.n_critic_warmup_itr:
+                        if self.max_grad_norm is not None:
+                            torch.nn.utils.clip_grad_norm_(
+                                self.model.actor.parameters(), self.max_grad_norm
+                            )
+                        self.actor_optimizer.step()
+
+            # Update lr
+            self.actor_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                }
+            )
+            if self.itr % self.log_freq == 0:
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{timer():8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "loss": loss,
+                                "loss - critic": loss_critic,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["loss"] = loss
+                    run_results[-1]["loss_critic"] = loss_critic
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                run_results[-1]["time"] = timer()
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/finetune/train_dipo_diffusion_agent.py b/agent/finetune/train_dipo_diffusion_agent.py
new file mode 100644
index 0000000..875f160
--- /dev/null
+++ b/agent/finetune/train_dipo_diffusion_agent.py
@@ -0,0 +1,358 @@
+"""
+Model-free online RL with DIffusion POlicy (DIPO)
+
+Applies action gradient to perturb actions towards maximizer of Q-function.
+
+a_t <- a_t + \eta * \grad_a Q(s, a)
+"""
+
+import os
+import pickle
+import numpy as np
+import torch
+import logging
+import wandb
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from collections import deque
+from agent.finetune.train_agent import TrainAgent
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+
+class TrainDIPODiffusionAgent(TrainAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # note the discount factor gamma here is applied to reward every act_steps, instead of every env step
+        self.gamma = cfg.train.gamma
+
+        # Wwarm up period for critic before actor updates
+        self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
+
+        # Optimizer
+        self.actor_optimizer = torch.optim.AdamW(
+            self.model.actor.parameters(),
+            lr=cfg.train.actor_lr,
+            weight_decay=cfg.train.actor_weight_decay,
+        )
+        # use cosine scheduler with linear warmup
+        self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.actor_optimizer,
+            first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.actor_lr,
+            min_lr=cfg.train.actor_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+        self.critic_optimizer = torch.optim.AdamW(
+            self.model.critic.parameters(),
+            lr=cfg.train.critic_lr,
+            weight_decay=cfg.train.critic_weight_decay,
+        )
+        self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.critic_optimizer,
+            first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.critic_lr,
+            min_lr=cfg.train.critic_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+
+        # Buffer size
+        self.buffer_size = cfg.train.buffer_size
+
+        # Perturbation scale
+        self.eta = cfg.train.eta
+
+        # Updates
+        self.replay_ratio = cfg.train.replay_ratio
+
+        # Scaling reward
+        self.scale_reward_factor = cfg.train.scale_reward_factor
+
+        # Apply action gradient many steps
+        self.action_gradient_steps = cfg.train.action_gradient_steps
+
+    def run(self):
+
+        # make a FIFO replay buffer for obs, action, and reward
+        obs_buffer = deque(maxlen=self.buffer_size)
+        next_obs_buffer = deque(maxlen=self.buffer_size)
+        action_buffer = deque(maxlen=self.buffer_size)
+        reward_buffer = deque(maxlen=self.buffer_size)
+        done_buffer = deque(maxlen=self.buffer_size)
+        first_buffer = deque(maxlen=self.buffer_size)
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = self.itr % self.val_freq == 0 and not self.force_train
+            self.model.eval() if eval_mode else self.model.train()
+            firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
+
+            # Reset env at the beginning of an iteration
+            if self.reset_at_iteration or eval_mode or last_itr_eval:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                firsts_trajs[0] = (
+                    done_venv  # if done at the end of last iteration, then the envs are just reset
+                )
+            last_itr_eval = eval_mode
+            reward_trajs = np.empty((0, self.n_envs))
+
+            # Collect a set of trajectories from env
+            for step in range(self.n_steps):
+                if step % 10 == 0:
+                    print(f"Processed step {step} of {self.n_steps}")
+
+                # Select action
+                with torch.no_grad():
+                    samples = (
+                        self.model(
+                            cond=torch.from_numpy(prev_obs_venv)
+                            .float()
+                            .to(self.device),
+                            deterministic=eval_mode,
+                        )
+                        .cpu()
+                        .numpy()
+                    )  # n_env x horizon x act
+                action_venv = samples[:, : self.act_steps]
+
+                # Apply multi-step action
+                obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
+                    action_venv
+                )
+                reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
+
+                # add to buffer
+                for i in range(self.n_envs):
+                    obs_buffer.append(prev_obs_venv[i])
+                    next_obs_buffer.append(obs_venv[i])
+                    action_buffer.append(action_venv[i])
+                    reward_buffer.append(reward_venv[i] * self.scale_reward_factor)
+                    done_buffer.append(done_venv[i])
+                first_buffer.append(firsts_trajs[step])
+
+                firsts_trajs[step + 1] = done_venv
+                prev_obs_venv = obs_venv
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                episode_best_reward = np.array(
+                    [
+                        np.max(reward_traj) / self.act_steps
+                        for reward_traj in reward_trajs_split
+                    ]
+                )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+                log.info("[WARNING] No episode completed within the iteration!")
+
+            if not eval_mode:
+
+                num_batch = self.replay_ratio
+
+                # Critic learning
+                for _ in range(num_batch):
+                    # Sample batch
+                    inds = np.random.choice(len(obs_buffer), self.batch_size)
+                    obs_b = (
+                        torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    next_obs_b = (
+                        torch.from_numpy(
+                            np.vstack([next_obs_buffer[i][None] for i in inds])
+                        )
+                        .float()
+                        .to(self.device)
+                    )
+                    actions_b = (
+                        torch.from_numpy(
+                            np.vstack([action_buffer[i][None] for i in inds])
+                        )
+                        .float()
+                        .to(self.device)
+                    )
+                    rewards_b = (
+                        torch.from_numpy(np.vstack([reward_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    dones_b = (
+                        torch.from_numpy(np.vstack([done_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+
+                    # Update critic
+                    loss_critic = self.model.loss_critic(
+                        obs_b, next_obs_b, actions_b, rewards_b, dones_b, self.gamma
+                    )
+                    self.critic_optimizer.zero_grad()
+                    loss_critic.backward()
+                    self.critic_optimizer.step()
+
+                # Actor learning
+                for _ in range(num_batch):
+
+                    # Sample batch
+                    inds = np.random.choice(len(obs_buffer), self.batch_size)
+                    obs_b = (
+                        torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    actions_b = (
+                        torch.from_numpy(
+                            np.vstack([action_buffer[i][None] for i in inds])
+                        )
+                        .float()
+                        .to(self.device)
+                    )
+
+                    # Replace actions in buffer with guided actions
+                    guided_action_list = []
+
+                    # get Q-perturbed actions by optimizing
+                    actions_flat = actions_b.reshape(actions_b.shape[0], -1)
+                    actions_optim = torch.optim.Adam(
+                        [actions_flat], lr=self.eta, eps=1e-5
+                    )
+                    for _ in range(self.action_gradient_steps):
+                        actions_flat.requires_grad_(True)
+                        q_values_1, q_values_2 = self.model.critic(obs_b, actions_flat)
+                        q_values = torch.min(q_values_1, q_values_2)
+                        action_opt_loss = -q_values.sum()
+
+                        actions_optim.zero_grad()
+                        action_opt_loss.backward(torch.ones_like(action_opt_loss))
+
+                        # get the perturbed action
+                        actions_optim.step()
+
+                        actions_flat.requires_grad_(False)
+                        actions_flat.clamp_(-1.0, 1.0)
+                    guided_action = actions_flat.detach()
+                    guided_action = guided_action.reshape(
+                        guided_action.shape[0], -1, self.action_dim
+                    )
+                    guided_action_list.append(guided_action)
+                    guided_action_stacked = torch.cat(guided_action_list, 0)
+
+                    # Add to buffer (need separate indices since we're working with a limited subset)
+                    for i, i_buf in enumerate(inds):
+                        action_buffer[i_buf] = (
+                            guided_action_stacked[i].detach().cpu().numpy()
+                        )
+
+                    # Update policy with collected trajectories
+                    loss = self.model.loss(guided_action.detach(), {0: obs_b})
+                    self.actor_optimizer.zero_grad()
+                    loss.backward()
+                    if self.itr >= self.n_critic_warmup_itr:
+                        if self.max_grad_norm is not None:
+                            torch.nn.utils.clip_grad_norm_(
+                                self.model.actor.parameters(), self.max_grad_norm
+                            )
+                        self.actor_optimizer.step()
+
+            # Update lr
+            self.actor_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                }
+            )
+            if self.itr % self.log_freq == 0:
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{timer():8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "loss": loss,
+                                "loss - critic": loss_critic,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["loss"] = loss
+                    run_results[-1]["loss_critic"] = loss_critic
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                run_results[-1]["time"] = timer()
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/finetune/train_dql_diffusion_agent.py b/agent/finetune/train_dql_diffusion_agent.py
new file mode 100644
index 0000000..1a90394
--- /dev/null
+++ b/agent/finetune/train_dql_diffusion_agent.py
@@ -0,0 +1,317 @@
+"""
+Diffusion Q-Learning (DQL)
+
+Learns a critic Q-function and backprops the expected Q-value to train the actor
+
+pi = argmin L_d(\theta) - \alpha * E[Q(s, a)]
+L_d is demonstration loss for regularization
+"""
+
+import os
+import pickle
+import numpy as np
+import torch
+import logging
+import wandb
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from collections import deque
+from agent.finetune.train_agent import TrainAgent
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+
+class TrainDQLDiffusionAgent(TrainAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # note the discount factor gamma here is applied to reward every act_steps, instead of every env step
+        self.gamma = cfg.train.gamma
+
+        # Wwarm up period for critic before actor updates
+        self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
+
+        # Optimizer
+        self.actor_optimizer = torch.optim.AdamW(
+            self.model.actor.parameters(),
+            lr=cfg.train.actor_lr,
+            weight_decay=cfg.train.actor_weight_decay,
+        )
+        # use cosine scheduler with linear warmup
+        self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.actor_optimizer,
+            first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.actor_lr,
+            min_lr=cfg.train.actor_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+        self.critic_optimizer = torch.optim.AdamW(
+            self.model.critic.parameters(),
+            lr=cfg.train.critic_lr,
+            weight_decay=cfg.train.critic_weight_decay,
+        )
+        self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.critic_optimizer,
+            first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.critic_lr,
+            min_lr=cfg.train.critic_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+
+        # Buffer size
+        self.buffer_size = cfg.train.buffer_size
+
+        # Perturbation scale
+        self.eta = cfg.train.eta
+
+        # Reward factor - scale down mujoco reward for better critic training
+        self.scale_reward_factor = cfg.train.scale_reward_factor
+
+        # Updates
+        self.replay_ratio = cfg.train.replay_ratio
+
+    def run(self):
+
+        # make a FIFO replay buffer for obs, action, and reward
+        obs_buffer = deque(maxlen=self.buffer_size)
+        next_obs_buffer = deque(maxlen=self.buffer_size)
+        action_buffer = deque(maxlen=self.buffer_size)
+        reward_buffer = deque(maxlen=self.buffer_size)
+        done_buffer = deque(maxlen=self.buffer_size)
+        first_buffer = deque(maxlen=self.buffer_size)
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = self.itr % self.val_freq == 0 and not self.force_train
+            self.model.eval() if eval_mode else self.model.train()
+            firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
+
+            # Reset env at the beginning of an iteration
+            if self.reset_at_iteration or eval_mode or last_itr_eval:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                firsts_trajs[0] = (
+                    done_venv  # if done at the end of last iteration, then the envs are just reset
+                )
+            last_itr_eval = eval_mode
+            reward_trajs = np.empty((0, self.n_envs))
+
+            # Collect a set of trajectories from env
+            for step in range(self.n_steps):
+                if step % 10 == 0:
+                    print(f"Processed step {step} of {self.n_steps}")
+
+                # Select action
+                with torch.no_grad():
+                    samples = (
+                        self.model(
+                            cond=torch.from_numpy(prev_obs_venv)
+                            .float()
+                            .to(self.device),
+                            deterministic=eval_mode,
+                        )
+                        .cpu()
+                        .numpy()
+                    )  # n_env x horizon x act
+                action_venv = samples[:, : self.act_steps]
+
+                # Apply multi-step action
+                obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
+                    action_venv
+                )
+                reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
+
+                # add to buffer
+                for i in range(self.n_envs):
+                    obs_buffer.append(prev_obs_venv[i])
+                    next_obs_buffer.append(obs_venv[i])
+                    action_buffer.append(action_venv[i])
+                    reward_buffer.append(reward_venv[i] * self.scale_reward_factor)
+                    done_buffer.append(done_venv[i])
+                first_buffer.append(firsts_trajs[step])
+
+                firsts_trajs[step + 1] = done_venv
+                prev_obs_venv = obs_venv
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                episode_best_reward = np.array(
+                    [
+                        np.max(reward_traj) / self.act_steps
+                        for reward_traj in reward_trajs_split
+                    ]
+                )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+                log.info("[WARNING] No episode completed within the iteration!")
+
+            if not eval_mode:
+
+                num_batch = self.replay_ratio
+
+                # Critic learning
+                for _ in range(num_batch):
+                    # Sample batch
+                    inds = np.random.choice(len(obs_buffer), self.batch_size)
+                    obs_b = (
+                        torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    next_obs_b = (
+                        torch.from_numpy(
+                            np.vstack([next_obs_buffer[i][None] for i in inds])
+                        )
+                        .float()
+                        .to(self.device)
+                    )
+                    actions_b = (
+                        torch.from_numpy(
+                            np.vstack([action_buffer[i][None] for i in inds])
+                        )
+                        .float()
+                        .to(self.device)
+                    )
+                    rewards_b = (
+                        torch.from_numpy(np.vstack([reward_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    dones_b = (
+                        torch.from_numpy(np.vstack([done_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+
+                    # Update critic
+                    loss_critic = self.model.loss_critic(
+                        obs_b, next_obs_b, actions_b, rewards_b, dones_b, self.gamma
+                    )
+                    self.critic_optimizer.zero_grad()
+                    loss_critic.backward()
+                    self.critic_optimizer.step()
+
+                    # get the new action and q values
+                    samples = self.model.forward_train(
+                        cond=obs_b.to(self.device),
+                        deterministic=eval_mode,
+                    )
+                    output_venv = samples  # n_env x horizon x act
+                    action_venv = output_venv[:, : self.act_steps, : self.action_dim]
+                    actions_flat_b = action_venv.reshape(action_venv.shape[0], -1)
+                    q_values_b = self.model.critic(obs_b, actions_flat_b)
+                    q1_new_action, q2_new_action = q_values_b
+
+                    # Update policy with collected trajectories
+                    self.actor_optimizer.zero_grad()
+                    actor_loss = self.model.loss_actor(
+                        obs_b, actions_b, q1_new_action, q2_new_action, self.eta
+                    )
+                    actor_loss.backward()
+                    if self.itr >= self.n_critic_warmup_itr:
+                        if self.max_grad_norm is not None:
+                            torch.nn.utils.clip_grad_norm_(
+                                self.model.actor.parameters(), self.max_grad_norm
+                            )
+                        self.actor_optimizer.step()
+                    loss = actor_loss
+
+            # Update lr
+            self.actor_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                }
+            )
+            if self.itr % self.log_freq == 0:
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{timer():8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "loss": loss,
+                                "loss - critic": loss_critic,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["loss"] = loss
+                    run_results[-1]["loss_critic"] = loss_critic
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                run_results[-1]["time"] = timer()
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/finetune/train_idql_diffusion_agent.py b/agent/finetune/train_idql_diffusion_agent.py
new file mode 100644
index 0000000..5b71169
--- /dev/null
+++ b/agent/finetune/train_idql_diffusion_agent.py
@@ -0,0 +1,347 @@
+"""
+Implicit diffusion Q-learning (IDQL) trainer for diffusion policy.
+
+"""
+
+import os
+import pickle
+import einops
+import numpy as np
+import torch
+import logging
+import wandb
+from copy import deepcopy
+import random
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from collections import deque
+from agent.finetune.train_agent import TrainAgent
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+
+class TrainIDQLDiffusionAgent(TrainAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # note the discount factor gamma here is applied to reward every act_steps, instead of every env step
+        self.gamma = cfg.train.gamma
+
+        # Wwarm up period for critic before actor updates
+        self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
+
+        # Optimizer
+        self.actor_optimizer = torch.optim.AdamW(
+            self.model.actor.parameters(),
+            lr=cfg.train.actor_lr,
+            weight_decay=cfg.train.actor_weight_decay,
+        )
+        self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.actor_optimizer,
+            first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.actor_lr,
+            min_lr=cfg.train.actor_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+        self.critic_q_optimizer = torch.optim.AdamW(
+            self.model.critic_q.parameters(),
+            lr=cfg.train.critic_lr,
+            weight_decay=cfg.train.critic_weight_decay,
+        )
+        self.critic_v_optimizer = torch.optim.AdamW(
+            self.model.critic_v.parameters(),
+            lr=cfg.train.critic_lr,
+            weight_decay=cfg.train.critic_weight_decay,
+        )
+        self.critic_v_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.critic_v_optimizer,
+            first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.critic_lr,
+            min_lr=cfg.train.critic_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+        self.critic_q_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.critic_q_optimizer,
+            first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.critic_lr,
+            min_lr=cfg.train.critic_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+
+        # Buffer size
+        self.buffer_size = cfg.train.buffer_size
+
+        # Actor params
+        self.use_expectile_exploration = cfg.train.use_expectile_exploration
+
+        # Scaling reward
+        self.scale_reward_factor = cfg.train.scale_reward_factor
+
+        # Updates
+        self.replay_ratio = cfg.train.replay_ratio
+        self.critic_tau = cfg.train.critic_tau
+
+        # Whether to use deterministic mode when sampling at eval
+        self.eval_deterministic = cfg.train.get("eval_deterministic", False)
+
+        # Sampling
+        self.num_sample = cfg.train.eval_sample_num
+
+    def run(self):
+
+        # make a FIFO replay buffer for obs, action, and reward
+        obs_buffer = deque(maxlen=self.buffer_size)
+        action_buffer = deque(maxlen=self.buffer_size)
+        next_obs_buffer = deque(maxlen=self.buffer_size)
+        reward_buffer = deque(maxlen=self.buffer_size)
+        done_buffer = deque(maxlen=self.buffer_size)
+        first_buffer = deque(maxlen=self.buffer_size)
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = self.itr % self.val_freq == 0 and not self.force_train
+            self.model.eval() if eval_mode else self.model.train()
+            firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
+
+            # Reset env at the beginning of an iteration
+            if self.reset_at_iteration or eval_mode or last_itr_eval:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                firsts_trajs[0] = (
+                    done_venv  # if done at the end of last iteration, then the envs are just reset
+                )
+            last_itr_eval = eval_mode
+            reward_trajs = np.empty((0, self.n_envs))
+
+            # Collect a set of trajectories from env
+            for step in range(self.n_steps):
+                if step % 10 == 0:
+                    print(f"Processed step {step} of {self.n_steps}")
+
+                # Select action
+                with torch.no_grad():
+                    samples = (
+                        self.model(
+                            cond=torch.from_numpy(prev_obs_venv)
+                            .float()
+                            .to(self.device),
+                            deterministic=eval_mode and self.eval_deterministic,
+                            num_sample=self.num_sample,
+                            use_expectile_exploration=self.use_expectile_exploration,
+                        )
+                        .cpu()
+                        .numpy()
+                    )  # n_env x horizon x act
+                action_venv = samples[:, : self.act_steps]
+
+                # Apply multi-step action
+                obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
+                    action_venv
+                )
+                reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
+
+                # add to buffer
+                obs_buffer.append(prev_obs_venv)
+                action_buffer.append(action_venv)
+                next_obs_buffer.append(obs_venv)
+                reward_buffer.append(reward_venv * self.scale_reward_factor)
+                done_buffer.append(done_venv)
+                first_buffer.append(firsts_trajs[step])
+
+                firsts_trajs[step + 1] = done_venv
+                prev_obs_venv = obs_venv
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                episode_best_reward = np.array(
+                    [
+                        np.max(reward_traj) / self.act_steps
+                        for reward_traj in reward_trajs_split
+                    ]
+                )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+                log.info("[WARNING] No episode completed within the iteration!")
+
+            # Update
+            if not eval_mode:
+
+                obs_trajs = np.array(deepcopy(obs_buffer))
+                action_trajs = np.array(deepcopy(action_buffer))
+                next_obs_trajs = np.array(deepcopy(next_obs_buffer))
+                reward_trajs = np.array(deepcopy(reward_buffer))
+                done_trajs = np.array(deepcopy(done_buffer))
+                first_trajs = np.array(deepcopy(first_buffer))
+
+                # flatten
+                obs_trajs = einops.rearrange(
+                    obs_trajs,
+                    "s e h d -> (s e) h d",
+                )
+                next_obs_trajs = einops.rearrange(
+                    next_obs_trajs,
+                    "s e h d -> (s e) h d",
+                )
+                action_trajs = einops.rearrange(
+                    action_trajs,
+                    "s e h d -> (s e) h d",
+                )
+                reward_trajs = reward_trajs.reshape(-1)
+                done_trajs = done_trajs.reshape(-1)
+                first_trajs = first_trajs.reshape(-1)
+
+                num_batch = int(
+                    self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
+                )
+
+                for _ in range(num_batch):
+
+                    # Sample batch
+                    inds = np.random.choice(len(obs_trajs), self.batch_size)
+                    obs_b = torch.from_numpy(obs_trajs[inds]).float().to(self.device)
+                    next_obs_b = (
+                        torch.from_numpy(next_obs_trajs[inds]).float().to(self.device)
+                    )
+                    actions_b = (
+                        torch.from_numpy(action_trajs[inds]).float().to(self.device)
+                    )
+                    reward_b = (
+                        torch.from_numpy(reward_trajs[inds]).float().to(self.device)
+                    )
+                    done_b = torch.from_numpy(done_trajs[inds]).float().to(self.device)
+
+                    # update critic value function
+                    critic_loss_v = self.model.loss_critic_v(obs_b, actions_b)
+                    self.critic_v_optimizer.zero_grad()
+                    critic_loss_v.backward()
+                    self.critic_v_optimizer.step()
+
+                    # update critic q function
+                    critic_loss_q = self.model.loss_critic_q(
+                        obs_b, next_obs_b, actions_b, reward_b, done_b, self.gamma
+                    )
+                    self.critic_q_optimizer.zero_grad()
+                    critic_loss_q.backward()
+                    self.critic_q_optimizer.step()
+
+                    # update target q function
+                    self.model.update_target_critic(self.critic_tau)
+
+                    loss_critic = critic_loss_q.detach() + critic_loss_v.detach()
+
+                    # Update policy with collected trajectories - no weighting
+                    loss = self.model.loss(
+                        actions_b,
+                        obs_b,
+                    )
+                    self.actor_optimizer.zero_grad()
+                    loss.backward()
+                    if self.itr >= self.n_critic_warmup_itr:
+                        if self.max_grad_norm is not None:
+                            torch.nn.utils.clip_grad_norm_(
+                                self.model.actor.parameters(), self.max_grad_norm
+                            )
+                        self.actor_optimizer.step()
+
+            # Update lr
+            self.actor_lr_scheduler.step()
+            self.critic_v_lr_scheduler.step()
+            self.critic_q_lr_scheduler.step()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                }
+            )
+            if self.itr % self.log_freq == 0:
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{timer():8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "loss": loss,
+                                "loss - critic": loss_critic,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["loss"] = loss
+                    run_results[-1]["loss_critic"] = loss_critic
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                run_results[-1]["time"] = timer()
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/finetune/train_ppo_agent.py b/agent/finetune/train_ppo_agent.py
new file mode 100644
index 0000000..ff25b89
--- /dev/null
+++ b/agent/finetune/train_ppo_agent.py
@@ -0,0 +1,112 @@
+"""
+Parent PPO fine-tuning agent class.
+
+"""
+
+from typing import Optional
+import torch
+import logging
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+log = logging.getLogger(__name__)
+from agent.finetune.train_agent import TrainAgent
+from util.reward_scaling import RunningRewardScaler
+
+
+class TrainPPOAgent(TrainAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # Batch size for logprobs calculations after an iteration --- prevent out of memory if using a single batch
+        self.logprob_batch_size = cfg.train.get("logprob_batch_size", 10000)
+        assert (
+            self.logprob_batch_size % self.n_envs == 0
+        ), "logprob_batch_size must be divisible by n_envs"
+
+        # note the discount factor gamma here is applied to reward every act_steps, instead of every env step
+        self.gamma = cfg.train.gamma
+
+        # Wwarm up period for critic before actor updates
+        self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
+
+        # Optimizer
+        self.actor_optimizer = torch.optim.AdamW(
+            self.model.actor_ft.parameters(),
+            lr=cfg.train.actor_lr,
+            weight_decay=cfg.train.actor_weight_decay,
+        )
+        # use cosine scheduler with linear warmup
+        self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.actor_optimizer,
+            first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.actor_lr,
+            min_lr=cfg.train.actor_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+        self.critic_optimizer = torch.optim.AdamW(
+            self.model.critic.parameters(),
+            lr=cfg.train.critic_lr,
+            weight_decay=cfg.train.critic_weight_decay,
+        )
+        self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.critic_optimizer,
+            first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.critic_lr,
+            min_lr=cfg.train.critic_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+
+        # Generalized advantage estimation
+        self.gae_lambda: float = cfg.train.get("gae_lambda", 0.95)
+
+        # If specified, stop gradient update once KL difference reaches it
+        self.target_kl: Optional[float] = cfg.train.target_kl
+
+        # Number of times the collected data is used in gradient update
+        self.update_epochs: int = cfg.train.update_epochs
+
+        # Entropy loss coefficient
+        self.ent_coef: float = cfg.train.get("ent_coef", 0)
+
+        # Value loss coefficient
+        self.vf_coef: float = cfg.train.get("vf_coef", 0)
+
+        # Whether to use running reward scaling
+        self.reward_scale_running: bool = cfg.train.reward_scale_running
+        if self.reward_scale_running:
+            self.running_reward_scaler = RunningRewardScaler(self.n_envs)
+
+        # Scaling reward with constant
+        self.reward_scale_const: float = cfg.train.get("reward_scale_const", 1)
+
+        # Use base policy
+        self.use_bc_loss: bool = cfg.train.get("use_bc_loss", False)
+        self.bc_loss_coeff: float = cfg.train.get("bc_loss_coeff", 0)
+
+    def reset_actor_optimizer(self):
+        """Not used anywhere currently"""
+        new_optimizer = torch.optim.AdamW(
+            self.model.actor_ft.parameters(),
+            lr=self.cfg.train.actor_lr,
+            weight_decay=self.cfg.train.actor_weight_decay,
+        )
+        new_optimizer.load_state_dict(self.actor_optimizer.state_dict())
+        self.actor_optimizer = new_optimizer
+
+        new_scheduler = CosineAnnealingWarmupRestarts(
+            self.actor_optimizer,
+            first_cycle_steps=self.cfg.train.actor_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=self.cfg.train.actor_lr,
+            min_lr=self.cfg.train.actor_lr_scheduler.min_lr,
+            warmup_steps=self.cfg.train.actor_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+        new_scheduler.load_state_dict(self.actor_lr_scheduler.state_dict())
+        self.actor_lr_scheduler = new_scheduler
+        log.info("Reset actor optimizer")
diff --git a/agent/finetune/train_ppo_diffusion_agent.py b/agent/finetune/train_ppo_diffusion_agent.py
new file mode 100644
index 0000000..380c723
--- /dev/null
+++ b/agent/finetune/train_ppo_diffusion_agent.py
@@ -0,0 +1,452 @@
+"""
+DPPO fine-tuning.
+
+"""
+
+import os
+import pickle
+import einops
+import numpy as np
+import torch
+import logging
+import wandb
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from agent.finetune.train_ppo_agent import TrainPPOAgent
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+
+class TrainPPODiffusionAgent(TrainPPOAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # Reward horizon --- always set to act_steps for now
+        self.reward_horizon = cfg.get("reward_horizon", self.act_steps)
+
+        # Eta - between DDIM (=0 for eval) and DDPM (=1 for training)
+        self.learn_eta = self.model.learn_eta
+        if self.learn_eta:
+            self.eta_update_interval = cfg.train.eta_update_interval
+            self.eta_optimizer = torch.optim.AdamW(
+                self.model.eta.parameters(),
+                lr=cfg.train.eta_lr,
+                weight_decay=cfg.train.eta_weight_decay,
+            )
+            self.eta_lr_scheduler = CosineAnnealingWarmupRestarts(
+                self.eta_optimizer,
+                first_cycle_steps=cfg.train.eta_lr_scheduler.first_cycle_steps,
+                cycle_mult=1.0,
+                max_lr=cfg.train.eta_lr,
+                min_lr=cfg.train.eta_lr_scheduler.min_lr,
+                warmup_steps=cfg.train.eta_lr_scheduler.warmup_steps,
+                gamma=1.0,
+            )
+
+    def run(self):
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        last_itr_eval = False
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = self.itr % self.val_freq == 0 and not self.force_train
+            self.model.eval() if eval_mode else self.model.train()
+            last_itr_eval = eval_mode
+
+            # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
+            dones_trajs = np.empty((0, self.n_envs))
+            firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
+            if self.reset_at_iteration or eval_mode or last_itr_eval:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                firsts_trajs[0] = (
+                    done_venv  # if done at the end of last iteration, then the envs are just reset
+                )
+
+            # Holder
+            obs_trajs = np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
+            chains_trajs = np.empty(
+                (
+                    0,
+                    self.n_envs,
+                    self.model.ft_denoising_steps + 1,
+                    self.horizon_steps,
+                    self.action_dim,
+                )
+            )
+            reward_trajs = np.empty((0, self.n_envs))
+            obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
+            obs_full_trajs = np.vstack(
+                (obs_full_trajs, prev_obs_venv[None].squeeze(2))
+            )  # remove cond_step dim
+
+            # Collect a set of trajectories from env
+            for step in range(self.n_steps):
+                if step % 10 == 0:
+                    print(f"Processed step {step} of {self.n_steps}")
+
+                # Select action
+                with torch.no_grad():
+                    samples = self.model(
+                        cond=torch.from_numpy(prev_obs_venv).float().to(self.device),
+                        deterministic=eval_mode,
+                        return_chain=True,
+                    )
+                    output_venv = (
+                        samples.trajectories.cpu().numpy()
+                    )  # n_env x horizon x act
+                    chains_venv = (
+                        samples.chains.cpu().numpy()
+                    )  # n_env x denoising x horizon x act
+                action_venv = output_venv[:, : self.act_steps]
+
+                # Apply multi-step action
+                obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
+                    action_venv
+                )
+                if self.save_full_observations:
+                    obs_full_venv = np.vstack(
+                        [info["full_obs"][None] for info in info_venv]
+                    )  # n_envs x n_act_steps x obs_dim
+                    obs_full_trajs = np.vstack(
+                        (obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
+                    )
+                obs_trajs = np.vstack((obs_trajs, prev_obs_venv[None]))
+                chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
+                reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
+                dones_trajs = np.vstack((dones_trajs, done_venv[None]))
+                firsts_trajs[step + 1] = done_venv
+                prev_obs_venv = obs_venv
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                if (
+                    self.furniture_sparse_reward
+                ):  # only for furniture tasks, where reward only occurs in one env step
+                    episode_best_reward = episode_reward
+                else:
+                    episode_best_reward = np.array(
+                        [
+                            np.max(reward_traj) / self.act_steps
+                            for reward_traj in reward_trajs_split
+                        ]
+                    )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+                log.info("[WARNING] No episode completed within the iteration!")
+
+            # Update models
+            if not eval_mode:
+                with torch.no_grad():
+                    # Calculate value and logprobs - split into batches to prevent out of memory
+                    obs_t = einops.rearrange(
+                        torch.from_numpy(obs_trajs).float().to(self.device),
+                        "s e h d -> (s e) h d",
+                    )
+                    obs_ts = torch.split(obs_t, self.logprob_batch_size, dim=0)
+                    values_trajs = np.empty((0, self.n_envs))
+                    for obs in obs_ts:
+                        values = self.model.critic(obs).cpu().numpy().flatten()
+                        values_trajs = np.vstack(
+                            (values_trajs, values.reshape(-1, self.n_envs))
+                        )
+                    chains_t = einops.rearrange(
+                        torch.from_numpy(chains_trajs).float().to(self.device),
+                        "s e t h d -> (s e) t h d",
+                    )
+                    chains_ts = torch.split(chains_t, self.logprob_batch_size, dim=0)
+                    logprobs_trajs = np.empty(
+                        (
+                            0,
+                            self.model.ft_denoising_steps,
+                            self.horizon_steps,
+                            self.action_dim,
+                        )
+                    )
+                    for obs, chains in zip(obs_ts, chains_ts):
+                        logprobs = self.model.get_logprobs(obs, chains).cpu().numpy()
+                        logprobs_trajs = np.vstack(
+                            (
+                                logprobs_trajs,
+                                logprobs.reshape(-1, *logprobs_trajs.shape[1:]),
+                            )
+                        )
+
+                    # normalize reward with running variance if specified
+                    if self.reward_scale_running:
+                        reward_trajs_transpose = self.running_reward_scaler(
+                            reward=reward_trajs.T, first=firsts_trajs[:-1].T
+                        )
+                        reward_trajs = reward_trajs_transpose.T
+
+                    # bootstrap value with GAE if not done - apply reward scaling with constant if specified
+                    obs_venv_ts = torch.from_numpy(obs_venv).float().to(self.device)
+                    with torch.no_grad():
+                        next_value = (
+                            self.model.critic(obs_venv_ts).reshape(1, -1).cpu().numpy()
+                        )
+                    advantages_trajs = np.zeros_like(reward_trajs)
+                    lastgaelam = 0
+                    for t in reversed(range(self.n_steps)):
+                        if t == self.n_steps - 1:
+                            nextnonterminal = 1.0 - done_venv
+                            nextvalues = next_value
+                        else:
+                            nextnonterminal = 1.0 - dones_trajs[t + 1]
+                            nextvalues = values_trajs[t + 1]
+                        # delta = r + gamma*V(st+1) - V(st)
+                        delta = (
+                            reward_trajs[t] * self.reward_scale_const
+                            + self.gamma * nextvalues * nextnonterminal
+                            - values_trajs[t]
+                        )
+                        # A = delta_t + gamma*lamdba*delta_{t+1} + ...
+                        advantages_trajs[t] = lastgaelam = (
+                            delta
+                            + self.gamma
+                            * self.gae_lambda
+                            * nextnonterminal
+                            * lastgaelam
+                        )
+                    returns_trajs = advantages_trajs + values_trajs
+
+                # k for environment step
+                obs_k = einops.rearrange(
+                    torch.tensor(obs_trajs).float().to(self.device),
+                    "s e h d -> (s e) h d",
+                )
+                chains_k = einops.rearrange(
+                    torch.tensor(chains_trajs).float().to(self.device),
+                    "s e t h d -> (s e) t h d",
+                )
+                returns_k = (
+                    torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+                )
+                values_k = (
+                    torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+                )
+                advantages_k = (
+                    torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
+                )
+                logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
+
+                # Update policy and critic
+                total_steps = self.n_steps * self.n_envs
+                inds_k = np.arange(total_steps)
+                clipfracs = []
+                for update_epoch in range(self.update_epochs):
+
+                    # for each epoch, go through all data in batches
+                    flag_break = False
+                    np.random.shuffle(inds_k)
+                    num_batch = max(1, total_steps // self.batch_size)  # skip last ones
+                    for batch in range(num_batch):
+                        start = batch * self.batch_size
+                        end = start + self.batch_size
+                        inds_b = inds_k[start:end]  # b for batch
+                        obs_b = obs_k[inds_b]
+                        chains_b = chains_k[inds_b]
+                        returns_b = returns_k[inds_b]
+                        values_b = values_k[inds_b]
+                        advantages_b = advantages_k[inds_b]
+                        logprobs_b = logprobs_k[inds_b]
+
+                        # get loss
+                        (
+                            pg_loss,
+                            entropy_loss,
+                            v_loss,
+                            clipfrac,
+                            approx_kl,
+                            ratio,
+                            bc_loss,
+                            eta,
+                        ) = self.model.loss(
+                            obs_b,
+                            chains_b,
+                            returns_b,
+                            values_b,
+                            advantages_b,
+                            logprobs_b,
+                            use_bc_loss=self.use_bc_loss,
+                            reward_horizon=self.reward_horizon,
+                        )
+                        loss = (
+                            pg_loss
+                            + entropy_loss * self.ent_coef
+                            + v_loss * self.vf_coef
+                            + bc_loss * self.bc_loss_coeff
+                        )
+                        clipfracs += [clipfrac]
+
+                        # update policy and critic
+                        self.actor_optimizer.zero_grad()
+                        self.critic_optimizer.zero_grad()
+                        if self.learn_eta:
+                            self.eta_optimizer.zero_grad()
+                        loss.backward()
+                        if self.itr >= self.n_critic_warmup_itr:
+                            if self.max_grad_norm is not None:
+                                torch.nn.utils.clip_grad_norm_(
+                                    self.model.actor_ft.parameters(), self.max_grad_norm
+                                )
+                            self.actor_optimizer.step()
+                            if self.learn_eta and batch % self.eta_update_interval == 0:
+                                self.eta_optimizer.step()
+                        self.critic_optimizer.step()
+                        log.info(
+                            f"approx_kl: {approx_kl}, update_epoch: {update_epoch}, num_batch: {num_batch}"
+                        )
+
+                        # Stop gradient update if KL difference reaches target
+                        if self.target_kl is not None and approx_kl > self.target_kl:
+                            flag_break = True
+                            break
+                    if flag_break:
+                        break
+
+                # Explained variation of future rewards using value function
+                y_pred, y_true = values_k.cpu().numpy(), returns_k.cpu().numpy()
+                var_y = np.var(y_true)
+                explained_var = (
+                    np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
+                )
+
+            # Plot state trajectories in D3IL
+            if (
+                self.itr % self.render_freq == 0
+                and self.n_render > 0
+                and self.traj_plotter is not None
+            ):
+                self.traj_plotter(
+                    obs_full_trajs=obs_full_trajs,
+                    n_render=self.n_render,
+                    max_episode_steps=self.max_episode_steps,
+                    render_dir=self.render_dir,
+                    itr=self.itr,
+                )
+
+            # Update lr, min_sampling_std
+            if self.itr >= self.n_critic_warmup_itr:
+                self.actor_lr_scheduler.step()
+                if self.learn_eta:
+                    self.eta_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+            self.model.step()
+            diffusion_min_sampling_std = self.model.get_min_sampling_denoising_std()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                }
+            )
+            if self.save_trajs:
+                run_results[-1]["obs_full_trajs"] = obs_full_trajs
+                run_results[-1]["obs_trajs"] = obs_trajs
+                run_results[-1]["chains_trajs"] = chains_trajs
+                run_results[-1]["reward_trajs"] = reward_trajs
+            if self.itr % self.log_freq == 0:
+                time = timer()
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "loss": loss,
+                                "pg loss": pg_loss,
+                                "value loss": v_loss,
+                                "bc loss": bc_loss,
+                                "eta": eta,
+                                "approx kl": approx_kl,
+                                "ratio": ratio,
+                                "clipfrac": np.mean(clipfracs),
+                                "explained variance": explained_var,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                                "diffusion - min sampling std": diffusion_min_sampling_std,
+                                "actor lr": self.actor_optimizer.param_groups[0]["lr"],
+                                "critic lr": self.critic_optimizer.param_groups[0][
+                                    "lr"
+                                ],
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["loss"] = loss
+                    run_results[-1]["pg_loss"] = pg_loss
+                    run_results[-1]["value_loss"] = v_loss
+                    run_results[-1]["bc_loss"] = bc_loss
+                    run_results[-1]["eta"] = eta
+                    run_results[-1]["approx_kl"] = approx_kl
+                    run_results[-1]["ratio"] = ratio
+                    run_results[-1]["clip_frac"] = np.mean(clipfracs)
+                    run_results[-1]["explained_variance"] = explained_var
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                run_results[-1]["time"] = time
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/finetune/train_ppo_diffusion_img_agent.py b/agent/finetune/train_ppo_diffusion_img_agent.py
new file mode 100644
index 0000000..08ef33f
--- /dev/null
+++ b/agent/finetune/train_ppo_diffusion_img_agent.py
@@ -0,0 +1,468 @@
+"""
+DPPO fine-tuning for pixel observations.
+
+"""
+
+import os
+import pickle
+import einops
+import numpy as np
+import torch
+import logging
+import wandb
+import math
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from agent.finetune.train_ppo_diffusion_agent import TrainPPODiffusionAgent
+from model.common.modules import RandomShiftsAug
+
+
+class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # Image randomization
+        self.augment = cfg.train.augment
+        if self.augment:
+            self.aug = RandomShiftsAug(pad=4)
+
+        # Set obs dim -  we will save the different obs in batch in a dict
+        shape_meta = cfg.shape_meta
+        self.obs_dims = {k: shape_meta.obs[k]["shape"] for k in shape_meta.obs.keys()}
+
+        # Gradient accumulation to deal with large GPU RAM usage
+        self.grad_accumulate = cfg.train.grad_accumulate
+
+    def run(self):
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        last_itr_eval = False
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = self.itr % self.val_freq == 0 and not self.force_train
+            self.model.eval() if eval_mode else self.model.train()
+            last_itr_eval = eval_mode
+
+            # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
+            dones_trajs = np.empty((0, self.n_envs))
+            firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
+            if self.reset_at_iteration or eval_mode or last_itr_eval:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                firsts_trajs[0] = (
+                    done_venv  # if done at the end of last iteration, then the envs are just reset
+                )
+
+            # Holder
+            obs_trajs = {
+                k: np.empty((0, self.n_envs, self.n_cond_step, *self.obs_dims[k]))
+                for k in self.obs_dims
+            }
+            chains_trajs = np.empty(
+                (
+                    0,
+                    self.n_envs,
+                    self.model.ft_denoising_steps + 1,
+                    self.horizon_steps,
+                    self.action_dim,
+                )
+            )
+            reward_trajs = np.empty((0, self.n_envs))
+
+            # Collect a set of trajectories from env
+            for step in range(self.n_steps):
+                if step % 10 == 0:
+                    print(f"Processed step {step} of {self.n_steps}")
+
+                # Select action
+                with torch.no_grad():
+                    cond = {
+                        key: torch.from_numpy(prev_obs_venv[key])
+                        .float()
+                        .to(self.device)
+                        for key in self.obs_dims.keys()
+                    }  # batch each type of obs and put into dict
+                    samples = self.model(
+                        cond=cond,
+                        deterministic=eval_mode,
+                        return_chain=True,
+                    )
+                    output_venv = (
+                        samples.trajectories.cpu().numpy()
+                    )  # n_env x horizon x act
+                    chains_venv = (
+                        samples.chains.cpu().numpy()
+                    )  # n_env x denoising x horizon x act
+                action_venv = output_venv[:, : self.act_steps]
+
+                # Apply multi-step action
+                obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
+                    action_venv
+                )
+                for k in obs_trajs.keys():
+                    obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None]))
+                chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
+                reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
+                dones_trajs = np.vstack((dones_trajs, done_venv[None]))
+                firsts_trajs[step + 1] = done_venv
+                prev_obs_venv = obs_venv
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                episode_best_reward = np.array(
+                    [
+                        np.max(reward_traj) / self.act_steps
+                        for reward_traj in reward_trajs_split
+                    ]
+                )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+                log.info("[WARNING] No episode completed within the iteration!")
+
+            # Update
+            if not eval_mode:
+                with torch.no_grad():
+                    # apply image randomization
+                    obs_trajs["rgb"] = (
+                        torch.from_numpy(obs_trajs["rgb"]).float().to(self.device)
+                    )
+                    obs_trajs["state"] = (
+                        torch.from_numpy(obs_trajs["state"]).float().to(self.device)
+                    )
+                    if self.augment:
+                        rgb = einops.rearrange(
+                            obs_trajs["rgb"],
+                            "s e t c h w -> (s e t) c h w",
+                        )
+                        rgb = self.aug(rgb)
+                        obs_trajs["rgb"] = einops.rearrange(
+                            rgb,
+                            "(s e t) c h w -> s e t c h w",
+                            s=self.n_steps,
+                            e=self.n_envs,
+                        )
+
+                    # Calculate value and logprobs - split into batches to prevent out of memory
+                    num_split = math.ceil(
+                        self.n_envs * self.n_steps / self.logprob_batch_size
+                    )
+                    obs_ts = [{} for _ in range(num_split)]
+                    for k in obs_trajs.keys():
+                        obs_k = einops.rearrange(
+                            obs_trajs[k],
+                            "s e ... -> (s e) ...",
+                        )
+                        obs_ts_k = torch.split(obs_k, self.logprob_batch_size, dim=0)
+                        for i, obs_t in enumerate(obs_ts_k):
+                            obs_ts[i][k] = obs_t
+                    values_trajs = np.empty((0, self.n_envs))
+                    for obs in obs_ts:
+                        values = (
+                            self.model.critic(obs, no_augment=True)
+                            .cpu()
+                            .numpy()
+                            .flatten()
+                        )
+                        values_trajs = np.vstack(
+                            (values_trajs, values.reshape(-1, self.n_envs))
+                        )
+                    chains_t = einops.rearrange(
+                        torch.from_numpy(chains_trajs).float().to(self.device),
+                        "s e t h d -> (s e) t h d",
+                    )
+                    chains_ts = torch.split(chains_t, self.logprob_batch_size, dim=0)
+                    logprobs_trajs = np.empty(
+                        (
+                            0,
+                            self.model.ft_denoising_steps,
+                            self.horizon_steps,
+                            self.action_dim,
+                        )
+                    )
+                    for obs, chains in zip(obs_ts, chains_ts):
+                        logprobs = self.model.get_logprobs(obs, chains).cpu().numpy()
+                        logprobs_trajs = np.vstack(
+                            (
+                                logprobs_trajs,
+                                logprobs.reshape(-1, *logprobs_trajs.shape[1:]),
+                            )
+                        )
+
+                    # normalize reward with running variance if specified
+                    if self.reward_scale_running:
+                        reward_trajs_transpose = self.running_reward_scaler(
+                            reward=reward_trajs.T, first=firsts_trajs[:-1].T
+                        )
+                        reward_trajs = reward_trajs_transpose.T
+
+                    # bootstrap value with GAE if not done - apply reward scaling with constant if specified
+                    obs_venv_ts = {
+                        key: torch.from_numpy(obs_venv[key]).float().to(self.device)
+                        for key in self.obs_dims.keys()
+                    }
+                    with torch.no_grad():
+                        next_value = (
+                            self.model.critic(obs_venv_ts, no_augment=True)
+                            .reshape(1, -1)
+                            .cpu()
+                            .numpy()
+                        )
+                    advantages_trajs = np.zeros_like(reward_trajs)
+                    lastgaelam = 0
+                    for t in reversed(range(self.n_steps)):
+                        if t == self.n_steps - 1:
+                            nextnonterminal = 1.0 - done_venv
+                            nextvalues = next_value
+                        else:
+                            nextnonterminal = 1.0 - dones_trajs[t + 1]
+                            nextvalues = values_trajs[t + 1]
+                        # delta = r + gamma*V(st+1) - V(st)
+                        delta = (
+                            reward_trajs[t] * self.reward_scale_const
+                            + self.gamma * nextvalues * nextnonterminal
+                            - values_trajs[t]
+                        )
+                        # A = delta_t + gamma*lamdba*delta_{t+1} + ...
+                        advantages_trajs[t] = lastgaelam = (
+                            delta
+                            + self.gamma
+                            * self.gae_lambda
+                            * nextnonterminal
+                            * lastgaelam
+                        )
+                    returns_trajs = advantages_trajs + values_trajs
+
+                # k for environment step
+                obs_k = {
+                    k: einops.rearrange(
+                        obs_trajs[k],
+                        "s e ... -> (s e) ...",
+                    )
+                    for k in obs_trajs.keys()
+                }
+                chains_k = einops.rearrange(
+                    torch.tensor(chains_trajs).float().to(self.device),
+                    "s e t h d -> (s e) t h d",
+                )
+                returns_k = (
+                    torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+                )
+                values_k = (
+                    torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+                )
+                advantages_k = (
+                    torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
+                )
+                logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
+
+                # Update policy and critic
+                total_steps = self.n_steps * self.n_envs
+                inds_k = np.arange(total_steps)
+                clipfracs = []
+                for update_epoch in range(self.update_epochs):
+
+                    # for each epoch, go through all data in batches
+                    flag_break = False
+                    np.random.shuffle(inds_k)
+                    num_batch = max(1, total_steps // self.batch_size)  # skip last ones
+                    for batch in range(num_batch):
+                        start = batch * self.batch_size
+                        end = start + self.batch_size
+                        inds_b = inds_k[start:end]  # b for batch
+                        obs_b = {k: obs_k[k][inds_b] for k in obs_k.keys()}
+                        chains_b = chains_k[inds_b]
+                        returns_b = returns_k[inds_b]
+                        values_b = values_k[inds_b]
+                        advantages_b = advantages_k[inds_b]
+                        logprobs_b = logprobs_k[inds_b]
+
+                        # get loss
+                        (
+                            pg_loss,
+                            entropy_loss,
+                            v_loss,
+                            clipfrac,
+                            approx_kl,
+                            ratio,
+                            bc_loss,
+                            eta,
+                        ) = self.model.loss(
+                            obs_b,
+                            chains_b,
+                            returns_b,
+                            values_b,
+                            advantages_b,
+                            logprobs_b,
+                            use_bc_loss=self.use_bc_loss,
+                            reward_horizon=self.reward_horizon,
+                        )
+                        loss = (
+                            pg_loss
+                            + entropy_loss * self.ent_coef
+                            + v_loss * self.vf_coef
+                            + bc_loss * self.bc_loss_coeff
+                        )
+                        clipfracs += [clipfrac]
+
+                        # update policy and critic
+                        loss.backward()
+                        if (batch + 1) % self.grad_accumulate == 0:
+                            if self.itr >= self.n_critic_warmup_itr:
+                                if self.max_grad_norm is not None:
+                                    torch.nn.utils.clip_grad_norm_(
+                                        self.model.actor_ft.parameters(),
+                                        self.max_grad_norm,
+                                    )
+                                self.actor_optimizer.step()
+                                if (
+                                    self.learn_eta
+                                    and batch % self.eta_update_interval == 0
+                                ):
+                                    self.eta_optimizer.step()
+                            self.critic_optimizer.step()
+                            self.actor_optimizer.zero_grad()
+                            self.critic_optimizer.zero_grad()
+                            if self.learn_eta:
+                                self.eta_optimizer.zero_grad()
+                            log.info(f"run grad update at batch {batch}")
+                            log.info(
+                                f"approx_kl: {approx_kl}, update_epoch: {update_epoch}, num_batch: {num_batch}"
+                            )
+
+                            # Stop gradient update if KL difference reaches target
+                            if (
+                                self.target_kl is not None
+                                and approx_kl > self.target_kl
+                                and self.itr >= self.n_critic_warmup_itr
+                            ):
+                                flag_break = True
+                                break
+                    if flag_break:
+                        break
+
+                # Explained variation of future rewards using value function
+                y_pred, y_true = values_k.cpu().numpy(), returns_k.cpu().numpy()
+                var_y = np.var(y_true)
+                explained_var = (
+                    np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
+                )
+
+            # Update lr
+            if self.itr >= self.n_critic_warmup_itr:
+                self.actor_lr_scheduler.step()
+                if self.learn_eta:
+                    self.eta_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+            self.model.step()
+            diffusion_min_sampling_std = self.model.get_min_sampling_denoising_std()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                }
+            )
+            if self.itr % self.log_freq == 0:
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{timer():8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "loss": loss,
+                                "pg loss": pg_loss,
+                                "value loss": v_loss,
+                                "bc loss": bc_loss,
+                                "eta": eta,
+                                "approx kl": approx_kl,
+                                "ratio": ratio,
+                                "clipfrac": np.mean(clipfracs),
+                                "explained variance": explained_var,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                                "diffusion - min sampling std": diffusion_min_sampling_std,
+                                "actor lr": self.actor_optimizer.param_groups[0]["lr"],
+                                "critic lr": self.critic_optimizer.param_groups[0][
+                                    "lr"
+                                ],
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["loss"] = loss
+                    run_results[-1]["pg_loss"] = pg_loss
+                    run_results[-1]["value_loss"] = v_loss
+                    run_results[-1]["bc_loss"] = bc_loss
+                    run_results[-1]["eta"] = eta
+                    run_results[-1]["approx_kl"] = approx_kl
+                    run_results[-1]["ratio"] = ratio
+                    run_results[-1]["clip_frac"] = np.mean(clipfracs)
+                    run_results[-1]["explained_variance"] = explained_var
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                run_results[-1]["time"] = timer()
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/finetune/train_ppo_exact_diffusion_agent.py b/agent/finetune/train_ppo_exact_diffusion_agent.py
new file mode 100644
index 0000000..a70541d
--- /dev/null
+++ b/agent/finetune/train_ppo_exact_diffusion_agent.py
@@ -0,0 +1,405 @@
+"""
+Use diffusion exact likelihood for policy gradient.
+
+"""
+
+import os
+import pickle
+import einops
+import numpy as np
+import torch
+import logging
+import wandb
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from agent.finetune.train_ppo_diffusion_agent import TrainPPODiffusionAgent
+
+
+class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+    def run(self):
+        """
+        For exact likelihood, we do not need to save the chains.
+        """
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        last_itr_eval = False
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = self.itr % self.val_freq == 0 and not self.force_train
+            self.model.eval() if eval_mode else self.model.train()
+            last_itr_eval = eval_mode
+
+            # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
+            dones_trajs = np.empty((0, self.n_envs))
+            firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
+            if self.reset_at_iteration or eval_mode or last_itr_eval:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                firsts_trajs[0] = (
+                    done_venv  # if done at the end of last iteration, then the envs are just reset
+                )
+
+            # Holder
+            obs_trajs = np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
+            samples_trajs = np.empty(
+                (
+                    0,
+                    self.n_envs,
+                    self.horizon_steps,
+                    self.action_dim,
+                )
+            )
+            chains_trajs = np.empty(
+                (
+                    0,
+                    self.n_envs,
+                    self.model.ft_denoising_steps + 1,
+                    self.horizon_steps,
+                    self.action_dim,
+                )
+            )
+            reward_trajs = np.empty((0, self.n_envs))
+            obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
+            obs_full_trajs = np.vstack(
+                (obs_full_trajs, prev_obs_venv[None].squeeze(2))
+            )  # remove cond_step dim
+
+            # Collect a set of trajectories from env
+            for step in range(self.n_steps):
+                if step % 10 == 0:
+                    print(f"Processed step {step} of {self.n_steps}")
+
+                # Select action
+                with torch.no_grad():
+                    samples = self.model(
+                        cond=torch.from_numpy(prev_obs_venv).float().to(self.device),
+                        deterministic=eval_mode,
+                        return_chain=True,
+                    )
+                    output_venv = (
+                        samples.trajectories.cpu().numpy()
+                    )  # n_env x horizon x act
+                    chains_venv = (
+                        samples.chains.cpu().numpy()
+                    )  # n_env x denoising x horizon x act
+                action_venv = output_venv[:, : self.act_steps]
+
+                # Apply multi-step action
+                obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
+                    action_venv
+                )
+                if self.save_full_observations:
+                    obs_full_venv = np.vstack(
+                        [info["full_obs"][None] for info in info_venv]
+                    )  # n_envs x n_act_steps x obs_dim
+                    obs_full_trajs = np.vstack(
+                        (obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
+                    )
+                obs_trajs = np.vstack((obs_trajs, prev_obs_venv[None]))
+                chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
+                samples_trajs = np.vstack((samples_trajs, output_venv[None]))
+                reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
+                dones_trajs = np.vstack((dones_trajs, done_venv[None]))
+                firsts_trajs[step + 1] = done_venv
+                prev_obs_venv = obs_venv
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                episode_best_reward = np.array(
+                    [
+                        np.max(reward_traj) / self.act_steps
+                        for reward_traj in reward_trajs_split
+                    ]
+                )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+                log.info("[WARNING] No episode completed within the iteration!")
+
+            # Update
+            if not eval_mode:
+                with torch.no_grad():
+                    # Calculate value and logprobs - split into batches to prevent out of memory
+                    obs_t = einops.rearrange(
+                        torch.from_numpy(obs_trajs).float().to(self.device),
+                        "s e h d -> (s e) h d",
+                    )
+                    obs_ts = torch.split(obs_t, self.logprob_batch_size, dim=0)
+                    values_trajs = np.empty((0, self.n_envs))
+                    for obs in obs_ts:
+                        values = self.model.critic(obs).cpu().numpy().flatten()
+                        values_trajs = np.vstack(
+                            (values_trajs, values.reshape(-1, self.n_envs))
+                        )
+                    samples_t = einops.rearrange(
+                        torch.from_numpy(samples_trajs).float().to(self.device),
+                        "s e h d -> (s e) h d",
+                    )
+                    samples_ts = torch.split(samples_t, self.logprob_batch_size, dim=0)
+                    logprobs_trajs = np.empty((0))
+                    for obs, samples in zip(obs_ts, samples_ts):
+                        logprobs = (
+                            self.model.get_exact_logprobs(obs, samples).cpu().numpy()
+                        )
+                        logprobs_trajs = np.concatenate((logprobs_trajs, logprobs))
+
+                    # normalize reward with running variance if specified
+                    if self.reward_scale_running:
+                        reward_trajs_transpose = self.running_reward_scaler(
+                            reward=reward_trajs.T, first=firsts_trajs[:-1].T
+                        )
+                        reward_trajs = reward_trajs_transpose.T
+
+                    # bootstrap value with GAE if not done - apply reward scaling with constant if specified
+                    obs_venv_ts = torch.from_numpy(obs_venv).float().to(self.device)
+                    with torch.no_grad():
+                        next_value = (
+                            self.model.critic(obs_venv_ts).reshape(1, -1).cpu().numpy()
+                        )
+                    advantages_trajs = np.zeros_like(reward_trajs)
+                    lastgaelam = 0
+                    for t in reversed(range(self.n_steps)):
+                        if t == self.n_steps - 1:
+                            nextnonterminal = 1.0 - done_venv
+                            nextvalues = next_value
+                        else:
+                            nextnonterminal = 1.0 - dones_trajs[t + 1]
+                            nextvalues = values_trajs[t + 1]
+                        # delta = r + gamma*V(st+1) - V(st)
+                        delta = (
+                            reward_trajs[t] * self.reward_scale_const
+                            + self.gamma * nextvalues * nextnonterminal
+                            - values_trajs[t]
+                        )
+                        # A = delta_t + gamma*lamdba*delta_{t+1} + ...
+                        advantages_trajs[t] = lastgaelam = (
+                            delta
+                            + self.gamma
+                            * self.gae_lambda
+                            * nextnonterminal
+                            * lastgaelam
+                        )
+                    returns_trajs = advantages_trajs + values_trajs
+
+                # k for environment step
+                obs_k = einops.rearrange(
+                    torch.tensor(obs_trajs).float().to(self.device),
+                    "s e h d -> (s e) h d",
+                )
+                samples_k = einops.rearrange(
+                    torch.tensor(samples_trajs).float().to(self.device),
+                    "s e h d -> (s e) h d",
+                )
+                returns_k = (
+                    torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+                )
+                values_k = (
+                    torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+                )
+                advantages_k = (
+                    torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
+                )
+                logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
+
+                # Update policy and critic
+                total_steps = self.n_steps * self.n_envs
+                inds_k = np.arange(total_steps)
+                clipfracs = []
+                for update_epoch in range(self.update_epochs):
+
+                    # for each epoch, go through all data in batches
+                    flag_break = False
+                    np.random.shuffle(inds_k)
+                    num_batch = max(1, total_steps // self.batch_size)  # skip last ones
+                    for batch in range(num_batch):
+                        start = batch * self.batch_size
+                        end = start + self.batch_size
+                        inds_b = inds_k[start:end]  # b for batch
+                        obs_b = obs_k[inds_b]
+                        samples_b = samples_k[inds_b]
+                        returns_b = returns_k[inds_b]
+                        values_b = values_k[inds_b]
+                        advantages_b = advantages_k[inds_b]
+                        logprobs_b = logprobs_k[inds_b]
+
+                        # get loss
+                        (
+                            pg_loss,
+                            v_loss,
+                            clipfrac,
+                            approx_kl,
+                            ratio,
+                            bc_loss,
+                        ) = self.model.loss(
+                            obs_b,
+                            samples_b,
+                            returns_b,
+                            values_b,
+                            advantages_b,
+                            logprobs_b,
+                            use_bc_loss=self.use_bc_loss,
+                            reward_horizon=self.reward_horizon,
+                        )
+                        loss = (
+                            pg_loss
+                            + v_loss * self.vf_coef
+                            + bc_loss * self.bc_loss_coeff
+                        )
+                        clipfracs += [clipfrac]
+
+                        # update policy and critic
+                        self.actor_optimizer.zero_grad()
+                        self.critic_optimizer.zero_grad()
+                        loss.backward()
+                        if self.itr >= self.n_critic_warmup_itr:
+                            if self.max_grad_norm is not None:
+                                torch.nn.utils.clip_grad_norm_(
+                                    self.model.actor_ft.parameters(), self.max_grad_norm
+                                )
+                            self.actor_optimizer.step()
+                        self.critic_optimizer.step()
+                        log.info(
+                            f"approx_kl: {approx_kl}, update_epoch: {update_epoch}, num_batch: {num_batch}"
+                        )
+
+                        # Stop gradient update if KL difference reaches target
+                        if self.target_kl is not None and approx_kl > self.target_kl:
+                            flag_break = True
+                            break
+                    if flag_break:
+                        break
+
+                # Explained variation of future rewards using value function
+                y_pred, y_true = values_k.cpu().numpy(), returns_k.cpu().numpy()
+                var_y = np.var(y_true)
+                explained_var = (
+                    np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
+                )
+
+            # Plot state trajectories
+            if (
+                self.itr % self.render_freq == 0
+                and self.n_render > 0
+                and self.traj_plotter is not None
+            ):
+                self.traj_plotter(
+                    obs_full_trajs=obs_full_trajs,
+                    n_render=self.n_render,
+                    max_episode_steps=self.max_episode_steps,
+                    render_dir=self.render_dir,
+                    itr=self.itr,
+                )
+
+            # Update lr
+            if self.itr >= self.n_critic_warmup_itr:
+                self.actor_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                }
+            )
+            if self.save_trajs:
+                run_results[-1]["obs_full_trajs"] = obs_full_trajs
+                run_results[-1]["obs_trajs"] = obs_trajs
+                run_results[-1]["action_trajs"] = samples_trajs
+                run_results[-1]["chains_trajs"] = chains_trajs
+                run_results[-1]["reward_trajs"] = reward_trajs
+            if self.itr % self.log_freq == 0:
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{timer():8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "loss": loss,
+                                "pg loss": pg_loss,
+                                "value loss": v_loss,
+                                "approx kl": approx_kl,
+                                "ratio": ratio,
+                                "clipfrac": np.mean(clipfracs),
+                                "explained variance": explained_var,
+                                "avg episode reward - train": avg_episode_reward,
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["loss"] = loss
+                    run_results[-1]["pg_loss"] = pg_loss
+                    run_results[-1]["value_loss"] = v_loss
+                    run_results[-1]["approx_kl"] = approx_kl
+                    run_results[-1]["ratio"] = ratio
+                    run_results[-1]["clip_frac"] = np.mean(clipfracs)
+                    run_results[-1]["explained_variance"] = explained_var
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                run_results[-1]["time"] = timer()
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/finetune/train_ppo_gaussian_agent.py b/agent/finetune/train_ppo_gaussian_agent.py
new file mode 100644
index 0000000..8803e10
--- /dev/null
+++ b/agent/finetune/train_ppo_gaussian_agent.py
@@ -0,0 +1,404 @@
+"""
+PPO training for Gaussian/GMM policy.
+
+"""
+
+import os
+import pickle
+import einops
+import numpy as np
+import torch
+import logging
+import wandb
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from agent.finetune.train_ppo_agent import TrainPPOAgent
+
+
+class TrainPPOGaussianAgent(TrainPPOAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+    def run(self):
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        last_itr_eval = False
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = self.itr % self.val_freq == 0 and not self.force_train
+            self.model.eval() if eval_mode else self.model.train()
+            last_itr_eval = eval_mode
+
+            # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
+            dones_trajs = np.empty((0, self.n_envs))
+            firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
+            if self.reset_at_iteration or eval_mode or last_itr_eval:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                firsts_trajs[0] = (
+                    done_venv  # if done at the end of last iteration, then the envs are just reset
+                )
+
+            # Holder
+            obs_trajs = np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
+            samples_trajs = np.empty(
+                (
+                    0,
+                    self.n_envs,
+                    self.horizon_steps,
+                    self.action_dim,
+                )
+            )
+            reward_trajs = np.empty((0, self.n_envs))
+            obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
+            obs_full_trajs = np.vstack(
+                (obs_full_trajs, prev_obs_venv[None].squeeze(2))
+            )  # remove cond_step dim
+
+            # Collect a set of trajectories from env
+            for step in range(self.n_steps):
+                if step % 10 == 0:
+                    print(f"Processed step {step} of {self.n_steps}")
+
+                # Select action
+                with torch.no_grad():
+                    samples = self.model(
+                        cond=torch.from_numpy(prev_obs_venv).float().to(self.device),
+                        deterministic=eval_mode,
+                    )
+                    output_venv = samples.cpu().numpy()
+                action_venv = output_venv[:, : self.act_steps, : self.action_dim]
+                obs_trajs = np.vstack((obs_trajs, prev_obs_venv[None]))
+                samples_trajs = np.vstack((samples_trajs, output_venv[None]))
+
+                # Apply multi-step action
+                obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
+                    action_venv
+                )
+                if self.save_full_observations:
+                    obs_full_venv = np.vstack(
+                        [info["full_obs"][None] for info in info_venv]
+                    )  # n_envs x n_act_steps x obs_dim
+                    obs_full_trajs = np.vstack(
+                        (obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
+                    )
+                reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
+                dones_trajs = np.vstack((dones_trajs, done_venv[None]))
+                firsts_trajs[step + 1] = done_venv
+                prev_obs_venv = obs_venv
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                if (
+                    self.furniture_sparse_reward
+                ):  # only for furniture tasks, where reward only occurs in one env step
+                    episode_best_reward = episode_reward
+                else:
+                    episode_best_reward = np.array(
+                        [
+                            np.max(reward_traj) / self.act_steps
+                            for reward_traj in reward_trajs_split
+                        ]
+                    )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+                log.info("[WARNING] No episode completed within the iteration!")
+
+            # Update
+            if not eval_mode:
+                with torch.no_grad():
+                    # Calculate value and logprobs - split into batches to prevent out of memory
+                    obs_t = einops.rearrange(
+                        torch.from_numpy(obs_trajs).float().to(self.device),
+                        "s e h d -> (s e) h d",
+                    )
+                    obs_ts = torch.split(obs_t, self.logprob_batch_size, dim=0)
+                    values_trajs = np.empty((0, self.n_envs))
+                    for obs in obs_ts:
+                        values = self.model.critic(obs).cpu().numpy().flatten()
+                        values_trajs = np.vstack(
+                            (values_trajs, values.reshape(-1, self.n_envs))
+                        )
+                    samples_t = einops.rearrange(
+                        torch.from_numpy(samples_trajs).float().to(self.device),
+                        "s e h d -> (s e) h d",
+                    )
+                    samples_ts = torch.split(samples_t, self.logprob_batch_size, dim=0)
+                    logprobs_trajs = np.empty((0))
+                    for obs_t, samples_t in zip(obs_ts, samples_ts):
+                        logprobs = (
+                            self.model.get_logprobs(obs_t, samples_t)[0].cpu().numpy()
+                        )
+                        logprobs_trajs = np.concatenate(
+                            (
+                                logprobs_trajs,
+                                logprobs.reshape(-1),
+                            )
+                        )
+
+                    # normalize reward with running variance if specified
+                    if self.reward_scale_running:
+                        reward_trajs_transpose = self.running_reward_scaler(
+                            reward=reward_trajs.T, first=firsts_trajs[:-1].T
+                        )
+                        reward_trajs = reward_trajs_transpose.T
+
+                    # bootstrap value with GAE if not done - apply reward scaling with constant if specified
+                    obs_venv_ts = torch.from_numpy(obs_venv).float().to(self.device)
+                    with torch.no_grad():
+                        next_value = (
+                            self.model.critic(obs_venv_ts).reshape(1, -1).cpu().numpy()
+                        )
+                    advantages_trajs = np.zeros_like(reward_trajs)
+                    lastgaelam = 0
+                    for t in reversed(range(self.n_steps)):
+                        if t == self.n_steps - 1:
+                            nextnonterminal = 1.0 - done_venv
+                            nextvalues = next_value
+                        else:
+                            nextnonterminal = 1.0 - dones_trajs[t + 1]
+                            nextvalues = values_trajs[t + 1]
+                        # delta = r + gamma*V(st+1) - V(st)
+                        delta = (
+                            reward_trajs[t] * self.reward_scale_const
+                            + self.gamma * nextvalues * nextnonterminal
+                            - values_trajs[t]
+                        )
+                        # A = delta_t + gamma*lamdba*delta_{t+1} + ...
+                        advantages_trajs[t] = lastgaelam = (
+                            delta
+                            + self.gamma
+                            * self.gae_lambda
+                            * nextnonterminal
+                            * lastgaelam
+                        )
+                    returns_trajs = advantages_trajs + values_trajs
+
+                # k for environment step
+                obs_k = einops.rearrange(
+                    torch.tensor(obs_trajs).float().to(self.device),
+                    "s e h d -> (s e) h d",
+                )
+                samples_k = einops.rearrange(
+                    torch.tensor(samples_trajs).float().to(self.device),
+                    "s e h d -> (s e) h d",
+                )
+                returns_k = (
+                    torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+                )
+                values_k = (
+                    torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+                )
+                advantages_k = (
+                    torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
+                )
+                logprobs_k = (
+                    torch.tensor(logprobs_trajs).float().to(self.device).reshape(-1)
+                )
+
+                # Update policy and critic
+                total_steps = self.n_steps * self.n_envs
+                inds_k = np.arange(total_steps)
+                clipfracs = []
+                for update_epoch in range(self.update_epochs):
+
+                    # for each epoch, go through all data in batches
+                    flag_break = False
+                    np.random.shuffle(inds_k)
+                    num_batch = max(1, total_steps // self.batch_size)  # skip last ones
+                    for batch in range(num_batch):
+                        start = batch * self.batch_size
+                        end = start + self.batch_size
+                        inds_b = inds_k[start:end]  # b for batch
+                        obs_b = obs_k[inds_b]
+                        samples_b = samples_k[inds_b]
+                        returns_b = returns_k[inds_b]
+                        values_b = values_k[inds_b]
+                        advantages_b = advantages_k[inds_b]
+                        logprobs_b = logprobs_k[inds_b]
+
+                        # get loss
+                        (
+                            pg_loss,
+                            entropy_loss,
+                            v_loss,
+                            clipfrac,
+                            approx_kl,
+                            ratio,
+                            bc_loss,
+                            std,
+                        ) = self.model.loss(
+                            obs_b,
+                            samples_b,
+                            returns_b,
+                            values_b,
+                            advantages_b,
+                            logprobs_b,
+                            use_bc_loss=self.use_bc_loss,
+                        )
+                        loss = (
+                            pg_loss
+                            + entropy_loss * self.ent_coef
+                            + v_loss * self.vf_coef
+                            + bc_loss * self.bc_loss_coeff
+                        )
+                        clipfracs += [clipfrac]
+
+                        # update policy and critic
+                        self.actor_optimizer.zero_grad()
+                        self.critic_optimizer.zero_grad()
+                        loss.backward()
+                        if self.itr >= self.n_critic_warmup_itr:
+                            if self.max_grad_norm is not None:
+                                torch.nn.utils.clip_grad_norm_(
+                                    self.model.actor_ft.parameters(), self.max_grad_norm
+                                )
+                            self.actor_optimizer.step()
+                        self.critic_optimizer.step()
+                        log.info(
+                            f"approx_kl: {approx_kl}, update_epoch: {update_epoch}, num_batch: {num_batch}"
+                        )
+
+                        # Stop gradient update if KL difference reaches target
+                        if self.target_kl is not None and approx_kl > self.target_kl:
+                            flag_break = True
+                            break
+                    if flag_break:
+                        break
+
+                # Explained variation of future rewards using value function
+                y_pred, y_true = values_k.cpu().numpy(), returns_k.cpu().numpy()
+                var_y = np.var(y_true)
+                explained_var = (
+                    np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
+                )
+
+            # Plot state trajectories
+            if (
+                self.itr % self.render_freq == 0
+                and self.n_render > 0
+                and self.traj_plotter is not None
+            ):
+                self.traj_plotter(
+                    obs_full_trajs=obs_full_trajs,
+                    n_render=self.n_render,
+                    max_episode_steps=self.max_episode_steps,
+                    render_dir=self.render_dir,
+                    itr=self.itr,
+                )
+
+            # Update lr
+            if self.itr >= self.n_critic_warmup_itr:
+                self.actor_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                }
+            )
+            if self.save_trajs:
+                run_results[-1]["obs_full_trajs"] = obs_full_trajs
+                run_results[-1]["obs_trajs"] = obs_trajs
+                run_results[-1]["action_trajs"] = samples_trajs
+                run_results[-1]["reward_trajs"] = reward_trajs
+            if self.itr % self.log_freq == 0:
+                time = timer()
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | ent {-entropy_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "loss": loss,
+                                "pg loss": pg_loss,
+                                "value loss": v_loss,
+                                "entropy": -entropy_loss,
+                                "std": std,
+                                "approx kl": approx_kl,
+                                "ratio": ratio,
+                                "clipfrac": np.mean(clipfracs),
+                                "explained variance": explained_var,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["loss"] = loss
+                    run_results[-1]["pg_loss"] = pg_loss
+                    run_results[-1]["value_loss"] = v_loss
+                    run_results[-1]["entropy_loss"] = entropy_loss
+                    run_results[-1]["approx_kl"] = approx_kl
+                    run_results[-1]["ratio"] = ratio
+                    run_results[-1]["clip_frac"] = np.mean(clipfracs)
+                    run_results[-1]["explained_variance"] = explained_var
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                run_results[-1]["time"] = time
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/finetune/train_ppo_gaussian_img_agent.py b/agent/finetune/train_ppo_gaussian_img_agent.py
new file mode 100644
index 0000000..21251dd
--- /dev/null
+++ b/agent/finetune/train_ppo_gaussian_img_agent.py
@@ -0,0 +1,443 @@
+"""
+PPO training for Gaussian/GMM policy with pixel observations.
+
+"""
+
+import os
+import pickle
+import einops
+import numpy as np
+import torch
+import logging
+import wandb
+import math
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from agent.finetune.train_ppo_gaussian_agent import TrainPPOGaussianAgent
+from model.common.modules import RandomShiftsAug
+
+
+class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # Image randomization
+        self.augment = cfg.train.augment
+        if self.augment:
+            self.aug = RandomShiftsAug(pad=4)
+
+        # Set obs dim -  we will save the different obs in batch in a dict
+        shape_meta = cfg.shape_meta
+        self.obs_dims = {k: shape_meta.obs[k]["shape"] for k in shape_meta.obs.keys()}
+
+        # Gradient accumulation to deal with large GPU RAM usage
+        self.grad_accumulate = cfg.train.grad_accumulate
+
+    def run(self):
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        last_itr_eval = False
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = self.itr % self.val_freq == 0 and not self.force_train
+            self.model.eval() if eval_mode else self.model.train()
+            last_itr_eval = eval_mode
+
+            # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
+            dones_trajs = np.empty((0, self.n_envs))
+            firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
+            if self.reset_at_iteration or eval_mode or last_itr_eval:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                firsts_trajs[0] = (
+                    done_venv  # if done at the end of last iteration, then the envs are just reset
+                )
+
+            # Holder
+            obs_trajs = {
+                k: np.empty((0, self.n_envs, self.n_cond_step, *self.obs_dims[k]))
+                for k in self.obs_dims
+            }
+            samples_trajs = np.empty(
+                (
+                    0,
+                    self.n_envs,
+                    self.horizon_steps,
+                    self.action_dim,
+                )
+            )
+            reward_trajs = np.empty((0, self.n_envs))
+
+            # Collect a set of trajectories from env
+            for step in range(self.n_steps):
+                if step % 10 == 0:
+                    print(f"Processed step {step} of {self.n_steps}")
+
+                # Select action
+                with torch.no_grad():
+                    cond = {
+                        key: torch.from_numpy(prev_obs_venv[key])
+                        .float()
+                        .to(self.device)
+                        for key in self.obs_dims.keys()
+                    }  # batch each type of obs and put into dict
+                    samples = self.model(
+                        cond=cond,
+                        deterministic=eval_mode,
+                    )
+                    output_venv = samples.cpu().numpy()
+                action_venv = output_venv[:, : self.act_steps]
+
+                # Apply multi-step action
+                obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
+                    action_venv
+                )
+                for k in obs_trajs.keys():
+                    obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None]))
+                samples_trajs = np.vstack((samples_trajs, output_venv[None]))
+                reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
+                dones_trajs = np.vstack((dones_trajs, done_venv[None]))
+                firsts_trajs[step + 1] = done_venv
+                prev_obs_venv = obs_venv
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                episode_best_reward = np.array(
+                    [
+                        np.max(reward_traj) / self.act_steps
+                        for reward_traj in reward_trajs_split
+                    ]
+                )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+                log.info("[WARNING] No episode completed within the iteration!")
+
+            # Update
+            if not eval_mode:
+                with torch.no_grad():
+                    # apply image randomization
+                    obs_trajs["rgb"] = (
+                        torch.from_numpy(obs_trajs["rgb"]).float().to(self.device)
+                    )
+                    obs_trajs["state"] = (
+                        torch.from_numpy(obs_trajs["state"]).float().to(self.device)
+                    )
+                    if self.augment:
+                        rgb = einops.rearrange(
+                            obs_trajs["rgb"],
+                            "s e t c h w -> (s e t) c h w",
+                        )
+                        rgb = self.aug(rgb)
+                        obs_trajs["rgb"] = einops.rearrange(
+                            rgb,
+                            "(s e t) c h w -> s e t c h w",
+                            s=self.n_steps,
+                            e=self.n_envs,
+                        )
+
+                    # Calculate value and logprobs - split into batches to prevent out of memory
+                    num_split = math.ceil(
+                        self.n_envs * self.n_steps / self.logprob_batch_size
+                    )
+                    obs_ts = [{} for _ in range(num_split)]
+                    for k in obs_trajs.keys():
+                        obs_k = einops.rearrange(
+                            obs_trajs[k],
+                            "s e ... -> (s e) ...",
+                        )
+                        obs_ts_k = torch.split(obs_k, self.logprob_batch_size, dim=0)
+                        for i, obs_t in enumerate(obs_ts_k):
+                            obs_ts[i][k] = obs_t
+                    values_trajs = np.empty((0, self.n_envs))
+                    for obs in obs_ts:
+                        values = (
+                            self.model.critic(obs, no_augment=True)
+                            .cpu()
+                            .numpy()
+                            .flatten()
+                        )
+                        values_trajs = np.vstack(
+                            (values_trajs, values.reshape(-1, self.n_envs))
+                        )
+                    samples_t = einops.rearrange(
+                        torch.from_numpy(samples_trajs).float().to(self.device),
+                        "s e h d -> (s e) h d",
+                    )
+                    samples_ts = torch.split(samples_t, self.logprob_batch_size, dim=0)
+                    logprobs_trajs = np.empty((0))
+                    for obs_t, samples_t in zip(obs_ts, samples_ts):
+                        logprobs = (
+                            self.model.get_logprobs(obs_t, samples_t)[0].cpu().numpy()
+                        )
+                        logprobs_trajs = np.concatenate(
+                            (
+                                logprobs_trajs,
+                                logprobs.reshape(-1),
+                            )
+                        )
+
+                    # normalize reward with running variance if specified
+                    if self.reward_scale_running:
+                        reward_trajs_transpose = self.running_reward_scaler(
+                            reward=reward_trajs.T, first=firsts_trajs[:-1].T
+                        )
+                        reward_trajs = reward_trajs_transpose.T
+
+                    # bootstrap value with GAE if not done - apply reward scaling with constant if specified
+                    obs_venv_ts = {
+                        key: torch.from_numpy(obs_venv[key]).float().to(self.device)
+                        for key in self.obs_dims.keys()
+                    }
+                    with torch.no_grad():
+                        next_value = (
+                            self.model.critic(obs_venv_ts, no_augment=True)
+                            .reshape(1, -1)
+                            .cpu()
+                            .numpy()
+                        )
+                    advantages_trajs = np.zeros_like(reward_trajs)
+                    lastgaelam = 0
+                    for t in reversed(range(self.n_steps)):
+                        if t == self.n_steps - 1:
+                            nextnonterminal = 1.0 - done_venv
+                            nextvalues = next_value
+                        else:
+                            nextnonterminal = 1.0 - dones_trajs[t + 1]
+                            nextvalues = values_trajs[t + 1]
+                        # delta = r + gamma*V(st+1) - V(st)
+                        delta = (
+                            reward_trajs[t] * self.reward_scale_const
+                            + self.gamma * nextvalues * nextnonterminal
+                            - values_trajs[t]
+                        )
+                        # A = delta_t + gamma*lamdba*delta_{t+1} + ...
+                        advantages_trajs[t] = lastgaelam = (
+                            delta
+                            + self.gamma
+                            * self.gae_lambda
+                            * nextnonterminal
+                            * lastgaelam
+                        )
+                    returns_trajs = advantages_trajs + values_trajs
+
+                # k for environment step
+                obs_k = {
+                    k: einops.rearrange(
+                        obs_trajs[k],
+                        "s e ... -> (s e) ...",
+                    )
+                    for k in obs_trajs.keys()
+                }
+                samples_k = einops.rearrange(
+                    torch.tensor(samples_trajs).float().to(self.device),
+                    "s e h d -> (s e) h d",
+                )
+                returns_k = (
+                    torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+                )
+                values_k = (
+                    torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+                )
+                advantages_k = (
+                    torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
+                )
+                logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
+
+                # Update policy and critic
+                total_steps = self.n_steps * self.n_envs
+                inds_k = np.arange(total_steps)
+                clipfracs = []
+                for update_epoch in range(self.update_epochs):
+
+                    # for each epoch, go through all data in batches
+                    flag_break = False
+                    np.random.shuffle(inds_k)
+                    num_batch = max(1, total_steps // self.batch_size)  # skip last ones
+                    for batch in range(num_batch):
+                        start = batch * self.batch_size
+                        end = start + self.batch_size
+                        inds_b = inds_k[start:end]  # b for batch
+                        obs_b = {k: obs_k[k][inds_b] for k in obs_k.keys()}
+                        samples_b = samples_k[inds_b]
+                        returns_b = returns_k[inds_b]
+                        values_b = values_k[inds_b]
+                        advantages_b = advantages_k[inds_b]
+                        logprobs_b = logprobs_k[inds_b]
+
+                        # get loss
+                        (
+                            pg_loss,
+                            entropy_loss,
+                            v_loss,
+                            clipfrac,
+                            approx_kl,
+                            ratio,
+                            bc_loss,
+                            std,
+                        ) = self.model.loss(
+                            obs_b,
+                            samples_b,
+                            returns_b,
+                            values_b,
+                            advantages_b,
+                            logprobs_b,
+                            use_bc_loss=self.use_bc_loss,
+                        )
+                        loss = (
+                            pg_loss
+                            + entropy_loss * self.ent_coef
+                            + v_loss * self.vf_coef
+                            + bc_loss * self.bc_loss_coeff
+                        )
+                        clipfracs += [clipfrac]
+
+                        # update policy and critic
+                        loss.backward()
+                        if (batch + 1) % self.grad_accumulate == 0:
+                            if self.itr >= self.n_critic_warmup_itr:
+                                if self.max_grad_norm is not None:
+                                    torch.nn.utils.clip_grad_norm_(
+                                        self.model.actor_ft.parameters(),
+                                        self.max_grad_norm,
+                                    )
+                                self.actor_optimizer.step()
+                            self.critic_optimizer.step()
+                            self.actor_optimizer.zero_grad()
+                            self.critic_optimizer.zero_grad()
+                            log.info(f"run grad update at batch {batch}")
+                            log.info(
+                                f"approx_kl: {approx_kl}, update_epoch: {update_epoch}, num_batch: {num_batch}"
+                            )
+
+                            # Stop gradient update if KL difference reaches target
+                            if (
+                                self.target_kl is not None
+                                and approx_kl > self.target_kl
+                                and self.itr >= self.n_critic_warmup_itr
+                            ):
+                                flag_break = True
+                                break
+                    if flag_break:
+                        break
+
+                # Explained variation of future rewards using value function
+                y_pred, y_true = values_k.cpu().numpy(), returns_k.cpu().numpy()
+                var_y = np.var(y_true)
+                explained_var = (
+                    np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
+                )
+
+            # Update lr
+            if self.itr >= self.n_critic_warmup_itr:
+                self.actor_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                }
+            )
+            if self.itr % self.log_freq == 0:
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{timer():8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "loss": loss,
+                                "pg loss": pg_loss,
+                                "value loss": v_loss,
+                                "bc loss": bc_loss,
+                                "std": std,
+                                "approx kl": approx_kl,
+                                "ratio": ratio,
+                                "clipfrac": np.mean(clipfracs),
+                                "explained variance": explained_var,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                                "actor lr": self.actor_optimizer.param_groups[0]["lr"],
+                                "critic lr": self.critic_optimizer.param_groups[0][
+                                    "lr"
+                                ],
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["loss"] = loss
+                    run_results[-1]["pg_loss"] = pg_loss
+                    run_results[-1]["value_loss"] = v_loss
+                    run_results[-1]["bc_loss"] = bc_loss
+                    run_results[-1]["std"] = std
+                    run_results[-1]["approx_kl"] = approx_kl
+                    run_results[-1]["ratio"] = ratio
+                    run_results[-1]["clip_frac"] = np.mean(clipfracs)
+                    run_results[-1]["explained_variance"] = explained_var
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                run_results[-1]["time"] = timer()
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/finetune/train_qsm_diffusion_agent.py b/agent/finetune/train_qsm_diffusion_agent.py
new file mode 100644
index 0000000..3ddacd2
--- /dev/null
+++ b/agent/finetune/train_qsm_diffusion_agent.py
@@ -0,0 +1,316 @@
+"""
+QSM (Q-Score Matching) for diffusion policy.
+
+"""
+
+import os
+import pickle
+import einops
+import numpy as np
+import torch
+import logging
+import wandb
+from copy import deepcopy
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from collections import deque
+from agent.finetune.train_agent import TrainAgent
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+
+class TrainQSMDiffusionAgent(TrainAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # note the discount factor gamma here is applied to reward every act_steps, instead of every env step
+        self.gamma = cfg.train.gamma
+
+        # Wwarm up period for critic before actor updates
+        self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
+
+        # Optimizer
+        self.actor_optimizer = torch.optim.AdamW(
+            self.model.actor.parameters(),
+            lr=cfg.train.actor_lr,
+            weight_decay=cfg.train.actor_weight_decay,
+        )
+        self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.actor_optimizer,
+            first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.actor_lr,
+            min_lr=cfg.train.actor_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+        self.critic_optimizer = torch.optim.AdamW(
+            self.model.critic_q.parameters(),
+            lr=cfg.train.critic_lr,
+            weight_decay=cfg.train.critic_weight_decay,
+        )
+        self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.critic_optimizer,
+            first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.critic_lr,
+            min_lr=cfg.train.critic_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+
+        # Buffer size
+        self.buffer_size = cfg.train.buffer_size
+
+        # Scaling reward
+        self.scale_reward_factor = cfg.train.scale_reward_factor
+
+        # Updates
+        self.replay_ratio = cfg.train.replay_ratio
+        self.critic_tau = cfg.train.critic_tau
+        self.q_grad_coeff = cfg.train.q_grad_coeff
+
+    def run(self):
+
+        # make a FIFO replay buffer for obs, action, and reward
+        obs_buffer = deque(maxlen=self.buffer_size)
+        action_buffer = deque(maxlen=self.buffer_size)
+        next_obs_buffer = deque(maxlen=self.buffer_size)
+        reward_buffer = deque(maxlen=self.buffer_size)
+        done_buffer = deque(maxlen=self.buffer_size)
+        first_buffer = deque(maxlen=self.buffer_size)
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = self.itr % self.val_freq == 0 and not self.force_train
+            self.model.eval() if eval_mode else self.model.train()
+            firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
+
+            # Reset env at the beginning of an iteration
+            if self.reset_at_iteration or eval_mode or last_itr_eval:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                firsts_trajs[0] = (
+                    done_venv  # if done at the end of last iteration, then the envs are just reset
+                )
+            last_itr_eval = eval_mode
+            reward_trajs = np.empty((0, self.n_envs))
+
+            # Collect a set of trajectories from env
+            for step in range(self.n_steps):
+                if step % 10 == 0:
+                    print(f"Processed step {step} of {self.n_steps}")
+
+                # Select action
+                with torch.no_grad():
+                    samples = (
+                        self.model(
+                            cond=torch.from_numpy(prev_obs_venv)
+                            .float()
+                            .to(self.device),
+                            deterministic=eval_mode,
+                        )
+                        .cpu()
+                        .numpy()
+                    )  # n_env x horizon x act
+                action_venv = samples[:, : self.act_steps]
+
+                # Apply multi-step action
+                obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
+                    action_venv
+                )
+                reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
+
+                # add to buffer
+                obs_buffer.append(prev_obs_venv)
+                action_buffer.append(action_venv)
+                next_obs_buffer.append(obs_venv)
+                reward_buffer.append(reward_venv * self.scale_reward_factor)
+                done_buffer.append(done_venv)
+                first_buffer.append(firsts_trajs[step])
+
+                firsts_trajs[step + 1] = done_venv
+                prev_obs_venv = obs_venv
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                episode_best_reward = np.array(
+                    [
+                        np.max(reward_traj) / self.act_steps
+                        for reward_traj in reward_trajs_split
+                    ]
+                )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+                log.info("[WARNING] No episode completed within the iteration!")
+
+            # Update
+            if not eval_mode:
+
+                obs_trajs = np.array(deepcopy(obs_buffer))
+                action_trajs = np.array(deepcopy(action_buffer))
+                next_obs_trajs = np.array(deepcopy(next_obs_buffer))
+                reward_trajs = np.array(deepcopy(reward_buffer))
+                done_trajs = np.array(deepcopy(done_buffer))
+                first_trajs = np.array(deepcopy(first_buffer))
+
+                # flatten
+                obs_trajs = einops.rearrange(
+                    obs_trajs,
+                    "s e h d -> (s e) h d",
+                )
+                next_obs_trajs = einops.rearrange(
+                    next_obs_trajs,
+                    "s e h d -> (s e) h d",
+                )
+                action_trajs = einops.rearrange(
+                    action_trajs,
+                    "s e h d -> (s e) h d",
+                )
+                reward_trajs = reward_trajs.reshape(-1)
+                done_trajs = done_trajs.reshape(-1)
+                first_trajs = first_trajs.reshape(-1)
+
+                num_batch = int(
+                    self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
+                )
+
+                for _ in range(num_batch):
+
+                    # Sample batch
+                    inds = np.random.choice(len(obs_trajs), self.batch_size)
+                    obs_b = torch.from_numpy(obs_trajs[inds]).float().to(self.device)
+                    next_obs_b = (
+                        torch.from_numpy(next_obs_trajs[inds]).float().to(self.device)
+                    )
+                    actions_b = (
+                        torch.from_numpy(action_trajs[inds]).float().to(self.device)
+                    )
+                    reward_b = (
+                        torch.from_numpy(reward_trajs[inds]).float().to(self.device)
+                    )
+                    done_b = torch.from_numpy(done_trajs[inds]).float().to(self.device)
+
+                    # update critic q function
+                    critic_loss = self.model.loss_critic(
+                        obs_b, next_obs_b, actions_b, reward_b, done_b, self.gamma
+                    )
+                    self.critic_optimizer.zero_grad()
+                    critic_loss.backward()
+                    self.critic_optimizer.step()
+
+                    # update target q function
+                    self.model.update_target_critic(self.critic_tau)
+
+                    loss_critic = critic_loss.detach()
+
+                    # Update policy with collected trajectories
+                    loss = self.model.loss_actor(
+                        obs_b,
+                        actions_b,
+                        self.q_grad_coeff,
+                    )
+                    self.actor_optimizer.zero_grad()
+                    loss.backward()
+                    if self.itr >= self.n_critic_warmup_itr:
+                        if self.max_grad_norm is not None:
+                            torch.nn.utils.clip_grad_norm_(
+                                self.model.actor.parameters(), self.max_grad_norm
+                            )
+                        self.actor_optimizer.step()
+
+            # Update lr
+            self.actor_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                }
+            )
+            if self.itr % self.log_freq == 0:
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{timer():8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "loss": loss,
+                                "loss - critic": loss_critic,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["loss"] = loss
+                    run_results[-1]["loss_critic"] = loss_critic
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                run_results[-1]["time"] = timer()
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/finetune/train_rwr_diffusion_agent.py b/agent/finetune/train_rwr_diffusion_agent.py
new file mode 100644
index 0000000..30d0daa
--- /dev/null
+++ b/agent/finetune/train_rwr_diffusion_agent.py
@@ -0,0 +1,301 @@
+"""
+Reward-weighted regression (RWR) for diffusion policy.
+
+"""
+
+import os
+import pickle
+import numpy as np
+import torch
+import logging
+import wandb
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from agent.finetune.train_agent import TrainAgent
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+
+class TrainRWRDiffusionAgent(TrainAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # note the discount factor gamma here is applied to reward every act_steps, instead of every env step
+        self.gamma = cfg.train.gamma
+
+        # Build optimizer
+        self.optimizer = torch.optim.AdamW(
+            self.model.parameters(),
+            lr=cfg.train.lr,
+            weight_decay=cfg.train.weight_decay,
+        )
+        self.lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.optimizer,
+            first_cycle_steps=cfg.train.lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.lr,
+            min_lr=cfg.train.lr_scheduler.min_lr,
+            warmup_steps=cfg.train.lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+
+        # Reward exponential
+        self.beta = cfg.train.beta
+
+        # Max weight for AWR
+        self.max_reward_weight = cfg.train.max_reward_weight
+
+        # Updates
+        self.update_epochs = cfg.train.update_epochs
+
+    def run(self):
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = self.itr % self.val_freq == 0 and not self.force_train
+            self.model.eval() if eval_mode else self.model.train()
+            firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
+
+            # Reset env at the beginning of an iteration
+            if self.reset_at_iteration or eval_mode or last_itr_eval:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                firsts_trajs[0] = (
+                    done_venv  # if done at the end of last iteration, then the envs are just reset
+                )
+            last_itr_eval = eval_mode
+            reward_trajs = np.empty((0, self.n_envs))
+
+            # Holders
+            obs_trajs = np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
+            samples_trajs = np.empty(
+                (
+                    0,
+                    self.n_envs,
+                    self.horizon_steps,
+                    self.action_dim,
+                )
+            )
+
+            # Collect a set of trajectories from env
+            for step in range(self.n_steps):
+                if step % 10 == 0:
+                    print(f"Processed step {step} of {self.n_steps}")
+
+                # Select action
+                with torch.no_grad():
+                    samples = (
+                        self.model(
+                            cond=torch.from_numpy(prev_obs_venv)
+                            .float()
+                            .to(self.device),
+                            deterministic=eval_mode,
+                        )
+                        .cpu()
+                        .numpy()
+                    )  # n_env x horizon x act
+                action_venv = samples[:, : self.act_steps]
+                obs_trajs = np.vstack((obs_trajs, prev_obs_venv[None]))
+                samples_trajs = np.vstack((samples_trajs, samples[None]))
+
+                # Apply multi-step action
+                obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
+                    action_venv
+                )
+                reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
+                firsts_trajs[step + 1] = done_venv
+                prev_obs_venv = obs_venv
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                # Compute transitions for completed trajectories
+                obs_trajs_split = [
+                    obs_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                samples_trajs_split = [
+                    samples_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+
+                # Compute episode returns
+                discounted_reward_trajs_split = [
+                    [
+                        self.gamma**t * r
+                        for t, r in zip(
+                            list(range(end - start + 1)),
+                            reward_trajs[start : end + 1, env_ind],
+                        )
+                    ]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                returns_trajs_split = [
+                    np.cumsum(y[::-1])[::-1] for y in discounted_reward_trajs_split
+                ]
+                returns_trajs_split = np.concatenate(returns_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                episode_best_reward = np.array(
+                    [
+                        np.max(reward_traj) / self.act_steps
+                        for reward_traj in reward_trajs_split
+                    ]
+                )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+                log.info("[WARNING] No episode completed within the iteration!")
+
+            # Update
+            if not eval_mode:
+
+                # Tensorize data and put them to device
+                # k for environment step
+                obs_k = (
+                    torch.tensor(np.concatenate(obs_trajs_split))
+                    .float()
+                    .to(self.device)
+                )
+
+                samples_k = (
+                    torch.tensor(np.concatenate(samples_trajs_split))
+                    .float()
+                    .to(self.device)
+                )
+
+                # Normalize reward
+                returns_trajs_split = (
+                    returns_trajs_split - np.mean(returns_trajs_split)
+                ) / (returns_trajs_split.std() + 1e-3)
+
+                rewards_k = (
+                    torch.tensor(returns_trajs_split)
+                    .float()
+                    .to(self.device)
+                    .reshape(-1)
+                )
+
+                rewards_k_scaled = torch.exp(self.beta * rewards_k)
+                rewards_k_scaled.clamp_(max=self.max_reward_weight)
+
+                # rewards_k_scaled = rewards_k_scaled / rewards_k_scaled.mean()
+
+                # Update policy and critic
+                total_steps = len(rewards_k_scaled)
+                inds_k = np.arange(total_steps)
+                for _ in range(self.update_epochs):
+
+                    # for each epoch, go through all data in batches
+                    np.random.shuffle(inds_k)
+                    num_batch = max(1, total_steps // self.batch_size)  # skip last ones
+                    for batch in range(num_batch):
+                        start = batch * self.batch_size
+                        end = start + self.batch_size
+                        inds_b = inds_k[start:end]  # b for batch
+                        obs_b = obs_k[inds_b]
+                        samples_b = samples_k[inds_b]
+                        rewards_b = rewards_k_scaled[inds_b]
+
+                        # Update policy with collected trajectories
+                        loss = self.model.loss(
+                            samples_b,
+                            obs_b,
+                            rewards_b,
+                        )
+                        self.optimizer.zero_grad()
+                        loss.backward()
+                        if self.max_grad_norm is not None:
+                            torch.nn.utils.clip_grad_norm_(
+                                self.model.parameters(), self.max_grad_norm
+                            )
+                        self.optimizer.step()
+
+            # Update lr
+            self.lr_scheduler.step()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                }
+            )
+            if self.itr % self.log_freq == 0:
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{timer():8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "loss": loss,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["loss"] = loss
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                run_results[-1]["time"] = timer()
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/pretrain/train_agent.py b/agent/pretrain/train_agent.py
new file mode 100644
index 0000000..a1a358a
--- /dev/null
+++ b/agent/pretrain/train_agent.py
@@ -0,0 +1,168 @@
+"""
+Parent pre-training agent class.
+
+"""
+
+import os
+import random
+import numpy as np
+from omegaconf import OmegaConf
+import torch
+import hydra
+import logging
+import wandb
+from copy import deepcopy
+
+log = logging.getLogger(__name__)
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+DEVICE = "cuda:0"
+
+
+def to_device(x, device=DEVICE):
+    if torch.is_tensor(x):
+        return x.to(device)
+    elif type(x) is dict:
+        return {k: to_device(v, device) for k, v in x.items()}
+    else:
+        print(f"Unrecognized type in `to_device`: {type(x)}")
+
+
+def batch_to_device(batch, device="cuda:0"):
+    vals = [to_device(getattr(batch, field), device) for field in batch._fields]
+    return type(batch)(*vals)
+
+
+class EMA:
+    """
+    Empirical moving average
+
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.beta = cfg.decay
+
+    def update_model_average(self, ma_model, current_model):
+        for current_params, ma_params in zip(
+            current_model.parameters(), ma_model.parameters()
+        ):
+            old_weight, up_weight = ma_params.data, current_params.data
+            ma_params.data = self.update_average(old_weight, up_weight)
+
+    def update_average(self, old, new):
+        if old is None:
+            return new
+        return old * self.beta + (1 - self.beta) * new
+
+
+class PreTrainAgent:
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.seed = cfg.get("seed", 42)
+        random.seed(self.seed)
+        np.random.seed(self.seed)
+        torch.manual_seed(self.seed)
+
+        # Wandb
+        self.use_wandb = cfg.wandb is not None
+        if cfg.wandb is not None:
+            wandb.init(
+                entity=cfg.wandb.entity,
+                project=cfg.wandb.project,
+                name=cfg.wandb.run,
+                config=OmegaConf.to_container(cfg, resolve=True),
+            )
+
+        # Build model
+        self.model = hydra.utils.instantiate(cfg.model)
+        self.ema = EMA(cfg.ema)
+        self.ema_model = deepcopy(self.model)
+
+        # Training params
+        self.n_epochs = cfg.train.n_epochs
+        self.batch_size = cfg.train.batch_size
+        self.update_ema_freq = cfg.train.update_ema_freq
+        self.epoch_start_ema = cfg.train.epoch_start_ema
+        self.val_freq = cfg.train.get("val_freq", 100)
+
+        # Logging, checkpoints
+        self.logdir = cfg.logdir
+        self.checkpoint_dir = os.path.join(self.logdir, "checkpoint")
+        os.makedirs(self.checkpoint_dir, exist_ok=True)
+        self.log_freq = cfg.train.get("log_freq", 1)
+        self.save_model_freq = cfg.train.save_model_freq
+
+        # Build dataset
+        self.dataset_train = hydra.utils.instantiate(cfg.train_dataset)
+        self.dataloader_train = torch.utils.data.DataLoader(
+            self.dataset_train,
+            batch_size=self.batch_size,
+            num_workers=4 if self.dataset_train.device == "cpu" else 0,
+            shuffle=True,
+            pin_memory=True if self.dataset_train.device == "cpu" else False,
+        )
+        self.dataloader_val = None
+        if "train_split" in cfg.train and cfg.train.train_split < 1:
+            val_indices = self.dataset_train.set_train_val_split(cfg.train.train_split)
+            self.dataset_val = deepcopy(self.dataset_train)
+            self.dataset_val.set_indices(val_indices)
+            self.dataloader_val = torch.utils.data.DataLoader(
+                self.dataset_val,
+                batch_size=self.batch_size,
+                num_workers=4 if self.dataset_val.device == "cpu" else 0,
+                shuffle=True,
+                pin_memory=True if self.dataset_val.device == "cpu" else False,
+            )
+        self.optimizer = torch.optim.AdamW(
+            self.model.parameters(),
+            lr=cfg.train.learning_rate,
+            weight_decay=cfg.train.weight_decay,
+        )
+        self.lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.optimizer,
+            first_cycle_steps=cfg.train.lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.learning_rate,
+            min_lr=cfg.train.lr_scheduler.min_lr,
+            warmup_steps=cfg.train.lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+        self.reset_parameters()
+
+    def run(self):
+        raise NotImplementedError
+
+    def reset_parameters(self):
+        self.ema_model.load_state_dict(self.model.state_dict())
+
+    def step_ema(self):
+        if self.epoch < self.epoch_start_ema:
+            self.reset_parameters()
+            return
+        self.ema.update_model_average(self.ema_model, self.model)
+
+    def save_model(self):
+        """
+        saves model and ema to disk;
+        """
+        data = {
+            "epoch": self.epoch,
+            "model": self.model.state_dict(),
+            "ema": self.ema_model.state_dict(),
+        }
+        savepath = os.path.join(self.checkpoint_dir, f"state_{self.epoch}.pt")
+        torch.save(data, savepath)
+        log.info(f"Saved model to {savepath}")
+
+    def load(self, epoch):
+        """
+        loads model and ema from disk
+        """
+        loadpath = os.path.join(self.checkpoint_dir, f"state_{epoch}.pt")
+        data = torch.load(loadpath, weights_only=True)
+
+        self.epoch = data["epoch"]
+        self.model.load_state_dict(data["model"])
+        self.ema_model.load_state_dict(data["ema"])
diff --git a/agent/pretrain/train_diffusion_agent.py b/agent/pretrain/train_diffusion_agent.py
new file mode 100644
index 0000000..e7747b2
--- /dev/null
+++ b/agent/pretrain/train_diffusion_agent.py
@@ -0,0 +1,83 @@
+"""
+Pre-training diffusion policy
+
+"""
+
+import logging
+import wandb
+import numpy as np
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from agent.pretrain.train_agent import PreTrainAgent, batch_to_device
+
+
+class TrainDiffusionAgent(PreTrainAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+    def run(self):
+
+        timer = Timer()
+        self.epoch = 1
+        for _ in range(self.n_epochs):
+
+            # train
+            loss_train_epoch = []
+            for batch_train in self.dataloader_train:
+                if self.dataset_train.device == "cpu":
+                    batch_train = batch_to_device(batch_train)
+
+                self.model.train()
+                loss_train = self.model.loss(*batch_train)
+                loss_train.backward()
+                loss_train_epoch.append(loss_train.item())
+
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+            loss_train = np.mean(loss_train_epoch)
+
+            # validate
+            loss_val_epoch = []
+            if self.dataloader_val is not None and self.epoch % self.val_freq == 0:
+                self.model.eval()
+                for batch_val in self.dataloader_val:
+                    if self.dataset_val.device == "cpu":
+                        batch_val = batch_to_device(batch_val)
+                    loss_val, infos_val = self.model.loss(*batch_val)
+                    loss_val_epoch.append(loss_val.item())
+                self.model.train()
+            loss_val = np.mean(loss_val_epoch) if len(loss_val_epoch) > 0 else None
+
+            # update lr
+            self.lr_scheduler.step()
+
+            # update ema
+            if self.epoch % self.update_ema_freq == 0:
+                self.step_ema()
+
+            # save model
+            if self.epoch % self.save_model_freq == 0 or self.epoch == self.n_epochs:
+                self.save_model()
+
+            # log loss
+            if self.epoch % self.log_freq == 0:
+                log.info(
+                    f"{self.epoch}: train loss {loss_train:8.4f} | t:{timer():8.4f}"
+                )
+                if self.use_wandb:
+                    if loss_val is not None:
+                        wandb.log(
+                            {"loss - val": loss_val}, step=self.epoch, commit=False
+                        )
+                    wandb.log(
+                        {
+                            "loss - train": loss_train,
+                        },
+                        step=self.epoch,
+                        commit=True,
+                    )
+
+            # count
+            self.epoch += 1
diff --git a/agent/pretrain/train_gaussian_agent.py b/agent/pretrain/train_gaussian_agent.py
new file mode 100644
index 0000000..7384785
--- /dev/null
+++ b/agent/pretrain/train_gaussian_agent.py
@@ -0,0 +1,99 @@
+"""
+Pre-training Gaussian/GMM policy
+
+"""
+
+import logging
+import wandb
+import numpy as np
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from agent.pretrain.train_agent import PreTrainAgent, batch_to_device
+
+
+class TrainGaussianAgent(PreTrainAgent):
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # Entropy bonus - not used right now since using fixed_std
+        self.ent_coef = cfg.train.get("ent_coef", 0)
+
+    def run(self):
+
+        timer = Timer()
+        self.epoch = 1
+        for _ in range(self.n_epochs):
+
+            # train
+            loss_train_epoch = []
+            ent_train_epoch = []
+            for batch_train in self.dataloader_train:
+                if self.dataset_train.device == "cpu":
+                    batch_train = batch_to_device(batch_train)
+
+                self.model.train()
+                loss_train, infos_train = self.model.loss(
+                    *batch_train,
+                    ent_coef=self.ent_coef,
+                )
+                loss_train.backward()
+                loss_train_epoch.append(loss_train.item())
+                ent_train_epoch.append(infos_train["entropy"].item())
+
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+            loss_train = np.mean(loss_train_epoch)
+            ent_train = np.mean(ent_train_epoch)
+
+            # validate
+            loss_val_epoch = []
+            if self.dataloader_val is not None and self.epoch % self.val_freq == 0:
+                self.model.eval()
+                for batch_val in self.dataloader_val:
+                    if self.dataset_val.device == "cpu":
+                        batch_val = batch_to_device(batch_val)
+                    loss_val, infos_val = self.model.loss(
+                        *batch_val,
+                        ent_coef=self.ent_coef,
+                    )
+                    loss_val_epoch.append(loss_val.item())
+                self.model.train()
+            loss_val = np.mean(loss_val_epoch) if len(loss_val_epoch) > 0 else None
+
+            # update lr
+            self.lr_scheduler.step()
+
+            # update ema
+            if self.epoch % self.update_ema_freq == 0:
+                self.step_ema()
+
+            # save model
+            if self.epoch % self.save_model_freq == 0 or self.epoch == self.n_epochs:
+                self.save_model()
+
+            # log loss
+            if self.epoch % self.log_freq == 0:
+                infos_str = " | ".join(
+                    [f"{key}: {val:8.4f}" for key, val in infos_train.items()]
+                )
+                log.info(
+                    f"{self.epoch}: train loss {loss_train:8.4f} | {infos_str} | t:{timer():8.4f}"
+                )
+                if self.use_wandb:
+                    if loss_val is not None:
+                        wandb.log(
+                            {"loss - val": loss_val}, step=self.epoch, commit=False
+                        )
+                    wandb.log(
+                        {
+                            "loss - train": loss_train,
+                            "entropy - train": ent_train,
+                        },
+                        step=self.epoch,
+                        commit=True,
+                    )
+
+            # count
+            self.epoch += 1
diff --git a/cfg/d3il/finetune/avoid_m1/ft_ppo_diffusion_mlp.yaml b/cfg/d3il/finetune/avoid_m1/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..f3ea689
--- /dev/null
+++ b/cfg/d3il/finetune/avoid_m1/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,119 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_m1_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m1/avoid_d56_r12_pre_diffusion_mlp_ta4_td20/2024-07-06_22-50-07/checkpoint/state_10000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: avoiding-m5
+mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  max_episode_steps: 100
+  reset_at_iteration: True
+  save_video: False
+  best_reward_threshold_for_success: 2
+  save_full_observations: True
+  wrappers:
+    d3il_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      pass_full_observations: ${env.save_full_observations}
+      reset_within_step: False
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env_name}-m1-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 51
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.99
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 100 # no eval, always train mode
+  force_train: True
+  render:
+    freq: 1
+    num: 10
+  plotter:
+    _target_: env.plot_traj.TrajPlotter
+    env_type: avoid
+    normalization_path: ${normalization_path}
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.95
+  clip_ploss_coef: 0.1
+  clip_ploss_coef_base: 0.1
+  clip_ploss_coef_rate: 1
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/finetune/avoid_m1/ft_ppo_gaussian_mlp.yaml b/cfg/d3il/finetune/avoid_m1/ft_ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..3bf6374
--- /dev/null
+++ b/cfg/d3il/finetune/avoid_m1/ft_ppo_gaussian_mlp.yaml
@@ -0,0 +1,105 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_m1_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m1/avoid_d56_r12_pre_gaussian_mlp_ta4/2024-07-07_01-35-48/checkpoint/state_10000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: avoiding-m5
+mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  max_episode_steps: 100
+  reset_at_iteration: True
+  save_video: False
+  best_reward_threshold_for_success: 2
+  save_full_observations: True
+  wrappers:
+    d3il_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      pass_full_observations: ${env.save_full_observations}
+      reset_within_step: False
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env_name}-m1-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 51
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.99
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 100 # no eval, always train mode
+  force_train: True
+  render:
+    freq: 1
+    num: 10
+  plotter:
+    _target_: env.plot_traj.TrajPlotter
+    env_type: avoid
+    normalization_path: ${normalization_path}
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.1
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256] # smaller MLP for less overfitting
+    activation_type: ReLU
+    residual_style: True
+    fixed_std: 0.1
+    learn_fixed_std: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    mlp_dims: [256, 256, 256]
+    residual_style: True
+    obs_dim: ${obs_dim}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/finetune/avoid_m1/ft_ppo_gmm_mlp.yaml b/cfg/d3il/finetune/avoid_m1/ft_ppo_gmm_mlp.yaml
new file mode 100644
index 0000000..35a6a81
--- /dev/null
+++ b/cfg/d3il/finetune/avoid_m1/ft_ppo_gmm_mlp.yaml
@@ -0,0 +1,106 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_m1_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m1/avoid_d56_r12_pre_gmm_mlp_ta4/2024-07-10_14-30-00/checkpoint/state_10000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: avoiding-m5
+mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+num_modes: 5
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  max_episode_steps: 100
+  reset_at_iteration: True
+  save_video: False
+  best_reward_threshold_for_success: 2
+  save_full_observations: True
+  wrappers:
+    d3il_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      pass_full_observations: ${env.save_full_observations}
+      reset_within_step: False
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env_name}-m1-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 51
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.99
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 100 # no eval, always train mode
+  force_train: True
+  render:
+    freq: 1
+    num: 10
+  plotter:
+    _target_: env.plot_traj.TrajPlotter
+    env_type: avoid
+    normalization_path: ${normalization_path}
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gmm_ppo.PPO_GMM
+  clip_ploss_coef: 0.1
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [256, 256] # smaller MLP for less overfitting
+    activation_type: ReLU
+    residual_style: False
+    fixed_std: 0.1
+    learn_fixed_std: False
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    mlp_dims: [256, 256, 256]
+    residual_style: True
+    obs_dim: ${obs_dim}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/finetune/avoid_m2/ft_ppo_diffusion_mlp.yaml b/cfg/d3il/finetune/avoid_m2/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..9e69811
--- /dev/null
+++ b/cfg/d3il/finetune/avoid_m2/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,119 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_m2_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m2/avoid_d57_r12_pre_diffusion_mlp_ta4_td20/2024-07-07_13-12-09/checkpoint/state_15000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m2/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: avoiding-m5
+mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  max_episode_steps: 100
+  reset_at_iteration: True
+  save_video: False
+  best_reward_threshold_for_success: 2
+  save_full_observations: True
+  wrappers:
+    d3il_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      pass_full_observations: ${env.save_full_observations}
+      reset_within_step: False
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env_name}-m2-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 51
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.99
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 100 # no eval, always train mode
+  force_train: True
+  render:
+    freq: 1
+    num: 10
+  plotter:
+    _target_: env.plot_traj.TrajPlotter
+    env_type: avoid
+    normalization_path: ${normalization_path}
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.95
+  clip_ploss_coef: 0.1
+  clip_ploss_coef_base: 0.1
+  clip_ploss_coef_rate: 1
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/finetune/avoid_m2/ft_ppo_gaussian_mlp.yaml b/cfg/d3il/finetune/avoid_m2/ft_ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..539107f
--- /dev/null
+++ b/cfg/d3il/finetune/avoid_m2/ft_ppo_gaussian_mlp.yaml
@@ -0,0 +1,105 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_m2_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m2/avoid_d57_r12_pre_gaussian_mlp_ta4/2024-07-07_02-15-50/checkpoint/state_10000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m2/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: avoiding-m5
+mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  max_episode_steps: 100
+  reset_at_iteration: True
+  save_video: False
+  best_reward_threshold_for_success: 2
+  save_full_observations: True
+  wrappers:
+    d3il_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      pass_full_observations: ${env.save_full_observations}
+      reset_within_step: False
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env_name}-m2-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 51
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.99
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 100 # no eval, always train mode
+  force_train: True
+  render:
+    freq: 1
+    num: 10
+  plotter:
+    _target_: env.plot_traj.TrajPlotter
+    env_type: avoid
+    normalization_path: ${normalization_path}
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.1
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256] # smaller MLP for less overfitting
+    activation_type: ReLU
+    residual_style: True
+    fixed_std: 0.1
+    learn_fixed_std: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    mlp_dims: [256, 256, 256]
+    residual_style: True
+    obs_dim: ${obs_dim}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/finetune/avoid_m2/ft_ppo_gmm_mlp.yaml b/cfg/d3il/finetune/avoid_m2/ft_ppo_gmm_mlp.yaml
new file mode 100644
index 0000000..5a8b776
--- /dev/null
+++ b/cfg/d3il/finetune/avoid_m2/ft_ppo_gmm_mlp.yaml
@@ -0,0 +1,106 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_m2_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m2/avoid_d57_r12_pre_gmm_mlp_ta4/2024-07-10_15-44-32/checkpoint/state_10000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m2/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: avoiding-m5
+mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+num_modes: 5
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  max_episode_steps: 100
+  reset_at_iteration: True
+  save_video: False
+  best_reward_threshold_for_success: 2
+  save_full_observations: True
+  wrappers:
+    d3il_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      pass_full_observations: ${env.save_full_observations}
+      reset_within_step: False
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env_name}-m2-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 51
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.99
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 100 # no eval, always train mode
+  force_train: True
+  render:
+    freq: 1
+    num: 10
+  plotter:
+    _target_: env.plot_traj.TrajPlotter
+    env_type: avoid
+    normalization_path: ${normalization_path}
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gmm_ppo.PPO_GMM
+  clip_ploss_coef: 0.1
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [256, 256] # smaller MLP for less overfitting
+    activation_type: ReLU
+    residual_style: False
+    fixed_std: 0.1
+    learn_fixed_std: False
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    mlp_dims: [256, 256, 256]
+    residual_style: True
+    obs_dim: ${obs_dim}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/finetune/avoid_m3/ft_ppo_diffusion_mlp.yaml b/cfg/d3il/finetune/avoid_m3/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..d350d5b
--- /dev/null
+++ b/cfg/d3il/finetune/avoid_m3/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,119 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_m3_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m3/avoid_d58_r12_pre_diffusion_mlp_ta4_td20/2024-07-07_13-54-54/checkpoint/state_15000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m3/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: avoiding-m5
+mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  max_episode_steps: 100
+  reset_at_iteration: True
+  save_video: False
+  best_reward_threshold_for_success: 2
+  save_full_observations: True
+  wrappers:
+    d3il_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      pass_full_observations: ${env.save_full_observations}
+      reset_within_step: False
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env_name}-m3-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 51
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.99
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 100 # no eval, always train mode
+  force_train: True
+  render:
+    freq: 1
+    num: 10
+  plotter:
+    _target_: env.plot_traj.TrajPlotter
+    env_type: avoid
+    normalization_path: ${normalization_path}
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.95
+  clip_ploss_coef: 0.1
+  clip_ploss_coef_base: 0.1
+  clip_ploss_coef_rate: 1
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/finetune/avoid_m3/ft_ppo_gaussian_mlp.yaml b/cfg/d3il/finetune/avoid_m3/ft_ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..a626f54
--- /dev/null
+++ b/cfg/d3il/finetune/avoid_m3/ft_ppo_gaussian_mlp.yaml
@@ -0,0 +1,105 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_m3_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m3/avoid_d58_r12_pre_gaussian_mlp_ta4/2024-07-07_13-11-49/checkpoint/state_10000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m3/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: avoiding-m5
+mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  max_episode_steps: 100
+  reset_at_iteration: True
+  save_video: False
+  best_reward_threshold_for_success: 2
+  save_full_observations: True
+  wrappers:
+    d3il_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      pass_full_observations: ${env.save_full_observations}
+      reset_within_step: False
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env_name}-m3-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 51
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.99
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 100 # no eval, always train mode
+  force_train: True
+  render:
+    freq: 1
+    num: 10
+  plotter:
+    _target_: env.plot_traj.TrajPlotter
+    env_type: avoid
+    normalization_path: ${normalization_path}
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.1
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256] # smaller MLP for less overfitting
+    activation_type: ReLU
+    residual_style: True
+    fixed_std: 0.1
+    learn_fixed_std: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    mlp_dims: [256, 256, 256]
+    residual_style: True
+    obs_dim: ${obs_dim}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/finetune/avoid_m3/ft_ppo_gmm_mlp.yaml b/cfg/d3il/finetune/avoid_m3/ft_ppo_gmm_mlp.yaml
new file mode 100644
index 0000000..8f240ae
--- /dev/null
+++ b/cfg/d3il/finetune/avoid_m3/ft_ppo_gmm_mlp.yaml
@@ -0,0 +1,106 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_m3_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m3/avoid_d58_r12_pre_gmm_mlp_ta4/2024-07-10_17-01-50/checkpoint/state_10000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m3/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: avoiding-m5
+mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+num_modes: 5
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  max_episode_steps: 100
+  reset_at_iteration: True
+  save_video: False
+  best_reward_threshold_for_success: 2
+  save_full_observations: True
+  wrappers:
+    d3il_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      pass_full_observations: ${env.save_full_observations}
+      reset_within_step: False
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env_name}-m3-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 51
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.99
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 100
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 100 # no eval, always train mode
+  force_train: True
+  render:
+    freq: 1
+    num: 10
+  plotter:
+    _target_: env.plot_traj.TrajPlotter
+    env_type: avoid
+    normalization_path: ${normalization_path}
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gmm_ppo.PPO_GMM
+  clip_ploss_coef: 0.1
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [256, 256] # smaller MLP for less overfitting
+    activation_type: ReLU
+    residual_style: False
+    fixed_std: 0.1
+    learn_fixed_std: False
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    mlp_dims: [256, 256, 256]
+    residual_style: True
+    obs_dim: ${obs_dim}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/pretrain/avoid_m1/pre_diffusion_mlp.yaml b/cfg/d3il/pretrain/avoid_m1/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..056692f
--- /dev/null
+++ b/cfg/d3il/pretrain/avoid_m1/pre_diffusion_mlp.yaml
@@ -0,0 +1,70 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: avoid_m1_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/train.pkl
+
+seed: 42
+device: cuda:0
+env: avoid
+mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 15000
+  batch_size: 16
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 15000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/pretrain/avoid_m1/pre_gaussian_mlp.yaml b/cfg/d3il/pretrain/avoid_m1/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..23e2884
--- /dev/null
+++ b/cfg/d3il/pretrain/avoid_m1/pre_gaussian_mlp.yaml
@@ -0,0 +1,62 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: avoid_m1_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/train.pkl
+
+seed: 42
+device: cuda:0
+env: avoid
+mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 10000
+  batch_size: 16
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256] # smaller MLP for less overfitting
+    activation_type: ReLU
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/pretrain/avoid_m1/pre_gmm_mlp.yaml b/cfg/d3il/pretrain/avoid_m1/pre_gmm_mlp.yaml
new file mode 100644
index 0000000..f20817f
--- /dev/null
+++ b/cfg/d3il/pretrain/avoid_m1/pre_gmm_mlp.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: avoid_m1_pre_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/train.pkl
+
+seed: 42
+device: cuda:0
+env: avoid
+mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+num_modes: 5
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 10000
+  batch_size: 32
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gmm.GMMModel
+  network:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [256, 256]  # smaller MLP for less overfitting
+    activation_type: ReLU
+    residual_style: False
+    fixed_std: 0.1
+    num_modes: ${num_modes}
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/pretrain/avoid_m2/pre_diffusion_mlp.yaml b/cfg/d3il/pretrain/avoid_m2/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..482c7d3
--- /dev/null
+++ b/cfg/d3il/pretrain/avoid_m2/pre_diffusion_mlp.yaml
@@ -0,0 +1,70 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: avoid_m2_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m2/train.pkl
+
+seed: 42
+device: cuda:0
+env: avoid
+mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 15000
+  batch_size: 16
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 15000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/pretrain/avoid_m2/pre_gaussian_mlp.yaml b/cfg/d3il/pretrain/avoid_m2/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..d6c84a6
--- /dev/null
+++ b/cfg/d3il/pretrain/avoid_m2/pre_gaussian_mlp.yaml
@@ -0,0 +1,62 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: avoid_m2_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m2/train.pkl
+
+seed: 42
+device: cuda:0
+env: avoid
+mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 10000
+  batch_size: 16
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256] # smaller MLP for less overfitting
+    activation_type: ReLU
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/pretrain/avoid_m2/pre_gmm_mlp.yaml b/cfg/d3il/pretrain/avoid_m2/pre_gmm_mlp.yaml
new file mode 100644
index 0000000..5f06db5
--- /dev/null
+++ b/cfg/d3il/pretrain/avoid_m2/pre_gmm_mlp.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: avoid_m2_pre_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m2/train.pkl
+
+seed: 42
+device: cuda:0
+env: avoid
+mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+num_modes: 5
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 10000
+  batch_size: 32
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gmm.GMMModel
+  network:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [256, 256]  # smaller MLP for less overfitting
+    activation_type: ReLU
+    residual_style: False
+    fixed_std: 0.1
+    num_modes: ${num_modes}
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/pretrain/avoid_m3/pre_diffusion_mlp.yaml b/cfg/d3il/pretrain/avoid_m3/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..e4d2864
--- /dev/null
+++ b/cfg/d3il/pretrain/avoid_m3/pre_diffusion_mlp.yaml
@@ -0,0 +1,70 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: avoid_m3_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m3/train.pkl
+
+seed: 42
+device: cuda:0
+env: avoid
+mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 15000
+  batch_size: 16
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 15000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/pretrain/avoid_m3/pre_gaussian_mlp.yaml b/cfg/d3il/pretrain/avoid_m3/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..a1d1bfc
--- /dev/null
+++ b/cfg/d3il/pretrain/avoid_m3/pre_gaussian_mlp.yaml
@@ -0,0 +1,62 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: avoid_m3_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m3/train.pkl
+
+seed: 42
+device: cuda:0
+env: avoid
+mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 10000
+  batch_size: 16
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [256, 256, 256] # smaller MLP for less overfitting
+    activation_type: ReLU
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/d3il/pretrain/avoid_m3/pre_gmm_mlp.yaml b/cfg/d3il/pretrain/avoid_m3/pre_gmm_mlp.yaml
new file mode 100644
index 0000000..f8dca2d
--- /dev/null
+++ b/cfg/d3il/pretrain/avoid_m3/pre_gmm_mlp.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: avoid_m3_pre_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m3/train.pkl
+
+seed: 42
+device: cuda:0
+env: avoid
+mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
+obs_dim: 4
+action_dim: 2
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+num_modes: 5
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: d3il-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 10000
+  batch_size: 32
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gmm.GMMModel
+  network:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [256, 256]  # smaller MLP for less overfitting
+    activation_type: ReLU
+    residual_style: False
+    fixed_std: 0.1
+    num_modes: ${num_modes}
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/finetuning.md b/cfg/finetuning.md
new file mode 100644
index 0000000..b7532f2
--- /dev/null
+++ b/cfg/finetuning.md
@@ -0,0 +1,26 @@
+## Fine-tuning experiments
+
+### Comparing diffusion-based RL algorithms (Sec. 5.1)
+Gym configs are under `cfg/gym/finetune/<env_name>/`, and the naming follows `ft_<alg_name>_diffusion_mlp`, e.g., `ft_awr_diffusion_mlp`. `alg_name` is one of `rwr`, `awr`, `dipo`, `idql`, `dql`, `qsm`, `ppo` (DPPO), `ppo_exact` (exact likelihood). They share the same pre-trained checkpoint in each env.
+
+Robomimic configs are under `cfg/robomimic/finetune/<env_name>/`, and the naming follows the same.
+
+<!-- **Note**: For *Can* and *Lift* in Robomimic, we use earlier checkpoints from pre-training (epoch 5000) so there is more room for fine-tuning improvement. For comparing policy parameterizations, we use the final checkpoints for all tasks (epoch 8000). -->
+
+### Comparing policy parameterizations (Sec. 5.2, 5.3)
+
+Robomimic configs are under `cfg/robomimic/finetune/<env_name>/`, and the naming follows `ft_ppo_<diffusion/gaussian/gmm>_<mlp/unet/transformer>_<img?>`. For pixel experiments, we choose pre-trained checkpoints such that the pre-training performance is similar between DPPO and Gaussian baseline.
+
+**Note**: For *Can* and *Lift* in Robomimic with DPPO, you need to manually download the final checkpoints (epoch 8000). The default ones in the configs are from epoch 5000 (more room for fine-tuning improvement) and used for comparing diffusion-based RL algorithms, 
+
+Furniture-Bench configs are under `cfg/furniture/finetune/<env_name>/`, and the naming follows `ft_<diffusion/gaussian>_<mlp/unet>`. In the paper we did not show the results of `ft_diffusion_mlp`. Running IsaacGym for the first time may take a while for setting up the meshes. If you encounter the error about `libpython`, see instruction [here](installation/install_furniture.md#L27).
+
+### D3IL (Sec. 6)
+
+D3IL configs are under `cfg/d3il/finetune/avoid_<mode>/`, and the naming follows `ft_ppo_<diffusion/gaussian/gmm>_mlp`. The number of fine-tuned denoising steps can be specified with `ft_denoising_steps`.
+
+### Training from scratch (App. B.2)
+`ppo_diffusion_mlp` and `ppo_gaussian_mlp` under `cfg/gym/finetune/<env_name>` are for training DPPO or Gaussian policy from scratch.
+
+### Comparing to exact likelihood policy gradient (App. B.5)
+`ft_ppo_exact_diffusion_mlp` under `cfg/gym/finetune/hopper-v2`, `cfg/gym/finetune/halfcheetah-v2`, and `cfg/robomimic/finetune/can` are for training diffusion policy gradient with exact likelihood. `torchdiffeq` package needs to be installed first with `pip install -e .[exact]`.
diff --git a/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..227efdc
--- /dev/null
+++ b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,121 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/lamp/lamp_low_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-20/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+use_ddim: True
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: lamp
+    randomness: low
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.001
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.04
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml
new file mode 100644
index 0000000..82652a8
--- /dev/null
+++ b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml
@@ -0,0 +1,123 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/lamp/lamp_low_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-16-48/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 16
+act_steps: 8
+use_ddim: True
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: lamp
+    randomness: low
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.001
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.04
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    groupnorm_eps: 1e-4
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/lamp_low/ft_ppo_gaussian_mlp.yaml b/cfg/furniture/finetune/lamp_low/ft_ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..5efed86
--- /dev/null
+++ b/cfg/furniture/finetune/lamp_low/ft_ppo_gaussian_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/lamp/lamp_low_dim_pre_gaussian_mlp_ta8/2024-06-28_16-26-51/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: lamp
+    randomness: low
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims:
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    activation_type: ReLU
+    residual_style: True
+    fixed_std: 0.04
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..1ddd372
--- /dev/null
+++ b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,121 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/lamp/lamp_med_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-20/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+use_ddim: True
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: lamp
+    randomness: med
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.001
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.04
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml
new file mode 100644
index 0000000..a600120
--- /dev/null
+++ b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml
@@ -0,0 +1,122 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/lamp/lamp_med_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-17-21/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 16
+act_steps: 8
+use_ddim: True
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: lamp
+    randomness: med
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.001
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.04
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/lamp_med/ft_ppo_gaussian_mlp.yaml b/cfg/furniture/finetune/lamp_med/ft_ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..7c538a9
--- /dev/null
+++ b/cfg/furniture/finetune/lamp_med/ft_ppo_gaussian_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/lamp/lamp_med_dim_pre_gaussian_mlp_ta8/2024-06-28_16-26-56/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: lamp
+    randomness: med
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims:
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    activation_type: ReLU
+    residual_style: True
+    fixed_std: 0.04
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..2e5aecc
--- /dev/null
+++ b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,121 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_low_dim_pre_diffusion_mlp_ta8_td100/2024-07-22_20-01-16/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 58
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+use_ddim: True
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 700
+  best_reward_threshold_for_success: 1
+  specific:
+    headless: true
+    furniture: one_leg
+    randomness: low
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.001
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.04
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml
new file mode 100644
index 0000000..0caead2
--- /dev/null
+++ b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml
@@ -0,0 +1,123 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_low_dim_pre_diffusion_unet_ta16_td100/2024-07-03_22-23-38/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 58
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 16
+act_steps: 8
+use_ddim: True
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 700
+  best_reward_threshold_for_success: 1
+  specific:
+    headless: true
+    furniture: one_leg
+    randomness: low
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.001
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.04
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    groupnorm_eps: 1e-4
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/one_leg_low/ft_ppo_gaussian_mlp.yaml b/cfg/furniture/finetune/one_leg_low/ft_ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..31ea4ed
--- /dev/null
+++ b/cfg/furniture/finetune/one_leg_low/ft_ppo_gaussian_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_low_dim_pre_gaussian_mlp_ta8/2024-06-26_23-43-02/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 58
+action_dim: 10
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 700
+  best_reward_threshold_for_success: 1
+  specific:
+    headless: true
+    furniture: one_leg
+    randomness: low
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims:
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    activation_type: ReLU
+    residual_style: True
+    fixed_std: 0.04
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..c74a22d
--- /dev/null
+++ b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,121 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_med_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-11/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 58
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+use_ddim: True
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 700
+  best_reward_threshold_for_success: 1
+  specific:
+    headless: true
+    furniture: one_leg
+    randomness: med
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.001
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.04
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml
new file mode 100644
index 0000000..8a0c042
--- /dev/null
+++ b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml
@@ -0,0 +1,123 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_med_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-16-16/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 58
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 16
+act_steps: 8
+use_ddim: True
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 700
+  best_reward_threshold_for_success: 1
+  specific:
+    headless: true
+    furniture: one_leg
+    randomness: med
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.001
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.04
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    groupnorm_eps: 1e-4
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/one_leg_med/ft_ppo_gaussian_mlp.yaml b/cfg/furniture/finetune/one_leg_med/ft_ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..abd5c30
--- /dev/null
+++ b/cfg/furniture/finetune/one_leg_med/ft_ppo_gaussian_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_med_dim_pre_gaussian_mlp_ta8/2024-06-28_16-27-02/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 58
+action_dim: 10
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 700
+  best_reward_threshold_for_success: 1
+  specific:
+    headless: true
+    furniture: one_leg
+    randomness: med
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims:
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    activation_type: ReLU
+    residual_style: True
+    fixed_std: 0.04
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..5294173
--- /dev/null
+++ b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,121 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/round_table/round_table_low_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-26/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+use_ddim: True
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: round_table
+    randomness: low
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.001
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.04
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml
new file mode 100644
index 0000000..16c27bf
--- /dev/null
+++ b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml
@@ -0,0 +1,123 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/round_table/round_table_low_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-19-48/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 16
+act_steps: 8
+use_ddim: True
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: round_table
+    randomness: low
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.001
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.04
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    groupnorm_eps: 1e-4
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/round_table_low/ft_ppo_gaussian_mlp.yaml b/cfg/furniture/finetune/round_table_low/ft_ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..0d4a2d9
--- /dev/null
+++ b/cfg/furniture/finetune/round_table_low/ft_ppo_gaussian_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/round_table/round_table_low_dim_pre_gaussian_mlp_ta8/2024-06-28_16-26-51/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: round_table
+    randomness: low
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims:
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    activation_type: ReLU
+    residual_style: True
+    fixed_std: 0.04
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..aee7ec5
--- /dev/null
+++ b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,121 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/round_table/round_table_med_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-29/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+use_ddim: True
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: round_table
+    randomness: med
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.001
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.04
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml
new file mode 100644
index 0000000..72cf8a8
--- /dev/null
+++ b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml
@@ -0,0 +1,122 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/round_table/round_table_med_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-20-21/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 16
+act_steps: 8
+use_ddim: True
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: round_table
+    randomness: med
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.001
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.04
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/finetune/round_table_med/ft_ppo_gaussian_mlp.yaml b/cfg/furniture/finetune/round_table_med/ft_ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..7aba0ae
--- /dev/null
+++ b/cfg/furniture/finetune/round_table_med/ft_ppo_gaussian_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/round_table/round_table_med_dim_pre_gaussian_mlp_ta8/2024-06-28_16-26-38/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: round_table
+    randomness: med
+    normalization_path: ${normalization_path}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 1
+  n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 50
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 8800
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims:
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    - 1024
+    activation_type: ReLU
+    residual_style: True
+    fixed_std: 0.04
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [512, 512, 512]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/lamp_low/pre_diffusion_mlp.yaml b/cfg/furniture/pretrain/lamp_low/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..cfef0c5
--- /dev/null
+++ b/cfg/furniture/pretrain/lamp_low/pre_diffusion_mlp.yaml
@@ -0,0 +1,72 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: lamp
+randomness: low
+env: ${task}_${randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/lamp_low/pre_diffusion_unet.yaml b/cfg/furniture/pretrain/lamp_low/pre_diffusion_unet.yaml
new file mode 100644
index 0000000..5e50540
--- /dev/null
+++ b/cfg/furniture/pretrain/lamp_low/pre_diffusion_unet.yaml
@@ -0,0 +1,74 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: lamp
+randomness: low
+env: ${task}_${randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 16
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    groupnorm_eps: 1e-4 # not important
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/lamp_low/pre_gaussian_mlp.yaml b/cfg/furniture/pretrain/lamp_low/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..caa91c9
--- /dev/null
+++ b/cfg/furniture/pretrain/lamp_low/pre_gaussian_mlp.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: lamp
+randomness: low
+env: ${task}_${randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 3000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    use_layernorm: False  # worse with layernorm
+    activation_type: ReLU # worse with Mish
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/lamp_med/pre_diffusion_mlp.yaml b/cfg/furniture/pretrain/lamp_med/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..646afc6
--- /dev/null
+++ b/cfg/furniture/pretrain/lamp_med/pre_diffusion_mlp.yaml
@@ -0,0 +1,72 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: lamp
+randomness: med
+env: ${task}_${randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/lamp_med/pre_diffusion_unet.yaml b/cfg/furniture/pretrain/lamp_med/pre_diffusion_unet.yaml
new file mode 100644
index 0000000..1e26b27
--- /dev/null
+++ b/cfg/furniture/pretrain/lamp_med/pre_diffusion_unet.yaml
@@ -0,0 +1,73 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: lamp
+randomness: med
+env: ${task}_${randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 16
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/lamp_med/pre_gaussian_mlp.yaml b/cfg/furniture/pretrain/lamp_med/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..d65e4cb
--- /dev/null
+++ b/cfg/furniture/pretrain/lamp_med/pre_gaussian_mlp.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: lamp
+randomness: med
+env: ${task}_${randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 3000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    use_layernorm: False  # worse with layernorm
+    activation_type: ReLU # worse with Mish
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/one_leg_low/pre_diffusion_mlp.yaml b/cfg/furniture/pretrain/one_leg_low/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..49b8e59
--- /dev/null
+++ b/cfg/furniture/pretrain/one_leg_low/pre_diffusion_mlp.yaml
@@ -0,0 +1,72 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: one_leg
+randomness: low
+env: ${task}_${randomness}_dim
+obs_dim: 58
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/one_leg_low/pre_diffusion_unet.yaml b/cfg/furniture/pretrain/one_leg_low/pre_diffusion_unet.yaml
new file mode 100644
index 0000000..1b3e1fd
--- /dev/null
+++ b/cfg/furniture/pretrain/one_leg_low/pre_diffusion_unet.yaml
@@ -0,0 +1,74 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: one_leg
+randomness: low
+env: ${task}_${randomness}_dim
+obs_dim: 58
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 16
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    groupnorm_eps: 1e-4 # not important
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/one_leg_low/pre_gaussian_mlp.yaml b/cfg/furniture/pretrain/one_leg_low/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..3357c4f
--- /dev/null
+++ b/cfg/furniture/pretrain/one_leg_low/pre_gaussian_mlp.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: one_leg
+randomness: low
+env: ${task}_${randomness}_dim
+obs_dim: 58
+action_dim: 10
+transition_dim: ${action_dim}
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 3000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    use_layernorm: False  # worse with layernorm
+    activation_type: ReLU # worse with Mish
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/one_leg_med/pre_diffusion_mlp.yaml b/cfg/furniture/pretrain/one_leg_med/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..b690ade
--- /dev/null
+++ b/cfg/furniture/pretrain/one_leg_med/pre_diffusion_mlp.yaml
@@ -0,0 +1,72 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: one_leg
+randomness: med
+env: ${task}_${randomness}_dim
+obs_dim: 58
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/one_leg_med/pre_diffusion_unet.yaml b/cfg/furniture/pretrain/one_leg_med/pre_diffusion_unet.yaml
new file mode 100644
index 0000000..0f5a3d5
--- /dev/null
+++ b/cfg/furniture/pretrain/one_leg_med/pre_diffusion_unet.yaml
@@ -0,0 +1,73 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: one_leg
+randomness: med
+env: ${task}_${randomness}_dim
+obs_dim: 58
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 16
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/one_leg_med/pre_gaussian_mlp.yaml b/cfg/furniture/pretrain/one_leg_med/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..05cdd7e
--- /dev/null
+++ b/cfg/furniture/pretrain/one_leg_med/pre_gaussian_mlp.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: one_leg
+randomness: med
+env: ${task}_${randomness}_dim
+obs_dim: 58
+action_dim: 10
+transition_dim: ${action_dim}
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 10000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    use_layernorm: False  # worse with layernorm
+    activation_type: ReLU # worse with Mish
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/round_table_low/pre_diffusion_mlp.yaml b/cfg/furniture/pretrain/round_table_low/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..6b6c9eb
--- /dev/null
+++ b/cfg/furniture/pretrain/round_table_low/pre_diffusion_mlp.yaml
@@ -0,0 +1,72 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: round_table
+randomness: low
+env: ${task}_${randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/round_table_low/pre_diffusion_unet.yaml b/cfg/furniture/pretrain/round_table_low/pre_diffusion_unet.yaml
new file mode 100644
index 0000000..8186e2f
--- /dev/null
+++ b/cfg/furniture/pretrain/round_table_low/pre_diffusion_unet.yaml
@@ -0,0 +1,73 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: round_table
+randomness: low
+env: ${task}_${randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 16
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/round_table_low/pre_gaussian_mlp.yaml b/cfg/furniture/pretrain/round_table_low/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..f9488a2
--- /dev/null
+++ b/cfg/furniture/pretrain/round_table_low/pre_gaussian_mlp.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: round_table
+randomness: low
+env: ${task}_${randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 3000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    use_layernorm: False  # worse with layernorm
+    activation_type: ReLU # worse with Mish
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/round_table_med/pre_diffusion_mlp.yaml b/cfg/furniture/pretrain/round_table_med/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..d854f3a
--- /dev/null
+++ b/cfg/furniture/pretrain/round_table_med/pre_diffusion_mlp.yaml
@@ -0,0 +1,72 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: round_table
+randomness: med
+env: ${task}_${randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/round_table_med/pre_diffusion_unet.yaml b/cfg/furniture/pretrain/round_table_med/pre_diffusion_unet.yaml
new file mode 100644
index 0000000..a3dc85a
--- /dev/null
+++ b/cfg/furniture/pretrain/round_table_med/pre_diffusion_unet.yaml
@@ -0,0 +1,73 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: round_table
+randomness: med
+env: ${task}_${randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 16
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/furniture/pretrain/round_table_med/pre_gaussian_mlp.yaml b/cfg/furniture/pretrain/round_table_med/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..31764a5
--- /dev/null
+++ b/cfg/furniture/pretrain/round_table_med/pre_gaussian_mlp.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
+
+seed: 42
+device: cuda:0
+task: round_table
+randomness: med
+env: ${task}_${randomness}_dim
+obs_dim: 44
+action_dim: 10
+transition_dim: ${action_dim}
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: furniture-${task}-${randomness}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 3000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    use_layernorm: False  # worse with layernorm
+    activation_type: ReLU # worse with Mish
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_awr_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_awr_diffusion_mlp.yaml
new file mode 100644
index 0000000..4e733df
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/ft_awr_diffusion_mlp.yaml
@@ -0,0 +1,102 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
+
+name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # AWR specific
+  scale_reward_factor: 0.01
+  max_adv_weight: 100
+  beta: 10
+  buffer_size: 5000
+  batch_size: 256
+  replay_ratio: 64
+  critic_update_ratio: 4
+
+model:
+  _target_: model.diffusion.diffusion_awr.AWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_dipo_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_dipo_diffusion_mlp.yaml
new file mode 100644
index 0000000..b9641ef
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/ft_dipo_diffusion_mlp.yaml
@@ -0,0 +1,103 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
+
+name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DIPO specific
+  scale_reward_factor: 0.01
+  eta: 0.0001
+  action_gradient_steps: 10
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dipo.DIPODiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_dql_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_dql_diffusion_mlp.yaml
new file mode 100644
index 0000000..a26cbf7
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/ft_dql_diffusion_mlp.yaml
@@ -0,0 +1,102 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
+
+name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DQL specific
+  scale_reward_factor: 0.01
+  eta: 1.0
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dql.DQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_idql_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_idql_diffusion_mlp.yaml
new file mode 100644
index 0000000..d45cfaf
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/ft_idql_diffusion_mlp.yaml
@@ -0,0 +1,111 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
+
+name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # IDQL specific
+  scale_reward_factor: 0.01
+  eval_deterministic: True
+  eval_sample_num: 20 # how many samples to score during eval
+  critic_tau: 0.001  # rate of target q network update
+  use_expectile_exploration: True
+  buffer_size: 5000
+  batch_size: 512
+  replay_ratio: 16
+
+model:
+  _target_: model.diffusion.diffusion_idql.IDQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic_q:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  critic_v:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..cad9e82
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,110 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3  # success rate not relevant for gym tasks
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 5000
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.01
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_exact_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_exact_diffusion_mlp.yaml
new file mode 100644
index 0000000..49b45ab
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_exact_diffusion_mlp.yaml
@@ -0,0 +1,118 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_exact_diffusion_agent.TrainPPOExactDiffusionAgent
+
+name: ${env_name}_ppo_exact_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune-exact
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 5000
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo_exact.PPOExactDiffusion
+  sde:
+    _target_: model.diffusion.sde_lib.VPSDE # VP for DDPM
+    N: ${denoising_steps}
+  rtol: 1e-4
+  atol: 1e-4
+  sde_hutchinson_type: Rademacher
+  sde_step_size: 0.05
+  sde_method: euler
+  sde_num_epsilon: 2
+  sde_min_beta: 1e-10
+  sde_probability_flow: True
+  #
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_qsm_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_qsm_diffusion_mlp.yaml
new file mode 100644
index 0000000..a36aee6
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/ft_qsm_diffusion_mlp.yaml
@@ -0,0 +1,103 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent
+
+name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # QSM specific
+  scale_reward_factor: 0.01
+  q_grad_coeff: 50
+  critic_tau: 0.005  # rate of target q network update
+  buffer_size: 5000
+  batch_size: 256
+  replay_ratio: 32
+
+model:
+  _target_: model.diffusion.diffusion_qsm.QSMDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_rwr_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_rwr_diffusion_mlp.yaml
new file mode 100644
index 0000000..9c36ae1
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/ft_rwr_diffusion_mlp.yaml
@@ -0,0 +1,87 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent
+
+name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 500
+  gamma: 0.99
+  lr: 1e-4
+  weight_decay: 0
+  lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # RWR specific
+  max_reward_weight: 100
+  beta: 10
+  batch_size: 1000
+  update_epochs: 16
+
+model:
+  _target_: model.diffusion.diffusion_rwr.RWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..0c496ef
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_nopre_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 20
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 10
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-scratch
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 1000
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 1000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.1
+  clip_ploss_coef_base: 0.1
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..2d3e9c9
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml
@@ -0,0 +1,94 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_nopre_ppo_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 10
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-scratch
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 1000
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 1000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.1
+  randn_clip_value: 3
+  #
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ft_awr_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_awr_diffusion_mlp.yaml
new file mode 100644
index 0000000..b0239b6
--- /dev/null
+++ b/cfg/gym/finetune/hopper-v2/ft_awr_diffusion_mlp.yaml
@@ -0,0 +1,102 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
+
+name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # AWR specific
+  scale_reward_factor: 0.01
+  max_adv_weight: 100
+  beta: 10
+  buffer_size: 5000
+  batch_size: 256
+  replay_ratio: 64
+  critic_update_ratio: 4
+
+model:
+  _target_: model.diffusion.diffusion_awr.AWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ft_dipo_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_dipo_diffusion_mlp.yaml
new file mode 100644
index 0000000..259f4df
--- /dev/null
+++ b/cfg/gym/finetune/hopper-v2/ft_dipo_diffusion_mlp.yaml
@@ -0,0 +1,103 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
+
+name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DIPO specific
+  scale_reward_factor: 0.01
+  eta: 0.0001
+  action_gradient_steps: 10
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dipo.DIPODiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ft_dql_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_dql_diffusion_mlp.yaml
new file mode 100644
index 0000000..4d8d9d5
--- /dev/null
+++ b/cfg/gym/finetune/hopper-v2/ft_dql_diffusion_mlp.yaml
@@ -0,0 +1,102 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
+
+name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DQL specific
+  scale_reward_factor: 0.01
+  eta: 1.0
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dql.DQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ft_idql_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_idql_diffusion_mlp.yaml
new file mode 100644
index 0000000..3d29412
--- /dev/null
+++ b/cfg/gym/finetune/hopper-v2/ft_idql_diffusion_mlp.yaml
@@ -0,0 +1,111 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
+
+name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # IDQL specific
+  scale_reward_factor: 0.01
+  eval_deterministic: True
+  eval_sample_num: 20 # how many samples to score during eval
+  critic_tau: 0.001  # rate of target q network update
+  use_expectile_exploration: True
+  buffer_size: 5000
+  batch_size: 512
+  replay_ratio: 16
+
+model:
+  _target_: model.diffusion.diffusion_idql.IDQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic_q:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  critic_v:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..76364f8
--- /dev/null
+++ b/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,110 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3  # success rate not relevant for gym tasks
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 5000
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.01
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ft_ppo_exact_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_ppo_exact_diffusion_mlp.yaml
new file mode 100644
index 0000000..58cc997
--- /dev/null
+++ b/cfg/gym/finetune/hopper-v2/ft_ppo_exact_diffusion_mlp.yaml
@@ -0,0 +1,117 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_exact_diffusion_agent.TrainPPOExactDiffusionAgent
+
+name: ${env_name}_ppo_exact_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune-exact
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 5000
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo_exact.PPOExactDiffusion
+  sde:
+    _target_: model.diffusion.sde_lib.VPSDE # VP for DDPM
+    N: ${denoising_steps}
+  rtol: 1e-4
+  atol: 1e-4
+  sde_hutchinson_type: Rademacher
+  sde_step_size: 0.05
+  sde_method: euler
+  sde_num_epsilon: 2
+  sde_min_beta: 1e-10
+  sde_probability_flow: True
+  #
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ft_qsm_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_qsm_diffusion_mlp.yaml
new file mode 100644
index 0000000..a627f94
--- /dev/null
+++ b/cfg/gym/finetune/hopper-v2/ft_qsm_diffusion_mlp.yaml
@@ -0,0 +1,103 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent
+
+name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # QSM specific
+  scale_reward_factor: 0.01
+  q_grad_coeff: 50
+  critic_tau: 0.005  # rate of target q network update
+  buffer_size: 5000
+  batch_size: 256
+  replay_ratio: 32
+
+model:
+  _target_: model.diffusion.diffusion_qsm.QSMDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ft_rwr_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_rwr_diffusion_mlp.yaml
new file mode 100644
index 0000000..7cdf82e
--- /dev/null
+++ b/cfg/gym/finetune/hopper-v2/ft_rwr_diffusion_mlp.yaml
@@ -0,0 +1,87 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent
+
+name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 500
+  gamma: 0.99
+  lr: 1e-4
+  weight_decay: 0
+  lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # RWR specific
+  max_reward_weight: 100
+  beta: 10
+  batch_size: 1000
+  update_epochs: 16
+
+model:
+  _target_: model.diffusion.diffusion_rwr.RWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..8ce5397
--- /dev/null
+++ b/cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_nopre_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 20
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 10
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-scratch
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 1000
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 1000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.1
+  clip_ploss_coef_base: 0.1
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml b/cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..033cdbf
--- /dev/null
+++ b/cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml
@@ -0,0 +1,94 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_nopre_ppo_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 10
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-scratch
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 1000
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 1000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.1
+  randn_clip_value: 3
+  #
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/walker2d-v2/ft_awr_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_awr_diffusion_mlp.yaml
new file mode 100644
index 0000000..319d126
--- /dev/null
+++ b/cfg/gym/finetune/walker2d-v2/ft_awr_diffusion_mlp.yaml
@@ -0,0 +1,102 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
+
+name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: walker2d-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # AWR specific
+  scale_reward_factor: 0.01
+  max_adv_weight: 100
+  beta: 10
+  buffer_size: 5000
+  batch_size: 256
+  replay_ratio: 64
+  critic_update_ratio: 4
+
+model:
+  _target_: model.diffusion.diffusion_awr.AWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/walker2d-v2/ft_dipo_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_dipo_diffusion_mlp.yaml
new file mode 100644
index 0000000..dcdf747
--- /dev/null
+++ b/cfg/gym/finetune/walker2d-v2/ft_dipo_diffusion_mlp.yaml
@@ -0,0 +1,103 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
+
+name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: walker2d-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DIPO specific
+  scale_reward_factor: 0.01
+  eta: 0.0001
+  action_gradient_steps: 10
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dipo.DIPODiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/walker2d-v2/ft_dql_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_dql_diffusion_mlp.yaml
new file mode 100644
index 0000000..9f147f5
--- /dev/null
+++ b/cfg/gym/finetune/walker2d-v2/ft_dql_diffusion_mlp.yaml
@@ -0,0 +1,102 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
+
+name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: walker2d-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DQL specific
+  scale_reward_factor: 0.01
+  eta: 1.0
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dql.DQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/walker2d-v2/ft_idql_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_idql_diffusion_mlp.yaml
new file mode 100644
index 0000000..d419dfe
--- /dev/null
+++ b/cfg/gym/finetune/walker2d-v2/ft_idql_diffusion_mlp.yaml
@@ -0,0 +1,111 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
+
+name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: walker2d-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # IDQL specific
+  scale_reward_factor: 0.01
+  eval_deterministic: True
+  eval_sample_num: 20 # how many samples to score during eval
+  critic_tau: 0.001  # rate of target q network update
+  use_expectile_exploration: True
+  buffer_size: 5000
+  batch_size: 512
+  replay_ratio: 16
+
+model:
+  _target_: model.diffusion.diffusion_idql.IDQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic_q:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  critic_v:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..2c25c54
--- /dev/null
+++ b/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,110 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: walker2d-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3  # success rate not relevant for gym tasks
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 5000
+  update_epochs: 5
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.01
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/walker2d-v2/ft_qsm_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_qsm_diffusion_mlp.yaml
new file mode 100644
index 0000000..3ac51e0
--- /dev/null
+++ b/cfg/gym/finetune/walker2d-v2/ft_qsm_diffusion_mlp.yaml
@@ -0,0 +1,103 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent
+
+name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: walker2d-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 500
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # QSM specific
+  scale_reward_factor: 0.01
+  q_grad_coeff: 50
+  critic_tau: 0.005  # rate of target q network update
+  buffer_size: 5000
+  batch_size: 256
+  replay_ratio: 32
+
+model:
+  _target_: model.diffusion.diffusion_qsm.QSMDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/walker2d-v2/ft_rwr_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_rwr_diffusion_mlp.yaml
new file mode 100644
index 0000000..2d097d1
--- /dev/null
+++ b/cfg/gym/finetune/walker2d-v2/ft_rwr_diffusion_mlp.yaml
@@ -0,0 +1,87 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent
+
+name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: walker2d-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 500
+  gamma: 0.99
+  lr: 1e-4
+  weight_decay: 0
+  lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # RWR specific
+  max_reward_weight: 100
+  beta: 10
+  batch_size: 1000
+  update_epochs: 16
+
+model:
+  _target_: model.diffusion.diffusion_rwr.RWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..0ee1f77
--- /dev/null
+++ b/cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_nopre_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: walker2d-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 20
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 10
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-scratch
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 1000
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 1000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.1
+  clip_ploss_coef_base: 0.1
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..1764603
--- /dev/null
+++ b/cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml
@@ -0,0 +1,94 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_nopre_ppo_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+device: cuda:0
+env_name: walker2d-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 10
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env_name}-scratch
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 0
+  n_steps: 1000
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 1000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.1
+  randn_clip_value: 3
+  #
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/halfcheetah-medium-v2/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..bbc8772
--- /dev/null
+++ b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_diffusion_mlp.yaml
@@ -0,0 +1,71 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 3000
+  batch_size: 128
+  learning_rate: 1e-3
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 3000
+    warmup_steps: 1
+    min_lr: 1e-4
+  epoch_start_ema: 10
+  update_ema_freq: 5
+  save_model_freq: 100
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    out_activation_type: Identity
+    use_layernorm: False
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/hopper-medium-v2/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/hopper-medium-v2/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..29894e5
--- /dev/null
+++ b/cfg/gym/pretrain/hopper-medium-v2/pre_diffusion_mlp.yaml
@@ -0,0 +1,71 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 3000
+  batch_size: 128
+  learning_rate: 1e-3
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 3000
+    warmup_steps: 1
+    min_lr: 1e-4
+  epoch_start_ema: 10
+  update_ema_freq: 5
+  save_model_freq: 100
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    out_activation_type: Identity
+    use_layernorm: False
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/walker2d-medium-v2/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/walker2d-medium-v2/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..e638e9f
--- /dev/null
+++ b/cfg/gym/pretrain/walker2d-medium-v2/pre_diffusion_mlp.yaml
@@ -0,0 +1,71 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: walker2d-medium-v2
+obs_dim: 17
+action_dim: 6
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: gym-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 3000
+  batch_size: 128
+  learning_rate: 1e-3
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 3000
+    warmup_steps: 1
+    min_lr: 1e-4
+  epoch_start_ema: 10
+  update_ema_freq: 5
+  save_model_freq: 100
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    out_activation_type: Identity
+    use_layernorm: False
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/pretraining.md b/cfg/pretraining.md
new file mode 100644
index 0000000..c672d17
--- /dev/null
+++ b/cfg/pretraining.md
@@ -0,0 +1,14 @@
+## Pre-training experiments
+
+### Comparing diffusion-based RL algorithms (Sec. 5.1)
+Gym configs are under `cfg/gym/pretrain/<env_name>/`, and the config name is `pre_diffusion_mlp`. Robomimic configs are under `cfg/robomimic/pretrain/<env_name>/`, and the name is also `pre_diffusion_mlp`.
+
+**Note**: In both Gym and Robomimic experiments, for the experiments in the paper we used more than enough expert demonstrations for pre-training. You can specify `+train_dataset.max_n_episodes=<number_of_episodes>` to limit the number of episodes so the pre-training is faster.
+
+### Comparing policy parameterizations (Sec. 5.2, 5.3)
+
+Robomimic configs are under `cfg/robomimic/pretrain/<env_name>/`, and the naming follows `pre_<diffusion/gaussian/gmm>_<mlp/unet/transformer>_<img?>`. Furniture-Bench configs are under `cfg/furniture/pretrain/<env_name>/`, and the naming follows `pre_<diffusion/gaussian>_<mlp/unet>`.
+
+### D3IL (Sec. 6)
+
+D3IL configs are under `cfg/d3il/pretrain/avoid_<mode>/`, and the naming follows `pre_<diffusion/gaussian/gmm>_mlp`. In the paper we manually examine the pre-trained checkpoints and pick the ones that visually match the pre-training data the best. We also tune the Gaussian and GMM policy architecture extensively for best pre-training performance. The action chunk size can be specified with `horizon_steps` and the number of denoising steps can be specified with `denoising_steps`.
diff --git a/cfg/robomimic/env_meta/can-img.json b/cfg/robomimic/env_meta/can-img.json
new file mode 100644
index 0000000..ba76f61
--- /dev/null
+++ b/cfg/robomimic/env_meta/can-img.json
@@ -0,0 +1 @@
+{"env_name": "PickPlaceCan", "env_version": "1.4.1", "type": 1, "env_kwargs": {"has_renderer": false, "has_offscreen_renderer": true, "ignore_done": true, "use_object_obs": true, "use_camera_obs": true, "control_freq": 20, "controller_configs": {"type": "OSC_POSE", "input_max": 1, "input_min": -1, "output_max": [0.05, 0.05, 0.05, 0.5, 0.5, 0.5], "output_min": [-0.05, -0.05, -0.05, -0.5, -0.5, -0.5], "kp": 150, "damping": 1, "impedance_mode": "fixed", "kp_limits": [0, 300], "damping_limits": [0, 10], "position_limits": null, "orientation_limits": null, "uncouple_pos_ori": true, "control_delta": true, "interpolation": null, "ramp_ratio": 0.2}, "robots": ["Panda"], "camera_depths": false, "camera_heights": 96, "camera_widths": 96, "reward_shaping": false, "camera_names": ["robot0_eye_in_hand"], "render_gpu_device_id": 0}}
\ No newline at end of file
diff --git a/cfg/robomimic/env_meta/can.json b/cfg/robomimic/env_meta/can.json
new file mode 100644
index 0000000..3813cd2
--- /dev/null
+++ b/cfg/robomimic/env_meta/can.json
@@ -0,0 +1 @@
+{"env_name": "PickPlaceCan", "env_version": "1.4.1", "type": 1, "env_kwargs": {"has_renderer": false, "has_offscreen_renderer": false, "ignore_done": true, "use_object_obs": true, "use_camera_obs": false, "control_freq": 20, "controller_configs": {"type": "OSC_POSE", "input_max": 1, "input_min": -1, "output_max": [0.05, 0.05, 0.05, 0.5, 0.5, 0.5], "output_min": [-0.05, -0.05, -0.05, -0.5, -0.5, -0.5], "kp": 150, "damping": 1, "impedance_mode": "fixed", "kp_limits": [0, 300], "damping_limits": [0, 10], "position_limits": null, "orientation_limits": null, "uncouple_pos_ori": true, "control_delta": true, "interpolation": null, "ramp_ratio": 0.2}, "robots": ["Panda"], "camera_depths": false, "camera_heights": 84, "camera_widths": 84, "reward_shaping": false}}
\ No newline at end of file
diff --git a/cfg/robomimic/env_meta/lift-img.json b/cfg/robomimic/env_meta/lift-img.json
new file mode 100644
index 0000000..5a35f94
--- /dev/null
+++ b/cfg/robomimic/env_meta/lift-img.json
@@ -0,0 +1 @@
+{"env_name": "Lift", "env_version": "1.4.1", "type": 1, "env_kwargs": {"has_renderer": false, "has_offscreen_renderer": true, "ignore_done": true, "use_object_obs": true, "use_camera_obs": true, "control_freq": 20, "controller_configs": {"type": "OSC_POSE", "input_max": 1, "input_min": -1, "output_max": [0.05, 0.05, 0.05, 0.5, 0.5, 0.5], "output_min": [-0.05, -0.05, -0.05, -0.5, -0.5, -0.5], "kp": 150, "damping": 1, "impedance_mode": "fixed", "kp_limits": [0, 300], "damping_limits": [0, 10], "position_limits": null, "orientation_limits": null, "uncouple_pos_ori": true, "control_delta": true, "interpolation": null, "ramp_ratio": 0.2}, "robots": ["Panda"], "camera_depths": false, "camera_heights": 96, "camera_widths": 96, "reward_shaping": false, "camera_names": ["robot0_eye_in_hand"], "render_gpu_device_id": 0}}
\ No newline at end of file
diff --git a/cfg/robomimic/env_meta/lift.json b/cfg/robomimic/env_meta/lift.json
new file mode 100644
index 0000000..eeb4609
--- /dev/null
+++ b/cfg/robomimic/env_meta/lift.json
@@ -0,0 +1 @@
+{"env_name": "Lift", "env_version": "1.4.1", "type": 1, "env_kwargs": {"has_renderer": false, "has_offscreen_renderer": false, "ignore_done": true, "use_object_obs": true, "use_camera_obs": false, "control_freq": 20, "controller_configs": {"type": "OSC_POSE", "input_max": 1, "input_min": -1, "output_max": [0.05, 0.05, 0.05, 0.5, 0.5, 0.5], "output_min": [-0.05, -0.05, -0.05, -0.5, -0.5, -0.5], "kp": 150, "damping": 1, "impedance_mode": "fixed", "kp_limits": [0, 300], "damping_limits": [0, 10], "position_limits": null, "orientation_limits": null, "uncouple_pos_ori": true, "control_delta": true, "interpolation": null, "ramp_ratio": 0.2}, "robots": ["Panda"], "camera_depths": false, "camera_heights": 84, "camera_widths": 84, "reward_shaping": false}}
\ No newline at end of file
diff --git a/cfg/robomimic/env_meta/square-img.json b/cfg/robomimic/env_meta/square-img.json
new file mode 100644
index 0000000..ec219db
--- /dev/null
+++ b/cfg/robomimic/env_meta/square-img.json
@@ -0,0 +1 @@
+{"env_name": "NutAssemblySquare", "env_version": "1.4.1", "type": 1, "env_kwargs": {"has_renderer": false, "has_offscreen_renderer": true, "ignore_done": true, "use_object_obs": true, "use_camera_obs": true, "control_freq": 20, "controller_configs": {"type": "OSC_POSE", "input_max": 1, "input_min": -1, "output_max": [0.05, 0.05, 0.05, 0.5, 0.5, 0.5], "output_min": [-0.05, -0.05, -0.05, -0.5, -0.5, -0.5], "kp": 150, "damping": 1, "impedance_mode": "fixed", "kp_limits": [0, 300], "damping_limits": [0, 10], "position_limits": null, "orientation_limits": null, "uncouple_pos_ori": true, "control_delta": true, "interpolation": null, "ramp_ratio": 0.2}, "robots": ["Panda"], "camera_depths": false, "camera_heights": 96, "camera_widths": 96, "reward_shaping": false, "camera_names": ["agentview"], "render_gpu_device_id": 0}}
\ No newline at end of file
diff --git a/cfg/robomimic/env_meta/square.json b/cfg/robomimic/env_meta/square.json
new file mode 100644
index 0000000..8dfb59c
--- /dev/null
+++ b/cfg/robomimic/env_meta/square.json
@@ -0,0 +1 @@
+{"env_name": "NutAssemblySquare", "env_version": "1.4.1", "type": 1, "env_kwargs": {"has_renderer": false, "has_offscreen_renderer": false, "ignore_done": true, "use_object_obs": true, "use_camera_obs": false, "control_freq": 20, "controller_configs": {"type": "OSC_POSE", "input_max": 1, "input_min": -1, "output_max": [0.05, 0.05, 0.05, 0.5, 0.5, 0.5], "output_min": [-0.05, -0.05, -0.05, -0.5, -0.5, -0.5], "kp": 150, "damping": 1, "impedance_mode": "fixed", "kp_limits": [0, 300], "damping_limits": [0, 10], "position_limits": null, "orientation_limits": null, "uncouple_pos_ori": true, "control_delta": true, "interpolation": null, "ramp_ratio": 0.2}, "robots": ["Panda"], "camera_depths": false, "camera_heights": 84, "camera_widths": 84, "reward_shaping": false}}
\ No newline at end of file
diff --git a/cfg/robomimic/env_meta/transport-img.json b/cfg/robomimic/env_meta/transport-img.json
new file mode 100644
index 0000000..2e6b9af
--- /dev/null
+++ b/cfg/robomimic/env_meta/transport-img.json
@@ -0,0 +1 @@
+{"env_name": "TwoArmTransport", "env_version": "1.4.1", "type": 1, "env_kwargs": {"has_renderer": false, "has_offscreen_renderer": true, "ignore_done": true, "use_object_obs": true, "use_camera_obs": true, "control_freq": 20, "controller_configs": {"type": "OSC_POSE", "input_max": 1, "input_min": -1, "output_max": [0.05, 0.05, 0.05, 0.5, 0.5, 0.5], "output_min": [-0.05, -0.05, -0.05, -0.5, -0.5, -0.5], "kp": 150, "damping": 1, "impedance_mode": "fixed", "kp_limits": [0, 300], "damping_limits": [0, 10], "position_limits": null, "orientation_limits": null, "uncouple_pos_ori": true, "control_delta": true, "interpolation": null, "ramp_ratio": 0.2}, "robots": ["Panda", "Panda"], "env_configuration": "single-arm-opposed", "camera_depths": false, "camera_heights": 96, "camera_widths": 96, "reward_shaping": false, "camera_names": ["shouldercamera0", "shouldercamera1"], "render_gpu_device_id": 0}}
\ No newline at end of file
diff --git a/cfg/robomimic/env_meta/transport.json b/cfg/robomimic/env_meta/transport.json
new file mode 100644
index 0000000..9812350
--- /dev/null
+++ b/cfg/robomimic/env_meta/transport.json
@@ -0,0 +1 @@
+{"env_name": "TwoArmTransport", "env_version": "1.4.1", "type": 1, "env_kwargs": {"has_renderer": false, "has_offscreen_renderer": false, "ignore_done": true, "use_object_obs": true, "use_camera_obs": false, "control_freq": 20, "controller_configs": {"type": "OSC_POSE", "input_max": 1, "input_min": -1, "output_max": [0.05, 0.05, 0.05, 0.5, 0.5, 0.5], "output_min": [-0.05, -0.05, -0.05, -0.5, -0.5, -0.5], "kp": 150, "damping": 1, "impedance_mode": "fixed", "kp_limits": [0, 300], "damping_limits": [0, 10], "position_limits": null, "orientation_limits": null, "uncouple_pos_ori": true, "control_delta": true, "interpolation": null, "ramp_ratio": 0.2}, "robots": ["Panda", "Panda"], "env_configuration": "single-arm-opposed", "camera_depths": false, "camera_heights": 84, "camera_widths": 84, "reward_shaping": false}}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml
new file mode 100644
index 0000000..c0916f6
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml
@@ -0,0 +1,106 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
+
+name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # AWR specific
+  scale_reward_factor: 1
+  max_adv_weight: 100
+  beta: 10
+  buffer_size: 3000
+  batch_size: 256
+  replay_ratio: 64
+  critic_update_ratio: 4
+
+model:
+  _target_: model.diffusion.diffusion_awr.AWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_dipo_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_dipo_diffusion_mlp.yaml
new file mode 100644
index 0000000..b5c7867
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_dipo_diffusion_mlp.yaml
@@ -0,0 +1,107 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
+
+name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DIPO specific
+  scale_reward_factor: 1
+  eta: 0.0001
+  action_gradient_steps: 10
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dipo.DIPODiffusion
+  # HP to tune
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml
new file mode 100644
index 0000000..5b94c6f
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml
@@ -0,0 +1,106 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
+
+name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DQL specific
+  scale_reward_factor: 1
+  eta: 1.0
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dql.DQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml
new file mode 100644
index 0000000..4f03622
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml
@@ -0,0 +1,115 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
+
+name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 5
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # IDQL specific
+  scale_reward_factor: 1
+  eval_deterministic: True
+  eval_sample_num: 10 # how many samples to score during eval
+  critic_tau: 0.001  # rate of target q network update
+  use_expectile_exploration: True
+  buffer_size: 3000
+  batch_size: 512
+  replay_ratio: 16
+
+model:
+  _target_: model.diffusion.diffusion_idql.IDQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic_q:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  critic_v:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..988c2ef
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,114 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt  # use 8000 for comparing policy parameterizations
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml
new file mode 100644
index 0000000..5d9c9a8
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml
@@ -0,0 +1,164 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_img_ta4_td100/2024-07-30_22-23-55/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 9
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+use_ddim: True
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['robot0_eye_in_hand_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 200
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  augment: True
+  grad_accumulate: 15
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 200
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 200
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 500
+  logprob_batch_size: 500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: False
+    spatial_emb: 128
+    time_dim: 32
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.ViTCritic
+    spatial_emb: 128
+    augment: False
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet.yaml
new file mode 100644
index 0000000..addda3d
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet.yaml
@@ -0,0 +1,117 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_unet_ta4_td20/2024-06-29_02-49-45/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 40
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_ppo_exact_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_ppo_exact_diffusion_mlp.yaml
new file mode 100644
index 0000000..2995c60
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_exact_diffusion_mlp.yaml
@@ -0,0 +1,122 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_exact_diffusion_agent.TrainPPOExactDiffusionAgent
+
+name: ${env_name}_ft_exact_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo_exact.PPOExactDiffusion
+  sde:
+    _target_: model.diffusion.sde_lib.VPSDE # VP for DDPM
+    N: ${denoising_steps}
+  rtol: 1e-4
+  atol: 1e-4
+  sde_hutchinson_type: Rademacher
+  sde_step_size: 0.05
+  sde_method: euler
+  sde_num_epsilon: 2
+  sde_min_beta: 1e-10
+  sde_probability_flow: True
+  #
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp.yaml b/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..f47e7a8
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp.yaml
@@ -0,0 +1,103 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_gaussian_mlp_ta4/2024-06-28_13-31-00/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp_img.yaml b/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp_img.yaml
new file mode 100644
index 0000000..0d94f40
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp_img.yaml
@@ -0,0 +1,141 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_img_agent.TrainPPOImgGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_img_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_gaussian_mlp_img_ta4/2024-07-28_21-54-40/checkpoint/state_1000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 9
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['robot0_eye_in_hand_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 200
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  augment: True
+  grad_accumulate: 5
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 200
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 200
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 1500
+  logprob_batch_size: 1000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_VisionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: False
+    spatial_emb: 128
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.ViTCritic
+    spatial_emb: 128
+    augment: False
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_ppo_gaussian_transformer.yaml b/cfg/robomimic/finetune/can/ft_ppo_gaussian_transformer.yaml
new file mode 100644
index 0000000..948e20f
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_gaussian_transformer.yaml
@@ -0,0 +1,104 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_gaussian_transformer_ta4/2024-06-28_13-42-20/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.transformer.Gaussian_Transformer
+    transformer_embed_dim: 96
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_ppo_gmm_mlp.yaml b/cfg/robomimic/finetune/can/ft_ppo_gmm_mlp.yaml
new file mode 100644
index 0000000..fffe5bb
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_gmm_mlp.yaml
@@ -0,0 +1,104 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_gmm_mlp_ta4/2024-06-28_13-32-19/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+num_modes: 5
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gmm_ppo.PPO_GMM
+  clip_ploss_coef: 0.01
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_ppo_gmm_transformer.yaml b/cfg/robomimic/finetune/can/ft_ppo_gmm_transformer.yaml
new file mode 100644
index 0000000..5bc098b
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_gmm_transformer.yaml
@@ -0,0 +1,105 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gmm_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_gmm_transformer_ta4/2024-06-28_13-43-21/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+num_modes: 5
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gmm_ppo.PPO_GMM
+  clip_ploss_coef: 0.01
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.transformer.GMM_Transformer
+    transformer_embed_dim: 96
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml
new file mode 100644
index 0000000..a590263
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml
@@ -0,0 +1,107 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent
+
+name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 5
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # QSM specific
+  scale_reward_factor: 1
+  q_grad_coeff: 50
+  critic_tau: 0.005  # rate of target q network update
+  buffer_size: 3000
+  batch_size: 256
+  replay_ratio: 32
+
+model:
+  _target_: model.diffusion.diffusion_qsm.QSMDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml
new file mode 100644
index 0000000..3ff9c2d
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml
@@ -0,0 +1,91 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent
+
+name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  lr: 1e-5
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # RWR specific
+  max_reward_weight: 100
+  beta: 10
+  batch_size: 1000
+  update_epochs: 16
+
+model:
+  _target_: model.diffusion.diffusion_rwr.RWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml
new file mode 100644
index 0000000..8a797e3
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml
@@ -0,0 +1,106 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
+
+name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # AWR specific
+  scale_reward_factor: 1
+  max_adv_weight: 100
+  beta: 10
+  buffer_size: 3000
+  batch_size: 256
+  replay_ratio: 64
+  critic_update_ratio: 4
+
+model:
+  _target_: model.diffusion.diffusion_awr.AWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_dipo_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_dipo_diffusion_mlp.yaml
new file mode 100644
index 0000000..f3feb20
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_dipo_diffusion_mlp.yaml
@@ -0,0 +1,107 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
+
+name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DIPO specific
+  scale_reward_factor: 1
+  eta: 0.0001
+  action_gradient_steps: 10
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dipo.DIPODiffusion
+  # HP to tune
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml
new file mode 100644
index 0000000..a9b0282
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml
@@ -0,0 +1,106 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
+
+name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DQL specific
+  scale_reward_factor: 1
+  eta: 1.0
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dql.DQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml
new file mode 100644
index 0000000..563f9eb
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml
@@ -0,0 +1,115 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
+
+name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 5
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # IDQL specific
+  scale_reward_factor: 1
+  eval_deterministic: True
+  eval_sample_num: 10 # how many samples to score during eval
+  critic_tau: 0.001  # rate of target q network update
+  use_expectile_exploration: True
+  buffer_size: 3000
+  batch_size: 512
+  replay_ratio: 16
+
+model:
+  _target_: model.diffusion.diffusion_idql.IDQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic_q:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  critic_v:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..ffaf8b5
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,114 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_5000.pt # use 8000 for comparing policy parameterizations
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml
new file mode 100644
index 0000000..82a2449
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml
@@ -0,0 +1,164 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_img_ta4_td100/2024-07-30_22-24-35/checkpoint/state_2500.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 9
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+use_ddim: True
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['robot0_eye_in_hand_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 200
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  augment: True
+  grad_accumulate: 15
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 200
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 200
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 500
+  logprob_batch_size: 500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: False
+    spatial_emb: 128
+    time_dim: 32
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.ViTCritic
+    spatial_emb: 128
+    augment: False
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet.yaml b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet.yaml
new file mode 100644
index 0000000..3bc6975
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet.yaml
@@ -0,0 +1,117 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_unet_ta4_td20/2024-06-29_02-49-45/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 40
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp.yaml b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..9f6be1b
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp.yaml
@@ -0,0 +1,103 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_gaussian_mlp_ta4/2024-06-28_14-48-24/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp_img.yaml b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp_img.yaml
new file mode 100644
index 0000000..7e055e8
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp_img.yaml
@@ -0,0 +1,141 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_img_agent.TrainPPOImgGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_img_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_gaussian_mlp_img_ta4/2024-07-28_23-00-48/checkpoint/state_500.pt  # trained very quickly
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 9
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['robot0_eye_in_hand_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 200
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  augment: True
+  grad_accumulate: 5
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 200
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 200
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 1500
+  logprob_batch_size: 1000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_VisionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: False
+    spatial_emb: 128
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.ViTCritic
+    spatial_emb: 128
+    augment: False
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_transformer.yaml b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_transformer.yaml
new file mode 100644
index 0000000..c543cb7
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_transformer.yaml
@@ -0,0 +1,104 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_gaussian_transformer_ta4/2024-06-28_14-49-23/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.transformer.Gaussian_Transformer
+    transformer_embed_dim: 96
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_ppo_gmm_mlp.yaml b/cfg/robomimic/finetune/lift/ft_ppo_gmm_mlp.yaml
new file mode 100644
index 0000000..4bd2406
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gmm_mlp.yaml
@@ -0,0 +1,104 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_gmm_mlp_ta4/2024-06-28_15-30-32/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+num_modes: 5
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gmm_ppo.PPO_GMM
+  clip_ploss_coef: 0.01
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_ppo_gmm_transformer.yaml b/cfg/robomimic/finetune/lift/ft_ppo_gmm_transformer.yaml
new file mode 100644
index 0000000..d4469f3
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gmm_transformer.yaml
@@ -0,0 +1,105 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gmm_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_gmm_transformer_ta4/2024-06-28_14-51-23/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+num_modes: 5
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gmm_ppo.PPO_GMM
+  clip_ploss_coef: 0.01
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.transformer.GMM_Transformer
+    transformer_embed_dim: 96
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml
new file mode 100644
index 0000000..ae5b684
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml
@@ -0,0 +1,107 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent
+
+name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 5
+  n_steps: 300
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # QSM specific
+  scale_reward_factor: 1
+  q_grad_coeff: 50
+  critic_tau: 0.005  # rate of target q network update
+  buffer_size: 3000
+  batch_size: 256
+  replay_ratio: 32
+
+model:
+  _target_: model.diffusion.diffusion_qsm.QSMDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml
new file mode 100644
index 0000000..e870256
--- /dev/null
+++ b/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml
@@ -0,0 +1,91 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent
+
+name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  lr: 1e-5
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # RWR specific
+  max_reward_weight: 100
+  beta: 10
+  batch_size: 1000
+  update_epochs: 16
+
+model:
+  _target_: model.diffusion.diffusion_rwr.RWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml
new file mode 100644
index 0000000..afcf62b
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml
@@ -0,0 +1,107 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
+
+name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_diffusion_mlp_ta4_td20/2024-07-10_01-46-16/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # AWR specific
+  scale_reward_factor: 1
+  max_adv_weight: 100
+  beta: 10
+  buffer_size: 3000
+  batch_size: 256
+  replay_ratio: 64
+  critic_update_ratio: 4
+
+model:
+  _target_: model.diffusion.diffusion_awr.AWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.10
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_dipo_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_dipo_diffusion_mlp.yaml
new file mode 100644
index 0000000..5f2baa3
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_dipo_diffusion_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
+
+name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_diffusion_mlp_ta4_td20/2024-07-10_01-46-16/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DIPO specific
+  scale_reward_factor: 1
+  eta: 0.0001
+  action_gradient_steps: 10
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dipo.DIPODiffusion
+  # HP to tune
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml
new file mode 100644
index 0000000..d834dc9
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml
@@ -0,0 +1,107 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
+
+name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_diffusion_mlp_ta4_td20/2024-07-10_01-46-16/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DQL specific
+  scale_reward_factor: 1
+  eta: 1.0
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dql.DQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml
new file mode 100644
index 0000000..9245579
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml
@@ -0,0 +1,116 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
+
+name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_diffusion_mlp_ta4_td20/2024-07-10_01-46-16/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 5
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # IDQL specific
+  scale_reward_factor: 1
+  eval_deterministic: True
+  eval_sample_num: 10 # how many samples to score during eval
+  critic_tau: 0.001  # rate of target q network update
+  use_expectile_exploration: True
+  buffer_size: 3000
+  batch_size: 512
+  replay_ratio: 16
+
+model:
+  _target_: model.diffusion.diffusion_idql.IDQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic_q:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  critic_v:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..ee41faf
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,115 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_diffusion_mlp_ta4_td20/2024-07-10_01-46-16/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 500
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 10000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml
new file mode 100644
index 0000000..ffc6320
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml
@@ -0,0 +1,164 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_diffusion_mlp_img_ta4_td100/2024-07-30_22-27-34/checkpoint/state_4000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 9
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+use_ddim: True
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['agentview_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 500
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  augment: True
+  grad_accumulate: 20
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 500
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 500
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 500
+  logprob_batch_size: 1000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: False
+    spatial_emb: 128
+    time_dim: 32
+    mlp_dims: [768, 768, 768]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.ViTCritic
+    spatial_emb: 128
+    augment: False
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet.yaml
new file mode 100644
index 0000000..5020ef2
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet.yaml
@@ -0,0 +1,117 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_diffusion_unet_ta4_td20/2024-06-29_02-48-45/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 10000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp.yaml b/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..9f7ec25
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp.yaml
@@ -0,0 +1,103 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_gaussian_mlp_ta4/2024-06-28_15-02-32/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 10000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp_img.yaml b/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp_img.yaml
new file mode 100644
index 0000000..5a96b16
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp_img.yaml
@@ -0,0 +1,141 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_img_agent.TrainPPOImgGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_img_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_gaussian_mlp_img_ta4/2024-07-30_18-44-32/checkpoint/state_4000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 9
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['agentview_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 500
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  augment: True
+  grad_accumulate: 10
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 500
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 500
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 1000
+  logprob_batch_size: 1000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_VisionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: False
+    spatial_emb: 128
+    mlp_dims: [768, 768, 768]
+    residual_style: True
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.ViTCritic
+    spatial_emb: 128
+    augment: False
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_ppo_gaussian_transformer.yaml b/cfg/robomimic/finetune/square/ft_ppo_gaussian_transformer.yaml
new file mode 100644
index 0000000..9bf6f10
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_ppo_gaussian_transformer.yaml
@@ -0,0 +1,104 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_gaussian_transformer_ta4/2024-06-28_15-02-39/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 10000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.transformer.Gaussian_Transformer
+    transformer_embed_dim: 160
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_ppo_gmm_mlp.yaml b/cfg/robomimic/finetune/square/ft_ppo_gmm_mlp.yaml
new file mode 100644
index 0000000..62ca68a
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_ppo_gmm_mlp.yaml
@@ -0,0 +1,104 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_gmm_mlp_ta4/2024-06-28_15-03-08/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+num_modes: 5
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 10000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gmm_ppo.PPO_GMM
+  clip_ploss_coef: 0.01
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_ppo_gmm_transformer.yaml b/cfg/robomimic/finetune/square/ft_ppo_gmm_transformer.yaml
new file mode 100644
index 0000000..e185604
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_ppo_gmm_transformer.yaml
@@ -0,0 +1,105 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gmm_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_gmm_transformer_ta4/2024-06-28_15-03-15/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+num_modes: 5
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 10000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gmm_ppo.PPO_GMM
+  clip_ploss_coef: 0.01
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.transformer.GMM_Transformer
+    transformer_embed_dim: 160
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml
new file mode 100644
index 0000000..b1eb458
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent
+
+name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_diffusion_mlp_ta4_td20/2024-07-10_01-46-16/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 5
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # QSM specific
+  scale_reward_factor: 1
+  q_grad_coeff: 50
+  critic_tau: 0.005  # rate of target q network update
+  buffer_size: 3000
+  batch_size: 256
+  replay_ratio: 32
+
+model:
+  _target_: model.diffusion.diffusion_qsm.QSMDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml
new file mode 100644
index 0000000..1c731ef
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml
@@ -0,0 +1,92 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent
+
+name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_diffusion_mlp_ta4_td20/2024-07-10_01-46-16/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 300
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  lr: 1e-5
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-5
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # RWR specific
+  max_reward_weight: 100
+  beta: 10
+  batch_size: 1000
+  update_epochs: 16
+
+model:
+  _target_: model.diffusion.diffusion_rwr.RWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.1
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml
new file mode 100644
index 0000000..d3995ef
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
+
+name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_diffusion_mlp_ta8_td20/2024-07-08_11-18-59/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # AWR specific
+  scale_reward_factor: 1
+  max_adv_weight: 100
+  beta: 10
+  buffer_size: 3000
+  batch_size: 256
+  replay_ratio: 64
+  critic_update_ratio: 4
+
+model:
+  _target_: model.diffusion.diffusion_awr.AWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.08
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml
new file mode 100644
index 0000000..a7690fd
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml
@@ -0,0 +1,110 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
+
+name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_diffusion_mlp_ta8_td20/2024-07-08_11-18-59/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DIPO specific
+  scale_reward_factor: 1
+  eta: 0.0001
+  action_gradient_steps: 10
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dipo.DIPODiffusion
+  # HP to tune
+  min_sampling_denoising_std: 0.08
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml
new file mode 100644
index 0000000..40a3588
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
+
+name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_diffusion_mlp_ta8_td20/2024-07-08_11-18-59/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # DQL specific
+  scale_reward_factor: 1
+  eta: 1.0
+  buffer_size: 400000
+  batch_size: 5000
+  replay_ratio: 64
+
+model:
+  _target_: model.diffusion.diffusion_dql.DQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.08
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml
new file mode 100644
index 0000000..36eabe5
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml
@@ -0,0 +1,118 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
+
+name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_diffusion_mlp_ta8_td20/2024-07-08_11-18-59/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # IDQL specific
+  scale_reward_factor: 1
+  eval_deterministic: True
+  eval_sample_num: 10 # how many samples to score during eval
+  critic_tau: 0.001  # rate of target q network update
+  use_expectile_exploration: True
+  buffer_size: 3000
+  batch_size: 512
+  replay_ratio: 16
+
+model:
+  _target_: model.diffusion.diffusion_idql.IDQLDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.08
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic_q:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  critic_v:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..f962739
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,117 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_diffusion_mlp_ta8_td20/2024-07-08_11-18-59/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 10000
+  update_epochs: 8
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.08
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml
new file mode 100644
index 0000000..c5b5d64
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml
@@ -0,0 +1,170 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_diffusion_mlp_img_ta8_td100/2024-07-30_22-30-06/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 18
+action_dim: 14
+transition_dim: ${action_dim}
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+use_ddim: True
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos',
+                     "robot1_eef_pos",
+                     "robot1_eef_quat",
+                     "robot1_gripper_qpos"]
+      image_keys: ['shouldercamera0_image', 
+                   'shouldercamera1_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [6, 96, 96]
+    state:
+      shape: [18]
+  action: 
+    shape: [14]
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 500
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  augment: True
+  grad_accumulate: 20
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 500
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 500
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 500
+  logprob_batch_size: 1000
+  update_epochs: 8
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.9
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.08
+  min_logprob_denoising_std: 0.1
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: False
+    num_img: 2
+    spatial_emb: 128
+    time_dim: 32
+    mlp_dims: [768, 768, 768]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.ViTCritic
+    spatial_emb: 128
+    num_img: 2
+    augment: False
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_unet.yaml b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_unet.yaml
new file mode 100644
index 0000000..b9c7f15
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_unet.yaml
@@ -0,0 +1,120 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_diffusion_unet_ta16_td20/2024-07-04_02-20-53/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 16
+act_steps: 8
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 10000
+  update_epochs: 8
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.001
+  clip_ploss_coef_base: 0.0001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.08
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  ft_denoising_steps: ${ft_denoising_steps}
+  transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  cond_steps: ${cond_steps}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp.yaml b/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp.yaml
new file mode 100644
index 0000000..85b7db3
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp.yaml
@@ -0,0 +1,106 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_gaussian_mlp_ta8/2024-07-10_01-50-52/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 10000
+  update_epochs: 8
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    fixed_std: 0.08
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp_img.yaml b/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp_img.yaml
new file mode 100644
index 0000000..24a5696
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_ppo_gaussian_mlp_img.yaml
@@ -0,0 +1,147 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_img_agent.TrainPPOImgGaussianAgent
+
+name: ${env_name}_ft_gaussian_mlp_img_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_gaussian_mlp_img_ta8/2024-07-30_21-39-34/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 18
+action_dim: 14
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos',
+                     "robot1_eef_pos",
+                     "robot1_eef_quat",
+                     "robot1_gripper_qpos"]
+      image_keys: ['shouldercamera0_image', 
+                   'shouldercamera1_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [6, 96, 96]
+    state:
+      shape: [18]
+  action: 
+    shape: [14]
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 500
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  augment: True
+  grad_accumulate: 10
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 500
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 500
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 1000
+  logprob_batch_size: 1000
+  update_epochs: 8
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gaussian.Gaussian_VisionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: False
+    num_img: 2
+    spatial_emb: 128
+    mlp_dims: [768, 768, 768]
+    residual_style: True
+    fixed_std: 0.08
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.ViTCritic
+    spatial_emb: 128
+    num_img: 2
+    augment: False
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_ppo_gaussian_transformer.yaml b/cfg/robomimic/finetune/transport/ft_ppo_gaussian_transformer.yaml
new file mode 100644
index 0000000..bbab06d
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_ppo_gaussian_transformer.yaml
@@ -0,0 +1,107 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gaussian_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_gaussian_transformer_ta8/2024-06-28_15-18-16/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 10000
+  update_epochs: 8
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gaussian_ppo.PPO_Gaussian
+  clip_ploss_coef: 0.01
+  randn_clip_value: 3
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.transformer.Gaussian_Transformer
+    transformer_embed_dim: 160
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.08
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_ppo_gmm_mlp.yaml b/cfg/robomimic/finetune/transport/ft_ppo_gmm_mlp.yaml
new file mode 100644
index 0000000..c961d88
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_ppo_gmm_mlp.yaml
@@ -0,0 +1,107 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_gmm_mlp_ta8/2024-07-10_01-51-21/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+num_modes: 5
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 10000
+  update_epochs: 8
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gmm_ppo.PPO_GMM
+  clip_ploss_coef: 0.01
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    fixed_std: 0.08
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_ppo_gmm_transformer.yaml b/cfg/robomimic/finetune/transport/ft_ppo_gmm_transformer.yaml
new file mode 100644
index 0000000..32f0d61
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_ppo_gmm_transformer.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
+
+name: ${env_name}_ft_gmm_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_gmm_transformer_ta8/2024-06-28_15-18-43/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+num_modes: 5
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 10000
+  update_epochs: 8
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.rl.gmm_ppo.PPO_GMM
+  clip_ploss_coef: 0.01
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.common.transformer.GMM_Transformer
+    transformer_embed_dim: 160
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.08
+    learn_fixed_std: True
+    std_min: 0.01
+    std_max: 0.2
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml
new file mode 100644
index 0000000..235a747
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml
@@ -0,0 +1,110 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent
+
+name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_diffusion_mlp_ta8_td20/2024-07-08_11-18-59/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 5
+  n_steps: 400
+  gamma: 0.999
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-6
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # QSM specific
+  scale_reward_factor: 1
+  q_grad_coeff: 50
+  critic_tau: 0.005  # rate of target q network update
+  buffer_size: 3000
+  batch_size: 256
+  replay_ratio: 32
+
+model:
+  _target_: model.diffusion.diffusion_qsm.QSMDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.08
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+    obs_dim: ${obs_dim}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml
new file mode 100644
index 0000000..a70659d
--- /dev/null
+++ b/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml
@@ -0,0 +1,94 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent
+
+name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/transport/transport_pre_diffusion_mlp_ta8_td20/2024-07-08_11-18-59/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: false
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  lr: 1e-5
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-6
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # RWR specific
+  max_reward_weight: 100
+  beta: 10
+  batch_size: 1000
+  update_epochs: 16
+
+model:
+  _target_: model.diffusion.diffusion_rwr.RWRDiffusion
+  # Sampling HPs
+  min_sampling_denoising_std: 0.08
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_mlp.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..7789a8f
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_diffusion_mlp.yaml
@@ -0,0 +1,68 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${obs_dim}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_mlp_img.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_img.yaml
new file mode 100644
index 0000000..47faccd
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_img.yaml
@@ -0,0 +1,91 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-img/train.pkl
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 9  # proprioception only
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 500
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: True
+    spatial_emb: 128
+    time_dim: 32
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  use_img: True
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  max_n_episodes: 100
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_unet.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_unet.yaml
new file mode 100644
index 0000000..81e360c
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_diffusion_unet.yaml
@@ -0,0 +1,71 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 40
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_gaussian_mlp.yaml b/cfg/robomimic/pretrain/can/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..269d08a
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_gaussian_mlp.yaml
@@ -0,0 +1,60 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_gaussian_mlp_img.yaml b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_img.yaml
new file mode 100644
index 0000000..a57e911
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_img.yaml
@@ -0,0 +1,83 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_img_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-img/train.pkl
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 9  # proprioception only
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+train:
+  n_epochs: 1000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 500
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_VisionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: True
+    spatial_emb: 128
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  use_img: True
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  max_n_episodes: 100
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_gaussian_transformer.yaml b/cfg/robomimic/pretrain/can/pre_gaussian_transformer.yaml
new file mode 100644
index 0000000..239d9d5
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_gaussian_transformer.yaml
@@ -0,0 +1,62 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.transformer.Gaussian_Transformer
+    transformer_embed_dim: 96
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_gmm_mlp.yaml b/cfg/robomimic/pretrain/can/pre_gmm_mlp.yaml
new file mode 100644
index 0000000..d9fd909
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_gmm_mlp.yaml
@@ -0,0 +1,62 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+num_modes: 5
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gmm.GMMModel
+  network:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    fixed_std: 0.1
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_gmm_transformer.yaml b/cfg/robomimic/pretrain/can/pre_gmm_transformer.yaml
new file mode 100644
index 0000000..6cc1041
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_gmm_transformer.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gmm_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+num_modes: 5
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gmm.GMMModel
+  network:
+    _target_: model.common.transformer.GMM_Transformer
+    transformer_embed_dim: 96
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: False
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/lift/pre_diffusion_mlp.yaml b/cfg/robomimic/pretrain/lift/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..5dc7b50
--- /dev/null
+++ b/cfg/robomimic/pretrain/lift/pre_diffusion_mlp.yaml
@@ -0,0 +1,68 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/lift/pre_diffusion_mlp_img.yaml b/cfg/robomimic/pretrain/lift/pre_diffusion_mlp_img.yaml
new file mode 100644
index 0000000..c10c47b
--- /dev/null
+++ b/cfg/robomimic/pretrain/lift/pre_diffusion_mlp_img.yaml
@@ -0,0 +1,91 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-img/train.pkl
+
+seed: 42
+device: cuda:0
+env: lift
+obs_dim: 9  # proprioception only
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+train:
+  n_epochs: 2500
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 8000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 500
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: True
+    spatial_emb: 128
+    time_dim: 32
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  use_img: True
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  max_n_episodes: 100
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/lift/pre_diffusion_unet.yaml b/cfg/robomimic/pretrain/lift/pre_diffusion_unet.yaml
new file mode 100644
index 0000000..09ca52b
--- /dev/null
+++ b/cfg/robomimic/pretrain/lift/pre_diffusion_unet.yaml
@@ -0,0 +1,71 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 40
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    transition_dim: ${transition_dim}
+    cond_dim: ${obs_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp.yaml b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..2d53bf4
--- /dev/null
+++ b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp.yaml
@@ -0,0 +1,60 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_img.yaml b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_img.yaml
new file mode 100644
index 0000000..796d2ea
--- /dev/null
+++ b/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_img.yaml
@@ -0,0 +1,83 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_img_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-img/train.pkl
+
+seed: 42
+device: cuda:0
+env: lift
+obs_dim: 9  # proprioception only
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+train:
+  n_epochs: 500
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 3000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 500
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_VisionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: True
+    spatial_emb: 128
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  use_img: True
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  max_n_episodes: 100
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/lift/pre_gaussian_transformer.yaml b/cfg/robomimic/pretrain/lift/pre_gaussian_transformer.yaml
new file mode 100644
index 0000000..f6a884b
--- /dev/null
+++ b/cfg/robomimic/pretrain/lift/pre_gaussian_transformer.yaml
@@ -0,0 +1,62 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.transformer.Gaussian_Transformer
+    transformer_embed_dim: 96
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/lift/pre_gmm_mlp.yaml b/cfg/robomimic/pretrain/lift/pre_gmm_mlp.yaml
new file mode 100644
index 0000000..a4dbfaf
--- /dev/null
+++ b/cfg/robomimic/pretrain/lift/pre_gmm_mlp.yaml
@@ -0,0 +1,62 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+num_modes: 5
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gmm.GMMModel
+  network:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    fixed_std: 0.1
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/lift/pre_gmm_transformer.yaml b/cfg/robomimic/pretrain/lift/pre_gmm_transformer.yaml
new file mode 100644
index 0000000..ef55a5f
--- /dev/null
+++ b/cfg/robomimic/pretrain/lift/pre_gmm_transformer.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gmm_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: lift
+obs_dim: 19
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+num_modes: 5
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gmm.GMMModel
+  network:
+    _target_: model.common.transformer.GMM_Transformer
+    transformer_embed_dim: 96
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: False
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_mlp.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..6d39d54
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_diffusion_mlp.yaml
@@ -0,0 +1,69 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_mlp_img.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_img.yaml
new file mode 100644
index 0000000..76bf985
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_img.yaml
@@ -0,0 +1,91 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-img/train.pkl
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 9  # proprioception only
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+train:
+  n_epochs: 4000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 8000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 500
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: True
+    spatial_emb: 128
+    time_dim: 32
+    mlp_dims: [768, 768, 768]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  use_img: True
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  max_n_episodes: 100
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_unet.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_unet.yaml
new file mode 100644
index 0000000..2ca4271
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_diffusion_unet.yaml
@@ -0,0 +1,71 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_gaussian_mlp.yaml b/cfg/robomimic/pretrain/square/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..361271b
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_gaussian_mlp.yaml
@@ -0,0 +1,60 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_gaussian_mlp_img.yaml b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_img.yaml
new file mode 100644
index 0000000..552a52f
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_img.yaml
@@ -0,0 +1,83 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_img_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-img/train.pkl
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 9  # proprioception only
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+train:
+  n_epochs: 4000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 500
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_VisionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: True
+    spatial_emb: 128
+    mlp_dims: [768, 768, 768]
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  use_img: True
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  max_n_episodes: 100
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_gaussian_transformer.yaml b/cfg/robomimic/pretrain/square/pre_gaussian_transformer.yaml
new file mode 100644
index 0000000..82c6a09
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_gaussian_transformer.yaml
@@ -0,0 +1,62 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.transformer.Gaussian_Transformer
+    transformer_embed_dim: 160
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_gmm_mlp.yaml b/cfg/robomimic/pretrain/square/pre_gmm_mlp.yaml
new file mode 100644
index 0000000..b80301c
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_gmm_mlp.yaml
@@ -0,0 +1,62 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+num_modes: 5
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gmm.GMMModel
+  network:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    fixed_std: 0.1
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_gmm_transformer.yaml b/cfg/robomimic/pretrain/square/pre_gmm_transformer.yaml
new file mode 100644
index 0000000..b3059dc
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_gmm_transformer.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gmm_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 23
+action_dim: 7
+transition_dim: ${action_dim}
+horizon_steps: 4
+cond_steps: 1
+num_modes: 5
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gmm.GMMModel
+  network:
+    _target_: model.common.transformer.GMM_Transformer
+    transformer_embed_dim: 160
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: False
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/transport/pre_diffusion_mlp.yaml b/cfg/robomimic/pretrain/transport/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..020c1af
--- /dev/null
+++ b/cfg/robomimic/pretrain/transport/pre_diffusion_mlp.yaml
@@ -0,0 +1,68 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/transport/pre_diffusion_mlp_img.yaml b/cfg/robomimic/pretrain/transport/pre_diffusion_mlp_img.yaml
new file mode 100644
index 0000000..08d2657
--- /dev/null
+++ b/cfg/robomimic/pretrain/transport/pre_diffusion_mlp_img.yaml
@@ -0,0 +1,92 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-img/train.pkl
+
+seed: 42
+device: cuda:0
+env: transport
+obs_dim: 18  # proprioception only
+action_dim: 14
+transition_dim: ${action_dim}
+denoising_steps: 100
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 8000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 500
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: True
+    num_img: 2
+    spatial_emb: 128
+    time_dim: 32
+    mlp_dims: [768, 768, 768]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  use_img: True
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  max_n_episodes: 100
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/transport/pre_diffusion_unet.yaml b/cfg/robomimic/pretrain/transport/pre_diffusion_unet.yaml
new file mode 100644
index 0000000..7b31552
--- /dev/null
+++ b/cfg/robomimic/pretrain/transport/pre_diffusion_unet.yaml
@@ -0,0 +1,71 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+denoising_steps: 20
+horizon_steps: 16
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  transition_dim: ${transition_dim}
+  denoising_steps: ${denoising_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp.yaml b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..e3f8d86
--- /dev/null
+++ b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp.yaml
@@ -0,0 +1,60 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_img.yaml b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_img.yaml
new file mode 100644
index 0000000..9925dd9
--- /dev/null
+++ b/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_img.yaml
@@ -0,0 +1,84 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_img_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-img/train.pkl
+
+seed: 42
+device: cuda:0
+env: transport
+obs_dim: 18  # proprioception only
+action_dim: 14
+transition_dim: ${action_dim}
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 500
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.mlp_gaussian.Gaussian_VisionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: True
+    num_img: 2
+    spatial_emb: 128
+    mlp_dims: [768, 768, 768]
+    residual_style: True
+    fixed_std: 0.1
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  use_img: True
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  max_n_episodes: 100
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/transport/pre_gaussian_transformer.yaml b/cfg/robomimic/pretrain/transport/pre_gaussian_transformer.yaml
new file mode 100644
index 0000000..ac79f9d
--- /dev/null
+++ b/cfg/robomimic/pretrain/transport/pre_gaussian_transformer.yaml
@@ -0,0 +1,62 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+horizon_steps: 8
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gaussian.GaussianModel
+  network:
+    _target_: model.common.transformer.Gaussian_Transformer
+    transformer_embed_dim: 160
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: False
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/transport/pre_gmm_mlp.yaml b/cfg/robomimic/pretrain/transport/pre_gmm_mlp.yaml
new file mode 100644
index 0000000..7e48026
--- /dev/null
+++ b/cfg/robomimic/pretrain/transport/pre_gmm_mlp.yaml
@@ -0,0 +1,62 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gmm_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+horizon_steps: 8
+cond_steps: 1
+num_modes: 5
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gmm.GMMModel
+  network:
+    _target_: model.common.mlp_gmm.GMM_MLP
+    mlp_dims: [1024, 1024, 1024]
+    residual_style: True
+    fixed_std: 0.1
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/transport/pre_gmm_transformer.yaml b/cfg/robomimic/pretrain/transport/pre_gmm_transformer.yaml
new file mode 100644
index 0000000..87c0f18
--- /dev/null
+++ b/cfg/robomimic/pretrain/transport/pre_gmm_transformer.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gmm_transformer_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.pkl
+
+seed: 42
+device: cuda:0
+env: transport
+obs_dim: 59
+action_dim: 14
+transition_dim: ${action_dim}
+horizon_steps: 8
+cond_steps: 1
+num_modes: 5
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 5000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 5000
+    warmup_steps: 100
+    min_lr: 1e-5
+  epoch_start_ema: 20
+  update_ema_freq: 10
+  save_model_freq: 1000
+
+model:
+  _target_: model.common.gmm.GMMModel
+  network:
+    _target_: model.common.transformer.GMM_Transformer
+    transformer_embed_dim: 160
+    transformer_num_heads: 4
+    transformer_num_layers: 4
+    fixed_std: 0.1
+    learn_fixed_std: False
+    num_modes: ${num_modes}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    transition_dim: ${transition_dim}
+  horizon_steps: ${horizon_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedActionSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/env/gym_utils/__init__.py b/env/gym_utils/__init__.py
new file mode 100644
index 0000000..632b6a4
--- /dev/null
+++ b/env/gym_utils/__init__.py
@@ -0,0 +1,229 @@
+import os
+import json
+
+try:
+    from collections.abc import Iterable
+except ImportError:
+    Iterable = (tuple, list)
+
+
+def make_async(
+    id,
+    num_envs=1,
+    asynchronous=True,
+    wrappers=None,
+    render=False,
+    obs_dim=23,
+    action_dim=7,
+    env_type=None,
+    max_episode_steps=None,
+    # below for furniture only
+    gpu_id=0,
+    headless=True,
+    record=False,
+    normalization_path=None,
+    furniture="one_leg",
+    randomness="low",
+    act_steps=8,
+    sparse_reward=False,
+    # below for robomimic only
+    robomimic_env_cfg_path=None,
+    use_image_obs=False,
+    render_offscreen=False,
+    reward_shaping=False,
+    shape_meta=None,
+    **kwargs,
+):
+    """Create a vectorized environment from multiple copies of an environment,
+    from its id.
+
+    Parameters
+    ----------
+    id : str
+        The environment ID. This must be a valid ID from the registry.
+
+    num_envs : int
+        Number of copies of the environment.
+
+    asynchronous : bool
+        If `True`, wraps the environments in an :class:`AsyncVectorEnv` (which uses
+        `multiprocessing`_ to run the environments in parallel). If ``False``,
+        wraps the environments in a :class:`SyncVectorEnv`.
+
+    wrappers : dictionary, optional
+        Each key is a wrapper class, and each value is a dictionary of arguments
+
+    Returns
+    -------
+    :class:`gym.vector.VectorEnv`
+        The vectorized environment.
+
+    Example
+    -------
+    >>> env = gym.vector.make('CartPole-v1', num_envs=3)
+    >>> env.reset()
+    array([[-0.04456399,  0.04653909,  0.01326909, -0.02099827],
+           [ 0.03073904,  0.00145001, -0.03088818, -0.03131252],
+           [ 0.03468829,  0.01500225,  0.01230312,  0.01825218]],
+          dtype=float32)
+    """
+
+    if env_type == "furniture":
+        from furniture_bench.envs.observation import DEFAULT_STATE_OBS
+        from furniture_bench.envs.furniture_rl_sim_env import FurnitureRLSimEnv
+        from env.gym_utils.wrapper.furniture import FurnitureRLSimEnvMultiStepWrapper
+
+        env = FurnitureRLSimEnv(
+            act_rot_repr="rot_6d",
+            action_type="pos",
+            april_tags=False,
+            concat_robot_state=True,
+            ctrl_mode="diffik",
+            obs_keys=DEFAULT_STATE_OBS,
+            furniture=furniture,
+            gpu_id=gpu_id,
+            headless=headless,
+            num_envs=num_envs,
+            observation_space="state",
+            randomness=randomness,
+            max_env_steps=max_episode_steps,
+            record=record,
+            pos_scalar=1,
+            rot_scalar=1,
+            stiffness=1_000,
+            damping=200,
+        )
+
+        env = FurnitureRLSimEnvMultiStepWrapper(
+            env,
+            n_obs_steps=1,
+            n_action_steps=act_steps,
+            reward_agg_method="sum",
+            prev_action=False,
+            reset_within_step=False,
+            pass_full_observations=False,
+            normalization_path=normalization_path,
+            sparse_reward=sparse_reward,
+        )
+
+        return env
+
+    # avoid import error due incompatible gym versions
+    from gym import spaces
+    from env.gym_utils.async_vector_env import AsyncVectorEnv
+    from env.gym_utils.sync_vector_env import SyncVectorEnv
+    from env.gym_utils.wrapper import wrapper_dict
+
+    __all__ = [
+        "AsyncVectorEnv",
+        "SyncVectorEnv",
+        "VectorEnv",
+        "VectorEnvWrapper",
+        "make",
+    ]
+
+    # import the envs
+    if robomimic_env_cfg_path is not None:
+        import robomimic.utils.env_utils as EnvUtils
+        import robomimic.utils.obs_utils as ObsUtils
+    elif "avoiding" in id:
+        import gym_avoiding
+    else:
+        import d4rl.gym_mujoco
+    from gym.envs import make as make_
+
+    def _make_env():
+        if robomimic_env_cfg_path is not None:
+            obs_modality_dict = {
+                "low_dim": (
+                    wrappers.robomimic_image.low_dim_keys
+                    if "robomimic_image" in wrappers
+                    else wrappers.robomimic_lowdim.low_dim_keys
+                ),
+                "rgb": (
+                    wrappers.robomimic_image.image_keys
+                    if "robomimic_image" in wrappers
+                    else None
+                ),
+            }
+            if obs_modality_dict["rgb"] is None:
+                obs_modality_dict.pop("rgb")
+            ObsUtils.initialize_obs_modality_mapping_from_dict(obs_modality_dict)
+            if render_offscreen or use_image_obs:
+                os.environ["MUJOCO_GL"] = "egl"
+            with open(robomimic_env_cfg_path, "r") as f:
+                env_meta = json.load(f)
+            env_meta["reward_shaping"] = reward_shaping
+            env = EnvUtils.create_env_from_metadata(
+                env_meta=env_meta,
+                render=render,
+                # only way to not show collision geometry is to enable render_offscreen, which uses a lot of RAM.
+                render_offscreen=render_offscreen,
+                use_image_obs=use_image_obs,
+                # render_gpu_device_id=0,
+            )
+            # Robosuite's hard reset causes excessive memory consumption.
+            # Disabled to run more envs.
+            # https://github.com/ARISE-Initiative/robosuite/blob/92abf5595eddb3a845cd1093703e5a3ccd01e77e/robosuite/environments/base.py#L247-L248
+            env.env.hard_reset = False
+        else:  # d3il, gym
+            env = make_(id, render=render, **kwargs)
+
+        # add wrappers
+        if wrappers is not None:
+            for wrapper, args in wrappers.items():
+                env = wrapper_dict[wrapper](env, **args)
+        return env
+
+    def dummy_env_fn():
+        """TODO(allenzren): does this dummy env allow camera obs for other envs besides robomimic?"""
+        import gym
+        import numpy as np
+        from env.gym_utils.wrapper.multi_step import MultiStep
+
+        # Avoid importing or using env in the main process
+        # to prevent OpenGL context issue with fork.
+        # Create a fake env whose sole purpose is to provide
+        # obs/action spaces and metadata.
+        env = gym.Env()
+        if shape_meta is not None:  # rn only for images
+            observation_space = spaces.Dict()
+            for key, value in shape_meta["obs"].items():
+                shape = value["shape"]
+                if key.endswith("rgb"):
+                    min_value, max_value = -1, 1
+                elif key.endswith("state"):
+                    min_value, max_value = -1, 1
+                else:
+                    raise RuntimeError(f"Unsupported type {key}")
+                this_space = spaces.Box(
+                    low=min_value,
+                    high=max_value,
+                    shape=shape,
+                    dtype=np.float32,
+                )
+                observation_space[key] = this_space
+            env.observation_space = observation_space
+        else:
+            env.observation_space = gym.spaces.Box(
+                -1, 1, shape=(obs_dim,), dtype=np.float64
+            )
+        env.action_space = gym.spaces.Box(-1, 1, shape=(action_dim,), dtype=np.int64)
+        env.metadata = {
+            "render.modes": ["human", "rgb_array", "depth_array"],
+            "video.frames_per_second": 12,
+        }
+        return MultiStep(env=env)  # use all defaults
+
+    env_fns = [_make_env for _ in range(num_envs)]
+    return (
+        AsyncVectorEnv(
+            env_fns,
+            dummy_env_fn=(
+                dummy_env_fn if render or render_offscreen or use_image_obs else None
+            ),
+            delay_init="avoiding" in id,  # add delay for D3IL initialization
+        )
+        if asynchronous
+        else SyncVectorEnv(env_fns)
+    )
diff --git a/env/gym_utils/async_vector_env.py b/env/gym_utils/async_vector_env.py
new file mode 100644
index 0000000..92958a3
--- /dev/null
+++ b/env/gym_utils/async_vector_env.py
@@ -0,0 +1,839 @@
+"""
+From gym==0.22.0
+
+Disable auto-reset after done.
+Add reset_arg() that allows all environments with different options.
+Add reset_one_arg() that allows resetting a single environment with options.
+Add render().
+
+"""
+
+from typing import Optional, Union, List
+
+import numpy as np
+import multiprocessing as mp
+import time
+import sys
+from enum import Enum
+from copy import deepcopy
+import time
+
+from gym import logger
+
+# from gym.vector.vector_env import VectorEnv
+from .vector_env import VectorEnv
+from gym.error import (
+    AlreadyPendingCallError,
+    NoAsyncCallError,
+    ClosedEnvironmentError,
+    CustomSpaceError,
+)
+from gym.vector.utils import (
+    create_shared_memory,
+    create_empty_array,
+    write_to_shared_memory,
+    read_from_shared_memory,
+    concatenate,
+    iterate,
+    CloudpickleWrapper,
+    clear_mpi_env_vars,
+)
+
+
+__all__ = ["AsyncVectorEnv"]
+
+
+class AsyncState(Enum):
+    DEFAULT = "default"
+    WAITING_RESET = "reset"
+    WAITING_STEP = "step"
+    WAITING_CALL = "call"
+
+
+class AsyncVectorEnv(VectorEnv):
+    """Vectorized environment that runs multiple environments in parallel. It
+    uses `multiprocessing`_ processes, and pipes for communication.
+
+    Parameters
+    ----------
+    env_fns : iterable of callable
+        Functions that create the environments.
+
+    observation_space : :class:`gym.spaces.Space`, optional
+        Observation space of a single environment. If ``None``, then the
+        observation space of the first environment is taken.
+
+    action_space : :class:`gym.spaces.Space`, optional
+        Action space of a single environment. If ``None``, then the action space
+        of the first environment is taken.
+
+    shared_memory : bool
+        If ``True``, then the observations from the worker processes are
+        communicated back through shared variables. This can improve the
+        efficiency if the observations are large (e.g. images).
+
+    copy : bool
+        If ``True``, then the :meth:`~AsyncVectorEnv.reset` and
+        :meth:`~AsyncVectorEnv.step` methods return a copy of the observations.
+
+    context : str, optional
+        Context for `multiprocessing`_. If ``None``, then the default context is used.
+
+    daemon : bool
+        If ``True``, then subprocesses have ``daemon`` flag turned on; that is, they
+        will quit if the head process quits. However, ``daemon=True`` prevents
+        subprocesses to spawn children, so for some environments you may want
+        to have it set to ``False``.
+
+    worker : callable, optional
+        If set, then use that worker in a subprocess instead of a default one.
+        Can be useful to override some inner vector env logic, for instance,
+        how resets on done are handled.
+
+    Warning
+    -------
+    :attr:`worker` is an advanced mode option. It provides a high degree of
+    flexibility and a high chance to shoot yourself in the foot; thus,
+    if you are writing your own worker, it is recommended to start from the code
+    for ``_worker`` (or ``_worker_shared_memory``) method, and add changes.
+
+    Raises
+    ------
+    RuntimeError
+        If the observation space of some sub-environment does not match
+        :obj:`observation_space` (or, by default, the observation space of
+        the first sub-environment).
+
+    ValueError
+        If :obj:`observation_space` is a custom space (i.e. not a default
+        space in Gym, such as :class:`~gym.spaces.Box`, :class:`~gym.spaces.Discrete`,
+        or :class:`~gym.spaces.Dict`) and :obj:`shared_memory` is ``True``.
+
+    Example
+    -------
+
+    .. code-block::
+
+        >>> env = gym.vector.AsyncVectorEnv([
+        ...     lambda: gym.make("Pendulum-v0", g=9.81),
+        ...     lambda: gym.make("Pendulum-v0", g=1.62)
+        ... ])
+        >>> env.reset()
+        array([[-0.8286432 ,  0.5597771 ,  0.90249056],
+               [-0.85009176,  0.5266346 ,  0.60007906]], dtype=float32)
+    """
+
+    def __init__(
+        self,
+        env_fns,
+        dummy_env_fn=None,
+        observation_space=None,
+        action_space=None,
+        shared_memory=True,
+        copy=True,
+        context=None,
+        daemon=True,
+        worker=None,
+        delay_init=False,
+    ):
+        ctx = mp.get_context(context)
+        self.env_fns = env_fns
+        self.shared_memory = shared_memory
+        self.copy = copy
+        if dummy_env_fn is None:
+            dummy_env_fn = env_fns[0]
+        dummy_env = dummy_env_fn()
+        self.metadata = dummy_env.metadata
+
+        self.n_envs = len(env_fns)
+        if (observation_space is None) or (action_space is None):
+            observation_space = observation_space or dummy_env.observation_space
+            action_space = action_space or dummy_env.action_space
+        dummy_env.close()
+        del dummy_env
+        super().__init__(
+            num_envs=len(env_fns),
+            observation_space=observation_space,
+            action_space=action_space,
+        )
+
+        if self.shared_memory:
+            try:
+                _obs_buffer = create_shared_memory(
+                    self.single_observation_space, n=self.num_envs, ctx=ctx
+                )
+                self.observations = read_from_shared_memory(
+                    self.single_observation_space, _obs_buffer, n=self.num_envs
+                )
+            except CustomSpaceError:
+                raise ValueError(
+                    "Using `shared_memory=True` in `AsyncVectorEnv` "
+                    "is incompatible with non-standard Gym observation spaces "
+                    "(i.e. custom spaces inheriting from `gym.Space`), and is "
+                    "only compatible with default Gym spaces (e.g. `Box`, "
+                    "`Tuple`, `Dict`) for batching. Set `shared_memory=False` "
+                    "if you use custom observation spaces."
+                )
+        else:
+            _obs_buffer = None
+            self.observations = create_empty_array(
+                self.single_observation_space, n=self.num_envs, fn=np.zeros
+            )
+
+        self.parent_pipes, self.processes = [], []
+        self.error_queue = ctx.Queue()
+        target = _worker_shared_memory if self.shared_memory else _worker
+        target = worker or target
+        with clear_mpi_env_vars():
+            for idx, env_fn in enumerate(self.env_fns):
+                parent_pipe, child_pipe = ctx.Pipe()
+                process = ctx.Process(
+                    target=target,
+                    name=f"Worker<{type(self).__name__}>-{idx}",
+                    args=(
+                        idx,
+                        CloudpickleWrapper(env_fn),
+                        child_pipe,
+                        parent_pipe,
+                        _obs_buffer,
+                        self.error_queue,
+                    ),
+                )
+
+                self.parent_pipes.append(parent_pipe)
+                self.processes.append(process)
+
+                process.daemon = daemon
+                process.start()
+                child_pipe.close()
+                if (
+                    delay_init
+                ):  # D3IL complains about temporary XML if n_envs is too large. Adding a delay avoids the error.
+                    time.sleep(0.1)
+
+        self._state = AsyncState.DEFAULT
+        # self._check_spaces()
+
+    def seed(self, seed=None):
+        super().seed(seed=seed)
+        self._assert_is_running()
+        if seed is None:
+            seed = [None for _ in range(self.num_envs)]
+        if isinstance(seed, int):
+            seed = [seed + i for i in range(self.num_envs)]
+        assert len(seed) == self.num_envs
+
+        if self._state != AsyncState.DEFAULT:
+            raise AlreadyPendingCallError(
+                f"Calling `seed` while waiting for a pending call to `{self._state.value}` to complete.",
+                self._state.value,
+            )
+
+        for pipe, seed in zip(self.parent_pipes, seed):
+            pipe.send(("seed", seed))
+        _, successes = zip(*[pipe.recv() for pipe in self.parent_pipes])
+        self._raise_if_errors(successes)
+
+    def reset_async(
+        self,
+        seed: Optional[Union[int, List[int]]] = None,
+        return_info: bool = False,
+        options: Optional[dict] = None,
+    ):
+        """Send the calls to :obj:`reset` to each sub-environment.
+
+        Raises
+        ------
+        ClosedEnvironmentError
+            If the environment was closed (if :meth:`close` was previously called).
+
+        AlreadyPendingCallError
+            If the environment is already waiting for a pending call to another
+            method (e.g. :meth:`step_async`). This can be caused by two consecutive
+            calls to :meth:`reset_async`, with no call to :meth:`reset_wait` in
+            between.
+        """
+        self._assert_is_running()
+
+        if seed is None:
+            seed = [None for _ in range(self.num_envs)]
+        if isinstance(seed, int):
+            seed = [seed + i for i in range(self.num_envs)]
+        assert len(seed) == self.num_envs
+
+        if self._state != AsyncState.DEFAULT:
+            raise AlreadyPendingCallError(
+                f"Calling `reset_async` while waiting for a pending call to `{self._state.value}` to complete",
+                self._state.value,
+            )
+
+        for pipe, single_seed in zip(self.parent_pipes, seed):
+            single_kwargs = {}
+            if single_seed is not None:
+                single_kwargs["seed"] = single_seed
+            if return_info:
+                single_kwargs["return_info"] = return_info
+            if options is not None:
+                single_kwargs["options"] = options
+
+            pipe.send(("reset", single_kwargs))
+        self._state = AsyncState.WAITING_RESET
+
+    def reset_wait(
+        self,
+        timeout=None,
+        seed: Optional[int] = None,
+        return_info: bool = False,
+        options: Optional[dict] = None,
+    ):
+        """
+        Parameters
+        ----------
+        timeout : int or float, optional
+            Number of seconds before the call to `reset_wait` times out. If
+            `None`, the call to `reset_wait` never times out.
+        seed: ignored
+        options: ignored
+
+        Returns
+        -------
+        element of :attr:`~VectorEnv.observation_space`
+            A batch of observations from the vectorized environment.
+        infos : list of dicts containing metadata
+
+        Raises
+        ------
+        ClosedEnvironmentError
+            If the environment was closed (if :meth:`close` was previously called).
+
+        NoAsyncCallError
+            If :meth:`reset_wait` was called without any prior call to
+            :meth:`reset_async`.
+
+        TimeoutError
+            If :meth:`reset_wait` timed out.
+        """
+        self._assert_is_running()
+        if self._state != AsyncState.WAITING_RESET:
+            raise NoAsyncCallError(
+                "Calling `reset_wait` without any prior " "call to `reset_async`.",
+                AsyncState.WAITING_RESET.value,
+            )
+
+        if not self._poll(timeout):
+            self._state = AsyncState.DEFAULT
+            raise mp.TimeoutError(
+                f"The call to `reset_wait` has timed out after {timeout} second(s)."
+            )
+
+        results, successes = zip(*[pipe.recv() for pipe in self.parent_pipes])
+        self._raise_if_errors(successes)
+        self._state = AsyncState.DEFAULT
+
+        if return_info:
+            results, infos = zip(*results)
+            infos = list(infos)
+
+            if not self.shared_memory:
+                self.observations = concatenate(
+                    self.single_observation_space, results, self.observations
+                )
+
+            return (
+                deepcopy(self.observations) if self.copy else self.observations
+            ), infos
+        else:
+            if not self.shared_memory:
+                self.observations = concatenate(
+                    self.single_observation_space, results, self.observations
+                )
+
+            return deepcopy(self.observations) if self.copy else self.observations
+
+    def step_async(self, actions):
+        """Send the calls to :obj:`step` to each sub-environment.
+
+        Parameters
+        ----------
+        actions : element of :attr:`~VectorEnv.action_space`
+            Batch of actions.
+
+        Raises
+        ------
+        ClosedEnvironmentError
+            If the environment was closed (if :meth:`close` was previously called).
+
+        AlreadyPendingCallError
+            If the environment is already waiting for a pending call to another
+            method (e.g. :meth:`reset_async`). This can be caused by two consecutive
+            calls to :meth:`step_async`, with no call to :meth:`step_wait` in
+            between.
+        """
+        self._assert_is_running()
+        if self._state != AsyncState.DEFAULT:
+            raise AlreadyPendingCallError(
+                f"Calling `step_async` while waiting for a pending call to `{self._state.value}` to complete.",
+                self._state.value,
+            )
+
+        actions = iterate(self.action_space, actions)
+        for pipe, action in zip(self.parent_pipes, actions):
+            pipe.send(("step", action))
+        self._state = AsyncState.WAITING_STEP
+
+    def step_wait(self, timeout=None):
+        """Wait for the calls to :obj:`step` in each sub-environment to finish.
+
+        Parameters
+        ----------
+        timeout : int or float, optional
+            Number of seconds before the call to :meth:`step_wait` times out. If
+            ``None``, the call to :meth:`step_wait` never times out.
+
+        Returns
+        -------
+        observations : element of :attr:`~VectorEnv.observation_space`
+            A batch of observations from the vectorized environment.
+
+        rewards : :obj:`np.ndarray`, dtype :obj:`np.float_`
+            A vector of rewards from the vectorized environment.
+
+        dones : :obj:`np.ndarray`, dtype :obj:`np.bool_`
+            A vector whose entries indicate whether the episode has ended.
+
+        infos : list of dict
+            A list of auxiliary diagnostic information dicts from sub-environments.
+
+        Raises
+        ------
+        ClosedEnvironmentError
+            If the environment was closed (if :meth:`close` was previously called).
+
+        NoAsyncCallError
+            If :meth:`step_wait` was called without any prior call to
+            :meth:`step_async`.
+
+        TimeoutError
+            If :meth:`step_wait` timed out.
+        """
+        self._assert_is_running()
+        if self._state != AsyncState.WAITING_STEP:
+            raise NoAsyncCallError(
+                "Calling `step_wait` without any prior call " "to `step_async`.",
+                AsyncState.WAITING_STEP.value,
+            )
+
+        if not self._poll(timeout):
+            self._state = AsyncState.DEFAULT
+            raise mp.TimeoutError(
+                f"The call to `step_wait` has timed out after {timeout} second(s)."
+            )
+
+        results, successes = zip(*[pipe.recv() for pipe in self.parent_pipes])
+        self._raise_if_errors(successes)
+        self._state = AsyncState.DEFAULT
+        observations_list, rewards, dones, infos = zip(*results)
+
+        if not self.shared_memory:
+            self.observations = concatenate(
+                self.single_observation_space,
+                observations_list,
+                self.observations,
+            )
+
+        return (
+            deepcopy(self.observations) if self.copy else self.observations,
+            np.array(rewards),
+            np.array(dones, dtype=np.bool_),
+            infos,
+        )
+
+    def call_async(self, name, *args, **kwargs):
+        """
+        Parameters
+        ----------
+        name : string
+            Name of the method or property to call.
+
+        *args
+            Arguments to apply to the method call.
+
+        **kwargs
+            Keywoard arguments to apply to the method call.
+        """
+        self._assert_is_running()
+        if self._state != AsyncState.DEFAULT:
+            raise AlreadyPendingCallError(
+                "Calling `call_async` while waiting "
+                f"for a pending call to `{self._state.value}` to complete.",
+                self._state.value,
+            )
+
+        for pipe in self.parent_pipes:
+            pipe.send(("_call", (name, args, kwargs)))
+        self._state = AsyncState.WAITING_CALL
+
+    def call_wait(self, timeout=None):
+        """
+        Parameters
+        ----------
+        timeout : int or float, optional
+            Number of seconds before the call to `step_wait` times out. If
+            `None` (default), the call to `step_wait` never times out.
+
+        Returns
+        -------
+        results : list
+            List of the results of the individual calls to the method or
+            property for each environment.
+        """
+        self._assert_is_running()
+        if self._state != AsyncState.WAITING_CALL:
+            raise NoAsyncCallError(
+                "Calling `call_wait` without any prior call to `call_async`.",
+                AsyncState.WAITING_CALL.value,
+            )
+
+        if not self._poll(timeout):
+            self._state = AsyncState.DEFAULT
+            raise mp.TimeoutError(
+                f"The call to `call_wait` has timed out after {timeout} second(s)."
+            )
+
+        results, successes = zip(*[pipe.recv() for pipe in self.parent_pipes])
+        self._raise_if_errors(successes)
+        self._state = AsyncState.DEFAULT
+
+        return results
+
+    def set_attr(self, name, values):
+        """
+        Parameters
+        ----------
+        name : string
+            Name of the property to be set in each individual environment.
+
+        values : list, tuple, or object
+            Values of the property to be set to. If `values` is a list or
+            tuple, then it corresponds to the values for each individual
+            environment, otherwise a single value is set for all environments.
+        """
+        self._assert_is_running()
+        if not isinstance(values, (list, tuple)):
+            values = [values for _ in range(self.num_envs)]
+        if len(values) != self.num_envs:
+            raise ValueError(
+                "Values must be a list or tuple with length equal to the "
+                f"number of environments. Got `{len(values)}` values for "
+                f"{self.num_envs} environments."
+            )
+
+        if self._state != AsyncState.DEFAULT:
+            raise AlreadyPendingCallError(
+                "Calling `set_attr` while waiting "
+                f"for a pending call to `{self._state.value}` to complete.",
+                self._state.value,
+            )
+
+        for pipe, value in zip(self.parent_pipes, values):
+            pipe.send(("_setattr", (name, value)))
+        _, successes = zip(*[pipe.recv() for pipe in self.parent_pipes])
+        self._raise_if_errors(successes)
+
+    def close_extras(self, timeout=None, terminate=False):
+        """Close the environments & clean up the extra resources
+        (processes and pipes).
+
+        Parameters
+        ----------
+        timeout : int or float, optional
+            Number of seconds before the call to :meth:`close` times out. If ``None``,
+            the call to :meth:`close` never times out. If the call to :meth:`close`
+            times out, then all processes are terminated.
+
+        terminate : bool
+            If ``True``, then the :meth:`close` operation is forced and all processes
+            are terminated.
+
+        Raises
+        ------
+        TimeoutError
+            If :meth:`close` timed out.
+        """
+        timeout = 0 if terminate else timeout
+        try:
+            if self._state != AsyncState.DEFAULT:
+                logger.warn(
+                    f"Calling `close` while waiting for a pending call to `{self._state.value}` to complete."
+                )
+                function = getattr(self, f"{self._state.value}_wait")
+                function(timeout)
+        except mp.TimeoutError:
+            terminate = True
+
+        if terminate:
+            for process in self.processes:
+                if process.is_alive():
+                    process.terminate()
+        else:
+            for pipe in self.parent_pipes:
+                if (pipe is not None) and (not pipe.closed):
+                    pipe.send(("close", None))
+            for pipe in self.parent_pipes:
+                if (pipe is not None) and (not pipe.closed):
+                    pipe.recv()
+
+        for pipe in self.parent_pipes:
+            if pipe is not None:
+                pipe.close()
+        for process in self.processes:
+            process.join()
+
+    def _poll(self, timeout=None):
+        self._assert_is_running()
+        if timeout is None:
+            return True
+        end_time = time.perf_counter() + timeout
+        delta = None
+        for pipe in self.parent_pipes:
+            delta = max(end_time - time.perf_counter(), 0)
+            if pipe is None:
+                return False
+            if pipe.closed or (not pipe.poll(delta)):
+                return False
+        return True
+
+    def _check_spaces(self):
+        self._assert_is_running()
+        spaces = (self.single_observation_space, self.single_action_space)
+        for pipe in self.parent_pipes:
+            pipe.send(("_check_spaces", spaces))
+        results, successes = zip(*[pipe.recv() for pipe in self.parent_pipes])
+        self._raise_if_errors(successes)
+        same_observation_spaces, same_action_spaces = zip(*results)
+        if not all(same_observation_spaces):
+            raise RuntimeError(
+                "Some environments have an observation space different from "
+                f"`{self.single_observation_space}`. In order to batch observations, "
+                "the observation spaces from all environments must be equal."
+            )
+        if not all(same_action_spaces):
+            raise RuntimeError(
+                "Some environments have an action space different from "
+                f"`{self.single_action_space}`. In order to batch actions, the "
+                "action spaces from all environments must be equal."
+            )
+
+    def _assert_is_running(self):
+        if self.closed:
+            raise ClosedEnvironmentError(
+                f"Trying to operate on `{type(self).__name__}`, after a call to `close()`."
+            )
+
+    def _raise_if_errors(self, successes):
+        if all(successes):
+            return
+
+        num_errors = self.num_envs - sum(successes)
+        assert num_errors > 0
+        for _ in range(num_errors):
+            index, exctype, value = self.error_queue.get()
+            logger.error(
+                f"Received the following error from Worker-{index}: {exctype.__name__}: {value}"
+            )
+            logger.error(f"Shutting down Worker-{index}.")
+            self.parent_pipes[index].close()
+            self.parent_pipes[index] = None
+
+        logger.error("Raising the last exception back to the main process.")
+        raise exctype(value)
+
+    def __del__(self):
+        if not getattr(self, "closed", True):
+            self.close(terminate=True)
+
+    ######################### Added #########################
+    def call_sync(self, method_name, indices=None, **method_kwargs):
+        """Call instance methods of vectorized environments."""
+        target_remotes = self._get_target_remotes(indices)
+        for remote in target_remotes:
+            remote.send(("_call_sync", (method_name, method_kwargs)))
+        return [remote.recv() for remote in target_remotes]
+
+    def call_sync_arg(
+        self, method_name, method_arg_name, method_arg_list, indices=None
+    ):
+        """Call instance methods of vectorized environments with args."""
+        target_remotes = self._get_target_remotes(indices)
+        for method_arg, remote in zip(method_arg_list, target_remotes):
+            method_kwargs = {method_arg_name: method_arg}
+            remote.send(("_call_sync", (method_name, method_kwargs)))
+        return [remote.recv() for remote in target_remotes]
+
+    def _get_target_remotes(self, indices):
+        """Get the connection object needed to communicate with the wanted
+        envs that are in subprocesses."""
+        if indices is None:
+            indices = range(self.n_envs)
+        return [self.parent_pipes[i] for i in indices]
+
+    def reset_arg(self, options_list, **kwargs):
+        results = self.call_sync_arg("reset", "options", options_list)
+        obs = [result[0] for result in results]
+        if isinstance(obs[0], np.ndarray):
+            return np.stack(obs)
+        else:
+            assert isinstance(obs[0], dict)
+            return obs
+
+    def reset_one_arg(self, env_ind, options=None):
+        """
+        Reset one environment with options.
+        """
+        obs, success = self.call_sync(
+            "reset",
+            options=options,
+            indices=[env_ind],
+        )[0]
+        return obs
+
+    def render(self, *args, **kwargs):
+        return self.call("render", *args, **kwargs)
+
+
+def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
+    assert shared_memory is None
+    env = env_fn()
+    parent_pipe.close()
+    try:
+        while True:
+            command, data = pipe.recv()
+            if command == "reset":
+                if "return_info" in data and data["return_info"] == True:
+                    observation, info = env.reset(**data)
+                    pipe.send(((observation, info), True))
+                else:
+                    observation = env.reset(**data)
+                    pipe.send((observation, True))
+
+            elif command == "step":
+                observation, reward, done, info = env.step(data)
+                # if done:
+                #     info["terminal_observation"] = observation
+                #     observation = env.reset()
+                pipe.send(((observation, reward, done, info), True))
+            elif command == "seed":
+                env.seed(data)
+                pipe.send((None, True))
+            elif command == "close":
+                pipe.send((None, True))
+                break
+            elif command == "_call_sync":
+                function = getattr(env, data[0])
+                pipe.send((function(**data[1]), True))
+            elif command == "_call":
+                name, args, kwargs = data
+                if name in ["reset", "step", "seed", "close"]:
+                    raise ValueError(
+                        f"Trying to call function `{name}` with "
+                        f"`_call`. Use `{name}` directly instead."
+                    )
+                function = getattr(env, name)
+                if callable(function):
+                    pipe.send((function(**kwargs), True))
+                else:
+                    pipe.send((function, True))
+            elif command == "_setattr":
+                name, value = data
+                setattr(env, name, value)
+                pipe.send((None, True))
+            elif command == "_check_spaces":
+                pipe.send(
+                    (
+                        (data[0] == env.observation_space, data[1] == env.action_space),
+                        True,
+                    )
+                )
+            else:
+                raise RuntimeError(
+                    f"Received unknown command `{command}`. Must "
+                    "be one of {`reset`, `step`, `seed`, `close`, `_call`, "
+                    "`_setattr`, `_check_spaces`}."
+                )
+    except (KeyboardInterrupt, Exception):
+        error_queue.put((index,) + sys.exc_info()[:2])
+        pipe.send((None, False))
+    finally:
+        env.close()
+
+
+def _worker_shared_memory(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
+    assert shared_memory is not None
+    env = env_fn()
+    observation_space = env.observation_space
+    parent_pipe.close()
+    try:
+        while True:
+            command, data = pipe.recv()
+            if command == "reset":
+                if "return_info" in data and data["return_info"] == True:
+                    observation, info = env.reset(**data)
+                    write_to_shared_memory(
+                        observation_space, index, observation, shared_memory
+                    )
+                    pipe.send(((None, info), True))
+                else:
+                    observation = env.reset(**data)
+                    write_to_shared_memory(
+                        observation_space, index, observation, shared_memory
+                    )
+                    pipe.send((None, True))
+            elif command == "step":
+                observation, reward, done, info = env.step(data)
+                # if done:
+                #     info["terminal_observation"] = observation
+                #     observation = env.reset()
+                write_to_shared_memory(
+                    observation_space, index, observation, shared_memory
+                )
+                pipe.send(((None, reward, done, info), True))
+            elif command == "seed":
+                env.seed(data)
+                pipe.send((None, True))
+            elif command == "close":
+                pipe.send((None, True))
+                break
+            elif command == "_call_sync":
+                function = getattr(env, data[0])
+                pipe.send((function(**data[1]), True))
+            elif command == "_call":
+                name, args, kwargs = data
+                if name in ["reset", "step", "seed", "close"]:
+                    raise ValueError(
+                        f"Trying to call function `{name}` with "
+                        f"`_call`. Use `{name}` directly instead."
+                    )
+                function = getattr(env, name)
+                if callable(function):
+                    pipe.send((function(*args, **kwargs), True))
+                else:
+                    pipe.send((function, True))
+            elif command == "_setattr":
+                name, value = data
+                setattr(env, name, value)
+                pipe.send((None, True))
+            elif command == "_check_spaces":
+                pipe.send(
+                    ((data[0] == observation_space, data[1] == env.action_space), True)
+                )
+            else:
+                raise RuntimeError(
+                    f"Received unknown command `{command}`. Must "
+                    "be one of {`reset`, `step`, `seed`, `close`, `_call`, "
+                    "`_setattr`, `_check_spaces`}."
+                )
+    except (KeyboardInterrupt, Exception):
+        error_queue.put((index,) + sys.exc_info()[:2])
+        pipe.send((None, False))
+    finally:
+        env.close()
diff --git a/env/gym_utils/furniture_normalizer.py b/env/gym_utils/furniture_normalizer.py
new file mode 100644
index 0000000..3ad721a
--- /dev/null
+++ b/env/gym_utils/furniture_normalizer.py
@@ -0,0 +1,80 @@
+"""
+Normalization for Furniture-Bench environments.
+
+TODO: use this normalizer for all benchmarks.
+
+"""
+
+import torch.nn as nn
+
+
+class LinearNormalizer(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.stats = nn.ParameterDict()
+
+    def fit(self, data_dict):
+        for key, tensor in data_dict.items():
+            min_value = tensor.min(dim=0)[0]
+            max_value = tensor.max(dim=0)[0]
+
+            # Check if any column has only one value throughout
+            diff = max_value - min_value
+            constant_columns = diff == 0
+
+            # Set a small range for constant columns to avoid division by zero
+            min_value[constant_columns] -= 1
+            max_value[constant_columns] += 1
+
+            self.stats[key] = nn.ParameterDict(
+                {
+                    "min": nn.Parameter(min_value, requires_grad=False),
+                    "max": nn.Parameter(max_value, requires_grad=False),
+                },
+            )
+        self._turn_off_gradients()
+
+    def _normalize(self, x, key):
+        stats = self.stats[key]
+        x = (x - stats["min"]) / (stats["max"] - stats["min"])
+        x = 2 * x - 1
+        return x
+
+    def _denormalize(self, x, key):
+        stats = self.stats[key]
+        x = (x + 1) / 2
+        x = x * (stats["max"] - stats["min"]) + stats["min"]
+        return x
+
+    def forward(self, x, key, forward=True):
+        if forward:
+            return self._normalize(x, key)
+        else:
+            return self._denormalize(x, key)
+
+    def _turn_off_gradients(self):
+        for key in self.stats.keys():
+            for stat in self.stats[key].keys():
+                self.stats[key][stat].requires_grad = False
+
+    def load_state_dict(self, state_dict):
+
+        stats = nn.ParameterDict()
+        for key, value in state_dict.items():
+            if key.startswith("stats."):
+                param_key = key[6:]
+                keys = param_key.split(".")
+                current_dict = stats
+                for k in keys[:-1]:
+                    if k not in current_dict:
+                        current_dict[k] = nn.ParameterDict()
+                    current_dict = current_dict[k]
+                current_dict[keys[-1]] = nn.Parameter(value)
+
+        self.stats = stats
+        self._turn_off_gradients()
+
+        return f"<Added keys {self.stats.keys()} to the normalizer.>"
+
+    def keys(self):
+        return self.stats.keys()
diff --git a/env/gym_utils/sync_vector_env.py b/env/gym_utils/sync_vector_env.py
new file mode 100644
index 0000000..9804ca5
--- /dev/null
+++ b/env/gym_utils/sync_vector_env.py
@@ -0,0 +1,201 @@
+from typing import List, Union, Optional
+
+import numpy as np
+from copy import deepcopy
+
+from gym import logger
+from gym.logger import warn
+from gym.vector.vector_env import VectorEnv
+from gym.vector.utils import concatenate, iterate, create_empty_array
+
+
+__all__ = ["SyncVectorEnv"]
+
+
+class SyncVectorEnv(VectorEnv):
+    """Vectorized environment that serially runs multiple environments.
+
+    Parameters
+    ----------
+    env_fns : iterable of callable
+        Functions that create the environments.
+
+    observation_space : :class:`gym.spaces.Space`, optional
+        Observation space of a single environment. If ``None``, then the
+        observation space of the first environment is taken.
+
+    action_space : :class:`gym.spaces.Space`, optional
+        Action space of a single environment. If ``None``, then the action space
+        of the first environment is taken.
+
+    copy : bool
+        If ``True``, then the :meth:`reset` and :meth:`step` methods return a
+        copy of the observations.
+
+    Raises
+    ------
+    RuntimeError
+        If the observation space of some sub-environment does not match
+        :obj:`observation_space` (or, by default, the observation space of
+        the first sub-environment).
+
+    Example
+    -------
+
+    .. code-block::
+
+        >>> env = gym.vector.SyncVectorEnv([
+        ...     lambda: gym.make("Pendulum-v0", g=9.81),
+        ...     lambda: gym.make("Pendulum-v0", g=1.62)
+        ... ])
+        >>> env.reset()
+        array([[-0.8286432 ,  0.5597771 ,  0.90249056],
+               [-0.85009176,  0.5266346 ,  0.60007906]], dtype=float32)
+    """
+
+    def __init__(self, env_fns, observation_space=None, action_space=None, copy=True):
+        self.env_fns = env_fns
+        self.envs = [env_fn() for env_fn in env_fns]
+        self.copy = copy
+        self.metadata = self.envs[0].metadata
+
+        if (observation_space is None) or (action_space is None):
+            observation_space = observation_space or self.envs[0].observation_space
+            action_space = action_space or self.envs[0].action_space
+        super().__init__(
+            num_envs=len(env_fns),
+            observation_space=observation_space,
+            action_space=action_space,
+        )
+
+        self._check_spaces()
+        self.observations = create_empty_array(
+            self.single_observation_space, n=self.num_envs, fn=np.zeros
+        )
+        self._rewards = np.zeros((self.num_envs,), dtype=np.float64)
+        self._dones = np.zeros((self.num_envs,), dtype=np.bool_)
+        self._actions = None
+
+    def seed(self, seed=None):
+        super().seed(seed=seed)
+        if seed is None:
+            seed = [None for _ in range(self.num_envs)]
+        if isinstance(seed, int):
+            seed = [seed + i for i in range(self.num_envs)]
+        assert len(seed) == self.num_envs
+
+        for env, single_seed in zip(self.envs, seed):
+            env.seed(single_seed)
+
+    def reset_wait(
+        self,
+        seed: Optional[Union[int, List[int]]] = None,
+        return_info: bool = False,
+        options: Optional[dict] = None,
+    ):
+        if seed is None:
+            seed = [None for _ in range(self.num_envs)]
+        if isinstance(seed, int):
+            seed = [seed + i for i in range(self.num_envs)]
+        assert len(seed) == self.num_envs
+
+        self._dones[:] = False
+        observations = []
+        data_list = []
+        for env, single_seed in zip(self.envs, seed):
+
+            kwargs = {}
+            if single_seed is not None:
+                kwargs["seed"] = single_seed
+            if options is not None:
+                kwargs["options"] = options
+            if return_info == True:
+                kwargs["return_info"] = return_info
+
+            if not return_info:
+                observation = env.reset(**kwargs)
+                observations.append(observation)
+            else:
+                observation, data = env.reset(**kwargs)
+                observations.append(observation)
+                data_list.append(data)
+
+        self.observations = concatenate(
+            self.single_observation_space, observations, self.observations
+        )
+        if not return_info:
+            return deepcopy(self.observations) if self.copy else self.observations
+        else:
+            return (
+                deepcopy(self.observations) if self.copy else self.observations
+            ), data_list
+
+    def step_async(self, actions):
+        self._actions = iterate(self.action_space, actions)
+
+    def step_wait(self):
+        observations, infos = [], []
+        for i, (env, action) in enumerate(zip(self.envs, self._actions)):
+            observation, self._rewards[i], self._dones[i], info = env.step(action)
+            if self._dones[i]:
+                info["terminal_observation"] = observation
+                observation = env.reset()
+            observations.append(observation)
+            infos.append(info)
+        self.observations = concatenate(
+            self.single_observation_space, observations, self.observations
+        )
+
+        return (
+            deepcopy(self.observations) if self.copy else self.observations,
+            np.copy(self._rewards),
+            np.copy(self._dones),
+            infos,
+        )
+
+    def call(self, name, *args, **kwargs):
+        results = []
+        for env in self.envs:
+            function = getattr(env, name)
+            if callable(function):
+                results.append(function(*args, **kwargs))
+            else:
+                results.append(function)
+
+        return tuple(results)
+
+    def set_attr(self, name, values):
+        if not isinstance(values, (list, tuple)):
+            values = [values for _ in range(self.num_envs)]
+        if len(values) != self.num_envs:
+            raise ValueError(
+                "Values must be a list or tuple with length equal to the "
+                f"number of environments. Got `{len(values)}` values for "
+                f"{self.num_envs} environments."
+            )
+
+        for env, value in zip(self.envs, values):
+            setattr(env, name, value)
+
+    def close_extras(self, **kwargs):
+        """Close the environments."""
+        [env.close() for env in self.envs]
+
+    def _check_spaces(self):
+        for env in self.envs:
+            if not (env.observation_space == self.single_observation_space):
+                raise RuntimeError(
+                    "Some environments have an observation space different from "
+                    f"`{self.single_observation_space}`. In order to batch observations, "
+                    "the observation spaces from all environments must be equal."
+                )
+
+            if not (env.action_space == self.single_action_space):
+                raise RuntimeError(
+                    "Some environments have an action space different from "
+                    f"`{self.single_action_space}`. In order to batch actions, the "
+                    "action spaces from all environments must be equal."
+                )
+
+        else:
+            return True
diff --git a/env/gym_utils/vector_env.py b/env/gym_utils/vector_env.py
new file mode 100644
index 0000000..e1cbc1b
--- /dev/null
+++ b/env/gym_utils/vector_env.py
@@ -0,0 +1,276 @@
+from typing import Optional, Union, List
+
+import gym
+from gym.logger import warn, deprecation
+from gym.spaces import Tuple
+from gym.vector.utils.spaces import batch_space
+
+
+__all__ = ["VectorEnv"]
+
+
+class VectorEnv(gym.Env):
+    r"""Base class for vectorized environments.
+
+    Each observation returned from vectorized environment is a batch of observations
+    for each sub-environment. And :meth:`step` is also expected to receive a batch of
+    actions for each sub-environment.
+
+    .. note::
+
+        All sub-environments should share the identical observation and action spaces.
+        In other words, a vector of multiple different environments is not supported.
+
+    Parameters
+    ----------
+    num_envs : int
+        Number of environments in the vectorized environment.
+
+    observation_space : :class:`gym.spaces.Space`
+        Observation space of a single environment.
+
+    action_space : :class:`gym.spaces.Space`
+        Action space of a single environment.
+    """
+
+    def __init__(self, num_envs, observation_space, action_space):
+        self.num_envs = num_envs
+        self.is_vector_env = True
+        self.observation_space = batch_space(observation_space, n=num_envs)
+        self.action_space = batch_space(action_space, n=num_envs)
+
+        self.closed = False
+        self.viewer = None
+
+        # The observation and action spaces of a single environment are
+        # kept in separate properties
+        self.single_observation_space = observation_space
+        self.single_action_space = action_space
+
+    def reset_async(
+        self,
+        seed: Optional[Union[int, List[int]]] = None,
+        return_info: bool = False,
+        options: Optional[dict] = None,
+    ):
+        pass
+
+    def reset_wait(
+        self,
+        seed: Optional[Union[int, List[int]]] = None,
+        return_info: bool = False,
+        options: Optional[dict] = None,
+    ):
+        raise NotImplementedError()
+
+    def reset(
+        self,
+        *,
+        seed: Optional[Union[int, List[int]]] = None,
+        return_info: bool = False,
+        options: Optional[dict] = None,
+    ):
+        r"""Reset all sub-environments and return a batch of initial observations.
+
+        Returns
+        -------
+        element of :attr:`observation_space`
+            A batch of observations from the vectorized environment.
+        """
+        self.reset_async(seed=seed, return_info=return_info, options=options)
+        return self.reset_wait(seed=seed, return_info=return_info, options=options)
+
+    def step_async(self, actions):
+        pass
+
+    def step_wait(self, **kwargs):
+        raise NotImplementedError()
+
+    def step(self, actions):
+        r"""Take an action for each sub-environments.
+
+        Parameters
+        ----------
+        actions : element of :attr:`action_space`
+            Batch of actions.
+
+        Returns
+        -------
+        observations : element of :attr:`observation_space`
+            A batch of observations from the vectorized environment.
+
+        rewards : :obj:`np.ndarray`, dtype :obj:`np.float_`
+            A vector of rewards from the vectorized environment.
+
+        dones : :obj:`np.ndarray`, dtype :obj:`np.bool_`
+            A vector whose entries indicate whether the episode has ended.
+
+        infos : list of dict
+            A list of auxiliary diagnostic information dicts from sub-environments.
+        """
+
+        self.step_async(actions)
+        return self.step_wait()
+
+    def call_async(self, name, *args, **kwargs):
+        pass
+
+    def call_wait(self, **kwargs):
+        raise NotImplementedError()
+
+    def call(self, name, *args, **kwargs):
+        """Call a method, or get a property, from each sub-environment.
+
+        Parameters
+        ----------
+        name : string
+            Name of the method or property to call.
+
+        *args
+            Arguments to apply to the method call.
+
+        **kwargs
+            Keywoard arguments to apply to the method call.
+
+        Returns
+        -------
+        results : list
+            List of the results of the individual calls to the method or
+            property for each environment.
+        """
+        self.call_async(name, *args, **kwargs)
+        return self.call_wait()
+
+    def get_attr(self, name):
+        """Get a property from each sub-environment.
+
+        Parameters
+        ----------
+        name : string
+            Name of the property to be get from each individual environment.
+        """
+        return self.call(name)
+
+    def set_attr(self, name, values):
+        """Set a property in each sub-environment.
+
+        Parameters
+        ----------
+        name : string
+            Name of the property to be set in each individual environment.
+
+        values : list, tuple, or object
+            Values of the property to be set to. If `values` is a list or
+            tuple, then it corresponds to the values for each individual
+            environment, otherwise a single value is set for all environments.
+        """
+        raise NotImplementedError()
+
+    def close_extras(self, **kwargs):
+        r"""Clean up the extra resources e.g. beyond what's in this base class."""
+        pass
+
+    def close(self, **kwargs):
+        r"""Close all sub-environments and release resources.
+
+        It also closes all the existing image viewers, then calls :meth:`close_extras` and set
+        :attr:`closed` as ``True``.
+
+        .. warning::
+
+            This function itself does not close the environments, it should be handled
+            in :meth:`close_extras`. This is generic for both synchronous and asynchronous
+            vectorized environments.
+
+        .. note::
+
+            This will be automatically called when garbage collected or program exited.
+
+        """
+        if self.closed:
+            return
+        if self.viewer is not None:
+            self.viewer.close()
+        self.close_extras(**kwargs)
+        self.closed = True
+
+    def seed(self, seed=None):
+        """Set the random seed in all sub-environments.
+
+        Parameters
+        ----------
+        seed : list of int, or int, optional
+            Random seed for each sub-environment. If ``seed`` is a list of
+            length ``num_envs``, then the items of the list are chosen as random
+            seeds. If ``seed`` is an int, then each sub-environment uses the random
+            seed ``seed + n``, where ``n`` is the index of the sub-environment
+            (between ``0`` and ``num_envs - 1``).
+        """
+        deprecation(
+            "Function `env.seed(seed)` is marked as deprecated and will be removed in the future. "
+            "Please use `env.reset(seed=seed) instead in VectorEnvs."
+        )
+
+    def __del__(self):
+        if not getattr(self, "closed", True):
+            self.close()
+
+    def __repr__(self):
+        if self.spec is None:
+            return f"{self.__class__.__name__}({self.num_envs})"
+        else:
+            return f"{self.__class__.__name__}({self.spec.id}, {self.num_envs})"
+
+
+class VectorEnvWrapper(VectorEnv):
+    r"""Wraps the vectorized environment to allow a modular transformation.
+
+    This class is the base class for all wrappers for vectorized environments. The subclass
+    could override some methods to change the behavior of the original vectorized environment
+    without touching the original code.
+
+    .. note::
+
+        Don't forget to call ``super().__init__(env)`` if the subclass overrides :meth:`__init__`.
+
+    """
+
+    def __init__(self, env):
+        assert isinstance(env, VectorEnv)
+        self.env = env
+
+    # explicitly forward the methods defined in VectorEnv
+    # to self.env (instead of the base class)
+    def reset_async(self, **kwargs):
+        return self.env.reset_async(**kwargs)
+
+    def reset_wait(self, **kwargs):
+        return self.env.reset_wait(**kwargs)
+
+    def step_async(self, actions):
+        return self.env.step_async(actions)
+
+    def step_wait(self):
+        return self.env.step_wait()
+
+    def close(self, **kwargs):
+        return self.env.close(**kwargs)
+
+    def close_extras(self, **kwargs):
+        return self.env.close_extras(**kwargs)
+
+    def seed(self, seed=None):
+        return self.env.seed(seed)
+
+    # implicitly forward all other methods and attributes to self.env
+    def __getattr__(self, name):
+        if name.startswith("_"):
+            raise AttributeError(f"attempted to get missing private attribute '{name}'")
+        return getattr(self.env, name)
+
+    @property
+    def unwrapped(self):
+        return self.env.unwrapped
+
+    def __repr__(self):
+        return f"<{self.__class__.__name__}, {self.env}>"
diff --git a/env/gym_utils/wrapper/__init__.py b/env/gym_utils/wrapper/__init__.py
new file mode 100644
index 0000000..a0959d1
--- /dev/null
+++ b/env/gym_utils/wrapper/__init__.py
@@ -0,0 +1,14 @@
+from .multi_step import MultiStep
+from .robomimic_lowdim import RobomimicLowdimWrapper
+from .robomimic_image import RobomimicImageWrapper
+from .d3il_lowdim import D3ilLowdimWrapper
+from .mujoco_locomotion_lowdim import MujocoLocomotionLowdimWrapper
+
+
+wrapper_dict = {
+    "multi_step": MultiStep,
+    "robomimic_lowdim": RobomimicLowdimWrapper,
+    "robomimic_image": RobomimicImageWrapper,
+    "d3il_lowdim": D3ilLowdimWrapper,
+    "mujoco_locomotion_lowdim": MujocoLocomotionLowdimWrapper,
+}
diff --git a/env/gym_utils/wrapper/d3il_lowdim.py b/env/gym_utils/wrapper/d3il_lowdim.py
new file mode 100644
index 0000000..80d5591
--- /dev/null
+++ b/env/gym_utils/wrapper/d3il_lowdim.py
@@ -0,0 +1,87 @@
+"""
+Environment wrapper for D3IL environments with state observations.
+
+"""
+
+import numpy as np
+import gym
+
+
+class D3ilLowdimWrapper(gym.Env):
+    def __init__(
+        self,
+        env,
+        normalization_path,
+        # init_state=None,
+        # render_hw=(256, 256),
+        # render_camera_name="agentview",
+    ):
+        self.env = env
+        # self.init_state = init_state
+        # self.render_hw = render_hw
+        # self.render_camera_name = render_camera_name
+
+        # setup spaces
+        self.action_space = env.action_space
+        self.observation_space = env.observation_space
+        normalization = np.load(normalization_path)
+        self.obs_min = normalization["obs_min"]
+        self.obs_max = normalization["obs_max"]
+        self.action_min = normalization["action_min"]
+        self.action_max = normalization["action_max"]
+
+    # def get_observation(self):
+    #     raw_obs = self.env.get_observation()
+    #     obs = np.concatenate([raw_obs[key] for key in self.obs_keys], axis=0)
+    #     return obs
+
+    def seed(self, seed=None):
+        if seed is not None:
+            np.random.seed(seed=seed)
+        else:
+            np.random.seed()
+
+    def reset(self, **kwargs):
+        """Ignore passed-in arguments like seed"""
+        options = kwargs.get("options", {})
+
+        new_seed = options.get(
+            "seed", None
+        )  # used to set all environments to specified seeds
+        # if self.init_state is not None:
+        #     # always reset to the same state to be compatible with gym
+        #     self.env.reset_to({"states": self.init_state})
+        if new_seed is not None:
+            self.seed(seed=new_seed)
+            obs = self.env.reset()
+        else:
+            # random reset
+            obs = self.env.reset()
+
+        # normalize
+        obs = self.normalize_obs(obs)
+        return obs
+
+    def normalize_obs(self, obs):
+        return 2 * ((obs - self.obs_min) / (self.obs_max - self.obs_min + 1e-6) - 0.5)
+
+    def unnormaliza_action(self, action):
+        action = (action + 1) / 2  # [-1, 1] -> [0, 1]
+        return action * (self.action_max - self.action_min) + self.action_min
+
+    def step(self, action):
+        action = self.unnormaliza_action(action)
+        obs, reward, done, info = self.env.step(action)
+
+        # normalize
+        obs = self.normalize_obs(obs)
+        return obs, reward, done, info
+
+    def render(self, mode="rgb_array"):
+        h, w = self.render_hw
+        return self.env.render(
+            mode=mode,
+            height=h,
+            width=w,
+            camera_name=self.render_camera_name,
+        )
diff --git a/env/gym_utils/wrapper/furniture.py b/env/gym_utils/wrapper/furniture.py
new file mode 100644
index 0000000..0c5776d
--- /dev/null
+++ b/env/gym_utils/wrapper/furniture.py
@@ -0,0 +1,152 @@
+"""
+Environment wrapper for Furniture-Bench environments.
+
+"""
+
+import gym
+import numpy as np
+from furniture_bench.envs.furniture_rl_sim_env import FurnitureRLSimEnv
+import torch
+from furniture_bench.controllers.control_utils import proprioceptive_quat_to_6d_rotation
+from ..furniture_normalizer import LinearNormalizer
+from .multi_step import repeated_space
+
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class FurnitureRLSimEnvMultiStepWrapper(gym.Wrapper):
+    env: FurnitureRLSimEnv
+
+    def __init__(
+        self,
+        env: FurnitureRLSimEnv,
+        n_obs_steps=1,
+        n_action_steps=1,
+        max_episode_steps=None,
+        sparse_reward=False,
+        reward_agg_method="sum",  # never use other types
+        reset_within_step=False,
+        pass_full_observations=False,
+        normalization_path=None,
+        prev_action=False,
+    ):
+        assert (
+            not reset_within_step
+        ), "reset_within_step must be False for furniture envs"
+        assert n_obs_steps == 1, "n_obs_steps must be 1"
+        assert reward_agg_method == "sum", "reward_agg_method must be sum"
+        assert (
+            not pass_full_observations
+        ), "pass_full_observations is not implemented yet"
+        assert not prev_action, "prev_action is not implemented yet"
+
+        super().__init__(env)
+        self._single_action_space = env.action_space
+        self._action_space = repeated_space(env.action_space, n_action_steps)
+        self._observation_space = repeated_space(env.observation_space, n_obs_steps)
+        self.max_episode_steps = max_episode_steps
+        self.n_obs_steps = n_obs_steps
+        self.n_action_steps = n_action_steps
+        self.pass_full_observations = pass_full_observations
+
+        # Use the original reward function where the robot does not receive new reward after completing one part
+        self.sparse_reward = sparse_reward
+
+        # set up normalization
+        self.normalize = normalization_path is not None
+        self.normalizer = LinearNormalizer()
+        self.normalizer.load_state_dict(
+            torch.load(normalization_path, map_location=self.device, weights_only=True)
+        )
+        log.info(f"Loaded normalization from {normalization_path}")
+
+    def reset(
+        self,
+        **kwargs,
+    ):
+        """Resets the environment."""
+        obs = self.env.reset()
+        nobs = self.process_obs(obs)
+        self.best_reward = torch.zeros(self.env.num_envs).to(self.device)
+        self.done = list()
+
+        return nobs
+
+    def reset_arg(self, options_list=None):
+        return self.reset()
+
+    def reset_one_arg(self, env_ind=None, options=None):
+        if env_ind is not None:
+            env_ind = torch.tensor([env_ind], device=self.device)
+
+        return self.reset()
+
+    def step(self, action: np.ndarray):
+        """
+        Takes in a chunk of actions of length n_action_steps
+        and steps the environment n_action_steps times
+        and returns an aggregated observation, reward, and done signal
+        """
+        # action: (n_envs, n_action_steps, action_dim)
+        action = torch.tensor(action, device=self.device)
+
+        # Denormalize the action
+        action = self.normalizer(action, "actions", forward=False)
+
+        # Step the environment n_action_steps times
+        obs, sparse_reward, dense_reward, done, info = self._inner_step(action)
+        if self.sparse_reward:
+            reward = sparse_reward.clone().cpu().numpy()
+        else:
+            reward = dense_reward.clone().cpu().numpy()
+
+        # Only mark the environment as done if it times out, ignore done from inner steps
+        truncated = self.env.env_steps >= self.max_env_steps
+        done = truncated
+
+        nobs: np.ndarray = self.process_obs(obs)
+        done: np.ndarray = done.squeeze().cpu().numpy()
+
+        return (nobs, reward, done, info)
+
+    def _inner_step(self, action_chunk: torch.Tensor):
+        dones = torch.zeros(
+            action_chunk.shape[0], dtype=torch.bool, device=action_chunk.device
+        )
+        dense_reward = torch.zeros(action_chunk.shape[0], device=action_chunk.device)
+        sparse_reward = torch.zeros(action_chunk.shape[0], device=action_chunk.device)
+        for i in range(self.n_action_steps):
+            # The dimensions of the action_chunk are (num_envs, chunk_size, action_dim)
+            obs, reward, done, info = self.env.step(action_chunk[:, i, :])
+
+            # track raw reward
+            sparse_reward += reward.squeeze()
+
+            # track best reward --- reward nonzero only one part is assembled
+            self.best_reward += reward.squeeze()
+
+            # assign "permanent" rewards
+            dense_reward += self.best_reward
+
+            dones = dones | done.squeeze()
+
+        return obs, sparse_reward, dense_reward, dones, info
+
+    def process_obs(self, obs: torch.Tensor) -> np.ndarray:
+        robot_state = obs["robot_state"]
+
+        # Convert the robot state to have 6D pose
+        robot_state = proprioceptive_quat_to_6d_rotation(robot_state)
+
+        parts_poses = obs["parts_poses"]
+
+        obs = torch.cat([robot_state, parts_poses], dim=-1)
+        nobs = self.normalizer(obs, "observations", forward=True)
+        nobs = torch.clamp(nobs, -5, 5)
+
+        # Insert a dummy dimension for the n_obs_steps (n_envs, obs_dim) -> (n_envs, n_obs_steps, obs_dim)
+        nobs = nobs.unsqueeze(1).cpu().numpy()
+
+        return nobs
diff --git a/env/gym_utils/wrapper/mujoco_locomotion_lowdim.py b/env/gym_utils/wrapper/mujoco_locomotion_lowdim.py
new file mode 100644
index 0000000..7b2247f
--- /dev/null
+++ b/env/gym_utils/wrapper/mujoco_locomotion_lowdim.py
@@ -0,0 +1,61 @@
+"""
+Environment wrapper for Gym environments (MuJoCo locomotion tasks) with state observations.
+
+"""
+
+import numpy as np
+import gym
+
+
+class MujocoLocomotionLowdimWrapper(gym.Env):
+    def __init__(
+        self,
+        env,
+        normalization_path,
+    ):
+        self.env = env
+
+        # setup spaces
+        self.action_space = env.action_space
+        self.observation_space = env.observation_space
+        normalization = np.load(normalization_path)
+        self.obs_min = normalization["obs_min"]
+        self.obs_max = normalization["obs_max"]
+        self.action_min = normalization["action_min"]
+        self.action_max = normalization["action_max"]
+
+    def seed(self, seed=None):
+        if seed is not None:
+            np.random.seed(seed=seed)
+        else:
+            np.random.seed()
+
+    def reset(self, **kwargs):
+        """Ignore passed-in arguments like seed"""
+        options = kwargs.get("options", {})
+        new_seed = options.get("seed", None)
+        if new_seed is not None:
+            self.seed(seed=new_seed)
+        raw_obs = self.env.reset()
+
+        # normalize
+        obs = self.normalize_obs(raw_obs)
+        return obs
+
+    def normalize_obs(self, obs):
+        return 2 * ((obs - self.obs_min) / (self.obs_max - self.obs_min + 1e-6) - 0.5)
+
+    def unnormaliza_action(self, action):
+        action = (action + 1) / 2  # [-1, 1] -> [0, 1]
+        return action * (self.action_max - self.action_min) + self.action_min
+
+    def step(self, action):
+        raw_action = self.unnormaliza_action(action)
+        raw_obs, reward, done, info = self.env.step(raw_action)
+
+        # normalize
+        obs = self.normalize_obs(raw_obs)
+        return obs, reward, done, info
+
+    def render(self, **kwargs):
+        return self.env.render()
diff --git a/env/gym_utils/wrapper/multi_step.py b/env/gym_utils/wrapper/multi_step.py
new file mode 100644
index 0000000..d3b031d
--- /dev/null
+++ b/env/gym_utils/wrapper/multi_step.py
@@ -0,0 +1,283 @@
+"""
+Multi-step wrapper. Allow executing multiple environmnt steps. Returns stacked observation and optionally stacked previous action.
+
+Modified from https://github.com/real-stanford/diffusion_policy/blob/main/diffusion_policy/gym_util/multistep_wrapper.py
+
+"""
+
+import gym
+from typing import Optional
+from gym import spaces
+import numpy as np
+from collections import defaultdict, deque
+
+# import dill
+
+
+def stack_repeated(x, n):
+    return np.repeat(np.expand_dims(x, axis=0), n, axis=0)
+
+
+def repeated_box(box_space, n):
+    return spaces.Box(
+        low=stack_repeated(box_space.low, n),
+        high=stack_repeated(box_space.high, n),
+        shape=(n,) + box_space.shape,
+        dtype=box_space.dtype,
+    )
+
+
+def repeated_space(space, n):
+    if isinstance(space, spaces.Box):
+        return repeated_box(space, n)
+    elif isinstance(space, spaces.Dict):
+        result_space = spaces.Dict()
+        for key, value in space.items():
+            result_space[key] = repeated_space(value, n)
+        return result_space
+    else:
+        raise RuntimeError(f"Unsupported space type {type(space)}")
+
+
+def take_last_n(x, n):
+    x = list(x)
+    n = min(len(x), n)
+    return np.array(x[-n:])
+
+
+def dict_take_last_n(x, n):
+    result = dict()
+    for key, value in x.items():
+        result[key] = take_last_n(value, n)
+    return result
+
+
+def aggregate(data, method="max"):
+    if method == "max":
+        # equivalent to any
+        return np.max(data)
+    elif method == "min":
+        # equivalent to all
+        return np.min(data)
+    elif method == "mean":
+        return np.mean(data)
+    elif method == "sum":
+        return np.sum(data)
+    else:
+        raise NotImplementedError()
+
+
+def stack_last_n_obs(all_obs, n_steps):
+    """Apply padding"""
+    assert len(all_obs) > 0
+    all_obs = list(all_obs)
+    result = np.zeros((n_steps,) + all_obs[-1].shape, dtype=all_obs[-1].dtype)
+    start_idx = -min(n_steps, len(all_obs))
+    result[start_idx:] = np.array(all_obs[start_idx:])
+    if n_steps > len(all_obs):
+        # pad
+        result[:start_idx] = result[start_idx]
+    return result
+
+
+class MultiStep(gym.Wrapper):
+
+    def __init__(
+        self,
+        env,
+        n_obs_steps=1,
+        n_action_steps=1,
+        max_episode_steps=None,
+        reward_agg_method="sum",  # never use other types
+        prev_action=True,
+        reset_within_step=False,
+        pass_full_observations=False,
+        verbose=False,
+        **kwargs,
+    ):
+        super().__init__(env)
+        self._single_action_space = env.action_space
+        self._action_space = repeated_space(env.action_space, n_action_steps)
+        self._observation_space = repeated_space(env.observation_space, n_obs_steps)
+        self.max_episode_steps = max_episode_steps
+        self.n_obs_steps = n_obs_steps
+        self.n_action_steps = n_action_steps
+        self.reward_agg_method = reward_agg_method
+        self.prev_action = prev_action
+        self.reset_within_step = reset_within_step
+        self.pass_full_observations = pass_full_observations
+        self.verbose = verbose
+
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        return_info: bool = False,
+        options: dict = {},
+    ):
+        """Resets the environment."""
+        obs = self.env.reset(
+            seed=seed,
+            options=options,
+            return_info=return_info,
+        )
+        self.obs = deque([obs], maxlen=max(self.n_obs_steps + 1, self.n_action_steps))
+        if self.prev_action:
+            self.action = deque(
+                [self._single_action_space.sample()], maxlen=self.n_obs_steps
+            )
+        self.reward = list()
+        self.done = list()
+        self.info = defaultdict(lambda: deque(maxlen=self.n_obs_steps + 1))
+        obs = self._get_obs(self.n_obs_steps)
+
+        self.cnt = 0
+        return obs
+
+    def step(self, action):
+        """
+        actions: (n_action_steps,) + action_shape
+        """
+        if action.ndim == 1:  # in case action_steps = 1
+            action = action[None]
+        for act_step, act in enumerate(action):
+            self.cnt += 1
+
+            if len(self.done) > 0 and self.done[-1]:
+                # termination
+                break
+            observation, reward, done, info = self.env.step(act)
+
+            self.obs.append(observation)
+            self.action.append(act)
+            self.reward.append(reward)
+            if (
+                self.max_episode_steps is not None
+            ) and self.cnt >= self.max_episode_steps:
+                # truncation
+                done = True
+            self.done.append(done)
+            self._add_info(info)
+
+        observation = self._get_obs(self.n_obs_steps)
+        reward = aggregate(self.reward, self.reward_agg_method)
+        done = aggregate(self.done, "max")
+        info = dict_take_last_n(self.info, self.n_obs_steps)
+        if self.pass_full_observations:  # right now this assume n_obs_steps = 1
+            info["full_obs"] = self._get_obs(act_step + 1)
+
+        # In mujoco case, done can happen within the loop above
+        if self.reset_within_step and self.done[-1]:
+            observation = (
+                self.reset()
+            )  # TODO: arguments? this cannot handle video recording right now since needs to pass in options
+            self.verbose and print("Reset env within wrapper.")
+
+        # reset reward and done for next step
+        self.reward = list()
+        self.done = list()
+        return observation, reward, done, info
+
+    def _get_obs(self, n_steps=1):
+        """
+        Output (n_steps,) + obs_shape
+        """
+        assert len(self.obs) > 0
+        if isinstance(self.observation_space, spaces.Box):
+            return stack_last_n_obs(self.obs, n_steps)
+        elif isinstance(self.observation_space, spaces.Dict):
+            result = dict()
+            for key in self.observation_space.keys():
+                result[key] = stack_last_n_obs([obs[key] for obs in self.obs], n_steps)
+            return result
+        else:
+            raise RuntimeError("Unsupported space type")
+
+    def get_prev_action(self, n_steps=None):
+        if n_steps is None:
+            n_steps = self.n_obs_steps - 1  # exclude current step
+        assert len(self.action) > 0
+        return stack_last_n_obs(self.action, n_steps)
+
+    def _add_info(self, info):
+        for key, value in info.items():
+            self.info[key].append(value)
+
+    def render(self, **kwargs):
+        """Not the best design"""
+        return self.env.render(**kwargs)
+
+    # def get_rewards(self):
+    #     return self.reward
+
+    # def get_attr(self, name):
+    #     return getattr(self, name)
+
+    # def run_dill_function(self, dill_fn):
+    #     fn = dill.loads(dill_fn)
+    #     return fn(self)
+
+    # def get_infos(self):
+    #     result = dict()
+    #     for k, v in self.info.items():
+    #         result[k] = list(v)
+    #     return result
+
+
+if __name__ == "__main__":
+    import os
+    from omegaconf import OmegaConf
+    import json
+
+    os.environ["MUJOCO_GL"] = "egl"
+
+    cfg = OmegaConf.load("cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml")
+    shape_meta = cfg["shape_meta"]
+
+    import robomimic.utils.env_utils as EnvUtils
+    import robomimic.utils.obs_utils as ObsUtils
+    import matplotlib.pyplot as plt
+    from env.gym_utils.wrapper.robomimic_image import RobomimicImageWrapper
+
+    wrappers = cfg.env.wrappers
+    obs_modality_dict = {
+        "low_dim": (
+            wrappers.robomimic_image.low_dim_keys
+            if "robomimic_image" in wrappers
+            else wrappers.robomimic_lowdim.low_dim_keys
+        ),
+        "rgb": (
+            wrappers.robomimic_image.image_keys
+            if "robomimic_image" in wrappers
+            else None
+        ),
+    }
+    if obs_modality_dict["rgb"] is None:
+        obs_modality_dict.pop("rgb")
+    ObsUtils.initialize_obs_modality_mapping_from_dict(obs_modality_dict)
+
+    with open(cfg.robomimic_env_cfg_path, "r") as f:
+        env_meta = json.load(f)
+    env = EnvUtils.create_env_from_metadata(
+        env_meta=env_meta,
+        render=False,
+        render_offscreen=False,
+        use_image_obs=True,
+    )
+    env.env.hard_reset = False
+
+    wrapper = MultiStep(
+        env=RobomimicImageWrapper(
+            env=env,
+            shape_meta=shape_meta,
+            image_keys=["robot0_eye_in_hand_image"],
+        ),
+        n_obs_steps=1,
+        n_action_steps=1,
+    )
+    wrapper.seed(0)
+    obs = wrapper.reset()
+    print(obs.keys())
+    img = wrapper.render()
+    wrapper.close()
+    plt.imshow(img)
+    plt.savefig("test.png")
diff --git a/env/gym_utils/wrapper/robomimic_image.py b/env/gym_utils/wrapper/robomimic_image.py
new file mode 100644
index 0000000..aa078ea
--- /dev/null
+++ b/env/gym_utils/wrapper/robomimic_image.py
@@ -0,0 +1,227 @@
+"""
+Environment wrapper for Robomimic environments with image observations.
+
+Modified from https://github.com/real-stanford/diffusion_policy/blob/main/diffusion_policy/env/robomimic/robomimic_image_wrapper.py
+
+"""
+
+import numpy as np
+import gym
+from gym import spaces
+import imageio
+
+
+class RobomimicImageWrapper(gym.Env):
+    def __init__(
+        self,
+        env,
+        shape_meta: dict,
+        normalization_path=None,
+        low_dim_keys=[
+            "robot0_eef_pos",
+            "robot0_eef_quat",
+            "robot0_gripper_qpos",
+        ],
+        image_keys=[
+            "agentview_image",
+            "robot0_eye_in_hand_image",
+        ],
+        clamp_obs=False,
+        init_state=None,
+        render_hw=(256, 256),
+        render_camera_name="agentview",
+    ):
+        self.env = env
+        self.init_state = init_state
+        self.has_reset_before = False
+        self.render_hw = render_hw
+        self.render_camera_name = render_camera_name
+        self.video_writer = None
+        self.clamp_obs = clamp_obs
+
+        # set up normalization
+        self.normalize = normalization_path is not None
+        if self.normalize:
+            normalization = np.load(normalization_path)
+            self.obs_min = normalization["obs_min"]
+            self.obs_max = normalization["obs_max"]
+            self.action_min = normalization["action_min"]
+            self.action_max = normalization["action_max"]
+
+        # setup spaces
+        low = np.full(env.action_dimension, fill_value=-1)
+        high = np.full(env.action_dimension, fill_value=1)
+        self.action_space = gym.spaces.Box(
+            low=low,
+            high=high,
+            shape=low.shape,
+            dtype=low.dtype,
+        )
+        self.low_dim_keys = low_dim_keys
+        self.image_keys = image_keys
+        self.obs_keys = low_dim_keys + image_keys
+        observation_space = spaces.Dict()
+        for key, value in shape_meta["obs"].items():
+            shape = value["shape"]
+            if key.endswith("rgb"):
+                min_value, max_value = 0, 1
+            elif key.endswith("state"):
+                min_value, max_value = -1, 1
+            else:
+                raise RuntimeError(f"Unsupported type {key}")
+            this_space = spaces.Box(
+                low=min_value,
+                high=max_value,
+                shape=shape,
+                dtype=np.float32,
+            )
+            observation_space[key] = this_space
+        self.observation_space = observation_space
+
+    def normalize_obs(self, obs):
+        obs = 2 * (
+            (obs - self.obs_min) / (self.obs_max - self.obs_min + 1e-6) - 0.5
+        )  # -> [-1, 1]
+        if self.clamp_obs:
+            obs = np.clip(obs, -1, 1)
+        return obs
+
+    def unnormalize_action(self, action):
+        action = (action + 1) / 2  # [-1, 1] -> [0, 1]
+        return action * (self.action_max - self.action_min) + self.action_min
+
+    def get_observation(self, raw_obs=None):
+        if raw_obs is None:
+            raw_obs = self.env.get_observation()
+        obs = {"rgb": None, "state": None}  # stack rgb if multiple cameras
+        for key in self.obs_keys:
+            if key in self.image_keys:
+                if obs["rgb"] is None:
+                    obs["rgb"] = raw_obs[key]
+                else:
+                    obs["rgb"] = np.concatenate(
+                        [obs["rgb"], raw_obs[key]], axis=0
+                    )  # C H W
+            else:
+                if obs["state"] is None:
+                    obs["state"] = raw_obs[key]
+                else:
+                    obs["state"] = np.concatenate([obs["state"], raw_obs[key]], axis=-1)
+        if self.normalize:
+            obs["state"] = self.normalize_obs(obs["state"])
+        obs["rgb"] *= 255  # [0, 1] -> [0, 255], in float64
+        return obs
+
+    def seed(self, seed=None):
+        if seed is not None:
+            np.random.seed(seed=seed)
+        else:
+            np.random.seed()
+
+    def reset(self, options={}, **kwargs):
+        """Ignore passed-in arguments like seed"""
+        # Close video if exists
+        if self.video_writer is not None:
+            self.video_writer.close()
+            self.video_writer = None
+
+        # Start video if specified
+        if "video_path" in options:
+            self.video_writer = imageio.get_writer(options["video_path"], fps=30)
+
+        # Call reset
+        new_seed = options.get(
+            "seed", None
+        )  # used to set all environments to specified seeds
+        if self.init_state is not None:
+            if not self.has_reset_before:
+                # the env must be fully reset at least once to ensure correct rendering
+                self.env.reset()
+                self.has_reset_before = True
+
+            # always reset to the same state to be compatible with gym
+            raw_obs = self.env.reset_to({"states": self.init_state})
+        elif new_seed is not None:
+            self.seed(seed=new_seed)
+            raw_obs = self.env.reset()
+        else:
+            # random reset
+            raw_obs = self.env.reset()
+        return self.get_observation(raw_obs)
+
+    def step(self, action):
+        if self.normalize:
+            action = self.unnormalize_action(action)
+        raw_obs, reward, done, info = self.env.step(action)
+        obs = self.get_observation(raw_obs)
+
+        # render if specified
+        if self.video_writer is not None:
+            video_img = self.render(mode="rgb_array")
+            self.video_writer.append_data(video_img)
+
+        return obs, reward, done, info
+
+    def render(self, mode="rgb_array"):
+        h, w = self.render_hw
+        return self.env.render(
+            mode=mode,
+            height=h,
+            width=w,
+            camera_name=self.render_camera_name,
+        )
+
+
+if __name__ == "__main__":
+    import os
+    from omegaconf import OmegaConf
+    import json
+
+    os.environ["MUJOCO_GL"] = "egl"
+
+    cfg = OmegaConf.load("cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml")
+    shape_meta = cfg["shape_meta"]
+
+    import robomimic.utils.env_utils as EnvUtils
+    import robomimic.utils.obs_utils as ObsUtils
+    import matplotlib.pyplot as plt
+
+    wrappers = cfg.env.wrappers
+    obs_modality_dict = {
+        "low_dim": (
+            wrappers.robomimic_image.low_dim_keys
+            if "robomimic_image" in wrappers
+            else wrappers.robomimic_lowdim.low_dim_keys
+        ),
+        "rgb": (
+            wrappers.robomimic_image.image_keys
+            if "robomimic_image" in wrappers
+            else None
+        ),
+    }
+    if obs_modality_dict["rgb"] is None:
+        obs_modality_dict.pop("rgb")
+    ObsUtils.initialize_obs_modality_mapping_from_dict(obs_modality_dict)
+
+    with open(cfg.robomimic_env_cfg_path, "r") as f:
+        env_meta = json.load(f)
+    env = EnvUtils.create_env_from_metadata(
+        env_meta=env_meta,
+        render=False,
+        render_offscreen=False,
+        use_image_obs=True,
+    )
+    env.env.hard_reset = False
+
+    wrapper = RobomimicImageWrapper(
+        env=env,
+        shape_meta=shape_meta,
+        image_keys=["robot0_eye_in_hand_image"],
+    )
+    wrapper.seed(0)
+    obs = wrapper.reset()
+    print(obs.keys())
+    img = wrapper.render()
+    wrapper.close()
+    plt.imshow(img)
+    plt.savefig("test.png")
diff --git a/env/gym_utils/wrapper/robomimic_lowdim.py b/env/gym_utils/wrapper/robomimic_lowdim.py
new file mode 100644
index 0000000..adbb43f
--- /dev/null
+++ b/env/gym_utils/wrapper/robomimic_lowdim.py
@@ -0,0 +1,142 @@
+"""
+Environment wrapper for Robomimic environments with state observations.
+
+Modified from https://github.com/real-stanford/diffusion_policy/blob/main/diffusion_policy/env/robomimic/robomimic_lowdim_wrapper.py
+
+"""
+
+import numpy as np
+import gym
+from gym.spaces import Box
+import imageio
+
+
+class RobomimicLowdimWrapper(gym.Env):
+    def __init__(
+        self,
+        env,
+        normalization_path=None,
+        low_dim_keys=[
+            "robot0_eef_pos",
+            "robot0_eef_quat",
+            "robot0_gripper_qpos",
+            "object",
+        ],
+        clamp_obs=False,
+        init_state=None,
+        render_hw=(256, 256),
+        render_camera_name="agentview",
+    ):
+        self.env = env
+        self.obs_keys = low_dim_keys
+        self.init_state = init_state
+        self.render_hw = render_hw
+        self.render_camera_name = render_camera_name
+        self.video_writer = None
+        self.clamp_obs = clamp_obs
+
+        # set up normalization
+        self.normalize = normalization_path is not None
+        if self.normalize:
+            normalization = np.load(normalization_path)
+            self.obs_min = normalization["obs_min"]
+            self.obs_max = normalization["obs_max"]
+            self.action_min = normalization["action_min"]
+            self.action_max = normalization["action_max"]
+
+        # setup spaces - use [-1, 1]
+        low = np.full(env.action_dimension, fill_value=-1)
+        high = np.full(env.action_dimension, fill_value=1)
+        self.action_space = Box(
+            low=low,
+            high=high,
+            shape=low.shape,
+            dtype=low.dtype,
+        )
+        obs_example = self.get_observation()
+        low = np.full_like(obs_example, fill_value=-1)
+        high = np.full_like(obs_example, fill_value=1)
+        self.observation_space = Box(
+            low=low,
+            high=high,
+            shape=low.shape,
+            dtype=low.dtype,
+        )
+
+    def normalize_obs(self, obs):
+        obs = 2 * (
+            (obs - self.obs_min) / (self.obs_max - self.obs_min + 1e-6) - 0.5
+        )  # -> [-1, 1]
+        if self.clamp_obs:
+            obs = np.clip(obs, -1, 1)
+        return obs
+
+    def unnormalize_action(self, action):
+        action = (action + 1) / 2  # [-1, 1] -> [0, 1]
+        return action * (self.action_max - self.action_min) + self.action_min
+
+    def get_observation(self):
+        raw_obs = self.env.get_observation()
+        raw_obs = np.concatenate([raw_obs[key] for key in self.obs_keys], axis=0)
+        if self.normalize:
+            return self.normalize_obs(raw_obs)
+        return raw_obs
+
+    def seed(self, seed=None):
+        if seed is not None:
+            np.random.seed(seed=seed)
+        else:
+            np.random.seed()
+
+    def reset(self, options={}, **kwargs):
+        """Ignore passed-in arguments like seed"""
+
+        # Close video if exists
+        if self.video_writer is not None:
+            self.video_writer.close()
+            self.video_writer = None
+
+        # Start video if specified
+        if "video_path" in options:
+            self.video_writer = imageio.get_writer(options["video_path"], fps=30)
+
+        # Call reset
+        new_seed = options.get(
+            "seed", None
+        )  # used to set all environments to specified seeds
+        if self.init_state is not None:
+            # always reset to the same state to be compatible with gym
+            self.env.reset_to({"states": self.init_state})
+        elif new_seed is not None:
+            self.seed(seed=new_seed)
+            self.env.reset()
+        else:
+            # random reset
+            self.env.reset()
+        return self.get_observation()
+
+    def step(self, action):
+        if self.normalize:
+            action = self.unnormalize_action(action)
+        raw_obs, reward, done, info = self.env.step(action)
+        raw_obs = np.concatenate([raw_obs[key] for key in self.obs_keys], axis=0)
+        if self.normalize:
+            obs = self.normalize_obs(raw_obs)
+        else:
+            obs = raw_obs
+
+        # render if specified
+        if self.video_writer is not None:
+            video_img = self.render(mode="rgb_array")
+            self.video_writer.append_data(video_img)
+
+        return obs, reward, done, info
+
+    def render(self, mode="rgb_array"):
+        h, w = self.render_hw
+        return self.env.render(
+            mode=mode,
+            height=h,
+            width=w,
+            camera_name=self.render_camera_name,
+        )
diff --git a/env/plot_traj.py b/env/plot_traj.py
new file mode 100644
index 0000000..5a4e6d6
--- /dev/null
+++ b/env/plot_traj.py
@@ -0,0 +1,169 @@
+"""
+Plotting D3IL trajectories
+
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+from functools import partial
+
+
+class TrajPlotter:
+
+    def __init__(self, env_type, **kwargs):
+        if env_type == "toy":
+            self.save_traj = save_toy_traj
+        elif env_type == "avoid":
+            self.save_traj = partial(save_avoid_traj, **kwargs)
+        else:
+            self.save_traj = dummy
+
+    def __call__(self, **kwargs):
+        self.save_traj(**kwargs)
+
+
+def dummy(*args, **kwargs):
+    pass
+
+
+def save_avoid_traj(
+    obs_full_trajs,
+    n_render,
+    max_episode_steps,
+    render_dir,
+    itr,
+    normalization_path,
+):
+    normalization = np.load(normalization_path)
+    obs_min = normalization["obs_min"]
+    obs_max = normalization["obs_max"]
+
+    # action_min = normalization['action_min']
+    # action_max = normalization['action_max']
+    def unnormalize_obs(obs):
+        obs = (obs + 1) / 2  # [-1, 1] -> [0, 1]
+        return obs * (obs_max - obs_min) + obs_min
+
+    def get_obj_xy_list():
+        mid_pos = 0.5
+        offset = 0.075
+        first_level_y = -0.1
+        level_distance = 0.18
+        return [
+            [mid_pos, first_level_y],
+            [mid_pos - offset, first_level_y + level_distance],
+            [mid_pos + offset, first_level_y + level_distance],
+            [mid_pos - 2 * offset, first_level_y + 2 * level_distance],
+            [mid_pos, first_level_y + 2 * level_distance],
+            [mid_pos + 2 * offset, first_level_y + 2 * level_distance],
+        ]
+
+    pillar_xys = get_obj_xy_list()
+    chosen_i = np.random.choice(
+        range(obs_full_trajs.shape[1]),
+        n_render,
+        replace=False,
+    )
+    fig = plt.figure()
+    for i in chosen_i:
+        obs_traj_env = obs_full_trajs[:max_episode_steps, i, :]
+        obs_traj_env = unnormalize_obs(obs_traj_env)
+
+        # bnds = np.array([[0, 8], [-3, 3]])  # denormalize
+        # obs_traj_env = obs_traj_env * (bnds[:, 1] - bnds[:, 0]) + bnds[:, 0]
+        # for j in range(len(obs_traj_env) - 4, len(obs_traj_env)):
+        for j in range(len(obs_traj_env)):
+            plt.scatter(
+                obs_traj_env[j, 0],
+                obs_traj_env[j, 1],
+                marker="o",
+                s=2,
+                # s=0.2,
+                # c=plt.cm.Blues(1 - j / 50 + 0.1),
+                color=(0.3, 0.3, 0.3),
+            )
+            if j > 0:  # connect
+                plt.plot(
+                    [obs_traj_env[j - 1, 0], obs_traj_env[j, 0]],
+                    [obs_traj_env[j - 1, 1], obs_traj_env[j, 1]],
+                    color=(0.3, 0.3, 0.3),
+                )
+    # finish line
+    plt.axhline(y=0.4, color=np.array([31, 119, 180]) / 255, linestyle="-")
+    for xy in pillar_xys:
+        circle = plt.Circle(xy, 0.01, color=(0.0, 0.0, 0.0), fill=True)
+        plt.gca().add_patch(circle)
+
+    plt.xlabel("X pos")
+    plt.ylabel("Y pos")
+
+    plt.xlim([0.2, 0.8])
+    plt.ylim([-0.3, 0.5])
+    ax = plt.gca()
+    ax.set_aspect("equal", adjustable="box")
+    ax.set_facecolor("white")
+    plt.savefig(os.path.join(render_dir, f"traj-{itr}.png"))
+    plt.close(fig)
+
+
+def save_toy_traj(
+    obs_full_trajs,
+    n_render,
+    max_episode_steps,
+    render_dir,
+    itr,
+):
+    chosen_i = np.random.choice(
+        range(obs_full_trajs.shape[1]),
+        n_render,
+        replace=False,
+    )
+    for i in chosen_i:
+        obs_traj_env = obs_full_trajs[:max_episode_steps, i, :]
+        bnds = np.array([[0, 8], [-3, 3]])  # denormalize
+        obs_traj_env = obs_traj_env * (bnds[:, 1] - bnds[:, 0]) + bnds[:, 0]
+        fig = plt.figure()
+        for j in range(max_episode_steps):
+            plt.scatter(
+                obs_traj_env[j, 0],
+                obs_traj_env[j, 1],
+                marker="o",
+                s=20,
+                c=plt.cm.Blues(1 - j / 50 + 0.1),
+            )
+            if j > 0:  # connect
+                plt.plot(
+                    [obs_traj_env[j - 1, 0], obs_traj_env[j, 0]],
+                    [obs_traj_env[j - 1, 1], obs_traj_env[j, 1]],
+                    "k-",
+                )
+        plt.scatter(
+            obs_traj_env[0, 0],
+            obs_traj_env[0, 1],
+            marker="*",
+            s=100,
+            c="g",
+        )
+        plt.scatter(6, 0, marker="*", s=100, c="r")  # target
+        circle = plt.Circle((3, 0), 1, color="r", fill=True)
+        plt.gca().add_patch(circle)
+        plt.plot(
+            [
+                bnds[0, 0],
+                bnds[0, 1],
+                bnds[0, 1],
+                bnds[0, 0],
+                bnds[0, 0],
+            ],
+            [
+                bnds[1, 0],
+                bnds[1, 0],
+                bnds[1, 1],
+                bnds[1, 1],
+                bnds[1, 0],
+            ],
+            "k-",
+        )
+        plt.savefig(os.path.join(render_dir, f"traj-{itr}-{i}.png"))
+        plt.close(fig)
diff --git a/installation/install_d3il.md b/installation/install_d3il.md
new file mode 100644
index 0000000..27f1a05
--- /dev/null
+++ b/installation/install_d3il.md
@@ -0,0 +1,9 @@
+## Install D3IL
+
+Install our fork at your desired location: 
+```console
+git clone https://github.com/allenzren/d3il
+cd d3il
+pip install -e environments/d3il
+pip install -e environments/d3il/envs/gym_avoiding_env/
+```
\ No newline at end of file
diff --git a/installation/install_furniture.md b/installation/install_furniture.md
new file mode 100644
index 0000000..9795f60
--- /dev/null
+++ b/installation/install_furniture.md
@@ -0,0 +1,41 @@
+## Install IsaacGym and Furniture-Bench (borrowed from [robust-rearrangement](https://github.com/ankile/robust-rearrangement/tree/main?tab=readme-ov-file#install-isaacgym)) 
+
+### IsaacGym
+Download the IsaacGym installer from the [IsaacGym website](https://developer.nvidia.com/isaac-gym) and follow the instructions to download the package by running:
+* Click "Join now" and log into your NVIDIA account.
+* Click "Member area".
+* Read and check the box for the license agreement.
+* Download and unzip Isaac Gym - Ubuntu Linux 18.04 / 20.04 Preview 4 release.
+
+You can also download a copy of the file from their AWS S3 bucket for your convenience:
+```console
+cd ~
+wget https://iai-robust-rearrangement.s3.us-east-2.amazonaws.com/IsaacGym_Preview_4_Package.tar.gz
+```
+
+Once the zipped file is downloaded, move it to the desired location and unzip it by running:
+```console
+tar -xzf IsaacGym_Preview_4_Package.tar.gz
+```
+
+Now, you can install the IsaacGym package by navigating to the isaacgym directory and running:
+```
+cd isaacgym
+pip install -e python --no-cache-dir --force-reinstall
+```
+<!-- The --no-cache-dir and --force-reinstall flags are used to avoid potential issues with the installation we encountered. Also ignore Pip's notice that [notice] To update, run: pip install --upgrade pip as the current version of Pip is necessary for compatibility with the codebase. -->
+
+**Note:** You will most likely encounter the error ```ImportError: libpython3.8.so.1.0: cannot open shared object file: No such file or directory``` when you run fine-tuning. You may fix it by (add to .bashrc if you want it permanently):
+```console 
+conda env list # print your conda_path
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<conda_path>/envs/dppo/lib
+```
+
+### Furniture-Bench
+
+Install our fork at your desired location:
+```console
+git clone git@github.com:ankile/furniture-bench.git
+cd furniture-bench
+pip install -e .
+```
\ No newline at end of file
diff --git a/installation/install_mujoco.md b/installation/install_mujoco.md
new file mode 100644
index 0000000..cf5f7fc
--- /dev/null
+++ b/installation/install_mujoco.md
@@ -0,0 +1,15 @@
+## Install MuJoCo
+
+```console
+cd ~
+wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz
+mkdir $HOME/.mujoco
+tar -xvf mujoco210-linux-x86_64.tar.gz -C ~/.mujoco/
+echo -e 'export LD_LIBRARY_PATH=$HOME/.mujoco/mujoco210/bin' >> ~/.bashrc
+echo -e 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/nvidia' >> ~/.bashrc
+echo -e 'export PATH="$LD_LIBRARY_PATH:$PATH"' >> ~/.bashrc
+```
+For visualizing mujoco in a GUI viewer:
+```console
+export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libGLEW.so
+```
\ No newline at end of file
diff --git a/model/__init__.py b/model/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/model/common/__init__.py b/model/common/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/model/common/critic.py b/model/common/critic.py
new file mode 100644
index 0000000..dc59265
--- /dev/null
+++ b/model/common/critic.py
@@ -0,0 +1,164 @@
+"""
+Critic networks.
+
+"""
+
+import torch
+import copy
+import einops
+from copy import deepcopy
+
+from model.common.mlp import MLP, ResidualMLP
+from model.common.modules import SpatialEmb, RandomShiftsAug
+
+
+class CriticObs(torch.nn.Module):
+    """State-only critic network."""
+
+    def __init__(
+        self,
+        obs_dim,
+        mlp_dims,
+        activation_type="Mish",
+        use_layernorm=False,
+        residual_style=False,
+        **kwargs,
+    ):
+        super().__init__()
+        mlp_dims = [obs_dim] + mlp_dims + [1]
+        if residual_style:
+            self.Q1 = ResidualMLP(
+                mlp_dims,
+                activation_type=activation_type,
+                out_activation_type="Identity",
+                use_layernorm=use_layernorm,
+            )
+        else:
+            self.Q1 = MLP(
+                mlp_dims,
+                activation_type=activation_type,
+                out_activation_type="Identity",
+                use_layernorm=use_layernorm,
+                verbose=False,
+            )
+
+    def forward(self, x):
+        x = x.view(x.size(0), -1)
+        q1 = self.Q1(x)
+        return q1
+
+
+class CriticObsAct(torch.nn.Module):
+    """State-action double critic network."""
+
+    def __init__(
+        self,
+        obs_dim,
+        mlp_dims,
+        action_dim,
+        action_steps=1,
+        activation_type="Mish",
+        use_layernorm=False,
+        residual_tyle=False,
+        **kwargs,
+    ):
+        super().__init__()
+        mlp_dims = [obs_dim + action_dim * action_steps] + mlp_dims + [1]
+        if residual_tyle:
+            self.Q1 = ResidualMLP(
+                mlp_dims,
+                activation_type=activation_type,
+                out_activation_type="Identity",
+                use_layernorm=use_layernorm,
+            )
+        else:
+            self.Q1 = MLP(
+                mlp_dims,
+                activation_type=activation_type,
+                out_activation_type="Identity",
+                use_layernorm=use_layernorm,
+                verbose=False,
+            )
+        self.Q2 = copy.deepcopy(self.Q1)
+
+    def forward(self, x, action):
+        x = x.view(x.size(0), -1)
+        x = torch.cat((x, action), dim=-1)
+        q1 = self.Q1(x)
+        q2 = self.Q2(x)
+        return q1.squeeze(1), q2.squeeze(1)
+
+
+class ViTCritic(CriticObs):
+    """ViT + MLP, state only"""
+
+    def __init__(
+        self,
+        backbone,
+        obs_dim,
+        spatial_emb=128,
+        patch_repr_dim=128,
+        dropout=0,
+        augment=False,
+        num_img=1,
+        **kwargs,
+    ):
+        # update input dim to mlp
+        mlp_obs_dim = spatial_emb * num_img + obs_dim
+        super().__init__(obs_dim=mlp_obs_dim, **kwargs)
+        self.backbone = backbone
+        if num_img > 1:
+            self.compress1 = SpatialEmb(
+                num_patch=121,  # TODO: repr_dim // patch_repr_dim,
+                patch_dim=patch_repr_dim,
+                prop_dim=obs_dim,
+                proj_dim=spatial_emb,
+                dropout=dropout,
+            )
+            self.compress2 = deepcopy(self.compress1)
+        else:  # TODO: clean up
+            self.compress = SpatialEmb(
+                num_patch=121,
+                patch_dim=patch_repr_dim,
+                prop_dim=obs_dim,
+                proj_dim=spatial_emb,
+                dropout=dropout,
+            )
+        if augment:
+            self.aug = RandomShiftsAug(pad=4)
+        self.augment = augment
+
+    def forward(
+        self,
+        obs: dict,
+        no_augment=False,
+    ):
+        # flatten cond_dim if exists
+        if obs["rgb"].ndim == 5:
+            rgb = einops.rearrange(obs["rgb"], "b d c h w -> (b d) c h w")
+        else:
+            rgb = obs["rgb"]
+        if obs["state"].ndim == 3:
+            state = einops.rearrange(obs["state"], "b d c -> (b d) c")
+        else:
+            state = obs["state"]
+
+        # get vit output - pass in two images separately
+        if rgb.shape[1] == 6:  # TODO: properly handle multiple images
+            rgb1 = rgb[:, :3]
+            rgb2 = rgb[:, 3:]
+            if self.augment and not no_augment:
+                rgb1 = self.aug(rgb1)
+                rgb2 = self.aug(rgb2)
+            feat1 = self.backbone(rgb1)
+            feat2 = self.backbone(rgb2)
+            feat1 = self.compress1.forward(feat1, state)
+            feat2 = self.compress2.forward(feat2, state)
+            feat = torch.cat([feat1, feat2], dim=-1)
+        else:  # single image
+            if self.augment and not no_augment:
+                rgb = self.aug(rgb)  # uint8 -> float32
+            feat = self.backbone(rgb)
+            feat = self.compress.forward(feat, state)
+        feat = torch.cat([feat, state], dim=-1)
+        return super().forward(feat)
diff --git a/model/common/gaussian.py b/model/common/gaussian.py
new file mode 100644
index 0000000..2ce7fb6
--- /dev/null
+++ b/model/common/gaussian.py
@@ -0,0 +1,97 @@
+"""
+Gaussian policy parameterization.
+
+"""
+
+import torch
+import torch.distributions as D
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class GaussianModel(torch.nn.Module):
+
+    def __init__(
+        self,
+        network,
+        horizon_steps,
+        network_path=None,
+        device="cuda:0",
+    ):
+        super().__init__()
+        self.device = device
+        self.network = network.to(device)
+        self.horizon_steps = horizon_steps
+        if network_path is not None:
+            checkpoint = torch.load(
+                network_path, map_location=self.device, weights_only=True
+            )
+            self.load_state_dict(
+                checkpoint["model"],
+                strict=False,
+            )
+            log.info("Loaded actor from %s", network_path)
+        log.info(
+            f"Number of network parameters: {sum(p.numel() for p in self.parameters())}"
+        )
+
+    def loss(self, true_action, cond, ent_coef):
+        B = len(true_action)
+        if isinstance(
+            cond, dict
+        ):  # image and state, only using one step observation right now
+            cond = cond[0]
+        else:
+            cond = cond[0].reshape(B, -1)
+        dist = self.forward_train(
+            cond,
+            deterministic=False,
+        )
+        true_action = true_action.view(B, -1)
+        loss = -dist.log_prob(true_action)  # [B]
+        entropy = dist.entropy().mean()
+        loss = loss.mean() - entropy * ent_coef
+        return loss, {"entropy": entropy}
+
+    def forward_train(
+        self,
+        cond,
+        deterministic=False,
+        network_override=None,
+    ):
+        """
+        Calls the MLP to compute the mean, scale, and logits of the GMM. Returns the torch.Distribution object.
+        """
+        if network_override is not None:
+            means, scales = network_override(cond)
+        else:
+            means, scales = self.network(cond)
+        if deterministic:
+            # low-noise for all Gaussian dists
+            scales = torch.ones_like(means) * 1e-4
+        return D.Normal(loc=means, scale=scales)
+
+    def forward(
+        self,
+        cond,
+        deterministic=False,
+        randn_clip_value=10,
+        network_override=None,
+    ):
+        if isinstance(cond, dict):
+            B = cond["state"].shape[0]
+        else:
+            B = cond.shape[0]
+        T = self.horizon_steps
+        dist = self.forward_train(
+            cond,
+            deterministic=deterministic,
+            network_override=network_override,
+        )
+        sampled_action = dist.sample()
+        sampled_action.clamp_(
+            dist.loc - randn_clip_value * dist.scale,
+            dist.loc + randn_clip_value * dist.scale,
+        )
+        return sampled_action.view(B, T, -1)
diff --git a/model/common/gmm.py b/model/common/gmm.py
new file mode 100644
index 0000000..fd42666
--- /dev/null
+++ b/model/common/gmm.py
@@ -0,0 +1,83 @@
+"""
+GMM policy parameterization.
+
+"""
+
+import torch
+import torch.distributions as D
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class GMMModel(torch.nn.Module):
+
+    def __init__(
+        self,
+        network,
+        horizon_steps,
+        device="cuda:0",
+        **kwargs,
+    ):
+        super().__init__()
+        self.device = device
+        self.network = network.to(device)
+        self.horizon_steps = horizon_steps
+        log.info(
+            f"Number of network parameters: {sum(p.numel() for p in self.parameters())}"
+        )
+
+    def loss(self, true_action, obs_cond, **kwargs):
+        B = len(true_action)
+        cond = obs_cond[0].reshape(B, -1)
+        dist, entropy, _ = self.forward_train(
+            cond,
+            deterministic=False,
+        )
+        true_action = true_action.view(B, -1)
+        loss = -dist.log_prob(true_action)  # [B]
+        loss = loss.mean()
+        return loss, {"entropy": entropy}
+
+    def forward_train(
+        self,
+        cond,
+        deterministic=False,
+    ):
+        """
+        Calls the MLP to compute the mean, scale, and logits of the GMM. Returns the torch.Distribution object.
+        """
+        means, scales, logits = self.network(cond)
+        if deterministic:
+            # low-noise for all Gaussian dists
+            scales = torch.ones_like(means) * 1e-4
+
+        # mixture components - make sure that `batch_shape` for the distribution is equal to (batch_size, num_modes) since MixtureSameFamily expects this shape
+        # Each mode has mean vector of dim T*D
+        component_distribution = D.Normal(loc=means, scale=scales)
+        component_distribution = D.Independent(component_distribution, 1)
+
+        component_entropy = component_distribution.entropy()
+        approx_entropy = torch.mean(
+            torch.sum(logits.softmax(-1) * component_entropy, dim=-1)
+        )
+        std = torch.mean(torch.sum(logits.softmax(-1) * scales.mean(-1), dim=-1))
+
+        # unnormalized logits to categorical distribution for mixing the modes
+        mixture_distribution = D.Categorical(logits=logits)
+        dist = D.MixtureSameFamily(
+            mixture_distribution=mixture_distribution,
+            component_distribution=component_distribution,
+        )
+        return dist, approx_entropy, std
+
+    def forward(self, cond, deterministic=False):
+        B = cond.shape[0]
+        T = self.horizon_steps
+        dist, _, _ = self.forward_train(
+            cond,
+            deterministic=deterministic,
+        )
+        sampled_action = dist.sample()
+        sampled_action = sampled_action.view(B, T, -1)
+        return sampled_action
diff --git a/model/common/mlp.py b/model/common/mlp.py
new file mode 100644
index 0000000..fe154d1
--- /dev/null
+++ b/model/common/mlp.py
@@ -0,0 +1,160 @@
+"""
+Implementation of Multi-layer Perception (MLP).
+
+Residual model is taken from https://github.com/ALRhub/d3il/blob/main/agents/models/common/mlp.py
+
+"""
+
+import torch
+from torch import nn
+from torch.nn.utils import spectral_norm
+from collections import OrderedDict
+import logging
+
+
+activation_dict = nn.ModuleDict(
+    {
+        "ReLU": nn.ReLU(),
+        "ELU": nn.ELU(),
+        "GELU": nn.GELU(),
+        "Tanh": nn.Tanh(),
+        "Mish": nn.Mish(),
+        "Identity": nn.Identity(),
+        "Softplus": nn.Softplus(),
+    }
+)
+
+
+class MLP(nn.Module):
+
+    def __init__(
+        self,
+        dim_list,
+        append_dim=0,
+        append_layers=None,
+        activation_type="Tanh",
+        out_activation_type="Identity",
+        use_layernorm=False,
+        use_spectralnorm=False,
+        verbose=False,
+    ):
+        super(MLP, self).__init__()
+
+        # Construct module list: if use `Python List`, the modules are not
+        # added to computation graph. Instead, we should use `nn.ModuleList()`.
+        self.moduleList = nn.ModuleList()
+        self.append_layers = append_layers
+        num_layer = len(dim_list) - 1
+        for idx in range(num_layer):
+            i_dim = dim_list[idx]
+            o_dim = dim_list[idx + 1]
+            if append_dim > 0 and idx in append_layers:
+                i_dim += append_dim
+
+            linear_layer = nn.Linear(i_dim, o_dim)
+            if use_spectralnorm:
+                linear_layer = spectral_norm(linear_layer)
+            if idx == num_layer - 1:
+                module = nn.Sequential(
+                    OrderedDict(
+                        [
+                            ("linear_1", linear_layer),
+                            ("act_1", activation_dict[out_activation_type]),
+                        ]
+                    )
+                )
+            else:
+                if use_layernorm:
+                    module = nn.Sequential(
+                        OrderedDict(
+                            [
+                                ("linear_1", linear_layer),
+                                ("norm_1", nn.LayerNorm(o_dim)),
+                                ("act_1", activation_dict[activation_type]),
+                            ]
+                        )
+                    )
+                else:
+                    module = nn.Sequential(
+                        OrderedDict(
+                            [
+                                ("linear_1", linear_layer),
+                                ("act_1", activation_dict[activation_type]),
+                            ]
+                        )
+                    )
+            self.moduleList.append(module)
+        if verbose:
+            logging.info(self.moduleList)
+
+    def forward(self, x, append=None):
+        for layer_ind, m in enumerate(self.moduleList):
+            if append is not None and layer_ind in self.append_layers:
+                x = torch.cat((x, append), dim=-1)
+            x = m(x)
+        return x
+
+
+class ResidualMLP(nn.Module):
+    """
+    Simple multi layer perceptron network with residual connections for
+    benchmarking the performance of different networks. The resiudal layers
+    are based on the IBC paper implementation, which uses 2 residual lalyers
+    with pre-actication with or without dropout and normalization.
+    """
+
+    def __init__(
+        self,
+        dim_list,
+        activation_type="Mish",
+        out_activation_type="Identity",
+        use_layernorm=False,
+    ):
+        super(ResidualMLP, self).__init__()
+        hidden_dim = dim_list[1]
+        num_hidden_layers = len(dim_list) - 3
+        assert num_hidden_layers % 2 == 0
+        self.layers = nn.ModuleList([nn.Linear(dim_list[0], hidden_dim)])
+        self.layers.extend(
+            [
+                TwoLayerPreActivationResNetLinear(
+                    hidden_dim=hidden_dim,
+                    activation_type=activation_type,
+                    use_layernorm=use_layernorm,
+                )
+                for _ in range(1, num_hidden_layers, 2)
+            ]
+        )
+        self.layers.append(nn.Linear(hidden_dim, dim_list[-1]))
+        self.layers.append(activation_dict[out_activation_type])
+
+    def forward(self, x):
+        for _, layer in enumerate(self.layers):
+            x = layer(x)
+        return x
+
+
+class TwoLayerPreActivationResNetLinear(nn.Module):
+    def __init__(
+        self,
+        hidden_dim,
+        activation_type="Mish",
+        use_layernorm=False,
+    ):
+        super().__init__()
+        self.l1 = nn.Linear(hidden_dim, hidden_dim)
+        self.l2 = nn.Linear(hidden_dim, hidden_dim)
+        self.act = activation_dict[activation_type]
+        if use_layernorm:
+            self.norm1 = nn.LayerNorm(hidden_dim, eps=1e-06)
+            self.norm2 = nn.LayerNorm(hidden_dim, eps=1e-06)
+
+    def forward(self, x):
+        x_input = x
+        if hasattr(self, "norm1"):
+            x = self.norm1(x)
+        x = self.l1(self.act(x))
+        if hasattr(self, "norm2"):
+            x = self.norm2(x)
+        x = self.l2(self.act(x))
+        return x + x_input
diff --git a/model/common/mlp_gaussian.py b/model/common/mlp_gaussian.py
new file mode 100644
index 0000000..b5ea047
--- /dev/null
+++ b/model/common/mlp_gaussian.py
@@ -0,0 +1,248 @@
+"""
+MLP models for Gaussian policy.
+
+"""
+
+import torch
+import torch.nn as nn
+import einops
+from copy import deepcopy
+
+from model.common.mlp import MLP, ResidualMLP
+from model.common.modules import SpatialEmb, RandomShiftsAug
+
+
+class Gaussian_VisionMLP(nn.Module):
+    """With ViT backbone"""
+
+    def __init__(
+        self,
+        backbone,
+        transition_dim,
+        horizon_steps,
+        cond_dim,
+        mlp_dims=[256, 256, 256],
+        activation_type="Mish",
+        residual_style=False,
+        use_layernorm=False,
+        fixed_std=None,
+        learn_fixed_std=False,
+        std_min=0.01,
+        std_max=1,
+        spatial_emb=0,
+        visual_feature_dim=128,
+        repr_dim=96 * 96,
+        patch_repr_dim=128,
+        dropout=0,
+        num_img=1,
+        augment=False,
+    ):
+        super().__init__()
+
+        # vision
+        self.backbone = backbone
+        if augment:
+            self.aug = RandomShiftsAug(pad=4)
+        self.augment = augment
+        if spatial_emb > 0:
+            assert spatial_emb > 1, "this is the dimension"
+            if num_img > 1:
+                self.compress1 = SpatialEmb(
+                    num_patch=121,  # TODO: repr_dim // patch_repr_dim,
+                    patch_dim=patch_repr_dim,
+                    prop_dim=cond_dim,
+                    proj_dim=spatial_emb,
+                    dropout=dropout,
+                )
+                self.compress2 = deepcopy(self.compress1)
+            else:  # TODO: clean up
+                self.compress = SpatialEmb(
+                    num_patch=121,
+                    patch_dim=patch_repr_dim,
+                    prop_dim=cond_dim,
+                    proj_dim=spatial_emb,
+                    dropout=dropout,
+                )
+            visual_feature_dim = spatial_emb * num_img
+        else:
+            self.compress = nn.Sequential(
+                nn.Linear(repr_dim, visual_feature_dim),
+                nn.LayerNorm(visual_feature_dim),
+                nn.Dropout(dropout),
+                nn.ReLU(),
+            )
+
+        # head
+        self.transition_dim = transition_dim
+        self.horizon_steps = horizon_steps
+        input_dim = visual_feature_dim + cond_dim
+        output_dim = transition_dim * horizon_steps
+        if residual_style:
+            model = ResidualMLP
+        else:
+            model = MLP
+        self.mlp_mean = model(
+            [input_dim] + mlp_dims + [output_dim],
+            activation_type=activation_type,
+            out_activation_type="Identity",
+            use_layernorm=use_layernorm,
+        )
+        if fixed_std is None:
+            self.mlp_logvar = MLP(
+                [input_dim] + mlp_dims[-1:] + [output_dim],
+                activation_type=activation_type,
+                out_activation_type="Identity",
+                use_layernorm=use_layernorm,
+            )
+        elif learn_fixed_std:  # initialize to fixed_std
+            self.logvar = torch.nn.Parameter(
+                torch.log(torch.tensor([fixed_std**2 for _ in range(transition_dim)])),
+                requires_grad=True,
+            )
+        self.logvar_min = torch.nn.Parameter(
+            torch.log(torch.tensor(std_min**2)), requires_grad=False
+        )
+        self.logvar_max = torch.nn.Parameter(
+            torch.log(torch.tensor(std_max**2)), requires_grad=False
+        )
+        self.use_fixed_std = fixed_std is not None
+        self.fixed_std = fixed_std
+        self.learn_fixed_std = learn_fixed_std
+
+    def forward(self, x):
+        B = len(x["state"])
+        device = x["state"].device
+
+        # flatten cond_dim if exists
+        if x["rgb"].ndim == 5:
+            rgb = einops.rearrange(x["rgb"], "b d c h w -> (b d) c h w")
+        else:
+            rgb = x["rgb"]
+        if x["state"].ndim == 3:
+            state = einops.rearrange(x["state"], "b d c -> (b d) c")
+        else:
+            state = x["state"]
+
+        # get vit output - pass in two images separately
+        if rgb.shape[1] == 6:  # TODO: properly handle multiple images
+            rgb1 = rgb[:, :3]
+            rgb2 = rgb[:, 3:]
+            if self.augment:
+                rgb1 = self.aug(rgb1)
+                rgb2 = self.aug(rgb2)
+            feat1 = self.backbone(rgb1)
+            feat2 = self.backbone(rgb2)
+            feat1 = self.compress1.forward(feat1, state)
+            feat2 = self.compress2.forward(feat2, state)
+            feat = torch.cat([feat1, feat2], dim=-1)
+        else:  # single image
+            if self.augment:
+                rgb = self.aug(rgb)  # uint8 -> float32
+            feat = self.backbone(rgb)
+
+            # compress
+            if isinstance(self.compress, SpatialEmb):
+                feat = self.compress.forward(feat, state)
+            else:
+                feat = feat.flatten(1, -1)
+                feat = self.compress(feat)
+
+        # mlp
+        x_encoded = torch.cat([feat, state], dim=-1)
+        out_mean = self.mlp_mean(x_encoded)
+        out_mean = torch.tanh(out_mean).view(
+            B, self.horizon_steps * self.transition_dim
+        )  # tanh squashing in [-1, 1]
+
+        if self.learn_fixed_std:
+            out_logvar = torch.clamp(self.logvar, self.logvar_min, self.logvar_max)
+            out_scale = torch.exp(0.5 * out_logvar)
+            out_scale = out_scale.view(1, self.transition_dim)
+            out_scale = out_scale.repeat(B, self.horizon_steps)
+        elif self.use_fixed_std:
+            out_scale = torch.ones_like(out_mean).to(device) * self.fixed_std
+        else:
+            out_logvar = self.mlp_logvar(x_encoded).view(
+                B, self.horizon_steps * self.transition_dim
+            )
+            out_logvar = torch.clamp(out_logvar, self.logvar_min, self.logvar_max)
+            out_scale = torch.exp(0.5 * out_logvar)
+        return out_mean, out_scale
+
+
+class Gaussian_MLP(nn.Module):
+
+    def __init__(
+        self,
+        transition_dim,
+        horizon_steps,
+        cond_dim,
+        mlp_dims=[256, 256, 256],
+        activation_type="Mish",
+        residual_style=False,
+        use_layernorm=False,
+        fixed_std=None,
+        learn_fixed_std=False,
+        std_min=0.01,
+        std_max=1,
+    ):
+        super().__init__()
+        self.transition_dim = transition_dim
+        self.horizon_steps = horizon_steps
+        input_dim = cond_dim
+        output_dim = transition_dim * horizon_steps
+        if residual_style:
+            model = ResidualMLP
+        else:
+            model = MLP
+        self.mlp_mean = model(
+            [input_dim] + mlp_dims + [output_dim],
+            activation_type=activation_type,
+            out_activation_type="Identity",
+            use_layernorm=use_layernorm,
+        )
+        if fixed_std is None:
+            self.mlp_logvar = MLP(
+                [input_dim] + mlp_dims[-1:] + [output_dim],
+                activation_type=activation_type,
+                out_activation_type="Identity",
+                use_layernorm=use_layernorm,
+            )
+        elif learn_fixed_std:  # initialize to fixed_std
+            self.logvar = torch.nn.Parameter(
+                torch.log(torch.tensor([fixed_std**2 for _ in range(transition_dim)])),
+                requires_grad=True,
+            )
+        self.logvar_min = torch.nn.Parameter(
+            torch.log(torch.tensor(std_min**2)), requires_grad=False
+        )
+        self.logvar_max = torch.nn.Parameter(
+            torch.log(torch.tensor(std_max**2)), requires_grad=False
+        )
+        self.use_fixed_std = fixed_std is not None
+        self.fixed_std = fixed_std
+        self.learn_fixed_std = learn_fixed_std
+
+    def forward(self, x):
+        B = len(x)
+
+        # mlp
+        out_mean = self.mlp_mean(x)
+        out_mean = torch.tanh(out_mean).view(
+            B, self.horizon_steps * self.transition_dim
+        )  # tanh squashing in [-1, 1]
+
+        if self.learn_fixed_std:
+            out_logvar = torch.clamp(self.logvar, self.logvar_min, self.logvar_max)
+            out_scale = torch.exp(0.5 * out_logvar)
+            out_scale = out_scale.view(1, self.transition_dim)
+            out_scale = out_scale.repeat(B, self.horizon_steps)
+        elif self.use_fixed_std:
+            out_scale = torch.ones_like(out_mean).to(x.device) * self.fixed_std
+        else:
+            out_logvar = self.mlp_logvar(x).view(
+                B, self.horizon_steps * self.transition_dim
+            )
+            out_logvar = torch.clamp(out_logvar, self.logvar_min, self.logvar_max)
+            out_scale = torch.exp(0.5 * out_logvar)
+        return out_mean, out_scale
diff --git a/model/common/mlp_gmm.py b/model/common/mlp_gmm.py
new file mode 100644
index 0000000..9262d97
--- /dev/null
+++ b/model/common/mlp_gmm.py
@@ -0,0 +1,106 @@
+"""
+MLP models for GMM policy.
+
+"""
+
+import torch
+import torch.nn as nn
+from model.common.mlp import MLP, ResidualMLP
+
+
+class GMM_MLP(nn.Module):
+
+    def __init__(
+        self,
+        transition_dim,
+        horizon_steps,
+        cond_dim=None,
+        mlp_dims=[256, 256, 256],
+        num_modes=5,
+        activation_type="Mish",
+        residual_style=False,
+        use_layernorm=False,
+        fixed_std=None,
+        learn_fixed_std=False,
+        std_min=0.01,
+        std_max=1,
+    ):
+        super().__init__()
+        self.transition_dim = transition_dim
+        self.horizon_steps = horizon_steps
+        input_dim = cond_dim
+        output_dim = transition_dim * horizon_steps * num_modes
+        self.num_modes = num_modes
+        if residual_style:
+            model = ResidualMLP
+        else:
+            model = MLP
+        self.mlp_mean = model(
+            [input_dim] + mlp_dims + [output_dim],
+            activation_type=activation_type,
+            out_activation_type="Identity",
+            use_layernorm=use_layernorm,
+        )
+        if fixed_std is None:
+            self.mlp_logvar = model(
+                [input_dim] + mlp_dims + [output_dim],
+                activation_type=activation_type,
+                out_activation_type="Identity",
+                use_layernorm=use_layernorm,
+            )
+        elif (
+            learn_fixed_std
+        ):  # initialize to fixed_std, separate for each action and mode
+            self.logvar = torch.nn.Parameter(
+                torch.log(
+                    torch.tensor(
+                        [fixed_std**2 for _ in range(transition_dim * num_modes)]
+                    )
+                ),
+                requires_grad=True,
+            )
+        self.logvar_min = torch.nn.Parameter(
+            torch.log(torch.tensor(std_min**2)), requires_grad=False
+        )
+        self.logvar_max = torch.nn.Parameter(
+            torch.log(torch.tensor(std_max**2)), requires_grad=False
+        )
+        self.use_fixed_std = fixed_std is not None
+        self.fixed_std = fixed_std
+        self.learn_fixed_std = learn_fixed_std
+
+        # mode weights
+        self.mlp_weights = model(
+            [input_dim] + mlp_dims + [num_modes],
+            activation_type=activation_type,
+            out_activation_type="Identity",
+            use_layernorm=use_layernorm,
+        )
+
+    def forward(self, x):
+        B = len(x)
+
+        # mlp
+        out_mean = self.mlp_mean(x)
+        out_mean = torch.tanh(out_mean).view(
+            B, self.num_modes, self.horizon_steps * self.transition_dim
+        )  # tanh squashing in [-1, 1]
+
+        if self.learn_fixed_std:
+            out_logvar = torch.clamp(self.logvar, self.logvar_min, self.logvar_max)
+            out_scale = torch.exp(0.5 * out_logvar)
+            out_scale = out_scale.view(1, self.num_modes, self.transition_dim)
+            out_scale = out_scale.repeat(B, 1, self.horizon_steps)
+        elif self.use_fixed_std:
+            out_scale = torch.ones_like(out_mean).to(x.device) * self.fixed_std
+        else:
+            out_logvar = self.mlp_logvar(x).view(
+                B, self.num_modes, self.horizon_steps * self.transition_dim
+            )
+            out_logvar = torch.clamp(out_logvar, self.logvar_min, self.logvar_max)
+            out_scale = torch.exp(0.5 * out_logvar)
+
+        out_weights = self.mlp_weights(x)
+        out_weights = out_weights.view(B, self.num_modes)
+
+        return out_mean, out_scale, out_weights
diff --git a/model/common/modules.py b/model/common/modules.py
new file mode 100644
index 0000000..0a72e4f
--- /dev/null
+++ b/model/common/modules.py
@@ -0,0 +1,69 @@
+"""
+Additional implementation of the ViT image encoder from https://github.com/hengyuan-hu/ibrl/tree/main
+
+"""
+
+import torch
+import torch.nn as nn
+
+
+class SpatialEmb(nn.Module):
+    def __init__(self, num_patch, patch_dim, prop_dim, proj_dim, dropout):
+        super().__init__()
+
+        proj_in_dim = num_patch + prop_dim
+        num_proj = patch_dim
+        self.patch_dim = patch_dim
+        self.prop_dim = prop_dim
+
+        self.input_proj = nn.Sequential(
+            nn.Linear(proj_in_dim, proj_dim),
+            nn.LayerNorm(proj_dim),
+            nn.ReLU(inplace=True),
+        )
+        self.weight = nn.Parameter(torch.zeros(1, num_proj, proj_dim))
+        self.dropout = nn.Dropout(dropout)
+        nn.init.normal_(self.weight)
+
+    def extra_repr(self) -> str:
+        return f"weight: nn.Parameter ({self.weight.size()})"
+
+    def forward(self, feat: torch.Tensor, prop: torch.Tensor):
+        feat = feat.transpose(1, 2)
+
+        if self.prop_dim > 0:
+            repeated_prop = prop.unsqueeze(1).repeat(1, feat.size(1), 1)
+            feat = torch.cat((feat, repeated_prop), dim=-1)
+
+        y = self.input_proj(feat)
+        z = (self.weight * y).sum(1)
+        z = self.dropout(z)
+        return z
+
+
+class RandomShiftsAug:
+    def __init__(self, pad):
+        self.pad = pad
+
+    def __call__(self, x):
+        n, c, h, w = x.size()
+        assert h == w
+        padding = tuple([self.pad] * 4)
+        x = nn.functional.pad(x, padding, "replicate")
+        eps = 1.0 / (h + 2 * self.pad)
+        arange = torch.linspace(
+            -1.0 + eps, 1.0 - eps, h + 2 * self.pad, device=x.device, dtype=x.dtype
+        )[:h]
+        arange = arange.unsqueeze(0).repeat(h, 1).unsqueeze(2)
+        base_grid = torch.cat([arange, arange.transpose(1, 0)], dim=2)
+        base_grid = base_grid.unsqueeze(0).repeat(n, 1, 1, 1)
+
+        shift = torch.randint(
+            0, 2 * self.pad + 1, size=(n, 1, 1, 2), device=x.device, dtype=x.dtype
+        )
+        shift *= 2.0 / (h + 2 * self.pad)
+
+        grid = base_grid + shift
+        return nn.functional.grid_sample(
+            x, grid, padding_mode="zeros", align_corners=False
+        )
diff --git a/model/common/transformer.py b/model/common/transformer.py
new file mode 100644
index 0000000..653f231
--- /dev/null
+++ b/model/common/transformer.py
@@ -0,0 +1,415 @@
+"""
+Implementation of Transformer, parameterized as Gaussian and GMM.
+
+Modified from https://github.com/real-stanford/diffusion_policy/blob/main/diffusion_policy/model/diffusion/transformer_for_diffusion.py
+
+"""
+
+import logging
+import torch
+import torch.nn as nn
+from model.diffusion.modules import SinusoidalPosEmb
+
+logger = logging.getLogger(__name__)
+
+
+class Gaussian_Transformer(nn.Module):
+    def __init__(
+        self,
+        transition_dim,
+        horizon_steps,
+        cond_dim,
+        transformer_embed_dim=256,
+        transformer_num_heads=8,
+        transformer_num_layers=6,
+        transformer_activation="gelu",
+        p_drop_emb=0.0,
+        p_drop_attn=0.0,
+        fixed_std=None,
+        learn_fixed_std=False,
+        std_min=0.01,
+        std_max=1,
+    ):
+
+        super().__init__()
+        self.transition_dim = transition_dim
+        self.horizon_steps = horizon_steps
+        output_dim = transition_dim
+
+        if fixed_std is None:  # learn the logvar
+            output_dim *= 2  # mean and logvar
+            logger.info("Using learned std")
+        elif learn_fixed_std:  # learn logvar
+            self.logvar = torch.nn.Parameter(
+                torch.log(torch.tensor([fixed_std**2 for _ in range(transition_dim)])),
+                requires_grad=True,
+            )
+            logger.info(f"Using fixed std {fixed_std} with learning")
+        else:
+            logger.info(f"Using fixed std {fixed_std} without learning")
+        self.logvar_min = torch.nn.Parameter(
+            torch.log(torch.tensor(std_min**2)), requires_grad=False
+        )
+        self.logvar_max = torch.nn.Parameter(
+            torch.log(torch.tensor(std_max**2)), requires_grad=False
+        )
+        self.learn_fixed_std = learn_fixed_std
+        self.fixed_std = fixed_std
+
+        self.transformer = Transformer(
+            output_dim,
+            horizon_steps,
+            cond_dim,
+            T_cond=1,  # right now we assume only one step of observation everywhere
+            n_layer=transformer_num_layers,
+            n_head=transformer_num_heads,
+            n_emb=transformer_embed_dim,
+            p_drop_emb=p_drop_emb,
+            p_drop_attn=p_drop_attn,
+            activation=transformer_activation,
+        )
+
+    def forward(self, cond):
+        """
+        cond: (B,cond_dim)
+        output: (B,horizon*transition)
+        """
+        B = len(cond)
+        cond = cond.unsqueeze(1)  # (B,1,cond_dim)
+        out, _ = self.transformer(cond)  # (B,horizon,output_dim)
+
+        # use the first half of the output as mean
+        out_mean = torch.tanh(out[:, :, : self.transition_dim])
+        out_mean = out_mean.view(B, self.horizon_steps * self.transition_dim)
+
+        if self.learn_fixed_std:
+            out_logvar = torch.clamp(self.logvar, self.logvar_min, self.logvar_max)
+            out_scale = torch.exp(0.5 * out_logvar)
+            out_scale = out_scale.view(1, self.transition_dim)
+            out_scale = out_scale.repeat(B, self.horizon_steps)
+        elif self.fixed_std is not None:
+            out_scale = torch.ones_like(out_mean).to(cond.device) * self.fixed_std
+        else:
+            out_logvar = out[:, :, self.transition_dim :]
+            out_logvar = out_logvar.reshape(B, self.horizon_steps * self.transition_dim)
+            out_logvar = torch.clamp(out_logvar, self.logvar_min, self.logvar_max)
+            out_scale = torch.exp(0.5 * out_logvar)
+        return out_mean, out_scale
+
+
+class GMM_Transformer(nn.Module):
+    def __init__(
+        self,
+        transition_dim,
+        horizon_steps,
+        cond_dim,
+        num_modes=5,
+        transformer_embed_dim=256,
+        transformer_num_heads=8,
+        transformer_num_layers=6,
+        transformer_activation="gelu",
+        p_drop_emb=0,
+        p_drop_attn=0,
+        fixed_std=None,
+        learn_fixed_std=False,
+        std_min=0.01,
+        std_max=1,
+    ):
+
+        super().__init__()
+        self.num_modes = num_modes
+        self.transition_dim = transition_dim
+        self.horizon_steps = horizon_steps
+        output_dim = transition_dim * num_modes
+        # + num_modes  # mean and modes
+
+        if fixed_std is None:
+            output_dim += num_modes * transition_dim  # logvar for each mode
+            logger.info("Using learned std")
+        elif (
+            learn_fixed_std
+        ):  # initialize to fixed_std, separate for each action and mode, but same along horizon
+            self.logvar = torch.nn.Parameter(
+                torch.log(
+                    torch.tensor(
+                        [fixed_std**2 for _ in range(num_modes * transition_dim)]
+                    )
+                ),
+                requires_grad=True,
+            )
+            logger.info(f"Using fixed std {fixed_std} with learning")
+        else:
+            logger.info(f"Using fixed std {fixed_std} without learning")
+        self.logvar_min = torch.nn.Parameter(
+            torch.log(torch.tensor(std_min**2)), requires_grad=False
+        )
+        self.logvar_max = torch.nn.Parameter(
+            torch.log(torch.tensor(std_max**2)), requires_grad=False
+        )
+        self.fixed_std = fixed_std
+        self.learn_fixed_std = learn_fixed_std
+
+        self.transformer = Transformer(
+            output_dim,
+            horizon_steps,
+            cond_dim,
+            T_cond=1,  # right now we assume only one step of observation everywhere
+            n_layer=transformer_num_layers,
+            n_head=transformer_num_heads,
+            n_emb=transformer_embed_dim,
+            p_drop_emb=p_drop_emb,
+            p_drop_attn=p_drop_attn,
+            activation=transformer_activation,
+        )
+        self.modes_head = nn.Linear(horizon_steps * transformer_embed_dim, num_modes)
+
+    def forward(self, cond):
+        """
+        cond: (B,cond_dim)
+        output: (B,horizon*transition)
+        """
+        B = len(cond)
+        cond = cond.unsqueeze(1)  # (B,1,cond_dim)
+        out, out_prehead = self.transformer(
+            cond
+        )  # (B,horizon,output_dim), (B,horizon,emb_dim)
+
+        # use the first half of the output as mean
+        out_mean = torch.tanh(out[:, :, : self.num_modes * self.transition_dim])
+        out_mean = out_mean.reshape(
+            B, self.horizon_steps, self.num_modes, self.transition_dim
+        )
+        out_mean = out_mean.permute(0, 2, 1, 3)  # flip horizons and modes
+        out_mean = out_mean.reshape(
+            B, self.num_modes, self.horizon_steps * self.transition_dim
+        )
+
+        if self.learn_fixed_std:
+            out_logvar = torch.clamp(self.logvar, self.logvar_min, self.logvar_max)
+            out_scale = torch.exp(0.5 * out_logvar)
+            out_scale = out_scale.view(1, self.num_modes, self.transition_dim)
+            out_scale = out_scale.repeat(B, 1, self.horizon_steps)
+        elif self.fixed_std is not None:
+            out_scale = torch.ones_like(out_mean).to(cond.device) * self.fixed_std
+        else:
+            out_logvar = out[
+                :, :, self.num_modes * self.transition_dim : -self.num_modes
+            ]
+            out_logvar = out_logvar.reshape(
+                B, self.horizon_steps, self.num_modes, self.transition_dim
+            )
+            out_logvar = out_logvar.permute(0, 2, 1, 3)  # flip horizons and modes
+            out_logvar = out_logvar.reshape(
+                B, self.num_modes, self.horizon_steps * self.transition_dim
+            )
+            out_logvar = torch.clamp(out_logvar, self.logvar_min, self.logvar_max)
+            out_scale = torch.exp(0.5 * out_logvar)
+
+        # use last horizon step as the mode weights - as it depends on the entire context
+        # out_weights = out[:, -1, -self.num_modes :]  # (B,num_modes)
+        out_weights = self.modes_head(out_prehead.view(B, -1))
+        return out_mean, out_scale, out_weights
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        output_dim,
+        horizon,
+        cond_dim,
+        T_cond=1,
+        n_layer=12,
+        n_head=12,
+        n_emb=768,
+        p_drop_emb=0.0,
+        p_drop_attn=0.0,
+        causal_attn=False,
+        n_cond_layers=0,
+        activation="gelu",
+    ):
+        super().__init__()
+
+        # encoder for observations
+        self.cond_obs_emb = nn.Linear(cond_dim, n_emb)
+        self.cond_pos_emb = nn.Parameter(torch.zeros(1, T_cond, n_emb))
+        if n_cond_layers > 0:
+            encoder_layer = nn.TransformerEncoderLayer(
+                d_model=n_emb,
+                nhead=n_head,
+                dim_feedforward=4 * n_emb,
+                dropout=p_drop_attn,
+                activation=activation,
+                batch_first=True,
+                norm_first=True,
+            )
+            self.encoder = nn.TransformerEncoder(
+                encoder_layer=encoder_layer,
+                num_layers=n_cond_layers,
+            )
+        else:
+            self.encoder = nn.Sequential(
+                nn.Linear(n_emb, 4 * n_emb),
+                nn.Mish(),
+                nn.Linear(4 * n_emb, n_emb),
+            )
+
+        # decoder
+        self.pos_emb = nn.Parameter(torch.zeros(1, horizon, n_emb))
+        self.drop = nn.Dropout(p_drop_emb)
+        decoder_layer = nn.TransformerDecoderLayer(
+            d_model=n_emb,
+            nhead=n_head,
+            dim_feedforward=4 * n_emb,
+            dropout=p_drop_attn,
+            activation=activation,
+            batch_first=True,
+            norm_first=True,  # important for stability
+        )
+        self.decoder = nn.TransformerDecoder(
+            decoder_layer=decoder_layer, num_layers=n_layer
+        )
+
+        # attention mask
+        if causal_attn:
+            # causal mask to ensure that attention is only applied to the left in the input sequence
+            # torch.nn.Transformer uses additive mask as opposed to multiplicative mask in minGPT
+            # therefore, the upper triangle should be -inf and others (including diag) should be 0.
+            sz = horizon
+            mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+            mask = (
+                mask.float()
+                .masked_fill(mask == 0, float("-inf"))
+                .masked_fill(mask == 1, float(0.0))
+            )
+            self.register_buffer("mask", mask)
+
+            t, s = torch.meshgrid(
+                torch.arange(horizon), torch.arange(T_cond), indexing="ij"
+            )
+            mask = t >= (
+                s - 1
+            )  # add one dimension since time is the first token in cond
+            mask = (
+                mask.float()
+                .masked_fill(mask == 0, float("-inf"))
+                .masked_fill(mask == 1, float(0.0))
+            )
+            self.register_buffer("memory_mask", mask)
+        else:
+            self.mask = None
+            self.memory_mask = None
+
+        # decoder head
+        self.ln_f = nn.LayerNorm(n_emb)
+        self.head = nn.Linear(n_emb, output_dim)
+
+        # constants
+        self.T_cond = T_cond
+        self.horizon = horizon
+
+        # init
+        self.apply(self._init_weights)
+
+    def _init_weights(self, module):
+        ignore_types = (
+            nn.Dropout,
+            SinusoidalPosEmb,
+            nn.TransformerEncoderLayer,
+            nn.TransformerDecoderLayer,
+            nn.TransformerEncoder,
+            nn.TransformerDecoder,
+            nn.ModuleList,
+            nn.Mish,
+            nn.Sequential,
+        )
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.MultiheadAttention):
+            weight_names = [
+                "in_proj_weight",
+                "q_proj_weight",
+                "k_proj_weight",
+                "v_proj_weight",
+            ]
+            for name in weight_names:
+                weight = getattr(module, name)
+                if weight is not None:
+                    torch.nn.init.normal_(weight, mean=0.0, std=0.02)
+            bias_names = ["in_proj_bias", "bias_k", "bias_v"]
+            for name in bias_names:
+                bias = getattr(module, name)
+                if bias is not None:
+                    torch.nn.init.zeros_(bias)
+        elif isinstance(module, nn.LayerNorm):
+            torch.nn.init.zeros_(module.bias)
+            torch.nn.init.ones_(module.weight)
+        elif isinstance(module, Transformer):
+            torch.nn.init.normal_(module.pos_emb, mean=0.0, std=0.02)
+            if module.cond_obs_emb is not None:
+                torch.nn.init.normal_(module.cond_pos_emb, mean=0.0, std=0.02)
+        elif isinstance(module, ignore_types):
+            # no param
+            pass
+        else:
+            raise RuntimeError("Unaccounted module {}".format(module))
+
+    def forward(
+        self,
+        cond: torch.Tensor,
+        **kwargs,
+    ):
+        """
+        cond: (B, T, cond_dim)
+        output: (B, T, output_dim)
+        """
+        # encoder
+        cond_embeddings = self.cond_obs_emb(cond)  # (B,To,n_emb)
+        tc = cond_embeddings.shape[1]
+        position_embeddings = self.cond_pos_emb[
+            :, :tc, :
+        ]  # each position maps to a (learnable) vector
+        x = self.drop(cond_embeddings + position_embeddings)
+        x = self.encoder(x)
+        memory = x
+        # (B,T_cond,n_emb)
+
+        # decoder
+        position_embeddings = self.pos_emb[
+            :, : self.horizon, :
+        ]  # each position maps to a (learnable) vector
+        position_embeddings = position_embeddings.expand(
+            cond.shape[0], self.horizon, -1
+        )  # repeat for batch dimension
+        x = self.drop(position_embeddings)
+        # (B,T,n_emb)
+        x = self.decoder(
+            tgt=x,
+            memory=memory,
+            tgt_mask=self.mask,
+            memory_mask=self.memory_mask,
+        )
+        # (B,T,n_emb)
+
+        # head
+        x_prehead = self.ln_f(x)
+        x = self.head(x_prehead)
+        # (B,T,n_out)
+        return x, x_prehead
+
+
+if __name__ == "__main__":
+    transformer = Transformer(
+        output_dim=10,
+        horizon=4,
+        T_cond=1,
+        cond_dim=16,
+        causal_attn=False,  # no need to use for delta control
+        # From Cheng: I found the causal attention masking to be critical to get the transformer variant of diffusion policy to work. My suspicion is that when used without it, the model "cheats" by looking ahead into future end-effector poses, which is almost identical to the action of the current timestep.
+        n_cond_layers=0,
+    )
+    # opt = transformer.configure_optimizers()
+
+    cond = torch.zeros((4, 1, 16))  # B x 1 x cond_dim
+    out, _ = transformer(cond)
diff --git a/model/common/vit.py b/model/common/vit.py
new file mode 100644
index 0000000..942b9bb
--- /dev/null
+++ b/model/common/vit.py
@@ -0,0 +1,236 @@
+"""
+ViT image encoder implementation from IBRL, https://github.com/hengyuan-hu/ibrl
+
+"""
+
+from dataclasses import dataclass, field
+from typing import List
+import einops
+import torch
+from torch import nn
+from torch.nn.init import trunc_normal_
+
+
+@dataclass
+class VitEncoderConfig:
+    patch_size: int = 8
+    depth: int = 1
+    embed_dim: int = 128
+    num_heads: int = 4
+    act_layer = nn.GELU
+    stride: int = -1
+    embed_style: str = "embed2"
+    embed_norm: int = 0
+
+
+class VitEncoder(nn.Module):
+    def __init__(self, obs_shape: List[int], cfg: VitEncoderConfig):
+        super().__init__()
+        self.obs_shape = obs_shape
+        self.cfg = cfg
+        self.vit = MinVit(
+            embed_style=cfg.embed_style,
+            embed_dim=cfg.embed_dim,
+            embed_norm=cfg.embed_norm,
+            num_head=cfg.num_heads,
+            depth=cfg.depth,
+        )
+
+        self.num_patch = self.vit.num_patches
+        self.patch_repr_dim = self.cfg.embed_dim
+        self.repr_dim = self.cfg.embed_dim * self.vit.num_patches
+
+    def forward(self, obs, flatten=False) -> torch.Tensor:
+        # assert obs.max() > 5
+        obs = obs / 255.0 - 0.5
+        feats: torch.Tensor = self.vit.forward(obs)
+        if flatten:
+            feats = feats.flatten(1, 2)
+        return feats
+
+
+class PatchEmbed1(nn.Module):
+    def __init__(self, embed_dim):
+        super().__init__()
+        self.conv = nn.Conv2d(3, embed_dim, kernel_size=8, stride=8)
+
+        self.num_patch = 144
+        self.patch_dim = embed_dim
+
+    def forward(self, x: torch.Tensor):
+        y = self.conv(x)
+        y = einops.rearrange(y, "b c h w -> b (h  w) c")
+        return y
+
+
+class PatchEmbed2(nn.Module):
+    def __init__(self, embed_dim, use_norm):
+        super().__init__()
+        layers = [
+            nn.Conv2d(3, embed_dim, kernel_size=8, stride=4),
+            nn.GroupNorm(embed_dim, embed_dim) if use_norm else nn.Identity(),
+            nn.ReLU(),
+            nn.Conv2d(embed_dim, embed_dim, kernel_size=3, stride=2),
+        ]
+        self.embed = nn.Sequential(*layers)
+
+        self.num_patch = 121  # TODO: specifically for 96x96 set by Hengyuan?
+        self.patch_dim = embed_dim
+
+    def forward(self, x: torch.Tensor):
+        y = self.embed(x)
+        y = einops.rearrange(y, "b c h w -> b (h  w) c")
+        return y
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, embed_dim, num_head):
+        super().__init__()
+        assert embed_dim % num_head == 0
+
+        self.num_head = num_head
+        self.qkv_proj = nn.Linear(embed_dim, 3 * embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+    def forward(self, x, attn_mask):
+        """
+        x: [batch, seq, embed_dim]
+        """
+        qkv = self.qkv_proj(x)
+        q, k, v = einops.rearrange(
+            qkv, "b t (k h d) -> b k h t d", k=3, h=self.num_head
+        ).unbind(1)
+        # force flash/mem-eff attention, it will raise error if flash cannot be applied
+        with torch.backends.cuda.sdp_kernel(enable_math=False):
+            attn_v = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, dropout_p=0.0, attn_mask=attn_mask
+            )
+        attn_v = einops.rearrange(attn_v, "b h t d -> b t (h d)")
+        return self.out_proj(attn_v)
+
+
+class TransformerLayer(nn.Module):
+    def __init__(self, embed_dim, num_head, dropout):
+        super().__init__()
+
+        self.layer_norm1 = nn.LayerNorm(embed_dim)
+        self.mha = MultiHeadAttention(embed_dim, num_head)
+
+        self.layer_norm2 = nn.LayerNorm(embed_dim)
+        self.linear1 = nn.Linear(embed_dim, 4 * embed_dim)
+        self.linear2 = nn.Linear(4 * embed_dim, embed_dim)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, attn_mask=None):
+        x = x + self.dropout(self.mha(self.layer_norm1(x), attn_mask))
+        x = x + self.dropout(self._ff_block(self.layer_norm2(x)))
+        return x
+
+    def _ff_block(self, x):
+        x = self.linear2(nn.functional.gelu(self.linear1(x)))
+        return x
+
+
+class MinVit(nn.Module):
+    def __init__(self, embed_style, embed_dim, embed_norm, num_head, depth):
+        super().__init__()
+
+        if embed_style == "embed1":
+            self.patch_embed = PatchEmbed1(embed_dim)
+        elif embed_style == "embed2":
+            self.patch_embed = PatchEmbed2(embed_dim, use_norm=embed_norm)
+        else:
+            assert False
+
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, self.patch_embed.num_patch, embed_dim)
+        )
+        layers = [
+            TransformerLayer(embed_dim, num_head, dropout=0) for _ in range(depth)
+        ]
+
+        self.net = nn.Sequential(*layers)
+        self.norm = nn.LayerNorm(embed_dim)
+        self.num_patches = self.patch_embed.num_patch
+
+        # weight init
+        trunc_normal_(self.pos_embed, std=0.02)
+        named_apply(init_weights_vit_timm, self)
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = x + self.pos_embed
+        x = self.net(x)
+        return self.norm(x)
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+
+
+def named_apply(
+    fn, module: nn.Module, name="", depth_first=True, include_root=False
+) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True,
+        )
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+
+def test_patch_embed():
+    print("embed 1")
+    embed = PatchEmbed1(128)
+    x = torch.rand(10, 3, 96, 96)
+    y = embed(x)
+    print(y.size())
+
+    print("embed 2")
+    embed = PatchEmbed2(128, True)
+    x = torch.rand(10, 3, 96, 96)
+    y = embed(x)
+    print(y.size())
+
+
+def test_transformer_layer():
+    embed = PatchEmbed1(128)
+    x = torch.rand(10, 3, 96, 96)
+    y = embed(x)
+    print(y.size())
+
+    transformer = TransformerLayer(128, 4, False, 0)
+    z = transformer(y)
+    print(z.size())
+
+
+if __name__ == "__main__":
+    import rich.traceback
+    import pyrallis
+
+    @dataclass
+    class MainConfig:
+        net_type: str = "vit"
+        obs_shape: list[int] = field(default_factory=lambda: [3, 96, 96])
+        vit: VitEncoderConfig = field(default_factory=lambda: VitEncoderConfig())
+
+    rich.traceback.install()
+    cfg = pyrallis.parse(config_class=MainConfig)  # type: ignore
+    enc = VitEncoder(cfg.obs_shape, cfg.vit)
+
+    print(enc)
+    x = torch.rand(1, *cfg.obs_shape) * 255
+    print("output size:", enc(x, flatten=False).size())
+    print("repr dim:", enc.repr_dim, ", real dim:", enc(x, flatten=True).size())
diff --git a/model/diffusion/__init__.py b/model/diffusion/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/model/diffusion/diffusion.py b/model/diffusion/diffusion.py
new file mode 100644
index 0000000..796925c
--- /dev/null
+++ b/model/diffusion/diffusion.py
@@ -0,0 +1,318 @@
+"""
+Gaussian diffusion with DDPM and optionally DDIM sampling.
+
+References:
+Diffuser: https://github.com/jannerm/diffuser
+Diffusion Policy: https://github.com/columbia-ai-robotics/diffusion_policy/blob/main/diffusion_policy/policy/diffusion_unet_lowdim_policy.py
+Annotated DDIM/DDPM: https://nn.labml.ai/diffusion/stable_diffusion/sampler/ddpm.html
+
+"""
+
+from typing import Optional, Union
+import logging
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+log = logging.getLogger(__name__)
+
+from model.diffusion.sampling import (
+    make_timesteps,
+    extract,
+    cosine_beta_schedule,
+)
+
+from collections import namedtuple
+Sample = namedtuple("Sample", "trajectories values chains")
+
+
+class DiffusionModel(nn.Module):
+
+    def __init__(
+        self,
+        network,
+        horizon_steps,
+        obs_dim,
+        action_dim,
+        transition_dim,
+        network_path=None,
+        cond_steps=1,
+        device="cuda:0",
+        # DDPM parameters
+        denoising_steps=100,
+        predict_epsilon=True,
+        denoised_clip_value=1.0,
+        # DDIM sampling
+        use_ddim=False,
+        ddim_discretize='uniform',
+        ddim_steps=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.device = device
+        self.horizon_steps = horizon_steps
+        self.obs_dim = obs_dim
+        self.action_dim = action_dim
+        self.transition_dim = transition_dim
+        self.denoising_steps = int(denoising_steps)
+        self.denoised_clip_value = denoised_clip_value
+        self.predict_epsilon = predict_epsilon
+        self.cond_steps = cond_steps
+        self.use_ddim = use_ddim
+        self.ddim_steps = ddim_steps
+
+        # Set up models
+        self.network = network.to(device)
+        if network_path is not None:
+            checkpoint = torch.load(network_path, map_location=device, weights_only=True)
+            if "ema" in checkpoint:
+                self.load_state_dict(checkpoint["ema"], strict=False)
+                logging.info("Loaded SL-trained policy from %s", network_path)
+            else:
+                self.load_state_dict(checkpoint["model"], strict=False)
+                logging.info("Loaded RL-trained policy from %s", network_path)
+        logging.info(
+            f"Number of network parameters: {sum(p.numel() for p in self.parameters())}"
+        )
+
+        """
+        DDPM parameters
+
+        """
+        """
+        βₜ
+        """
+        self.betas = cosine_beta_schedule(denoising_steps).to(device)
+        """
+        αₜ = 1 - βₜ
+        """
+        self.alphas = 1.0 - self.betas
+        """
+        α̅ₜ= ∏ᵗₛ₌₁ αₛ 
+        """
+        self.alphas_cumprod = torch.cumprod(self.alphas, axis=0)
+        """
+        α̅ₜ₋₁
+        """
+        self.alphas_cumprod_prev = torch.cat([torch.ones(1).to(device), self.alphas_cumprod[:-1]])
+        """
+        √ α̅ₜ
+        """
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        """
+        √ 1-α̅ₜ
+        """
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - self.alphas_cumprod)
+        """
+        √ 1\α̅ₜ
+        """
+        self.sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod)
+        """
+        √ 1\α̅ₜ-1
+        """
+        self.sqrt_recipm1_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod - 1)
+        """
+        β̃ₜ = σₜ² = βₜ (1-α̅ₜ₋₁)/(1-α̅ₜ)
+        """
+        self.ddpm_var = (
+            self.betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.ddpm_logvar_clipped = torch.log(torch.clamp(self.ddpm_var, min=1e-20))
+        """
+        μₜ = β̃ₜ √ α̅ₜ₋₁/(1-α̅ₜ)x₀ + √ αₜ (1-α̅ₜ₋₁)/(1-α̅ₜ)xₜ
+        """
+        self.ddpm_mu_coef1 = self.betas * torch.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        self.ddpm_mu_coef2 = (1.0 - self.alphas_cumprod_prev) * torch.sqrt(self.alphas) / (1.0 - self.alphas_cumprod)
+
+        """
+        DDIM parameters
+
+        In DDIM paper https://arxiv.org/pdf/2010.02502, alpha is alpha_cumprod in DDPM https://arxiv.org/pdf/2102.09672
+        """
+        if use_ddim:
+            assert predict_epsilon, "DDIM requires predicting epsilon for now."
+            if ddim_discretize == 'uniform':    # use the HF "leading" style
+                step_ratio = self.denoising_steps // ddim_steps
+                self.ddim_t = torch.arange(0, ddim_steps, device=self.device) * step_ratio
+            else:
+                raise 'Unknown discretization method for DDIM.'
+            self.ddim_alphas = self.alphas_cumprod[self.ddim_t].clone().to(torch.float32)
+            self.ddim_alphas_sqrt = torch.sqrt(self.ddim_alphas)
+            self.ddim_alphas_prev = torch.cat([
+                torch.tensor([1.]).to(torch.float32).to(self.device), 
+                self.alphas_cumprod[self.ddim_t[:-1]]])
+            self.ddim_sqrt_one_minus_alphas = (1. - self.ddim_alphas) ** .5
+
+            # Initialize fixed sigmas for inference - eta=0
+            ddim_eta = 0
+            self.ddim_sigmas = (ddim_eta * \
+                    ((1 - self.ddim_alphas_prev) / (1 - self.ddim_alphas) * \
+                    (1 - self.ddim_alphas / self.ddim_alphas_prev)) ** .5)
+
+            # Flip all
+            self.ddim_t = torch.flip(self.ddim_t, [0])
+            self.ddim_alphas = torch.flip(self.ddim_alphas, [0])
+            self.ddim_alphas_sqrt = torch.flip(self.ddim_alphas_sqrt, [0])
+            self.ddim_alphas_prev = torch.flip(self.ddim_alphas_prev, [0])
+            self.ddim_sqrt_one_minus_alphas = torch.flip(self.ddim_sqrt_one_minus_alphas, [0])
+            self.ddim_sigmas = torch.flip(self.ddim_sigmas, [0])
+
+    # ---------- Sampling ----------#
+
+    def p_mean_var(self, x, t, cond=None, index=None):
+        noise = self.network(x, t, cond=cond)
+
+        # Predict x_0
+        if self.predict_epsilon:
+            if self.use_ddim:
+                """
+                x₀ = (xₜ - √ (1-αₜ) ε )/ √ αₜ
+                """
+                alpha = extract(self.ddim_alphas, index, x.shape)
+                alpha_prev = extract(self.ddim_alphas_prev, index, x.shape)
+                sqrt_one_minus_alpha = extract(self.ddim_sqrt_one_minus_alphas, index, x.shape)
+                x_recon = (x - sqrt_one_minus_alpha * noise) / (alpha ** 0.5)
+            else:
+                """
+                x₀ = √ 1\α̅ₜ xₜ - √ 1\α̅ₜ-1 ε
+                """
+                x_recon = (
+                    extract(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+                    - extract(self.sqrt_recipm1_alphas_cumprod, t, x.shape) * noise
+                )
+        else:   # directly predicting x₀
+            x_recon = noise
+        if self.denoised_clip_value is not None:
+            x_recon.clamp_(-self.denoised_clip_value, self.denoised_clip_value)
+            if self.use_ddim:
+                # re-calculate noise based on clamped x_recon - default to false in HF, but let's use it here
+                noise = (x - alpha ** (0.5) * x_recon) / sqrt_one_minus_alpha
+
+        # Get mu
+        if self.use_ddim:
+            """
+            μ = √ αₜ₋₁ x₀ + √(1-αₜ₋₁ - σₜ²) ε 
+            
+            var should be zero here as self.ddim_eta=0
+            """
+            sigma = extract(self.ddim_sigmas, index, x.shape)
+            dir_xt = (1. - alpha_prev - sigma ** 2).sqrt() * noise
+            mu = (alpha_prev ** 0.5) * x_recon + dir_xt
+            var = sigma ** 2
+            logvar = torch.log(var)
+        else:
+            """
+            μₜ = β̃ₜ √ α̅ₜ₋₁/(1-α̅ₜ)x₀ + √ αₜ (1-α̅ₜ₋₁)/(1-α̅ₜ)xₜ
+            """
+            mu = (
+                extract(self.ddpm_mu_coef1, t, x.shape) * x_recon
+                + extract(self.ddpm_mu_coef2, t, x.shape) * x
+            )
+            logvar = extract(
+                self.ddpm_logvar_clipped, t, x.shape
+            )
+        return mu, logvar
+
+    @torch.no_grad()
+    def forward(
+        self,
+        cond: Optional[torch.Tensor],
+        return_chain=True,
+    ):
+        """
+        Forward sampling through denoising steps.
+        
+        Args:
+            cond: (batch_size, horizon, transition_dim)
+            return_chain: whether to return the chain of samples or only the final denoised sample
+        Return:
+            Sample: namedtuple with fields:
+                trajectories: (batch_size, horizon_steps, transition_dim)
+                values: (batch_size, )
+                chain: (batch_size, denoising_steps + 1, horizon_steps, transition_dim)
+        """
+        device = self.betas.device
+        if isinstance(cond, dict):
+            B = cond[list(cond.keys())[0]].shape[0]
+        else:
+            B = cond.shape[0]
+            cond = cond[:, : self.cond_steps].reshape(B, -1)
+        shape = (B, self.horizon_steps, self.transition_dim)
+
+        # Loop
+        x = torch.randn(shape, device=device)
+        chain = [x] if return_chain else None
+        if self.use_ddim:
+            t_all = self.ddim_t
+        else:
+            t_all = list(reversed(range(self.denoising_steps)))
+        for i, t in enumerate(t_all):
+            t_b = make_timesteps(B, t, device)
+            index_b = make_timesteps(B, i, device)
+            mu, logvar = self.p_mean_var(x=x, t=t_b, cond=cond, index=index_b)
+            std = torch.exp(0.5 * logvar)
+
+            # no noise when t == 0
+            noise = torch.randn_like(x)
+            noise[t == 0] = 0
+            x = mu + std * noise
+            if return_chain:
+                chain.append(x)
+        if return_chain:
+            chain = torch.stack(chain, dim=1)
+        values = torch.zeros(len(x), device=x.device)   # not considering the value for now
+        return Sample(x, values, chain)
+
+    # ---------- Supervised training ----------#
+
+    def loss(self, x, *args):
+        batch_size = len(x)
+        t = torch.randint(
+            0, self.denoising_steps, (batch_size,), device=x.device
+        ).long()
+        return self.p_losses(x, *args, t)
+
+    def p_losses(
+        self,
+        x_start,
+        obs_cond: Union[dict, torch.Tensor],
+        t,
+    ):
+        """
+        If predicting epsilon: E_{t, x0, ε} [||ε - ε_θ(√α̅ₜx0 + √(1-α̅ₜ)ε, t)||²
+
+        Args:
+            x_start: (batch_size, horizon_steps, transition_dim)
+            obs_cond: dict with keys as step and value as observation
+            t: batch of integers
+        """
+        device = x_start.device
+        B = x_start.shape[0]
+        if isinstance(obs_cond[0], dict):
+            cond = obs_cond[0]  # keep the dictionary and the network will extract img and prio
+        else:
+            cond = obs_cond[0].reshape(B, -1)
+
+        # Forward process
+        noise = torch.randn_like(x_start, device=device)
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+
+        # Predict
+        x_recon = self.network(x_noisy, t, cond=cond)
+        if self.predict_epsilon:
+            return F.mse_loss(x_recon, noise, reduction="mean") 
+        else:
+            return F.mse_loss(x_recon, x_noisy, reduction="mean")
+
+    def q_sample(self, x_start, t, noise=None):
+        """
+        q(xₜ | x₀) = 𝒩(xₜ; √ α̅ₜ x₀, (1-α̅ₜ)I)
+        xₜ = √ α̅ₜ xₒ + √ (1-α̅ₜ) ε
+        """
+        if noise is None:
+            device = x_start.device
+            noise = torch.randn_like(x_start, device=device)
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
diff --git a/model/diffusion/diffusion_awr.py b/model/diffusion/diffusion_awr.py
new file mode 100644
index 0000000..f5fc38d
--- /dev/null
+++ b/model/diffusion/diffusion_awr.py
@@ -0,0 +1,34 @@
+"""
+Advantage-weighted regression (AWR) for diffusion policy.
+
+"""
+
+import logging
+import torch
+
+log = logging.getLogger(__name__)
+
+from model.diffusion.diffusion_rwr import RWRDiffusion
+
+
+class AWRDiffusion(RWRDiffusion):
+
+    def __init__(
+        self,
+        actor,
+        critic,
+        **kwargs,
+    ):
+        super().__init__(network=actor, **kwargs)
+        self.critic = critic.to(self.device)
+
+        # assign actor
+        self.actor = self.network
+
+    def loss_critic(self, obs, advantages):
+        # get advantage
+        adv = self.critic(obs)
+
+        # Update critic
+        loss_critic = torch.mean((adv - advantages) ** 2)
+        return loss_critic
diff --git a/model/diffusion/diffusion_dipo.py b/model/diffusion/diffusion_dipo.py
new file mode 100644
index 0000000..4e4630c
--- /dev/null
+++ b/model/diffusion/diffusion_dipo.py
@@ -0,0 +1,118 @@
+"""
+Actor and Critic models for model-free online RL with DIffusion POlicy (DIPO).
+
+"""
+
+import torch
+import logging
+
+log = logging.getLogger(__name__)
+
+from model.diffusion.diffusion import DiffusionModel
+from model.diffusion.sampling import make_timesteps
+
+
+class DIPODiffusion(DiffusionModel):
+
+    def __init__(
+        self,
+        actor,
+        critic,
+        use_ddim=False,
+        randn_clip_value=10,
+        clamp_action=False,
+        min_sampling_denoising_std=0.1,
+        **kwargs,
+    ):
+        super().__init__(network=actor, use_ddim=use_ddim, **kwargs)
+        assert not self.use_ddim, "DQL does not support DDIM"
+        self.critic = critic.to(self.device)
+
+        # reassign actor
+        self.actor = self.network
+
+        # Minimum std used in denoising process when sampling action - helps exploration
+        self.min_sampling_denoising_std = min_sampling_denoising_std
+
+        # For each denoising step, we clip sampled randn (from standard deviation) such that the sampled action is not too far away from mean
+        self.randn_clip_value = randn_clip_value
+
+        # Whether to clamp sampled action between [-1, 1]
+        self.clamp_action = clamp_action
+
+    def loss_critic(self, obs, next_obs, actions, rewards, dones, gamma):
+
+        # get current Q-function
+        actions_flat = torch.flatten(actions, start_dim=-2)
+        current_q1, current_q2 = self.critic(obs, actions_flat)
+
+        # get next Q-function
+        next_actions = self.forward(
+            cond=next_obs,
+            deterministic=False,
+        )  # in DiffusionModel, forward() has no gradient, which is desired here.
+        next_actions_flat = torch.flatten(next_actions, start_dim=-2)
+        next_q1, next_q2 = self.critic(next_obs, next_actions_flat)
+        next_q = torch.min(next_q1, next_q2)
+
+        # terminal state mask
+        mask = 1 - dones
+
+        # flatten
+        rewards = rewards.view(-1)
+        next_q = next_q.view(-1)
+        mask = mask.view(-1)
+
+        # target value
+        target_q = rewards + gamma * next_q * mask
+
+        # Update critic
+        loss_critic = torch.mean((current_q1 - target_q) ** 2) + torch.mean(
+            (current_q2 - target_q) ** 2
+        )
+
+        return loss_critic
+
+    # override
+    @torch.no_grad()
+    def forward(
+        self,
+        cond,
+        deterministic=False,
+    ):
+        device = self.betas.device
+        B = cond.shape[0]
+        if isinstance(cond, dict):
+            raise NotImplementedError("Not implemented for images")
+        else:
+            B = cond.shape[0]
+            cond = cond[:, : self.cond_steps]
+
+        # Loop
+        x = torch.randn((B, self.horizon_steps, self.transition_dim), device=device)
+        t_all = list(reversed(range(self.denoising_steps)))
+        for i, t in enumerate(t_all):
+            t_b = make_timesteps(B, t, device)
+            mean, logvar = self.p_mean_var(
+                x=x,
+                t=t_b,
+                cond=cond,
+            )
+            std = torch.exp(0.5 * logvar)
+
+            # Determine the noise level
+            if deterministic and t == 0:
+                std = torch.zeros_like(std)
+            elif deterministic:  # For DDPM, sample with noise
+                std = torch.clip(std, min=1e-3)
+            else:
+                std = torch.clip(std, min=self.min_sampling_denoising_std)
+            noise = torch.randn_like(x).clamp_(
+                -self.randn_clip_value, self.randn_clip_value
+            )
+            x = mean + std * noise
+
+            # clamp action at final step
+            if self.clamp_action and i == len(t_all) - 1:
+                x = torch.clamp(x, -1, 1)
+        return x
diff --git a/model/diffusion/diffusion_dql.py b/model/diffusion/diffusion_dql.py
new file mode 100644
index 0000000..f6ef0ed
--- /dev/null
+++ b/model/diffusion/diffusion_dql.py
@@ -0,0 +1,173 @@
+"""
+Diffusion Q-Learning (DQL)
+
+"""
+
+import torch
+import logging
+import numpy as np
+
+log = logging.getLogger(__name__)
+
+from model.diffusion.diffusion import DiffusionModel
+from model.diffusion.sampling import make_timesteps
+
+
+class DQLDiffusion(DiffusionModel):
+
+    def __init__(
+        self,
+        actor,
+        critic,
+        use_ddim=False,
+        randn_clip_value=10,
+        clamp_action=False,
+        min_sampling_denoising_std=0.1,
+        **kwargs,
+    ):
+        super().__init__(network=actor, use_ddim=use_ddim, **kwargs)
+        assert not self.use_ddim, "DQL does not support DDIM"
+        self.critic = critic.to(self.device)
+
+        # reassign actor
+        self.actor = self.network
+
+        # Minimum std used in denoising process when sampling action - helps exploration
+        self.min_sampling_denoising_std = min_sampling_denoising_std
+
+        # For each denoising step, we clip sampled randn (from standard deviation) such that the sampled action is not too far away from mean
+        self.randn_clip_value = randn_clip_value
+
+        # Whether to clamp sampled action between [-1, 1]
+        self.clamp_action = clamp_action
+
+    def loss_critic(self, obs, next_obs, actions, rewards, dones, gamma):
+
+        # get current Q-function
+        actions_flat = torch.flatten(actions, start_dim=-2)
+        current_q1, current_q2 = self.critic(obs, actions_flat)
+
+        # get next Q-function
+        next_actions = self.forward(
+            cond=next_obs,
+            deterministic=False,
+        )  # in DiffusionModel, forward() has no gradient, which is desired here.
+        next_actions_flat = torch.flatten(next_actions, start_dim=-2)
+        next_q1, next_q2 = self.critic(next_obs, next_actions_flat)
+        next_q = torch.min(next_q1, next_q2)
+
+        # terminal state mask
+        mask = 1 - dones
+
+        # flatten
+        rewards = rewards.view(-1)
+        next_q = next_q.view(-1)
+        mask = mask.view(-1)
+
+        # target value
+        target_q = rewards + gamma * next_q * mask
+
+        # Update critic
+        loss_critic = torch.mean((current_q1 - target_q) ** 2) + torch.mean(
+            (current_q2 - target_q) ** 2
+        )
+
+        return loss_critic
+
+    def loss_actor(self, obs, actions, q1, q2, eta):
+        bc_loss = self.loss(actions, {0: obs})
+        if np.random.uniform() > 0.5:
+            q_loss = -q1.mean() / q2.abs().mean().detach()
+        else:
+            q_loss = -q2.mean() / q1.abs().mean().detach()
+        actor_loss = bc_loss + eta * q_loss
+        return actor_loss
+
+    # override
+    @torch.no_grad()
+    def forward(
+        self,
+        cond,
+        deterministic=False,
+    ):
+        device = self.betas.device
+        B = cond.shape[0]
+        if isinstance(cond, dict):
+            raise NotImplementedError("Not implemented for images")
+        else:
+            B = cond.shape[0]
+            cond = cond[:, : self.cond_steps]
+
+        # Loop
+        x = torch.randn((B, self.horizon_steps, self.transition_dim), device=device)
+        t_all = list(reversed(range(self.denoising_steps)))
+        for i, t in enumerate(t_all):
+            t_b = make_timesteps(B, t, device)
+            mean, logvar = self.p_mean_var(
+                x=x,
+                t=t_b,
+                cond=cond,
+            )
+            std = torch.exp(0.5 * logvar)
+
+            # Determine the noise level
+            if deterministic and t == 0:
+                std = torch.zeros_like(std)
+            elif deterministic:  # For DDPM, sample with noise
+                std = torch.clip(std, min=1e-3)
+            else:
+                std = torch.clip(std, min=self.min_sampling_denoising_std)
+            noise = torch.randn_like(x).clamp_(
+                -self.randn_clip_value, self.randn_clip_value
+            )
+            x = mean + std * noise
+
+            # clamp action at final step
+            if self.clamp_action and i == len(t_all) - 1:
+                x = torch.clamp(x, -1, 1)
+        return x
+
+    def forward_train(
+        self,
+        cond,
+        deterministic=False,
+    ):
+        """
+        Differentiable forward pass used in actor training.
+        """
+        device = self.betas.device
+        B = cond.shape[0]
+        if isinstance(cond, dict):
+            raise NotImplementedError("Not implemented for images")
+        else:
+            B = cond.shape[0]
+            cond = cond[:, : self.cond_steps]
+
+        # Loop
+        x = torch.randn((B, self.horizon_steps, self.transition_dim), device=device)
+        t_all = list(reversed(range(self.denoising_steps)))
+        for i, t in enumerate(t_all):
+            t_b = make_timesteps(B, t, device)
+            mean, logvar = self.p_mean_var(
+                x=x,
+                t=t_b,
+                cond=cond,
+            )
+            std = torch.exp(0.5 * logvar)
+
+            # Determine the noise level
+            if deterministic and t == 0:
+                std = torch.zeros_like(std)
+            elif deterministic:  # For DDPM, sample with noise
+                std = torch.clip(std, min=1e-3)
+            else:
+                std = torch.clip(std, min=self.min_sampling_denoising_std)
+            noise = torch.randn_like(x).clamp_(
+                -self.randn_clip_value, self.randn_clip_value
+            )
+            x = mean + std * noise
+
+            # clamp action at final step
+            if self.clamp_action and i == len(t_all) - 1:
+                x = torch.clamp(x, -1, 1)
+        return x
diff --git a/model/diffusion/diffusion_idql.py b/model/diffusion/diffusion_idql.py
new file mode 100644
index 0000000..121dc26
--- /dev/null
+++ b/model/diffusion/diffusion_idql.py
@@ -0,0 +1,197 @@
+"""
+Implicit diffusion Q-learning (IDQL) for diffusion policy.
+
+"""
+
+import logging
+import torch
+import einops
+import copy
+
+import torch.nn.functional as F
+
+log = logging.getLogger(__name__)
+
+from model.diffusion.diffusion_rwr import RWRDiffusion
+
+
+def expectile_loss(diff, expectile=0.8):
+    weight = torch.where(diff > 0, expectile, (1 - expectile))
+    return weight * (diff**2)
+
+
+def soft_update(target, source, tau):
+    for target_param, param in zip(target.parameters(), source.parameters()):
+        target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
+
+
+class IDQLDiffusion(RWRDiffusion):
+
+    def __init__(
+        self,
+        actor,
+        critic_q,
+        critic_v,
+        **kwargs,
+    ):
+        super().__init__(network=actor, **kwargs)
+        self.critic_q = critic_q.to(self.device)
+        self.target_q = copy.deepcopy(critic_q)
+        self.critic_v = critic_v.to(self.device)
+
+        # assign actor
+        self.actor = self.network
+
+    def compute_advantages(self, obs, actions):
+
+        # get current Q-function
+        actions_flat = torch.flatten(actions, start_dim=-2)
+        with torch.no_grad():  # no gradients for q-function when we update value function
+            current_q1, current_q2 = self.target_q(obs, actions_flat)
+        q = torch.min(current_q1, current_q2)
+
+        # get the current V-function
+        v = self.critic_v(obs).reshape(-1)
+
+        # compute advantage
+        adv = q - v
+
+        return adv
+
+    def loss_critic_v(self, obs, actions):
+
+        adv = self.compute_advantages(obs, actions)
+
+        # get the value loss
+        v_loss = expectile_loss(adv).mean()
+
+        return v_loss
+
+    def loss_critic_q(self, obs, next_obs, actions, rewards, dones, gamma):
+
+        # get current Q-function
+        actions_flat = torch.flatten(actions, start_dim=-2)
+        current_q1, current_q2 = self.critic_q(obs, actions_flat)
+
+        # get the next V-function
+        with torch.no_grad():  # no gradients for value function when we update q function
+            next_v = self.critic_v(next_obs)
+
+        # terminal state mask
+        mask = 1 - dones
+
+        # flatten
+        rewards = rewards.view(-1)
+        next_v = next_v.view(-1)
+        mask = mask.view(-1)
+
+        # target value
+        discounted_q = rewards + gamma * next_v * mask
+
+        # Update critic
+        q_loss = torch.mean((current_q1 - discounted_q) ** 2) + torch.mean(
+            (current_q2 - discounted_q) ** 2
+        )
+
+        return q_loss
+
+    def update_target_critic(self, tau):
+        soft_update(self.target_q, self.critic_q, tau)
+
+    @torch.no_grad()
+    def forward(  # override
+        self,
+        cond,
+        deterministic=False,
+        num_sample=10,
+        critic_hyperparam=0.7,  # sampling weight for implicit policy
+        use_expectile_exploration=True,
+    ):
+        # repeat obs num_sample times along dim 0
+        cond_shape_repeat_dims = tuple(1 for _ in cond.shape)
+        B, T, D = cond.shape
+        S = num_sample
+        cond_repeat = cond[None].repeat(num_sample, *cond_shape_repeat_dims)
+        cond_repeat = cond_repeat.view(-1, T, D)  # [B*S, T, D]
+
+        # for eval, use less noisy samples --- there is still DDPM noise, but final action uses small min_sampling_std
+        samples = super(IDQLDiffusion, self).forward(
+            cond_repeat,
+            deterministic=deterministic,
+        )
+        _, H, A = samples.shape
+
+        # get current Q-function
+        actions_flat = torch.flatten(samples, start_dim=-2)
+        current_q1, current_q2 = self.target_q(cond_repeat, actions_flat)
+        q = torch.min(current_q1, current_q2)
+        q = q.view(S, B)
+
+        # Use argmax
+        if deterministic or (not use_expectile_exploration):
+            # gather the best sample -- filter out suboptimal Q during inference
+            best_indices = q.argmax(0)
+            samples_expanded = samples.view(S, B, H, A)
+
+            # dummy dimension @ dim 0 for batched indexing
+            sample_indices = best_indices[None, :, None, None]  # [1, B, 1, 1]
+            sample_indices = sample_indices.repeat(S, 1, H, A)
+
+            samples_best = torch.gather(samples_expanded, 0, sample_indices)
+        # Sample as an implicit policy for exploration
+        else:
+            # get the current value function for probabilistic exploration
+            current_v = self.critic_v(cond_repeat)
+            v = current_v.view(S, B)
+            adv = q - v
+
+            # Compute weights for sampling
+            samples_expanded = samples.view(S, B, H, A)
+
+            # expectile exploration policy
+            tau_weights = torch.where(adv > 0, critic_hyperparam, 1 - critic_hyperparam)
+            tau_weights = tau_weights / tau_weights.sum(0)  # normalize
+
+            # select a sample from DP probabilistically -- sample index per batch and compile
+            sample_indices = torch.multinomial(tau_weights.T, 1)  # [B, 1]
+
+            # dummy dimension @ dim 0 for batched indexing
+            sample_indices = sample_indices[None, :, None]  # [1, B, 1, 1]
+            sample_indices = sample_indices.repeat(S, 1, H, A)
+
+            samples_best = torch.gather(samples_expanded, 0, sample_indices)
+
+        # squeeze dummy dimension
+        samples = samples_best[0]
+        return samples
+
+    # override
+    def p_losses(
+        self,
+        x_start,
+        obs_cond,
+        t,
+    ):
+        device = x_start.device
+        B, T, D = x_start.shape
+
+        # handle different ways of passing observation
+        if isinstance(obs_cond[0], dict):
+            cond = obs_cond[0]
+        else:
+            cond = obs_cond.reshape(B, -1)
+
+        # Forward process
+        noise = torch.randn_like(x_start, device=device)
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+
+        # Predict
+        x_recon = self.network(x_noisy, t, cond=cond)
+
+        # Loss with mask
+        if self.predict_epsilon:
+            loss = F.mse_loss(x_recon, noise, reduction="none")
+        else:
+            loss = F.mse_loss(x_recon, x_start, reduction="none")
+        loss = einops.reduce(loss, "b h d -> b", "mean")
+        return loss.mean()
diff --git a/model/diffusion/diffusion_ppo.py b/model/diffusion/diffusion_ppo.py
new file mode 100644
index 0000000..0e8081f
--- /dev/null
+++ b/model/diffusion/diffusion_ppo.py
@@ -0,0 +1,193 @@
+"""
+DPPO: Diffusion Policy Policy Optimization. 
+
+"""
+
+from typing import Optional
+import torch
+import logging
+import math
+
+log = logging.getLogger(__name__)
+from model.diffusion.diffusion_vpg import VPGDiffusion
+
+
+class PPODiffusion(VPGDiffusion):
+
+    def __init__(
+        self,
+        gamma_denoising: float,
+        clip_ploss_coef: float,
+        clip_ploss_coef_base: float = 1e-3,
+        clip_ploss_coef_rate: float = 3,
+        clip_vloss_coef: Optional[float] = None,
+        clip_advantage_lower_quantile: float = 0,
+        clip_advantage_upper_quantile: float = 1,
+        norm_adv: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # Whether to normalize advantages within batch
+        self.norm_adv = norm_adv
+
+        # Clipping value for policy loss
+        self.clip_ploss_coef = clip_ploss_coef
+        self.clip_ploss_coef_base = clip_ploss_coef_base
+        self.clip_ploss_coef_rate = clip_ploss_coef_rate
+
+        # Clipping value for value loss
+        self.clip_vloss_coef = clip_vloss_coef
+
+        # Discount factor for diffusion MDP
+        self.gamma_denoising = gamma_denoising
+
+        # Quantiles for clipping advantages
+        self.clip_advantage_lower_quantile = clip_advantage_lower_quantile
+        self.clip_advantage_upper_quantile = clip_advantage_upper_quantile
+
+    def loss(
+        self,
+        obs,
+        chains,
+        returns,
+        oldvalues,
+        advantages,
+        oldlogprobs,
+        use_bc_loss=False,
+        reward_horizon=4,
+    ):
+        """
+        PPO loss
+
+        obs: (B, obs_step, obs_dim)
+        chains: (B, num_denoising_step+1, horizon_step, action_dim)
+        returns: (B, )
+        values: (B, )
+        advantages: (B,)
+        oldlogprobs: (B, num_denoising_step, horizon_step, action_dim)
+        use_bc_loss: add BC regularization loss
+        reward_horizon: action horizon that backpropagates gradient
+        """
+        # Get new logprobs for denoising steps from T-1 to 0 - entropy is fixed fod diffusion
+        newlogprobs, eta = self.get_logprobs(
+            obs,
+            chains,
+            get_ent=True,
+        )
+        entropy_loss = -eta.mean()
+        newlogprobs = newlogprobs.clamp(min=-5, max=2)
+        oldlogprobs = oldlogprobs.clamp(min=-5, max=2)
+
+        # only backpropagate through the earlier steps (e.g., ones actually executed in the environment)
+        newlogprobs = newlogprobs[:, :reward_horizon, :]
+        oldlogprobs = oldlogprobs[:, :, :reward_horizon, :]
+
+        # Get the logprobs - batch over B and denoising steps
+        newlogprobs = newlogprobs.mean(dim=(-1, -2)).view(-1)
+        oldlogprobs = oldlogprobs.mean(dim=(-1, -2)).view(-1)
+
+        bc_loss = 0
+        if use_bc_loss:
+            # See Eqn. 2 of https://arxiv.org/pdf/2403.03949.pdf
+            # Give a reward for maximizing probability of teacher policy's action with current policy.
+            # Actions are chosen along trajectory induced by current policy.
+
+            # Get counterfactual teacher actions
+            samples = self.forward(
+                cond=obs.float()
+                .unsqueeze(1)
+                .to(self.device),  # B x horizon=1 x obs_dim
+                deterministic=False,
+                return_chain=True,
+                use_base_policy=True,
+            )
+            # Get logprobs of teacher actions under this policy
+            bc_logprobs = self.get_logprobs(
+                obs,
+                samples.chains,  # n_env x denoising x horizon x act
+                get_ent=False,
+                use_base_policy=False,
+            )
+            bc_logprobs = bc_logprobs.clamp(min=-5, max=2)
+            bc_logprobs = bc_logprobs.mean(dim=(-1, -2)).view(-1)
+            bc_loss = -bc_logprobs.mean()
+
+        # normalize advantages
+        if self.norm_adv:
+            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
+
+        # Clip advantages by 5th and 95th percentile
+        advantage_min = torch.quantile(advantages, self.clip_advantage_lower_quantile)
+        advantage_max = torch.quantile(advantages, self.clip_advantage_upper_quantile)
+        advantages = advantages.clamp(min=advantage_min, max=advantage_max)
+
+        # repeat advantages for denoising steps and horizon steps
+        advantages = advantages.repeat_interleave(self.ft_denoising_steps)
+
+        # denoising discount
+        discount = torch.tensor(
+            [self.gamma_denoising**i for i in reversed(range(self.ft_denoising_steps))]
+        ).to(self.device)
+        discount = discount.repeat(len(advantages) // self.ft_denoising_steps)
+        advantages *= discount
+
+        # get ratio
+        logratio = newlogprobs - oldlogprobs
+        ratio = logratio.exp()
+
+        # exponentially interpolate between the base and the current clipping value over denoising steps and repeat
+        t = torch.arange(self.ft_denoising_steps).float().to(self.device) / (
+            self.ft_denoising_steps - 1
+        )  # 0 to 1
+        if self.ft_denoising_steps > 1:
+            clip_ploss_coef = self.clip_ploss_coef_base + (
+                self.clip_ploss_coef - self.clip_ploss_coef_base
+            ) * (torch.exp(self.clip_ploss_coef_rate * t) - 1) / (
+                math.exp(self.clip_ploss_coef_rate) - 1
+            )
+        else:
+            clip_ploss_coef = torch.tensor([self.clip_ploss_coef]).to(self.device)
+        clip_ploss_coef = clip_ploss_coef.repeat(
+            len(advantages) // self.ft_denoising_steps
+        )
+
+        # get kl difference and whether value clipped
+        with torch.no_grad():
+            # old_approx_kl: the approximate Kullback–Leibler divergence, measured by (-logratio).mean(), which corresponds to the k1 estimator in John Schulman’s blog post on approximating KL http://joschu.net/blog/kl-approx.html
+            # approx_kl: better alternative to old_approx_kl measured by (logratio.exp() - 1) - logratio, which corresponds to the k3 estimator in approximating KL http://joschu.net/blog/kl-approx.html
+            # old_approx_kl = (-logratio).mean()
+            approx_kl = ((ratio - 1) - logratio).mean()
+            clipfrac = ((ratio - 1.0).abs() > clip_ploss_coef).float().mean().item()
+
+        # Policy loss with clipping
+        pg_loss1 = -advantages * ratio
+        pg_loss2 = -advantages * torch.clamp(
+            ratio, 1 - clip_ploss_coef, 1 + clip_ploss_coef
+        )
+        pg_loss = torch.max(pg_loss1, pg_loss2).mean()
+
+        # Value loss optionally with clipping
+        newvalues = self.critic(obs).view(-1)
+        if self.clip_vloss_coef is not None:
+            v_loss_unclipped = (newvalues - returns) ** 2
+            v_clipped = oldvalues + torch.clamp(
+                newvalues - oldvalues,
+                -self.clip_vloss_coef,
+                self.clip_vloss_coef,
+            )
+            v_loss_clipped = (v_clipped - returns) ** 2
+            v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
+            v_loss = 0.5 * v_loss_max.mean()
+        else:
+            v_loss = 0.5 * ((newvalues - returns) ** 2).mean()
+        return (
+            pg_loss,
+            entropy_loss,
+            v_loss,
+            clipfrac,
+            approx_kl.item(),
+            ratio.mean().item(),
+            bc_loss,
+            eta.mean().item(),
+        )
diff --git a/model/diffusion/diffusion_ppo_exact.py b/model/diffusion/diffusion_ppo_exact.py
new file mode 100644
index 0000000..6d442a8
--- /dev/null
+++ b/model/diffusion/diffusion_ppo_exact.py
@@ -0,0 +1,139 @@
+"""
+Diffusion policy gradient with exact likelihood estimation.
+
+Based on score_sde_pytorch https://github.com/yang-song/score_sde_pytorch
+
+"""
+
+import torch
+import logging
+
+log = logging.getLogger(__name__)
+from .diffusion_ppo import PPODiffusion
+from .exact_likelihood import get_likelihood_fn
+
+
+class PPOExactDiffusion(PPODiffusion):
+
+    def __init__(
+        self,
+        sde,
+        sde_hutchinson_type="Rademacher",
+        sde_rtol=1e-4,
+        sde_atol=1e-4,
+        sde_eps=1e-4,
+        sde_step_size=1e-3,
+        sde_method="RK23",
+        sde_continuous=False,
+        sde_probability_flow=False,
+        sde_num_epsilon=1,
+        sde_min_beta=1e-2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.sde = sde
+        self.sde.set_betas(
+            self.betas,
+            sde_min_beta,
+        )
+
+        # set up likelihood function
+        self.likelihood_fn = get_likelihood_fn(
+            sde,
+            hutchinson_type=sde_hutchinson_type,
+            rtol=sde_rtol,
+            atol=sde_atol,
+            eps=sde_eps,
+            step_size=sde_step_size,
+            method=sde_method,
+            continuous=sde_continuous,
+            probability_flow=sde_probability_flow,
+            predict_epsilon=self.predict_epsilon,
+            num_epsilon=sde_num_epsilon,
+        )
+
+    def get_exact_logprobs(self, obs, samples):
+        """Use torchdiffeq
+
+        samples: B x horizon x transition_dim
+        """
+        # TODO: image input
+        cond = obs.reshape(-1, self.obs_dim)
+        return self.likelihood_fn(
+            self.actor,
+            self.actor_ft,
+            samples,
+            self.denoising_steps,
+            self.ft_denoising_steps,
+            cond=cond,
+        )
+
+    def loss(
+        self,
+        obs,
+        samples,
+        returns,
+        oldvalues,
+        advantages,
+        oldlogprobs,
+        use_bc_loss=False,
+        **kwargs,
+    ):
+        # Get new logprobs for final x
+        newlogprobs = self.get_exact_logprobs(obs, samples)
+        newlogprobs = newlogprobs.clamp(min=-5, max=2)
+        oldlogprobs = oldlogprobs.clamp(min=-5, max=2)
+
+        bc_loss = 0
+        if use_bc_loss:
+            raise NotImplementedError
+
+        # get ratio
+        logratio = newlogprobs - oldlogprobs
+        ratio = logratio.exp()
+
+        # get kl difference and whether value clipped
+        with torch.no_grad():
+            # old_approx_kl: the approximate Kullback–Leibler divergence, measured by (-logratio).mean(), which corresponds to the k1 estimator in John Schulman’s blog post on approximating KL http://joschu.net/blog/kl-approx.html
+            # approx_kl: better alternative to old_approx_kl measured by (logratio.exp() - 1) - logratio, which corresponds to the k3 estimator in approximating KL http://joschu.net/blog/kl-approx.html
+            # old_approx_kl = (-logratio).mean()
+            approx_kl = ((ratio - 1) - logratio).mean()
+            clipfrac = (
+                ((ratio - 1.0).abs() > self.clip_ploss_coef).float().mean().item()
+            )
+
+        # normalize advantages
+        if self.norm_adv:
+            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
+
+        # Policy loss with clipping
+        pg_loss1 = -advantages * ratio
+        pg_loss2 = -advantages * torch.clamp(
+            ratio, 1 - self.clip_ploss_coef, 1 + self.clip_ploss_coef
+        )
+        pg_loss = torch.max(pg_loss1, pg_loss2).mean()
+
+        # Value loss optionally with clipping
+        newvalues = self.critic(obs).view(-1)
+        if self.clip_vloss_coef is not None:
+            v_loss_unclipped = (newvalues - returns) ** 2
+            v_clipped = oldvalues + torch.clamp(
+                newvalues - oldvalues,
+                -self.clip_vloss_coef,
+                self.clip_vloss_coef,
+            )
+            v_loss_clipped = (v_clipped - returns) ** 2
+            v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
+            v_loss = 0.5 * v_loss_max.mean()
+        else:
+            v_loss = 0.5 * ((newvalues - returns) ** 2).mean()
+
+        # entropy is maximized - only effective if residual is learned
+        return (
+            pg_loss,
+            v_loss,
+            clipfrac,
+            approx_kl.item(),
+            ratio.mean().item(),
+            bc_loss,
+        )
diff --git a/model/diffusion/diffusion_qsm.py b/model/diffusion/diffusion_qsm.py
new file mode 100644
index 0000000..0b3e808
--- /dev/null
+++ b/model/diffusion/diffusion_qsm.py
@@ -0,0 +1,110 @@
+"""
+QSM (Q-Score Matching) for diffusion policy.
+
+"""
+
+import logging
+import torch
+import copy
+
+import torch.nn.functional as F
+
+log = logging.getLogger(__name__)
+
+from model.diffusion.diffusion_rwr import RWRDiffusion
+
+
+def expectile_loss(diff, expectile=0.8):
+    weight = torch.where(diff > 0, expectile, (1 - expectile))
+    return weight * (diff**2)
+
+
+def soft_update(target, source, tau):
+    for target_param, param in zip(target.parameters(), source.parameters()):
+        target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
+
+
+class QSMDiffusion(RWRDiffusion):
+
+    def __init__(
+        self,
+        actor,
+        critic,
+        **kwargs,
+    ):
+        super().__init__(network=actor, **kwargs)
+        self.critic_q = critic.to(self.device)
+        self.target_q = copy.deepcopy(critic)
+
+        # assign actor
+        self.actor = self.network
+
+    def loss_actor(self, obs, actions, q_grad_coeff):
+        x_start = actions
+        device = x_start.device
+        B, T, D = x_start.shape
+        cond = obs.reshape(B, -1)
+
+        # Forward process
+        noise = torch.randn_like(x_start, device=device)
+        t = torch.randint(
+            0, self.denoising_steps, (B,), device=device
+        ).long()  # sample random denoising time index
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+
+        # get current value for noisy actions as the code does --- the algorthm block in the paper is wrong, it says using a_t, the final denoised action
+        x_noisy_flat = torch.flatten(x_noisy, start_dim=-2)
+        x_noisy_flat.requires_grad_(True)
+        current_q1, current_q2 = self.critic_q(obs, x_noisy_flat)
+
+        # Compute dQ/da|a=noise_actions
+        gradient_q1 = torch.autograd.grad(current_q1.sum(), x_noisy_flat)[0]
+        gradient_q2 = torch.autograd.grad(current_q2.sum(), x_noisy_flat)[0]
+        gradient_q = torch.stack((gradient_q1, gradient_q2), 0).mean(0).detach()
+
+        # Predict noise from noisy actions
+        x_recon = self.network(x_noisy, t, cond=cond)
+        x_recon = torch.flatten(x_recon, start_dim=-2)
+
+        # Loss with mask - align predicted noise with critic gradient of noisy actions
+        # Note: the gradient of mu wrt. epsilon has a negative sign
+        loss = F.mse_loss(-x_recon, q_grad_coeff * gradient_q, reduction="none").mean()
+
+        return loss
+
+    def loss_critic(self, obs, next_obs, actions, rewards, dones, gamma):
+
+        # get current Q-function
+        actions_flat = torch.flatten(actions, start_dim=-2)
+        current_q1, current_q2 = self.critic_q(obs, actions_flat)
+
+        # get next Q-function - with noise, same as QSM https://github.com/Alescontrela/score_matching_rl/blob/f02a21969b17e322eb229ceb2b0f5a9111b1b968/jaxrl5/agents/score_matching/score_matching_learner.py#L193
+        next_actions = self.forward(
+            cond=next_obs,
+            deterministic=False,
+        )  # in DiffusionModel, forward() has no gradient, which is desired here.
+        next_actions_flat = torch.flatten(next_actions, start_dim=-2)
+        with torch.no_grad():
+            next_q1, next_q2 = self.target_q(next_obs, next_actions_flat)
+        next_q = torch.min(next_q1, next_q2)
+
+        # terminal state mask
+        mask = 1 - dones
+
+        # flatten
+        rewards = rewards.view(-1)
+        next_q = next_q.view(-1)
+        mask = mask.view(-1)
+
+        # target value
+        discounted_q = rewards + gamma * next_q * mask
+
+        # Update critic
+        loss_critic = torch.mean((current_q1 - discounted_q) ** 2) + torch.mean(
+            (current_q2 - discounted_q) ** 2
+        )
+
+        return loss_critic
+
+    def update_target_critic(self, tau):
+        soft_update(self.target_q, self.critic_q, tau)
diff --git a/model/diffusion/diffusion_rwr.py b/model/diffusion/diffusion_rwr.py
new file mode 100644
index 0000000..d60cc65
--- /dev/null
+++ b/model/diffusion/diffusion_rwr.py
@@ -0,0 +1,153 @@
+"""
+Reward-weighted regression (RWR) for diffusion policy.
+
+"""
+
+import torch
+import logging
+import einops
+
+log = logging.getLogger(__name__)
+import torch.nn.functional as F
+
+from model.diffusion.diffusion import DiffusionModel
+from model.diffusion.sampling import make_timesteps, extract
+
+
+class RWRDiffusion(DiffusionModel):
+
+    def __init__(
+        self,
+        use_ddim=False,
+        # various clipping
+        randn_clip_value=10,
+        clamp_action=None,
+        min_sampling_denoising_std=0.1,
+        **kwargs,
+    ):
+        super().__init__(use_ddim=use_ddim, **kwargs)
+        assert not self.use_ddim, "RWR does not support DDIM"
+
+        # For each denoising step, we clip sampled randn (from standard deviation) such that the sampled action is not too far away from mean
+        self.randn_clip_value = randn_clip_value
+
+        # Action clamp range
+        self.clamp_action = clamp_action
+
+        # Minimum std used in denoising process when sampling action - helps exploration
+        self.min_sampling_denoising_std = min_sampling_denoising_std
+
+    # ---------- RL training ----------#
+
+    # override
+    def p_losses(
+        self,
+        x_start,
+        obs_cond,
+        rewards,
+        t,
+    ):
+        device = x_start.device
+        B, T, D = x_start.shape
+
+        # handle different ways of passing observation
+        if isinstance(obs_cond[0], dict):
+            cond = obs_cond[0]
+        else:
+            cond = obs_cond.reshape(B, -1)
+
+        # Forward process
+        noise = torch.randn_like(x_start, device=device)
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+
+        # Predict
+        x_recon = self.network(x_noisy, t, cond=cond)
+
+        # Loss with mask
+        if self.predict_epsilon:
+            loss = F.mse_loss(x_recon, noise, reduction="none")
+        else:
+            loss = F.mse_loss(x_recon, x_start, reduction="none")
+        loss = einops.reduce(loss, "b h d -> b", "mean")
+        loss *= rewards
+        return loss.mean()
+
+    # ---------- Sampling ----------#
+
+    # override
+    def p_mean_var(
+        self,
+        x,
+        t,
+        cond=None,
+    ):
+        noise = self.network(x, t, cond=cond)
+
+        # Predict x_0
+        if self.predict_epsilon:
+            """
+            x₀ = √ 1\α̅ₜ xₜ - √ 1\α̅ₜ-1 ε
+            """
+            x_recon = (
+                extract(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+                - extract(self.sqrt_recipm1_alphas_cumprod, t, x.shape) * noise
+            )
+        else:  # directly predicting x₀
+            x_recon = noise
+        if self.denoised_clip_value is not None:
+            x_recon.clamp_(-self.denoised_clip_value, self.denoised_clip_value)
+
+        # Get mu
+        """
+        μₜ = β̃ₜ √ α̅ₜ₋₁/(1-α̅ₜ)x₀ + √ αₜ (1-α̅ₜ₋₁)/(1-α̅ₜ)xₜ
+        """
+        mu = (
+            extract(self.ddpm_mu_coef1, t, x.shape) * x_recon
+            + extract(self.ddpm_mu_coef2, t, x.shape) * x
+        )
+        logvar = extract(self.ddpm_logvar_clipped, t, x.shape)
+        return mu, logvar
+
+    # override
+    @torch.no_grad()
+    def forward(
+        self,
+        cond,
+        deterministic=False,
+    ):
+        device = self.betas.device
+        B = cond.shape[0]
+        if isinstance(cond, dict):
+            raise NotImplementedError("Not implemented for images")
+        else:
+            B = cond.shape[0]
+            cond = cond[:, : self.cond_steps]
+
+        # Loop
+        x = torch.randn((B, self.horizon_steps, self.transition_dim), device=device)
+        t_all = list(reversed(range(self.denoising_steps)))
+        for i, t in enumerate(t_all):
+            t_b = make_timesteps(B, t, device)
+            mean, logvar = self.p_mean_var(
+                x=x,
+                t=t_b,
+                cond=cond,
+            )
+            std = torch.exp(0.5 * logvar)
+
+            # Determine noise level
+            if deterministic and t == 0:
+                std = torch.zeros_like(std)
+            elif deterministic:  # For DDPM, sample with noise
+                std = torch.clip(std, min=1e-3)
+            else:
+                std = torch.clip(std, min=self.min_sampling_denoising_std)
+            noise = torch.randn_like(x).clamp_(
+                -self.randn_clip_value, self.randn_clip_value
+            )
+            x = mean + std * noise
+
+            # clamp action at final step
+            if self.clamp_action is not None and i == len(t_all) - 1:
+                x = torch.clamp(x, -self.clamp_action, self.clamp_action)
+        return x
diff --git a/model/diffusion/diffusion_vpg.py b/model/diffusion/diffusion_vpg.py
new file mode 100644
index 0000000..c0b17eb
--- /dev/null
+++ b/model/diffusion/diffusion_vpg.py
@@ -0,0 +1,465 @@
+"""
+Policy gradient with diffusion policy.
+
+VPG: vanilla policy gradient
+
+"""
+
+import copy
+import torch
+import logging
+import einops
+
+log = logging.getLogger(__name__)
+import torch.nn.functional as F
+
+from model.diffusion.diffusion import DiffusionModel, Sample
+from model.diffusion.sampling import make_timesteps, extract
+from torch.distributions import Normal
+
+
+class VPGDiffusion(DiffusionModel):
+
+    def __init__(
+        self,
+        actor,
+        critic,
+        ft_denoising_steps,
+        ft_denoising_steps_d=0,
+        ft_denoising_steps_t=0,
+        network_path=None,
+        # various clipping
+        randn_clip_value=10,
+        clamp_action=False,
+        min_sampling_denoising_std=0.1,
+        min_logprob_denoising_std=0.1,  # or the scheduler class
+        eps_clip_value=None,  # only used with DDIM
+        # DDIM related
+        eta=None,
+        learn_eta=False,
+        **kwargs,
+    ):
+        super().__init__(
+            network=actor,
+            network_path=network_path,
+            **kwargs,
+        )
+        assert ft_denoising_steps <= self.denoising_steps
+        assert ft_denoising_steps <= self.ddim_steps if self.use_ddim else True
+        assert not (learn_eta and not self.use_ddim), "Cannot learn eta with DDPM."
+
+        # Number of denoising steps to use with fine-tuned model. Thus denoising_step - ft_denoising_steps is the number of denoising steps to use with original model.
+        self.ft_denoising_steps = ft_denoising_steps
+        self.ft_denoising_steps_d = ft_denoising_steps_d  # annealing step size
+        self.ft_denoising_steps_t = ft_denoising_steps_t  # annealing interval
+        self.ft_denoising_steps_cnt = 0
+
+        # Clip noise for numerical stability in policy gradient
+        self.eps_clip_value = eps_clip_value
+
+        # For each denoising step, we clip sampled randn (from standard deviation) such that the sampled action is not too far away from mean
+        self.randn_clip_value = randn_clip_value
+
+        # Whether to clamp sampled action between [-1, 1]
+        self.clamp_action = clamp_action
+
+        # Minimum std used in denoising process when sampling action - helps exploration
+        self.min_sampling_denoising_std = min_sampling_denoising_std
+
+        # Minimum std used in calculating denoising logprobs - for stability
+        self.min_logprob_denoising_std = min_logprob_denoising_std
+
+        # Learnable eta
+        self.learn_eta = learn_eta
+        if eta is not None:
+            self.eta = eta.to(self.device)
+            if not learn_eta:
+                for param in self.eta.parameters():
+                    param.requires_grad = False
+                logging.info("Turned off gradients for eta")
+
+        # Re-name network to actor
+        self.actor = self.network
+
+        # Make a copy of the original model
+        self.actor_ft = copy.deepcopy(self.actor)
+        logging.info("Cloned model for fine-tuning")
+
+        # Turn off gradients for original model
+        for param in self.actor.parameters():
+            param.requires_grad = False
+        logging.info("Turned off gradients of the pretrained network")
+        logging.info(
+            f"Number of finetuned parameters: {sum(p.numel() for p in self.actor_ft.parameters() if p.requires_grad)}"
+        )
+
+        # Value function
+        self.critic = critic.to(self.device)
+        if network_path is not None:
+            checkpoint = torch.load(
+                network_path, map_location=self.device, weights_only=True
+            )
+            if "ema" not in checkpoint:  # load trained RL model
+                self.load_state_dict(checkpoint["model"], strict=False)
+                logging.info("Loaded critic from %s", network_path)
+
+    # ---------- Sampling ----------#
+
+    def step(self):
+        """Update min_sampling_denoising_std annealing and fine-tuning denoising steps annealing. Both not used currently"""
+        # anneal min_sampling_denoising_std
+        if type(self.min_sampling_denoising_std) is not float:
+            self.min_sampling_denoising_std.step()
+
+        # anneal denoising steps
+        self.ft_denoising_steps_cnt += 1
+        if (
+            self.ft_denoising_steps_d > 0
+            and self.ft_denoising_steps_t > 0
+            and self.ft_denoising_steps_cnt % self.ft_denoising_steps_t == 0
+        ):
+            self.ft_denoising_steps = max(
+                0, self.ft_denoising_steps - self.ft_denoising_steps_d
+            )
+
+            # update actor
+            self.actor = self.actor_ft
+            self.actor_ft = copy.deepcopy(self.actor)
+            for param in self.actor.parameters():
+                param.requires_grad = False
+            logging.info(
+                f"Finished annealing fine-tuning denoising steps to {self.ft_denoising_steps}"
+            )
+
+    def get_min_sampling_denoising_std(self):
+        if type(self.min_sampling_denoising_std) is float:
+            return self.min_sampling_denoising_std
+        else:
+            return self.min_sampling_denoising_std()
+
+    # override
+    def p_mean_var(
+        self,
+        x,
+        t,
+        cond=None,
+        index=None,
+        use_base_policy=False,
+        deterministic=False,
+    ):
+        noise = self.actor(x, t, cond=cond)
+        if self.use_ddim:
+            ft_indices = torch.where(
+                index >= (self.ddim_steps - self.ft_denoising_steps)
+            )[0]
+        else:
+            ft_indices = torch.where(t < self.ft_denoising_steps)[0]
+
+        # Use base policy to query expert model, e.g. for imitation loss
+        actor = self.actor if use_base_policy else self.actor_ft
+
+        # overwrite noise for fine-tuning steps
+        if len(ft_indices) > 0:
+            if cond is not None:
+                if isinstance(cond, dict):
+                    cond = {key: cond[key][ft_indices] for key in cond}
+                else:
+                    cond = cond[ft_indices]
+            noise_ft = actor(x[ft_indices], t[ft_indices], cond=cond)
+            noise[ft_indices] = noise_ft
+
+        # Predict x_0
+        if self.predict_epsilon:
+            if self.use_ddim:
+                """
+                x₀ = (xₜ - √ (1-αₜ) ε )/ √ αₜ
+                """
+                alpha = extract(self.ddim_alphas, index, x.shape)
+                alpha_prev = extract(self.ddim_alphas_prev, index, x.shape)
+                sqrt_one_minus_alpha = extract(
+                    self.ddim_sqrt_one_minus_alphas, index, x.shape
+                )
+                x_recon = (x - sqrt_one_minus_alpha * noise) / (alpha**0.5)
+            else:
+                """
+                x₀ = √ 1\α̅ₜ xₜ - √ 1\α̅ₜ-1 ε
+                """
+                x_recon = (
+                    extract(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+                    - extract(self.sqrt_recipm1_alphas_cumprod, t, x.shape) * noise
+                )
+        else:  # directly predicting x₀
+            x_recon = noise
+        if self.denoised_clip_value is not None:
+            x_recon.clamp_(-self.denoised_clip_value, self.denoised_clip_value)
+            if self.use_ddim:
+                # re-calculate noise based on clamped x_recon - default to false in HF, but let's use it here
+                noise = (x - alpha ** (0.5) * x_recon) / sqrt_one_minus_alpha
+
+        # Clip epsilon for numerical stability in policy gradient - not sure if this is helpful yet, but the value can be huge sometimes. This has no effect if DDPM is used
+        if self.use_ddim and self.eps_clip_value is not None:
+            noise.clamp_(-self.eps_clip_value, self.eps_clip_value)
+
+        # Get mu
+        if self.use_ddim:
+            """
+            μ = √ αₜ₋₁ x₀ + √(1-αₜ₋₁ - σₜ²) ε
+            """
+            if deterministic:
+                etas = torch.zeros((x.shape[0], 1, 1)).to(x.device)
+            else:
+                etas = self.eta(cond).unsqueeze(1)  # B x 1 x (transition_dim or 1)
+            sigma = (
+                etas
+                * ((1 - alpha_prev) / (1 - alpha) * (1 - alpha / alpha_prev)) ** 0.5
+            ).clamp_(min=1e-10)
+            dir_xt_coef = (1.0 - alpha_prev - sigma**2).clamp_(min=0).sqrt()
+            mu = (alpha_prev**0.5) * x_recon + dir_xt_coef * noise
+            var = sigma**2
+            logvar = torch.log(var)
+        else:
+            """
+            μₜ = β̃ₜ √ α̅ₜ₋₁/(1-α̅ₜ)x₀ + √ αₜ (1-α̅ₜ₋₁)/(1-α̅ₜ)xₜ
+            """
+            mu = (
+                extract(self.ddpm_mu_coef1, t, x.shape) * x_recon
+                + extract(self.ddpm_mu_coef2, t, x.shape) * x
+            )
+            logvar = extract(self.ddpm_logvar_clipped, t, x.shape)
+            etas = torch.ones_like(mu).to(mu.device)  # always one for DDPM
+        return mu, logvar, etas
+
+    # override
+    @torch.no_grad()
+    def forward(
+        self,
+        cond,
+        deterministic=False,
+        return_chain=True,
+        use_base_policy=False,
+    ):
+        """
+        Forward pass for sampling actions.
+
+        Args:
+            cond: (batch_size, obs_step, obs_dim)
+            deterministic: whether to sample deterministically
+            return_chain: whether to return the chain of samples
+            use_base_policy: whether to use the base policy instead
+        Return:
+            Sample: namedtuple with fields:
+                trajectories: (batch_size, horizon_steps, transition_dim)
+                values: (batch_size, )
+                chain: (batch_size, denoising_steps + 1, horizon_steps, transition_dim)
+        """
+        device = self.betas.device
+        if isinstance(cond, dict):
+            B = cond["state"].shape[0]
+            cond["state"] = cond["state"][:, : self.cond_steps]
+            cond["rgb"] = cond["rgb"][:, : self.cond_steps]
+        else:
+            B = cond.shape[0]
+            cond = cond[:, : self.cond_steps]
+
+        # Get updated minimum sampling denoising std
+        min_sampling_denoising_std = self.get_min_sampling_denoising_std()
+
+        # Loop
+        x = torch.randn((B, self.horizon_steps, self.transition_dim), device=device)
+        if self.use_ddim:
+            t_all = self.ddim_t
+        else:
+            t_all = list(reversed(range(self.denoising_steps)))
+        chain = [] if return_chain else None
+        if not self.use_ddim and self.ft_denoising_steps == self.denoising_steps:
+            chain.append(x)
+        if self.use_ddim and self.ft_denoising_steps == self.ddim_steps:
+            chain.append(x)
+        for i, t in enumerate(t_all):
+            t_b = make_timesteps(B, t, device)
+            index_b = make_timesteps(B, i, device)
+            mean, logvar, _ = self.p_mean_var(
+                x=x,
+                t=t_b,
+                cond=cond,
+                index=index_b,
+                use_base_policy=use_base_policy,
+                deterministic=deterministic,
+            )
+            std = torch.exp(0.5 * logvar)
+
+            # Determine noise level
+            if self.use_ddim:
+                if deterministic:
+                    std = torch.zeros_like(std)
+                else:
+                    std = torch.clip(std, min=min_sampling_denoising_std)
+            else:
+                if deterministic and t == 0:
+                    std = torch.zeros_like(std)
+                elif deterministic:  # still keep the original noise
+                    std = torch.clip(std, min=1e-3)
+                else:  # use higher minimum noise
+                    std = torch.clip(std, min=min_sampling_denoising_std)
+            noise = torch.randn_like(x).clamp_(
+                -self.randn_clip_value, self.randn_clip_value
+            )
+            x = mean + std * noise
+
+            # clamp action at final step
+            if self.clamp_action and i == len(t_all) - 1:
+                x = torch.clamp(x, -1, 1)
+
+            if return_chain:
+                if not self.use_ddim and t <= self.ft_denoising_steps:
+                    chain.append(x)
+                elif self.use_ddim and i >= (
+                    self.ddim_steps - self.ft_denoising_steps - 1
+                ):
+                    chain.append(x)
+            values = torch.zeros(len(x), device=x.device)
+
+        if return_chain:
+            chain = torch.stack(chain, dim=1)
+        return Sample(x, values, chain)
+
+    # ---------- RL training ----------#
+
+    def get_logprobs(
+        self,
+        obs,
+        chains,
+        get_ent: bool = False,
+        use_base_policy: bool = False,
+    ):
+        """
+        Calculating the logprobs of the entire chain of denoised actions.
+
+        Args:
+        obs: (B, obs_step, obs_dim)
+        chains: (B, num_denoising_step+1, horizon_step, action_dim)
+        get_ent: flag for returning entropy
+        use_base_policy: flag for using base policy
+
+        Returns:
+        logprobs: (B x num_denoising_steps, horizon_step, action_dim)
+        entropy (if get_ent=True):  (B x num_denoising_steps, horizon_step)
+        """
+        # Repeat obs conditioning for denoising_steps
+        if isinstance(obs, dict):
+            obs = {
+                key: obs[key]
+                .unsqueeze(1)
+                .repeat(1, self.ft_denoising_steps, *(1,) * (obs[key].ndim - 1))
+                for key in obs
+            }
+        else:
+            obs = einops.repeat(obs, "b h d -> b t h d", t=self.ft_denoising_steps)
+
+        # flatten the first two dimensions
+        if isinstance(obs, dict):
+            cond = obs
+            for key in cond:
+                cond[key] = einops.rearrange(cond[key], "b t ... -> (b t) ...")
+                cond[key] = cond[key][:, : self.cond_steps]
+        else:
+            cond = einops.rearrange(obs, "b t h d -> (b t) h d")
+            cond = cond[:, : self.cond_steps]
+
+        # Repeat t for batch dim, keep it 1-dim
+        if self.use_ddim:
+            t_single = self.ddim_t[-self.ft_denoising_steps :]
+        else:
+            t_single = torch.arange(
+                start=self.ft_denoising_steps - 1,
+                end=-1,
+                step=-1,
+                device=self.device,
+            )
+            # 4,3,2,1,0,4,3,2,1,0,...,4,3,2,1,0
+        t_all = t_single.repeat(chains.shape[0], 1).flatten()
+        if self.use_ddim:
+            indices_single = torch.arange(
+                start=self.ddim_steps - self.ft_denoising_steps,
+                end=self.ddim_steps,
+                device=self.device,
+            )  # only used for DDIM
+            indices = indices_single.repeat(chains.shape[0])
+        else:
+            indices = None
+
+        # Split chains
+        chains_prev = chains[:, :-1]
+        chains_next = chains[:, 1:]
+
+        # Flatten first two dimensions
+        chains_prev = chains_prev.reshape(-1, self.horizon_steps, self.transition_dim)
+        chains_next = chains_next.reshape(-1, self.horizon_steps, self.transition_dim)
+
+        # Forward pass with previous chains
+        next_mean, logvar, eta = self.p_mean_var(
+            chains_prev,
+            t_all,
+            cond=cond,
+            index=indices,
+            use_base_policy=use_base_policy,
+        )
+        std = torch.exp(0.5 * logvar)
+        std = torch.clip(std, min=self.min_logprob_denoising_std)
+        dist = Normal(next_mean, std)
+
+        # Get logprobs with gaussian
+        log_prob = dist.log_prob(chains_next)
+        if get_ent:
+            return log_prob, eta
+        return log_prob
+
+    def loss(self, obs, chains, reward):
+        """
+        REINFORCE loss. Not used right now.
+
+        Args:
+        obs: (n_steps, n_envs, obs_dim)
+        chains: (n_steps, n_envs, num_denoising_step+1, horizon_step, action_dim)
+        reward (to go): (n_steps, n_envs)
+        """
+        if torch.is_tensor(reward):
+            assert not reward.requires_grad
+
+        n_steps, n_envs, _ = obs.shape
+
+        # Flatten first two dimensions
+        obs = einops.rearrange(obs, "s e d -> (s e) d")
+        chains = einops.rearrange(chains, "s e t h d -> (s e) t h d")
+        reward = reward.reshape(-1)
+
+        # Get advantage
+        with torch.no_grad():
+            value = self.critic(obs).squeeze()
+        advantage = reward - value
+
+        # Get logprobs for denoising steps from T-1 to 0
+        logprobs, eta = self.get_logprobs(obs, chains, get_ent=True)
+        # (n_steps x n_envs x denoising_steps) x horizon_steps x (obs_dim+action_dim)
+
+        # Ignore obs dimension, and then sum over action dimension
+        logprobs = logprobs[:, :, : self.action_dim].sum(-1)
+        # -> (n_steps x n_envs x denoising_steps) x horizon_steps
+
+        # -> (n_steps x n_envs) x denoising_steps x horizon_steps
+        logprobs = logprobs.reshape(
+            (n_steps * n_envs, self.denoising_steps, self.horizon_steps)
+        )
+        # Sum/avg over denoising steps
+        logprobs = logprobs.mean(-2)  # -> (n_steps x n_envs) x horizon_steps
+
+        # Sum/avg over horizon steps
+        logprobs = logprobs.mean(-1)  # -> (n_steps x n_envs)
+
+        # Get REINFORCE loss
+        loss_actor = torch.mean(-logprobs * advantage)
+
+        # Train critic to predict state value
+        pred = self.critic(obs).squeeze()
+        loss_critic = F.mse_loss(pred, reward)
+        return loss_actor, loss_critic, eta
diff --git a/model/diffusion/eta.py b/model/diffusion/eta.py
new file mode 100644
index 0000000..b1068f7
--- /dev/null
+++ b/model/diffusion/eta.py
@@ -0,0 +1,164 @@
+"""
+Eta in DDIM.
+
+Can be learned but always fixed to 1 during training and 0 during eval right now.
+
+"""
+
+import torch
+from model.common.mlp import MLP
+
+
+class EtaFixed(torch.nn.Module):
+
+    def __init__(
+        self,
+        base_eta=0.5,
+        min_eta=0.1,
+        max_eta=1.0,
+        **kwargs,
+    ):
+        super().__init__()
+        self.eta_logit = torch.nn.Parameter(torch.ones(1))
+        self.min = min_eta
+        self.max = max_eta
+
+        # initialize such that eta = base_eta
+        self.eta_logit.data = torch.atanh(
+            torch.tensor([2 * (base_eta - min_eta) / (max_eta - min_eta) - 1])
+        )
+
+    def __call__(self, x):
+        """Match input batch size, but do not depend on input"""
+        if isinstance(x, dict):
+            B = x["state"].shape[0]
+            device = x["state"].device
+        else:
+            B = x.size(0)
+            device = x.device
+        eta_normalized = torch.tanh(self.eta_logit)
+
+        # map to min and max from [-1, 1]
+        eta = 0.5 * (eta_normalized + 1) * (self.max - self.min) + self.min
+        return torch.full((B, 1), eta.item()).to(device)
+
+
+class EtaAction(torch.nn.Module):
+
+    def __init__(
+        self,
+        action_dim,
+        base_eta=0.5,
+        min_eta=0.1,
+        max_eta=1.0,
+        **kwargs,
+    ):
+        super().__init__()
+        # initialize such that eta = base_eta
+        self.eta_logit = torch.nn.Parameter(
+            torch.ones(action_dim)
+            * torch.atanh(
+                torch.tensor([2 * (base_eta - min_eta) / (max_eta - min_eta) - 1])
+            )
+        )
+        self.min = min_eta
+        self.max = max_eta
+
+    def __call__(self, x):
+        """Match input batch size, but do not depend on input"""
+        if isinstance(x, dict):
+            B = x["state"].shape[0]
+            device = x["state"].device
+        else:
+            B = x.size(0)
+            device = x.device
+        eta_normalized = torch.tanh(self.eta_logit)
+
+        # map to min and max from [-1, 1]
+        eta = 0.5 * (eta_normalized + 1) * (self.max - self.min) + self.min
+        return eta.repeat(B, 1).to(device)
+
+
+class EtaState(torch.nn.Module):
+
+    def __init__(
+        self,
+        input_dim,
+        mlp_dims,
+        activation_type="ReLU",
+        out_activation_type="Identity",
+        base_eta=0.5,
+        min_eta=0.1,
+        max_eta=1.0,
+        gain=1e-2,
+        **kwargs,
+    ):
+        super().__init__()
+        self.base = base_eta
+        self.min_res = min_eta - base_eta
+        self.max_res = max_eta - base_eta
+        self.mlp_res = MLP(
+            [input_dim] + mlp_dims + [1],
+            activation_type=activation_type,
+            out_activation_type=out_activation_type,
+        )
+
+        # initialize such that mlp(x) = 0
+        for m in self.mlp_res.modules():
+            if isinstance(m, torch.nn.Linear):
+                torch.nn.init.xavier_normal_(m.weight, gain=gain)
+                m.bias.data.fill_(0)
+
+    def __call__(self, x):
+        if isinstance(x, dict):
+            raise NotImplementedError(
+                "State-based eta not implemented for image-based training!"
+            )
+        x = x.view(x.size(0), -1)
+        eta_res = self.mlp_res(x)
+        eta_res = torch.tanh(eta_res)  # [-1, 1]
+        eta = eta_res + self.base  # [0, 2]
+        return torch.clamp(eta, self.min_res + self.base, self.max_res + self.base)
+
+
+class EtaStateAction(torch.nn.Module):
+
+    def __init__(
+        self,
+        input_dim,
+        mlp_dims,
+        action_dim,
+        activation_type="ReLU",
+        out_activation_type="Identity",
+        base_eta=1,
+        min_eta=1e-3,
+        max_eta=2,
+        gain=1e-2,
+        **kwargs,
+    ):
+        super().__init__()
+        self.base = base_eta
+        self.min_res = min_eta - base_eta
+        self.max_res = max_eta - base_eta
+        self.mlp_res = MLP(
+            [input_dim] + mlp_dims + [action_dim],
+            activation_type=activation_type,
+            out_activation_type=out_activation_type,
+        )
+
+        # initialize such that mlp(x) = 0
+        for m in self.mlp_res.modules():
+            if isinstance(m, torch.nn.Linear):
+                torch.nn.init.xavier_normal_(m.weight, gain=gain)
+                m.bias.data.fill_(0)
+
+    def __call__(self, x):
+        if isinstance(x, dict):
+            raise NotImplementedError(
+                "State-action-based eta not implemented for image-based training!"
+            )
+        x = x.view(x.size(0), -1)
+        eta_res = self.mlp_res(x)
+        eta_res = torch.tanh(eta_res)  # [-1, 1]
+        eta = eta_res + self.base
+        return torch.clamp(eta, self.min_res + self.base, self.max_res + self.base)
diff --git a/model/diffusion/exact_likelihood.py b/model/diffusion/exact_likelihood.py
new file mode 100644
index 0000000..fdaa721
--- /dev/null
+++ b/model/diffusion/exact_likelihood.py
@@ -0,0 +1,188 @@
+"""
+Solving probabilistic ODE for exact likelihood, from https://github.com/yang-song/score_sde_pytorch
+
+"""
+
+import torch
+import numpy as np
+from torchdiffeq import odeint
+
+# adjoint can reduce memory, but not faster
+# from torchdiffeq import odeint_adjoint as odeint
+from model.diffusion.sde_lib import get_score_fn
+
+
+def get_likelihood_fn(
+    sde,
+    hutchinson_type="Rademacher",
+    rtol=1e-5,
+    atol=1e-5,
+    method="RK45",
+    steps=10,  # should not matter, only t_eval
+    step_size=1e-3,
+    eps=1e-5,
+    continuous=False,
+    probability_flow=False,
+    predict_epsilon=False,
+    num_epsilon=1,
+):
+    """Create a function to compute the unbiased log-likelihood estimate of a given data point.
+
+    Args:
+      sde: A `sde_lib.SDE` object that represents the forward SDE.
+      inverse_scaler: The inverse data normalizer.
+      hutchinson_type: "Rademacher" or "Gaussian". The type of noise for Hutchinson-Skilling trace estimator.
+      rtol: A `float` number. The relative tolerance level of the black-box ODE solver.
+      atol: A `float` number. The absolute tolerance level of the black-box ODE solver.
+      method: A `str`. The algorithm for the black-box ODE solver.
+        See documentation for `scipy.integrate.solve_ivp`.
+      eps: A `float` number. The probability flow ODE is integrated to `eps` for numerical stability.
+
+    Returns:
+      A function that a batch of data points and returns the log-likelihoods in bits/dim,
+        the latent code, and the number of function evaluations cost by computation.
+    """
+
+    def drift_fn(
+        model,
+        x,
+        t,
+        **kwargs,
+    ):
+        """The drift function of the reverse-time SDE."""
+        score_fn = get_score_fn(
+            sde,
+            model,
+            continuous=continuous,
+            predict_epsilon=predict_epsilon,
+        )
+        # Probability flow ODE is a special case of Reverse SDE
+        rsde = sde.reverse(score_fn, probability_flow=probability_flow)
+        sde_out = rsde.sde(x, t, **kwargs)[0]
+        return sde_out
+
+    def div_fn(
+        model,
+        x,
+        t,
+        noise,
+        create_graph=False,
+        **kwargs,
+    ):
+        with torch.enable_grad():
+            x.requires_grad_(True)
+            fn_eps = torch.sum(drift_fn(model, x, t, **kwargs) * noise)
+            grad_fn_eps = torch.autograd.grad(
+                fn_eps,
+                x,
+                create_graph=create_graph,
+            )[0]
+        if not create_graph:
+            x.requires_grad_(False)
+        return torch.sum(grad_fn_eps * noise, dim=(1, 2))
+
+    def likelihood_fn(
+        model,
+        model_ft,
+        data,
+        denoising_steps,
+        ft_denoising_steps,
+        cond,
+        **kwargs,
+    ):
+        """Compute an unbiased estimate to the log-likelihood in bits/dim.
+
+        Args:
+          model: A score model.
+          data: A PyTorch tensor. B x horizon x transition_dim
+
+        Returns:
+          logprob: B
+        """
+        shape = data.shape
+        B, H, A = shape
+        device = data.device
+
+        # sample epsilon
+        if hutchinson_type == "Gaussian":
+            epsilon = torch.randn(size=(B * num_epsilon, H, A), device=device)
+        elif hutchinson_type == "Rademacher":
+            epsilon = (
+                torch.randint(
+                    low=0, high=2, size=(B * num_epsilon, H, A), device=device
+                ).float()
+                * 2
+                - 1.0
+            )
+        else:
+            raise NotImplementedError(f"Hutchinson type {hutchinson_type} unknown.")
+
+        # repeat for expectation
+        cond_eps = cond.repeat_interleave(num_epsilon, dim=0)
+
+        def ode_func(t, x):
+            x = x[:, :-1]
+            vec_t = torch.full(
+                (x.shape[0],),
+                torch.round(t * denoising_steps),
+                device=x.device,
+                dtype=int,
+            )
+            if torch.round(t * denoising_steps) <= ft_denoising_steps:
+                model_fn = model_ft
+            else:
+                model_fn = model
+            x = x.view(shape)  # B x horizon x transition_dim
+            drift = drift_fn(
+                model_fn,
+                x,
+                vec_t,
+                cond=cond,
+                **kwargs,
+            ).reshape(B, -1)
+
+            # repeat for expectation
+            x = x.repeat_interleave(num_epsilon, dim=0)
+            vec_t = vec_t.repeat_interleave(num_epsilon)
+
+            logp_grad = div_fn(
+                model,
+                x,
+                vec_t,
+                epsilon,
+                create_graph=True,
+                cond=cond_eps,
+                **kwargs,
+            )[:, None].reshape(B, num_epsilon, -1)
+            logp_grad = logp_grad.mean(dim=1)  # expectation over epsilon
+            return torch.cat(
+                [drift, logp_grad], dim=-1
+            )  # Concatenate along the feature dimension
+
+        # flatten data
+        data = data.view(shape[0], -1)
+        init = torch.hstack(
+            (data, torch.zeros((shape[0], 1)).to(data.dtype).to(device))
+        )
+        t_eval = torch.linspace(eps, sde.T, steps=steps).to(device)  # eval points
+        solution = odeint(
+            ode_func,
+            init,
+            t_eval,
+            method=method,
+            rtol=rtol,
+            atol=atol,
+            options={"step_size": step_size},
+            # args=(model, epsilon),
+        )  # steps x batch x 3
+        zp = solution[-1]  # batch x 3
+        z = zp[:, :-1].view(shape)
+        delta_logp = zp[:, -1]
+        prior_logp = sde.prior_logp(z)
+        N = torch.prod(torch.tensor(shape[1:]))
+        # print("prior:", prior_logp / (np.log(2) * N))
+        # print("delta:", delta_logp / (np.log(2) * N))
+        logprob = (prior_logp + delta_logp) / (np.log(2) * N)
+        return logprob
+
+    return likelihood_fn
diff --git a/model/diffusion/mlp_diffusion.py b/model/diffusion/mlp_diffusion.py
new file mode 100644
index 0000000..beccfde
--- /dev/null
+++ b/model/diffusion/mlp_diffusion.py
@@ -0,0 +1,233 @@
+"""
+MLP models for diffusion policies.
+
+"""
+
+import torch
+import torch.nn as nn
+import logging
+import einops
+from copy import deepcopy
+
+from model.common.mlp import MLP, ResidualMLP
+from model.diffusion.modules import SinusoidalPosEmb
+from model.common.modules import SpatialEmb, RandomShiftsAug
+
+log = logging.getLogger(__name__)
+
+
+class VisionDiffusionMLP(nn.Module):
+    """With ViT backbone"""
+
+    def __init__(
+        self,
+        backbone,
+        transition_dim,
+        horizon_steps,
+        cond_dim,
+        time_dim=16,
+        mlp_dims=[256, 256],
+        activation_type="Mish",
+        out_activation_type="Identity",
+        use_layernorm=False,
+        residual_style=False,
+        spatial_emb=0,
+        visual_feature_dim=128,
+        repr_dim=96 * 96,
+        patch_repr_dim=128,
+        dropout=0,
+        num_img=1,
+        augment=False,
+    ):
+        super().__init__()
+
+        # vision
+        self.backbone = backbone
+        if augment:
+            self.aug = RandomShiftsAug(pad=4)
+        self.augment = augment
+        if spatial_emb > 0:
+            assert spatial_emb > 1, "this is the dimension"
+            if num_img > 1:
+                self.compress1 = SpatialEmb(
+                    num_patch=121,  # TODO: repr_dim // patch_repr_dim,
+                    patch_dim=patch_repr_dim,
+                    prop_dim=cond_dim,
+                    proj_dim=spatial_emb,
+                    dropout=dropout,
+                )
+                self.compress2 = deepcopy(self.compress1)
+            else:  # TODO: clean up
+                self.compress = SpatialEmb(
+                    num_patch=121,
+                    patch_dim=patch_repr_dim,
+                    prop_dim=cond_dim,
+                    proj_dim=spatial_emb,
+                    dropout=dropout,
+                )
+            visual_feature_dim = spatial_emb * num_img
+        else:
+            self.compress = nn.Sequential(
+                nn.Linear(repr_dim, visual_feature_dim),
+                nn.LayerNorm(visual_feature_dim),
+                nn.Dropout(dropout),
+                nn.ReLU(),
+            )
+
+        # diffusion
+        input_dim = (
+            time_dim + transition_dim * horizon_steps + visual_feature_dim + cond_dim
+        )
+        output_dim = transition_dim * horizon_steps
+        self.time_embedding = nn.Sequential(
+            SinusoidalPosEmb(time_dim),
+            nn.Linear(time_dim, time_dim * 2),
+            nn.Mish(),
+            nn.Linear(time_dim * 2, time_dim),
+        )
+        if residual_style:
+            model = ResidualMLP
+        else:
+            model = MLP
+        self.mlp_mean = model(
+            [input_dim] + mlp_dims + [output_dim],
+            activation_type=activation_type,
+            out_activation_type=out_activation_type,
+            use_layernorm=use_layernorm,
+        )
+        self.time_dim = time_dim
+
+    def forward(
+        self,
+        x,
+        time,
+        cond=None,
+        **kwargs,
+    ):
+        """
+        x: (B,T,obs_dim)
+        time: (B,) or int, diffusion step
+        cond: dict (B,cond_step,cond_dim)
+        output: (B,T,input_dim)
+        """
+        # flatten T and input_dim
+        B, T, input_dim = x.shape
+        x = x.view(B, -1)
+
+        # flatten cond_dim if exists
+        if cond["rgb"].ndim == 5:
+            rgb = einops.rearrange(cond["rgb"], "b d c h w -> (b d) c h w")
+        else:
+            rgb = cond["rgb"]
+        if cond["state"].ndim == 3:
+            state = einops.rearrange(cond["state"], "b d c -> (b d) c")
+        else:
+            state = cond["state"]
+
+        # get vit output - pass in two images separately
+        if rgb.shape[1] == 6:  # TODO: properly handle multiple images
+            rgb1 = rgb[:, :3]
+            rgb2 = rgb[:, 3:]
+            if self.augment:
+                rgb1 = self.aug(rgb1)
+                rgb2 = self.aug(rgb2)
+            feat1 = self.backbone(rgb1)
+            feat2 = self.backbone(rgb2)
+            feat1 = self.compress1.forward(feat1, state)
+            feat2 = self.compress2.forward(feat2, state)
+            feat = torch.cat([feat1, feat2], dim=-1)
+        else:  # single image
+            if self.augment:
+                rgb = self.aug(rgb)  # uint8 -> float32
+            feat = self.backbone(rgb)
+
+            # compress
+            if isinstance(self.compress, SpatialEmb):
+                feat = self.compress.forward(feat, state)
+            else:
+                feat = feat.flatten(1, -1)
+                feat = self.compress(feat)
+        cond_encoded = torch.cat([feat, state], dim=-1)
+
+        # append time and cond
+        time = time.view(B, 1)
+        time_emb = self.time_embedding(time).view(B, self.time_dim)
+        x = torch.cat([x, time_emb, cond_encoded], dim=-1)
+
+        # mlp
+        out = self.mlp_mean(x)
+        return out.view(B, T, input_dim)
+
+
+class DiffusionMLP(nn.Module):
+
+    def __init__(
+        self,
+        transition_dim,
+        horizon_steps,
+        cond_dim,
+        time_dim=16,
+        mlp_dims=[256, 256],
+        cond_mlp_dims=None,
+        activation_type="Mish",
+        out_activation_type="Identity",
+        use_layernorm=False,
+        residual_style=False,
+    ):
+        super().__init__()
+        output_dim = transition_dim * horizon_steps
+        self.time_embedding = nn.Sequential(
+            SinusoidalPosEmb(time_dim),
+            nn.Linear(time_dim, time_dim * 2),
+            nn.Mish(),
+            nn.Linear(time_dim * 2, time_dim),
+        )
+        if residual_style:
+            model = ResidualMLP
+        else:
+            model = MLP
+        if cond_mlp_dims is not None:
+            self.cond_mlp = MLP(
+                [cond_dim] + cond_mlp_dims,
+                activation_type=activation_type,
+                out_activation_type="Identity",
+            )
+            input_dim = time_dim + transition_dim * horizon_steps + cond_mlp_dims[-1]
+        else:
+            input_dim = time_dim + transition_dim * horizon_steps + cond_dim
+        self.mlp_mean = model(
+            [input_dim] + mlp_dims + [output_dim],
+            activation_type=activation_type,
+            out_activation_type=out_activation_type,
+            use_layernorm=use_layernorm,
+        )
+        self.time_dim = time_dim
+
+    def forward(
+        self,
+        x,
+        time,
+        cond=None,
+        **kwargs,
+    ):
+        """
+        x: (B,T,obs_dim)
+        time: (B,) or int, diffusion step
+        cond: (B,cond_step,cond_dim)
+        output: (B,T,input_dim)
+        """
+        # flatten T and input_dim
+        B, T, input_dim = x.shape
+        x = x.view(B, -1)
+        cond = cond.view(B, -1) if cond is not None else None
+        if hasattr(self, "cond_mlp"):
+            cond = self.cond_mlp(cond)
+
+        # append time and cond
+        time = time.view(B, 1)
+        time_emb = self.time_embedding(time).view(B, self.time_dim)
+        x = torch.cat([x, time_emb, cond], dim=-1)
+
+        # mlp
+        out = self.mlp_mean(x)
+        return out.view(B, T, input_dim)
diff --git a/model/diffusion/modules.py b/model/diffusion/modules.py
new file mode 100644
index 0000000..528f9a9
--- /dev/null
+++ b/model/diffusion/modules.py
@@ -0,0 +1,95 @@
+"""
+From Diffuser https://github.com/jannerm/diffuser
+
+For MLP and UNet diffusion models.
+
+"""
+
+import math
+import torch
+import torch.nn as nn
+from einops.layers.torch import Rearrange
+
+
+class SinusoidalPosEmb(nn.Module):
+
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+
+
+class Downsample1d(nn.Module):
+
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = nn.Conv1d(dim, dim, 3, 2, 1)
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class Upsample1d(nn.Module):
+
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = nn.ConvTranspose1d(dim, dim, 4, 2, 1)
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class Conv1dBlock(nn.Module):
+    """
+    Conv1d --> GroupNorm --> Mish
+    """
+
+    def __init__(
+        self,
+        inp_channels,
+        out_channels,
+        kernel_size,
+        n_groups=None,
+        activation_type="Mish",
+        eps=1e-5,
+    ):
+        super().__init__()
+        if activation_type == "Mish":
+            act = nn.Mish()
+        elif activation_type == "ReLU":
+            act = nn.ReLU()
+        else:
+            raise "Unknown activation type for Conv1dBlock"
+
+        self.block = nn.Sequential(
+            nn.Conv1d(
+                inp_channels, out_channels, kernel_size, padding=kernel_size // 2
+            ),
+            (
+                Rearrange("batch channels horizon -> batch channels 1 horizon")
+                if n_groups is not None
+                else nn.Identity()
+            ),
+            (
+                nn.GroupNorm(n_groups, out_channels, eps=eps)
+                if n_groups is not None
+                else nn.Identity()
+            ),
+            (
+                Rearrange("batch channels 1 horizon -> batch channels horizon")
+                if n_groups is not None
+                else nn.Identity()
+            ),
+            act,
+        )
+
+    def forward(self, x):
+        return self.block(x)
diff --git a/model/diffusion/sampling.py b/model/diffusion/sampling.py
new file mode 100644
index 0000000..4b8e279
--- /dev/null
+++ b/model/diffusion/sampling.py
@@ -0,0 +1,37 @@
+"""
+From Diffuser https://github.com/jannerm/diffuser
+
+"""
+
+import torch
+import numpy as np
+
+
+def cosine_beta_schedule(timesteps, s=0.008, dtype=torch.float32):
+    """
+    cosine schedule as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
+    """
+    steps = timesteps + 1
+    x = np.linspace(0, steps, steps)
+    alphas_cumprod = np.cos(((x / steps) + s) / (1 + s) * np.pi * 0.5) ** 2
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+    betas_clipped = np.clip(betas, a_min=0, a_max=0.999)
+    return torch.tensor(betas_clipped, dtype=dtype)
+
+
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
+
+def apply_obs_conditioning(x, conditions, action_dim):
+    for t, val in conditions.items():
+        x[:, t, action_dim:] = val.clone()
+    return x
+
+
+def make_timesteps(batch_size, i, device):
+    t = torch.full((batch_size,), i, device=device, dtype=torch.long)
+    return t
diff --git a/model/diffusion/sde_lib.py b/model/diffusion/sde_lib.py
new file mode 100644
index 0000000..edee0ee
--- /dev/null
+++ b/model/diffusion/sde_lib.py
@@ -0,0 +1,213 @@
+"""
+Abstract SDE classes, Reverse SDE, and VE/VP SDEs.
+
+From https://github.com/yang-song/score_sde_pytorch
+
+"""
+
+import abc
+import torch
+import numpy as np
+
+
+def get_score_fn(
+    sde,
+    model,
+    continuous=False,
+    predict_epsilon=False,
+):
+    """Wraps `score_fn` so that the model output corresponds to a real time-dependent score function.
+
+    Args:
+      sde: An `sde_lib.SDE` object that represents the forward SDE.
+      model: A score model.
+      continuous: If `True`, the score-based model is expected to directly take continuous time steps.
+
+    Returns:
+      A score function.
+    """
+
+    def score_fn(x, t, **kwargs):
+        """
+        Use [:, None, None] to add two dimensions (horizon and transition)
+        """
+        score = model(x, t, **kwargs)
+
+        if not predict_epsilon:  # get epsilon first from predicted mu
+            score = (
+                -(x - score * sde.sqrt_alphas[t.long()][:, None, None])
+                / sde.discrete_betas[t.long()][:, None, None]
+            )
+        else:
+            std = sde.sqrt_1m_alpha_bar[t.long()]
+            score = -score / std[:, None, None]
+        return score
+
+    return score_fn
+
+
+class SDE(abc.ABC):
+    """SDE abstract class. Functions are designed for a mini-batch of inputs."""
+
+    def __init__(self, N):
+        """Construct an SDE.
+
+        Args:
+          N: number of discretization time steps.
+        """
+        super().__init__()
+        self.N = N
+
+    @property
+    @abc.abstractmethod
+    def T(self):
+        """End time of the SDE."""
+        pass
+
+    @abc.abstractmethod
+    def sde(self, x, t):
+        pass
+
+    @abc.abstractmethod
+    def marginal_prob(self, x, t):
+        """Parameters to determine the marginal distribution of the SDE, $p_t(x)$."""
+        pass
+
+    @abc.abstractmethod
+    def prior_sampling(self, shape):
+        """Generate one sample from the prior distribution, $p_T(x)$."""
+        pass
+
+    @abc.abstractmethod
+    def prior_logp(self, z):
+        """Compute log-density of the prior distribution.
+
+        Useful for computing the log-likelihood via probability flow ODE.
+
+        Args:
+          z: latent code
+        Returns:
+          log probability density
+        """
+        pass
+
+    def discretize(self, x, t):
+        """Discretize the SDE in the form: x_{i+1} = x_i + f_i(x_i) + G_i z_i.
+
+        Useful for reverse diffusion sampling and probabiliy flow sampling.
+        Defaults to Euler-Maruyama discretization.
+
+        Args:
+          x: a torch tensor
+          t: a torch float representing the time step (from 0 to `self.T`)
+
+        Returns:
+          f, G
+        """
+        dt = 1 / self.N
+        drift, diffusion = self.sde(x, t)
+        f = drift * dt
+        G = diffusion * torch.sqrt(torch.tensor(dt, device=t.device))
+        return f, G
+
+    def reverse(self, score_fn, probability_flow=False):
+        """Create the reverse-time SDE/ODE.
+
+        Args:
+          score_fn: A time-dependent score-based model that takes x and t and returns the score.
+          probability_flow: If `True`, create the reverse-time ODE used for probability flow sampling.
+        """
+        N = self.N
+        T = self.T
+        sde_fn = self.sde
+        discretize_fn = self.discretize
+
+        # Build the class for reverse-time SDE.
+        class RSDE(self.__class__):
+            def __init__(self):
+                self.N = N
+                self.probability_flow = probability_flow
+
+            @property
+            def T(self):
+                return T
+
+            def sde(self, x, t, **kwargs):
+                """Create the drift and diffusion functions for the reverse SDE/ODE."""
+                drift, diffusion = sde_fn(x, t)
+                score = score_fn(x, t, **kwargs)
+                drift = drift - diffusion[:, None, None] ** 2 * score * (
+                    0.5 if self.probability_flow else 1.0
+                )
+                # Set the diffusion function to zero for ODEs.
+                diffusion = 0.0 if self.probability_flow else diffusion
+                return drift, diffusion
+
+            def discretize(self, x, t):
+                """Create discretized iteration rules for the reverse diffusion sampler."""
+                f, G = discretize_fn(x, t)
+                rev_f = f - G[:, None] ** 2 * score_fn(x, t) * (
+                    0.5 if self.probability_flow else 1.0
+                )
+                rev_G = torch.zeros_like(G) if self.probability_flow else G
+                return rev_f, rev_G
+
+        return RSDE()
+
+
+class VPSDE(SDE):
+    def __init__(self, N=1000):
+        """Construct a Variance Preserving SDE.
+
+        Args:
+          beta_min: value of beta(0)
+          beta_max: value of beta(1)
+          N: number of discretization steps
+        """
+        super().__init__(N)
+
+    def set_betas(self, betas, min_beta=0.01):
+        self.discrete_betas = betas.clamp(min=min_beta)  # cosine schedule from our DDPM
+        self.alphas = 1.0 - self.discrete_betas
+        self.sqrt_alphas = torch.sqrt(self.alphas)
+        self.alphas_bar = torch.cumprod(self.alphas, axis=0)
+        self.sqrt_1m_alpha_bar = torch.sqrt(1 - self.alphas_bar)
+
+    @property
+    def T(self):
+        return 1
+
+    def sde(self, x, t):
+        # dx = - 1/2 beta(t) x dt + sqrt(beta(t)) dW
+        beta_t = self.discrete_betas[t]
+        drift = -0.5 * beta_t[:, None, None] * x
+        diffusion = torch.sqrt(beta_t)
+        return drift, diffusion
+
+    def marginal_prob(self, x, t):
+        raise NotImplementedError
+        # log_mean_coeff = (
+        #     -0.25 * t**2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
+        # )
+        # mean = torch.exp(log_mean_coeff[:, None, None]) * x
+        # std = torch.sqrt(1.0 - torch.exp(2.0 * log_mean_coeff))
+        # return mean, std
+
+    def prior_sampling(self, shape):
+        return torch.randn(*shape)
+
+    def prior_logp(self, z):
+        shape = z.shape
+        N = np.prod(shape[1:])
+        logps = -N / 2.0 * np.log(2 * np.pi) - torch.sum(z**2, dim=(1, 2)) / 2.0
+        return logps
+
+    def discretize(self, x, t):
+        """DDPM discretization."""
+        timestep = (t * (self.N - 1) / self.T).long()
+        beta = self.discrete_betas.to(x.device)[timestep]
+        alpha = self.alphas.to(x.device)[timestep]
+        sqrt_beta = torch.sqrt(beta)
+        f = torch.sqrt(alpha)[:, None, None] * x - x
+        G = sqrt_beta
+        return f, G
diff --git a/model/diffusion/unet.py b/model/diffusion/unet.py
new file mode 100644
index 0000000..621b507
--- /dev/null
+++ b/model/diffusion/unet.py
@@ -0,0 +1,318 @@
+"""
+UNet implementation. Minorly modified from Diffusion Policy: https://github.com/columbia-ai-robotics/diffusion_policy/blob/main/diffusion_policy/model/diffusion/conv1d_components.py
+
+Set `smaller_encoder` to False for using larger observation encoder in ResidualBlock1D
+
+"""
+
+import torch
+import torch.nn as nn
+import einops
+from einops.layers.torch import Rearrange
+import logging
+
+log = logging.getLogger(__name__)
+
+from model.diffusion.modules import (
+    SinusoidalPosEmb,
+    Downsample1d,
+    Upsample1d,
+    Conv1dBlock,
+)
+from model.common.mlp import ResidualMLP
+
+
+class ResidualBlock1D(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        cond_dim,
+        kernel_size=5,
+        n_groups=None,
+        cond_predict_scale=False,
+        larger_encoder=False,
+        activation_type="Mish",
+        groupnorm_eps=1e-5,
+    ):
+        super().__init__()
+
+        self.blocks = nn.ModuleList(
+            [
+                Conv1dBlock(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    n_groups=n_groups,
+                    activation_type=activation_type,
+                    eps=groupnorm_eps,
+                ),
+                Conv1dBlock(
+                    out_channels,
+                    out_channels,
+                    kernel_size,
+                    n_groups=n_groups,
+                    activation_type=activation_type,
+                    eps=groupnorm_eps,
+                ),
+            ]
+        )
+        if activation_type == "Mish":
+            act = nn.Mish()
+        elif activation_type == "ReLU":
+            act = nn.ReLU()
+        else:
+            raise "Unknown activation type for ConditionalResidualBlock1D"
+
+        # FiLM modulation https://arxiv.org/abs/1709.07871
+        # predicts per-channel scale and bias
+        cond_channels = out_channels
+        if cond_predict_scale:
+            cond_channels = out_channels * 2
+        self.cond_predict_scale = cond_predict_scale
+        self.out_channels = out_channels
+        if larger_encoder:
+            self.cond_encoder = nn.Sequential(
+                nn.Linear(cond_dim, cond_channels),
+                act,
+                nn.Linear(cond_channels, cond_channels),
+                act,
+                nn.Linear(cond_channels, cond_channels),
+                Rearrange("batch t -> batch t 1"),
+            )
+        else:
+            self.cond_encoder = nn.Sequential(
+                act,
+                nn.Linear(cond_dim, cond_channels),
+                Rearrange("batch t -> batch t 1"),
+            )
+
+        # make sure dimensions compatible
+        self.residual_conv = (
+            nn.Conv1d(in_channels, out_channels, 1)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+
+    def forward(self, x, cond):
+        """
+        x : [ batch_size x in_channels x horizon_steps ]
+        cond : [ batch_size x cond_dim]
+
+        returns:
+        out : [ batch_size x out_channels x horizon_steps ]
+        """
+        out = self.blocks[0](x)
+        embed = self.cond_encoder(cond)
+        if self.cond_predict_scale:
+            embed = embed.reshape(embed.shape[0], 2, self.out_channels, 1)
+            scale = embed[:, 0, ...]
+            bias = embed[:, 1, ...]
+            out = scale * out + bias
+        else:
+            out = out + embed
+        out = self.blocks[1](out)
+        return out + self.residual_conv(x)
+
+
+class Unet1D(nn.Module):
+
+    def __init__(
+        self,
+        transition_dim,
+        cond_dim=None,
+        diffusion_step_embed_dim=32,
+        dim=32,
+        dim_mults=(1, 2, 4, 8),
+        smaller_encoder=False,
+        cond_mlp_dims=None,
+        kernel_size=5,
+        n_groups=None,
+        activation_type="Mish",
+        cond_predict_scale=False,
+        groupnorm_eps=1e-5,
+    ):
+        super().__init__()
+        dims = [transition_dim, *map(lambda m: dim * m, dim_mults)]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        log.info(f"Channel dimensions: {in_out}")
+
+        dsed = diffusion_step_embed_dim
+        self.time_mlp = nn.Sequential(
+            SinusoidalPosEmb(dsed),
+            nn.Linear(dsed, dsed * 4),
+            nn.Mish(),
+            nn.Linear(dsed * 4, dsed),
+        )
+        if cond_mlp_dims is not None:
+            self.cond_mlp = ResidualMLP(
+                dim_list=[cond_dim] + cond_mlp_dims,
+                activation_type=activation_type,
+                out_activation_type="Identity",
+            )
+            cond_block_dim = dsed + cond_mlp_dims[-1]
+        else:
+            cond_block_dim = dsed + cond_dim
+        use_large_encoder_in_block = cond_mlp_dims is None and not smaller_encoder
+
+        mid_dim = dims[-1]
+        self.mid_modules = nn.ModuleList(
+            [
+                ResidualBlock1D(
+                    mid_dim,
+                    mid_dim,
+                    cond_dim=cond_block_dim,
+                    kernel_size=kernel_size,
+                    n_groups=n_groups,
+                    cond_predict_scale=cond_predict_scale,
+                    larger_encoder=use_large_encoder_in_block,
+                    activation_type=activation_type,
+                    groupnorm_eps=groupnorm_eps,
+                ),
+                ResidualBlock1D(
+                    mid_dim,
+                    mid_dim,
+                    cond_dim=cond_block_dim,
+                    kernel_size=kernel_size,
+                    n_groups=n_groups,
+                    cond_predict_scale=cond_predict_scale,
+                    larger_encoder=use_large_encoder_in_block,
+                    activation_type=activation_type,
+                    groupnorm_eps=groupnorm_eps,
+                ),
+            ]
+        )
+
+        self.down_modules = nn.ModuleList([])
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (len(in_out) - 1)
+            self.down_modules.append(
+                nn.ModuleList(
+                    [
+                        ResidualBlock1D(
+                            dim_in,
+                            dim_out,
+                            cond_dim=cond_block_dim,
+                            kernel_size=kernel_size,
+                            n_groups=n_groups,
+                            cond_predict_scale=cond_predict_scale,
+                            larger_encoder=use_large_encoder_in_block,
+                            activation_type=activation_type,
+                            groupnorm_eps=groupnorm_eps,
+                        ),
+                        ResidualBlock1D(
+                            dim_out,
+                            dim_out,
+                            cond_dim=cond_block_dim,
+                            kernel_size=kernel_size,
+                            n_groups=n_groups,
+                            cond_predict_scale=cond_predict_scale,
+                            larger_encoder=use_large_encoder_in_block,
+                            activation_type=activation_type,
+                            groupnorm_eps=groupnorm_eps,
+                        ),
+                        Downsample1d(dim_out) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+
+        self.up_modules = nn.ModuleList([])
+        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
+            is_last = ind >= (len(in_out) - 1)
+            self.up_modules.append(
+                nn.ModuleList(
+                    [
+                        ResidualBlock1D(
+                            dim_out * 2,
+                            dim_in,
+                            cond_dim=cond_block_dim,
+                            kernel_size=kernel_size,
+                            n_groups=n_groups,
+                            cond_predict_scale=cond_predict_scale,
+                            larger_encoder=use_large_encoder_in_block,
+                            activation_type=activation_type,
+                            groupnorm_eps=groupnorm_eps,
+                        ),
+                        ResidualBlock1D(
+                            dim_in,
+                            dim_in,
+                            cond_dim=cond_block_dim,
+                            kernel_size=kernel_size,
+                            n_groups=n_groups,
+                            cond_predict_scale=cond_predict_scale,
+                            larger_encoder=use_large_encoder_in_block,
+                            activation_type=activation_type,
+                            groupnorm_eps=groupnorm_eps,
+                        ),
+                        Upsample1d(dim_in) if not is_last else nn.Identity(),
+                    ]
+                )
+            )
+
+        self.final_conv = nn.Sequential(
+            Conv1dBlock(
+                dim,
+                dim,
+                kernel_size=kernel_size,
+                n_groups=n_groups,
+                activation_type=activation_type,
+                eps=groupnorm_eps,
+            ),
+            nn.Conv1d(dim, transition_dim, 1),
+        )
+
+    def forward(
+        self,
+        x,
+        time,
+        cond,
+        **kwargs,
+    ):
+        """
+        x: (B,T,input_dim)
+        time: (B,) or int, diffusion step
+        cond: (B,obs_step,cond_dim)
+        output: (B,T,input_dim)
+        """
+        x = einops.rearrange(x, "b h t -> b t h")
+        cond = cond.view(cond.shape[0], -1)
+        if hasattr(self, "cond_mlp"):
+            cond = self.cond_mlp(cond)
+
+        # 1. time
+        if not torch.is_tensor(time):
+            time = torch.tensor([time], dtype=torch.long, device=x.device)
+        elif torch.is_tensor(time) and len(time.shape) == 0:
+            time = time[None].to(x.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        time = time.expand(x.shape[0])
+        global_feature = self.time_mlp(time)
+        global_feature = torch.cat([global_feature, cond], axis=-1)
+
+        # encode local features
+        h_local = list()
+        h = []
+        for idx, (resnet, resnet2, downsample) in enumerate(self.down_modules):
+            x = resnet(x, global_feature)
+            if idx == 0 and len(h_local) > 0:
+                x = x + h_local[0]
+            x = resnet2(x, global_feature)
+            h.append(x)
+            x = downsample(x)
+
+        for mid_module in self.mid_modules:
+            x = mid_module(x, global_feature)
+
+        for idx, (resnet, resnet2, upsample) in enumerate(self.up_modules):
+            x = torch.cat((x, h.pop()), dim=1)
+            x = resnet(x, global_feature)
+            if idx == len(self.up_modules) and len(h_local) > 0:
+                x = x + h_local[1]
+            x = resnet2(x, global_feature)
+            x = upsample(x)
+
+        x = self.final_conv(x)
+
+        x = einops.rearrange(x, "b t h -> b h t")
+        return x
diff --git a/model/rl/__init__.py b/model/rl/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/model/rl/gaussian_awr.py b/model/rl/gaussian_awr.py
new file mode 100644
index 0000000..6a48432
--- /dev/null
+++ b/model/rl/gaussian_awr.py
@@ -0,0 +1,30 @@
+"""
+Advantage-weighted regression (AWR) for Gaussian policy.
+
+"""
+
+import torch
+import logging
+from model.rl.gaussian_rwr import RWR_Gaussian
+
+log = logging.getLogger(__name__)
+
+
+class AWR_Gaussian(RWR_Gaussian):
+
+    def __init__(
+        self,
+        actor,
+        critic,
+        **kwargs,
+    ):
+        super().__init__(actor=actor, **kwargs)
+        self.critic = critic.to(self.device)
+
+    def loss_critic(self, obs, advantages):
+        # get advantage
+        adv = self.critic(obs)
+
+        # Update critic
+        loss_critic = torch.mean((adv - advantages) ** 2)
+        return loss_critic
diff --git a/model/rl/gaussian_ppo.py b/model/rl/gaussian_ppo.py
new file mode 100644
index 0000000..5d3cdeb
--- /dev/null
+++ b/model/rl/gaussian_ppo.py
@@ -0,0 +1,120 @@
+"""
+PPO for Gaussian policy.
+
+"""
+
+from typing import Optional
+import torch
+from model.rl.gaussian_vpg import VPG_Gaussian
+
+
+class PPO_Gaussian(VPG_Gaussian):
+
+    def __init__(
+        self,
+        clip_ploss_coef: float,
+        clip_vloss_coef: Optional[float] = None,
+        norm_adv: Optional[bool] = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # Whether to normalize advantages within batch
+        self.norm_adv = norm_adv
+
+        # Clipping value for policy loss
+        self.clip_ploss_coef = clip_ploss_coef
+
+        # Clipping value for value loss
+        self.clip_vloss_coef = clip_vloss_coef
+
+    def loss(
+        self,
+        obs,
+        actions,
+        returns,
+        oldvalues,
+        advantages,
+        oldlogprobs,
+        use_bc_loss=False,
+    ):
+        """
+        PPO loss
+
+        obs: (B, obs_step, obs_dim)
+        actions: (B, horizon_step, action_dim)
+        returns: (B, )
+        values: (B, )
+        advantages: (B,)
+        oldlogprobs: (B, )
+        """
+        newlogprobs, entropy, std = self.get_logprobs(obs, actions)
+        newlogprobs = newlogprobs.clamp(min=-5, max=2)
+        oldlogprobs = oldlogprobs.clamp(min=-5, max=2)
+        entropy_loss = -entropy
+
+        # get ratio
+        logratio = newlogprobs - oldlogprobs
+        ratio = logratio.exp()
+
+        # get kl difference and whether value clipped
+        with torch.no_grad():
+            approx_kl = ((ratio - 1) - logratio).nanmean()
+            clipfrac = (
+                ((ratio - 1.0).abs() > self.clip_ploss_coef).float().mean().item()
+            )
+
+        # normalize advantages
+        if self.norm_adv:
+            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
+
+        # Policy loss with clipping
+        pg_loss1 = -advantages * ratio
+        pg_loss2 = -advantages * torch.clamp(
+            ratio, 1 - self.clip_ploss_coef, 1 + self.clip_ploss_coef
+        )
+        pg_loss = torch.max(pg_loss1, pg_loss2).mean()
+
+        # Value loss optionally with clipping
+        newvalues = self.critic(obs).view(-1)
+        if self.clip_vloss_coef is not None:
+            v_loss_unclipped = (newvalues - returns) ** 2
+            v_clipped = oldvalues + torch.clamp(
+                newvalues - oldvalues,
+                -self.clip_vloss_coef,
+                self.clip_vloss_coef,
+            )
+            v_loss_clipped = (v_clipped - returns) ** 2
+            v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
+            v_loss = 0.5 * v_loss_max.mean()
+        else:
+            v_loss = 0.5 * ((newvalues - returns) ** 2).mean()
+
+        bc_loss = 0.0
+        if use_bc_loss:
+            # See Eqn. 2 of https://arxiv.org/pdf/2403.03949.pdf
+            # Give a reward for maximizing probability of teacher policy's action with current policy.
+            # Actions are chosen along trajectory induced by current policy.
+
+            # Get counterfactual teacher actions
+            samples = self.forward(
+                cond=obs.float()
+                .unsqueeze(1)
+                .to(self.device),  # B x horizon=1 x obs_dim
+                deterministic=False,
+                use_base_policy=True,
+            )
+            # Get logprobs of teacher actions under this policy
+            bc_logprobs, _, _ = self.get_logprobs(obs, samples, use_base_policy=False)
+            bc_logprobs = bc_logprobs.clamp(min=-5, max=2)
+            bc_loss = -bc_logprobs.mean()
+        return (
+            pg_loss,
+            entropy_loss,
+            v_loss,
+            clipfrac,
+            approx_kl.item(),
+            ratio.mean().item(),
+            bc_loss,
+            std.item(),
+        )
diff --git a/model/rl/gaussian_rwr.py b/model/rl/gaussian_rwr.py
new file mode 100644
index 0000000..fd37d83
--- /dev/null
+++ b/model/rl/gaussian_rwr.py
@@ -0,0 +1,58 @@
+"""
+Reward-weighted regression (RWR) for Gaussian policy.
+
+"""
+
+import torch
+import logging
+from model.common.gaussian import GaussianModel
+import torch.distributions as D
+
+log = logging.getLogger(__name__)
+
+
+class RWR_Gaussian(GaussianModel):
+
+    def __init__(
+        self,
+        actor,
+        randn_clip_value=10,
+        **kwargs,
+    ):
+        super().__init__(network=actor, **kwargs)
+
+        # assign actor
+        self.actor = self.network
+
+        # Clip sampled randn (from standard deviation) such that the sampled action is not too far away from mean
+        self.randn_clip_value = randn_clip_value
+
+    # override
+    def loss(self, actions, obs, reward_weights):
+        cond = obs
+        B = cond.shape[0]
+        means, scales = self.network(cond)
+
+        dist = D.Normal(loc=means, scale=scales)
+        log_prob = dist.log_prob(actions.view(B, -1)).mean(-1)
+        log_prob = log_prob * reward_weights
+        log_prob = -log_prob.mean()
+        return log_prob
+
+    # override
+    @torch.no_grad()
+    def forward(self, cond, deterministic=False, **kwargs):
+        """
+        Args:
+            cond: (batch_size, horizon, obs_dim)
+
+        Return:
+            actions: (batch_size, horizon_steps, transition_dim)
+        """
+        B = cond.shape[0]
+        actions = super().forward(
+            cond=cond.view(B, -1),
+            deterministic=deterministic,
+            randn_clip_value=self.randn_clip_value,
+        )
+        return actions
diff --git a/model/rl/gaussian_vpg.py b/model/rl/gaussian_vpg.py
new file mode 100644
index 0000000..aecb71d
--- /dev/null
+++ b/model/rl/gaussian_vpg.py
@@ -0,0 +1,87 @@
+"""
+Policy gradient for Gaussian policy
+
+"""
+
+import torch
+from copy import deepcopy
+import logging
+from model.common.gaussian import GaussianModel
+
+
+class VPG_Gaussian(GaussianModel):
+
+    def __init__(
+        self,
+        actor,
+        critic,
+        cond_steps=1,
+        randn_clip_value=10,
+        network_path=None,
+        **kwargs,
+    ):
+        super().__init__(network=actor, **kwargs)
+        self.cond_steps = cond_steps
+        self.randn_clip_value = randn_clip_value
+
+        # Value function for obs - simple MLP
+        self.critic = critic.to(self.device)
+        if network_path is not None:
+            checkpoint = torch.load(
+                network_path, map_location=self.device, weights_only=True
+            )
+            self.load_state_dict(
+                checkpoint["model"],
+                strict=False,
+            )
+            logging.info("Loaded actor from %s", network_path)
+
+        # Re-name network to actor
+        self.actor_ft = actor
+
+        # Save a copy of original actor
+        self.actor = deepcopy(actor)
+        for param in self.actor.parameters():
+            param.requires_grad = False
+
+    def get_logprobs(
+        self,
+        cond,
+        actions,
+        use_base_policy=False,
+    ):
+        B, T, D = actions.shape
+        if not isinstance(cond, dict):
+            cond = cond.view(B, -1)
+        dist = self.forward_train(
+            cond,
+            deterministic=False,
+            network_override=self.actor if use_base_policy else None,
+        )
+        log_prob = dist.log_prob(actions.view(B, -1))
+        log_prob = log_prob.mean(-1)
+        entropy = dist.entropy().mean()
+        std = dist.scale.mean()
+        return log_prob, entropy, std
+
+    def loss(self, obs, actions, reward):
+        raise NotImplementedError
+
+    @torch.no_grad()
+    def forward(
+        self,
+        cond,
+        deterministic=False,
+        use_base_policy=False,
+    ):
+        if isinstance(cond, dict):
+            B = cond["state"].shape[0]
+        else:
+            B = cond.shape[0]
+            cond = cond.view(B, -1)
+        return super().forward(
+            cond=cond,
+            deterministic=deterministic,
+            randn_clip_value=self.randn_clip_value,
+            network_override=self.actor if use_base_policy else None,
+        )
diff --git a/model/rl/gmm_ppo.py b/model/rl/gmm_ppo.py
new file mode 100644
index 0000000..a92276a
--- /dev/null
+++ b/model/rl/gmm_ppo.py
@@ -0,0 +1,102 @@
+"""
+PPO for GMM policy.
+
+"""
+
+from typing import Optional
+import torch
+from model.rl.gmm_vpg import VPG_GMM
+
+
+class PPO_GMM(VPG_GMM):
+
+    def __init__(
+        self,
+        clip_ploss_coef: float,
+        clip_vloss_coef: Optional[float] = None,
+        norm_adv: Optional[bool] = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # Whether to normalize advantages within batch
+        self.norm_adv = norm_adv
+
+        # Clipping value for policy loss
+        self.clip_ploss_coef = clip_ploss_coef
+
+        # Clipping value for value loss
+        self.clip_vloss_coef = clip_vloss_coef
+
+    def loss(
+        self,
+        obs,
+        actions,
+        returns,
+        oldvalues,
+        advantages,
+        oldlogprobs,
+        **kwargs,
+    ):
+        """
+        PPO loss
+
+        obs: (B, obs_step, obs_dim)
+        actions: (B, horizon_step, action_dim)
+        returns: (B, )
+        values: (B, )
+        advantages: (B,)
+        oldlogprobs: (B, )
+        """
+        newlogprobs, entropy, std = self.get_logprobs(obs, actions)
+        newlogprobs = newlogprobs.clamp(min=-5, max=2)
+        oldlogprobs = oldlogprobs.clamp(min=-5, max=2)
+        entropy_loss = -entropy.mean()
+
+        # get ratio
+        logratio = newlogprobs - oldlogprobs
+        ratio = logratio.exp()
+
+        # get kl difference and whether value clipped
+        with torch.no_grad():
+            approx_kl = ((ratio - 1) - logratio).nanmean()
+            clipfrac = (
+                ((ratio - 1.0).abs() > self.clip_ploss_coef).float().mean().item()
+            )
+
+        # normalize advantages
+        if self.norm_adv:
+            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
+
+        # Policy loss with clipping
+        pg_loss1 = -advantages * ratio
+        pg_loss2 = -advantages * torch.clamp(
+            ratio, 1 - self.clip_ploss_coef, 1 + self.clip_ploss_coef
+        )
+        pg_loss = torch.max(pg_loss1, pg_loss2).mean()
+
+        # Value loss optionally with clipping
+        newvalues = self.critic(obs).view(-1)
+        if self.clip_vloss_coef is not None:
+            v_loss_unclipped = (newvalues - returns) ** 2
+            v_clipped = oldvalues + torch.clamp(
+                newvalues - oldvalues,
+                -self.clip_vloss_coef,
+                self.clip_vloss_coef,
+            )
+            v_loss_clipped = (v_clipped - returns) ** 2
+            v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
+            v_loss = 0.5 * v_loss_max.mean()
+        else:
+            v_loss = 0.5 * ((newvalues - returns) ** 2).mean()
+        bc_loss = 0
+        return (
+            pg_loss,
+            entropy_loss,
+            v_loss,
+            clipfrac,
+            approx_kl.item(),
+            ratio.mean().item(),
+            bc_loss,
+            std.item(),
+        )
diff --git a/model/rl/gmm_vpg.py b/model/rl/gmm_vpg.py
new file mode 100644
index 0000000..fc70b79
--- /dev/null
+++ b/model/rl/gmm_vpg.py
@@ -0,0 +1,56 @@
+import torch
+import logging
+from model.common.gmm import GMMModel
+
+
+class VPG_GMM(GMMModel):
+    def __init__(
+        self,
+        actor,
+        critic,
+        cond_steps=1,
+        network_path=None,
+        **kwargs,
+    ):
+        super().__init__(network=actor, **kwargs)
+        self.cond_steps = cond_steps
+
+        # Re-name network to actor
+        self.actor_ft = actor
+
+        # Value function for obs - simple MLP
+        self.critic = critic.to(self.device)
+        if network_path is not None:
+            checkpoint = torch.load(
+                network_path, map_location=self.device, weights_only=True
+            )
+            self.load_state_dict(
+                checkpoint["model"],
+                strict=False,
+            )
+            logging.info("Loaded actor from %s", network_path)
+
+    def get_logprobs(
+        self,
+        cond,
+        actions,
+    ):
+        B, T, D = actions.shape
+        dist, entropy, std = self.forward_train(
+            cond.view(B, -1),
+            deterministic=False,
+        )
+        log_prob = dist.log_prob(actions.view(B, -1))
+        return log_prob, entropy, std
+
+    def loss(self, obs, chains, reward):
+        raise NotImplementedError
+
+    # override to diffuse over action only
+    @torch.no_grad()
+    def forward(self, cond, deterministic=False):
+        B = cond.shape[0]
+        return super().forward(
+            cond=cond.view(B, -1),
+            deterministic=deterministic,
+        )
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..39f222d
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,76 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "dppo"
+version = "0.1.0"
+description = "Fine-tuning diffusion policies with PPO."
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Python :: 3",
+]
+dependencies = [
+    "av==12.3.0",
+    "einops==0.8.0",
+    "gdown==5.2.0",
+    "gym==0.22.0",
+    "hydra-core==1.3.2",
+    "imageio==2.35.1",
+    "matplotlib==3.7.5",
+    "omegaconf==2.3.0",
+    "pretty_errors==1.2.25",
+    "torch==2.4.0",
+    "tqdm==4.66.5",
+    "wandb==0.17.7"
+]
+
+[project.optional-dependencies]
+gym = [
+    "cython<3",
+    "d4rl",
+    "patchelf",
+]
+robomimic = [
+    "cython<3",
+    "d4rl",
+    "patchelf",
+    "robomimic @ git+https://github.com/ARISE-Initiative/robomimic.git",
+    "robosuite @ git+https://github.com/ARISE-Initiative/robosuite.git@v1.4.1",
+]
+d3il = [
+    "cython<3",
+    "d4rl",
+    "gin-config",
+    "patchelf",
+    "pin",
+]
+furniture = [
+    "cython<3",
+    "d4rl",
+    "ipdb==0.13.13",
+    "patchelf",
+]
+exact = [   # exact likelihood
+    "torchdiffeq",
+]
+all = [
+    "cython<3",
+    "d4rl",
+    "gin-config",
+    "ipdb==0.13.13",
+    "patchelf",
+    "pin",
+    "robomimic @ git+https://github.com/ARISE-Initiative/robomimic.git",
+    "robosuite @ git+https://github.com/ARISE-Initiative/robosuite.git@v1.4.1",
+]
+
+[project.urls]
+Homepage = "https://diffusion-ppo.github.io/"
+
+[tool.setuptools.packages.find]
+exclude = []
+
+[tool.wheel]
+exclude = []
diff --git a/script/dataset/README.md b/script/dataset/README.md
new file mode 100644
index 0000000..8d71e7b
--- /dev/null
+++ b/script/dataset/README.md
@@ -0,0 +1,3 @@
+## Data processing scripts
+
+These are some scripts used for processing the raw datasets from the benchmarks. We already pre-processed them and provide the final datasets. These scripts are for information only.
\ No newline at end of file
diff --git a/script/dataset/filter_d3il_avoid_data.py b/script/dataset/filter_d3il_avoid_data.py
new file mode 100644
index 0000000..76a2a16
--- /dev/null
+++ b/script/dataset/filter_d3il_avoid_data.py
@@ -0,0 +1,357 @@
+"""
+Filter avoid data based on modes.
+
+Trajectories are normalized with filtered data, not the original data.
+"""
+
+import os
+import numpy as np
+from tqdm import tqdm
+import pickle
+import random
+import matplotlib.pyplot as plt
+from copy import deepcopy
+
+from agent.dataset.d3il_dataset.avoiding_dataset import Avoiding_Dataset
+
+
+def make_dataset(
+    load_path,
+    save_dir,
+    save_name_prefix,
+    val_split,
+    desired_modes,
+    desired_mode_ratios,
+    required_modes,
+    avoid_modes,
+):
+    print("Desired modes:", desired_modes)
+    print("Required modes:", required_modes)
+    print("Avoid modes:", avoid_modes)
+    print("Desired mode ratios:", desired_mode_ratios)
+    demo_dataset = Avoiding_Dataset(
+        load_path,
+        action_dim=2,
+        obs_dim=4,
+        max_len_data=200,
+    )
+    # from avoiding env
+    level_distance = 0.18
+    obstacle_offset = 0.075
+    l1_ypos = -0.1
+    l2_ypos = -0.1 + level_distance
+    l3_ypos = -0.1 + 2 * level_distance
+    # goal_ypos = -0.1 + 2.5 * level_distance
+    l1_xpos = 0.5
+    l2_top_xpos = 0.5 - obstacle_offset
+    l2_bottom_xpos = 0.5 + obstacle_offset
+    l3_top_xpos = 0.5 - 2 * obstacle_offset
+    l3_mid_xpos = 0.5
+    l3_bottom_xpos = 0.5 + 2 * obstacle_offset
+
+    def check_mode(x):
+        r_x_pos = x[0]
+        r_y_pos = x[1]
+        mode_encoding = np.zeros((9))
+        if r_y_pos - 0.01 <= l1_ypos <= r_y_pos + 0.01:
+            if r_x_pos < l1_xpos:
+                mode_encoding[0] = 1
+            elif r_x_pos > l1_xpos:
+                mode_encoding[1] = 1
+
+        if r_y_pos - 0.01 <= l2_ypos <= r_y_pos + 0.01:
+            if r_x_pos < l2_top_xpos:
+                mode_encoding[2] = 1
+            elif l2_top_xpos < r_x_pos < l2_bottom_xpos:
+                mode_encoding[3] = 1
+            elif r_x_pos > l2_bottom_xpos:
+                mode_encoding[4] = 1
+
+        # if r_y_pos - 0.015 <= self.l3_ypos and (not self.l3_passed):
+        if r_y_pos >= l3_ypos:
+            if r_x_pos < l3_top_xpos:
+                mode_encoding[5] = 1
+            if l3_top_xpos < r_x_pos < l3_mid_xpos:
+                mode_encoding[6] = 1
+            elif l3_mid_xpos < r_x_pos < l3_bottom_xpos:
+                mode_encoding[7] = 1
+            elif r_x_pos > l3_top_xpos:
+                mode_encoding[8] = 1
+        return mode_encoding
+
+    # extract length of each trajectory in the file
+    full_traj_lengths = []
+    full_actions = demo_dataset.actions
+    full_obs = demo_dataset.observations
+    masks = demo_dataset.masks
+    action_dim = full_actions.shape[2]
+    obs_dim = full_obs.shape[2]
+    for ep in range(masks.shape[0]):
+        full_traj_lengths.append(int(masks[ep].sum().item()))
+    full_traj_lengths = np.array(full_traj_lengths)
+
+    # take the max and min of obs and action
+    obs_min = np.zeros((obs_dim))
+    obs_max = np.zeros((obs_dim))
+    action_min = np.zeros((action_dim))
+    action_max = np.zeros((action_dim))
+    chosen_indices = []
+    for i in tqdm(range(len(masks))):
+        T = full_traj_lengths[i]
+        obs_traj = full_obs[i, :T].numpy()
+        action_traj = full_actions[i, :T].numpy()
+
+        # check if trajectory pass through desired hole
+        flag_desired = False
+        flag_required = [False for _ in required_modes] if required_modes else [True]
+        flag_avoid = False
+        for ob in obs_traj:
+            modes = check_mode(ob)
+            if any(modes[desired] for desired in desired_modes):
+                desired_mode_idx = np.argmax(
+                    [modes[desired] for desired in desired_modes]
+                )
+                flag_desired = True
+            if any(modes[avoid] for avoid in avoid_modes):
+                flag_avoid = True
+                break
+            for j, required in enumerate(required_modes):
+                if modes[required]:
+                    flag_required[j] = True
+        if flag_avoid or not flag_desired or not all(flag_required):
+            continue
+        if desired_mode_ratios:
+            if random.random() > desired_mode_ratios[desired_mode_idx]:
+                continue
+        chosen_indices.append(i)
+
+        obs_min = np.min(np.vstack((obs_min, np.min(obs_traj, axis=0))), axis=0)
+        obs_max = np.max(np.vstack((obs_max, np.max(obs_traj, axis=0))), axis=0)
+        action_min = np.min(
+            np.vstack((action_min, np.min(action_traj, axis=0))), axis=0
+        )
+        action_max = np.max(
+            np.vstack((action_max, np.max(action_traj, axis=0))), axis=0
+        )
+    if len(chosen_indices) == 0:
+        raise ValueError("No data found for the desired/required modes")
+    chosen_indices = np.array(chosen_indices)
+    traj_lengths = full_traj_lengths[chosen_indices]
+    actions = demo_dataset.actions[chosen_indices]
+    obs = demo_dataset.observations[chosen_indices]
+    max_traj_length = np.max(traj_lengths)
+
+    # split indices in train and val
+    num_traj = len(traj_lengths)
+    num_train = int(num_traj * (1 - val_split))
+    train_indices = random.sample(range(num_traj), k=num_train)
+
+    logger.info("\n========== Basic Info ===========")
+    logger.info("total transitions: {}".format(np.sum(traj_lengths)))
+    logger.info("total trajectories: {}".format(len(traj_lengths)))
+    logger.info(
+        f"traj length mean/std: {np.mean(traj_lengths)}, {np.std(traj_lengths)}"
+    )
+    logger.info(f"traj length min/max: {np.min(traj_lengths)}, {np.max(traj_lengths)}")
+    logger.info(f"obs min: {obs_min}")
+    logger.info(f"obs max: {obs_max}")
+    logger.info(f"action min: {action_min}")
+    logger.info(f"action max: {action_max}")
+
+    # do over all indices
+    out_train = {}
+    keys = [
+        "observations",
+        "actions",
+        "rewards",
+    ]
+    total_timesteps = actions.shape[1]
+    out_train["observations"] = np.empty((0, total_timesteps, obs_dim))
+    out_train["actions"] = np.empty((0, total_timesteps, action_dim))
+    out_train["rewards"] = np.empty((0, total_timesteps))
+    out_train["traj_length"] = []
+    out_val = deepcopy(out_train)
+    for i in tqdm(range(len(traj_lengths))):
+        if i in train_indices:
+            out = out_train
+        else:
+            out = out_val
+        T = traj_lengths[i]
+        obs_traj = obs[i].numpy()
+        action_traj = actions[i].numpy()
+
+        # scale to [-1, 1] for both ob and action
+        obs_traj = 2 * (obs_traj - obs_min) / (obs_max - obs_min + 1e-6) - 1
+        action_traj = (
+            2 * (action_traj - action_min) / (action_max - action_min + 1e-6) - 1
+        )
+
+        # get episode length
+        traj_length = T
+        out["traj_length"].append(traj_length)
+
+        # extract
+        rewards = np.zeros(total_timesteps)  # no reward from d3il dataset
+        data_traj = {
+            "observations": obs_traj,
+            "actions": action_traj,
+            "rewards": rewards,
+        }
+        for key in keys:
+            traj = data_traj[key]
+            out[key] = np.vstack((out[key], traj[None]))
+
+    # plot all trajectories and save in a figure
+    def plot(out, name):
+        def get_obj_xy_list():
+            mid_pos = 0.5
+            offset = 0.075
+            first_level_y = -0.1
+            level_distance = 0.18
+            return [
+                [mid_pos, first_level_y],
+                [mid_pos - offset, first_level_y + level_distance],
+                [mid_pos + offset, first_level_y + level_distance],
+                [mid_pos - 2 * offset, first_level_y + 2 * level_distance],
+                [mid_pos, first_level_y + 2 * level_distance],
+                [mid_pos + 2 * offset, first_level_y + 2 * level_distance],
+            ]
+
+        pillar_xys = get_obj_xy_list()
+        fig = plt.figure()
+        all_trajs = out["observations"]  # num x timestep x obs
+        for traj, traj_length in zip(all_trajs, out["traj_length"]):
+            # unnormalize
+            traj = (traj + 1) / 2  # [-1, 1] -> [0, 1]
+            traj = traj * (obs_max - obs_min) + obs_min
+            plt.plot(
+                traj[:traj_length, 2], traj[:traj_length, 3], color=(0.3, 0.3, 0.3)
+            )
+        plt.axhline(y=0.4, color=np.array([31, 119, 180]) / 255, linestyle="-")
+        for xy in pillar_xys:
+            circle = plt.Circle(xy, 0.01, color=(0.0, 0.0, 0.0), fill=True)
+            plt.gca().add_patch(circle)
+        plt.xlabel("X pos")
+        plt.ylabel("Y pos")
+        plt.xlim([0.2, 0.8])
+        plt.ylim([-0.3, 0.5])
+        ax = plt.gca()
+        ax.set_aspect("equal", adjustable="box")
+        ax.set_facecolor("white")
+        plt.savefig(os.path.join(save_dir, name))
+        plt.close(fig)
+
+    plot(out_train, name="train-trajs.png")
+    plot(out_val, name="val-trajs.png")
+
+    # Save to np file
+    save_train_path = os.path.join(save_dir, save_name_prefix + "train.pkl")
+    save_val_path = os.path.join(save_dir, save_name_prefix + "val.pkl")
+    with open(save_train_path, "wb") as f:
+        pickle.dump(out_train, f)
+    with open(save_val_path, "wb") as f:
+        pickle.dump(out_val, f)
+    normalization_save_path = os.path.join(
+        save_dir, save_name_prefix + "normalization.npz"
+    )
+    np.savez(
+        normalization_save_path,
+        obs_min=obs_min,
+        obs_max=obs_max,
+        action_min=action_min,
+        action_max=action_max,
+    )
+
+    # debug
+    logger.info("\n========== Final ===========")
+    logger.info(
+        f"Train - Number of episodes and transitions: {len(out_train['traj_length'])}, {np.sum(out_train['traj_length'])}"
+    )
+    logger.info(
+        f"Val - Number of episodes and transitions: {len(out_val['traj_length'])}, {np.sum(out_val['traj_length'])}"
+    )
+    logger.info(
+        f"Train - Mean/Std trajectory length: {np.mean(out_train['traj_length'])}, {np.std(out_train['traj_length'])}"
+    )
+    logger.info(
+        f"Train - Max/Min trajectory length: {np.max(out_train['traj_length'])}, {np.min(out_train['traj_length'])}"
+    )
+    if val_split > 0:
+        logger.info(
+            f"Val - Mean/Std trajectory length: {np.mean(out_val['traj_length'])}, {np.std(out_val['traj_length'])}"
+        )
+        logger.info(
+            f"Val - Max/Min trajectory length: {np.max(out_val['traj_length'])}, {np.min(out_val['traj_length'])}"
+        )
+    for obs_dim_ind in range(obs_dim):
+        obs = out_train["observations"][:, :, obs_dim_ind]
+        logger.info(
+            f"Train - Obs dim {obs_dim_ind+1} mean {np.mean(obs)} std {np.std(obs)} min {np.min(obs)} max {np.max(obs)}"
+        )
+    for action_dim_ind in range(action_dim):
+        action = out_train["actions"][:, :, action_dim_ind]
+        logger.info(
+            f"Train - Action dim {action_dim_ind+1} mean {np.mean(action)} std {np.std(action)} min {np.min(action)} max {np.max(action)}"
+        )
+    if val_split > 0:
+        for obs_dim_ind in range(obs_dim):
+            obs = out_val["observations"][:, :, obs_dim_ind]
+            logger.info(
+                f"Val - Obs dim {obs_dim_ind+1} mean {np.mean(obs)} std {np.std(obs)} min {np.min(obs)} max {np.max(obs)}"
+            )
+        for action_dim_ind in range(action_dim):
+            action = out_val["actions"][:, :, action_dim_ind]
+            logger.info(
+                f"Val - Action dim {action_dim_ind+1} mean {np.mean(action)} std {np.std(action)} min {np.min(action)} max {np.max(action)}"
+            )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--load_path", type=str, default=".")
+    parser.add_argument("--save_dir", type=str, default=".")
+    parser.add_argument("--save_name_prefix", type=str, default="")
+    parser.add_argument("--val_split", type=float, default="0.2")
+    parser.add_argument("--desired_modes", nargs="+", type=int)
+    parser.add_argument("--desired_mode_ratios", nargs="+", type=float, default=[])
+    parser.add_argument("--required_modes", nargs="+", type=int, default=[])
+    parser.add_argument("--avoid_modes", nargs="+", type=int, default=[])
+    args = parser.parse_args()
+    if len(args.desired_mode_ratios) > 0:
+        assert len(args.desired_modes) == len(
+            args.desired_mode_ratios
+        ), "Desired modes and desired mode ratios should have the same length"
+
+    os.makedirs(args.save_dir, exist_ok=True)
+
+    import logging
+    import datetime
+
+    os.makedirs(args.save_dir, exist_ok=True)
+    log_path = os.path.join(
+        args.save_dir,
+        args.save_name_prefix
+        + f"{datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}.log",
+    )
+    logger = logging.getLogger("get_D4RL_dataset")
+    logger.setLevel(logging.INFO)
+    file_handler = logging.FileHandler(log_path)
+    file_handler.setLevel(logging.INFO)  # Set the minimum level for this handler
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+
+    make_dataset(
+        args.load_path,
+        args.save_dir,
+        args.save_name_prefix,
+        args.val_split,
+        args.desired_modes,
+        args.desired_mode_ratios,
+        args.required_modes,
+        args.avoid_modes,
+    )
diff --git a/script/dataset/get_d4rl_dataset.py b/script/dataset/get_d4rl_dataset.py
new file mode 100644
index 0000000..83fc967
--- /dev/null
+++ b/script/dataset/get_d4rl_dataset.py
@@ -0,0 +1,247 @@
+"""
+Download D4RL dataset and save it into our custom format so it can be loaded for diffusion training.
+
+"""
+
+import os
+import logging
+import gym
+import random
+from copy import deepcopy
+import numpy as np
+from tqdm import tqdm
+import pickle
+
+import d4rl.gym_mujoco  # Import required to register environments
+
+
+def make_dataset(env_name, save_dir, save_name_prefix, val_split, logger):
+    # Create the environment
+    env = gym.make(env_name)
+
+    # d4rl abides by the OpenAI gym interface
+    env.reset()
+    env.step(env.action_space.sample())
+
+    # Each task is associated with a dataset
+    # dataset contains observations, actions, rewards, terminals, and infos
+    dataset = env.get_dataset()
+    logger.info("\n========== Basic Info ===========")
+    logger.info(f"Keys in the dataset: {dataset.keys()}")
+    logger.info(f"Observation shape: {dataset['observations'].shape}")
+    logger.info(f"Action shape: {dataset['actions'].shape}")
+    terminal_indices = np.argwhere(dataset["terminals"])[:, 0]
+    timeout_indices = np.argwhere(dataset["timeouts"])[:, 0]
+    obs_dim = dataset["observations"].shape[1]
+    action_dim = dataset["actions"].shape[1]
+    done_indices = np.concatenate([terminal_indices, timeout_indices])
+    done_indices = np.sort(done_indices)
+    traj_lengths = []
+    prev_index = 0
+    for i in tqdm(range(len(done_indices))):
+        # get episode length
+        cur_index = done_indices[i]
+        traj_lengths.append(cur_index - prev_index + 1)
+        prev_index = cur_index + 1
+    obs_min = np.min(dataset["observations"], axis=0)
+    obs_max = np.max(dataset["observations"], axis=0)
+    action_min = np.min(dataset["actions"], axis=0)
+    action_max = np.max(dataset["actions"], axis=0)
+    max_episode_steps = max(traj_lengths)
+    logger.info("total transitions: {}".format(np.sum(traj_lengths)))
+    logger.info("total trajectories: {}".format(len(traj_lengths)))
+    logger.info(
+        f"traj length mean/std: {np.mean(traj_lengths)}, {np.std(traj_lengths)}"
+    )
+    logger.info(f"traj length min/max: {np.min(traj_lengths)}, {np.max(traj_lengths)}")
+    logger.info(f"obs min: {obs_min}")
+    logger.info(f"obs max: {obs_max}")
+    logger.info(f"action min: {action_min}")
+    logger.info(f"action max: {action_max}")
+
+    # Subsample episodes by taking the first ones
+    if args.max_episodes > 0:
+        traj_lengths = traj_lengths[: args.max_episodes]
+        done_indices = done_indices[: args.max_episodes]
+        max_episode_steps = max(traj_lengths)
+
+    # split indices in train and val
+    num_traj = len(traj_lengths)
+    num_train = int(num_traj * (1 - val_split))
+    train_indices = random.sample(range(num_traj), k=num_train)
+
+    # do over all indices
+    out_train = {}
+    keys = [
+        "observations",
+        "actions",
+        "rewards",
+    ]
+    out_train["observations"] = np.empty(
+        (0, max_episode_steps, dataset["observations"].shape[-1])
+    )
+    out_train["actions"] = np.empty(
+        (0, max_episode_steps, dataset["actions"].shape[-1])
+    )
+    out_train["rewards"] = np.empty((0, max_episode_steps))
+    out_train["traj_length"] = []
+    out_val = deepcopy(out_train)
+    prev_index = 0
+    train_episode_reward_all = []
+    val_episode_reward_all = []
+    for i in tqdm(range(len(done_indices))):
+        if i in train_indices:
+            out = out_train
+            episode_reward_all = train_episode_reward_all
+        else:
+            out = out_val
+            episode_reward_all = val_episode_reward_all
+
+        # get episode length
+        cur_index = done_indices[i]
+        traj_length = cur_index - prev_index + 1
+
+        # Skip if the episode has no reward
+        if np.sum(dataset["rewards"][prev_index : cur_index + 1]) > 0:
+            out["traj_length"].append(traj_length)
+
+            # apply padding to make all episodes have the same max steps
+            for key in keys:
+                traj = dataset[key][prev_index : cur_index + 1]
+
+                # also scale
+                if key == "observations":
+                    traj = 2 * (traj - obs_min) / (obs_max - obs_min + 1e-6) - 1
+                elif key == "actions":
+                    traj = (
+                        2 * (traj - action_min) / (action_max - action_min + 1e-6) - 1
+                    )
+
+                if traj.ndim == 1:
+                    traj = np.pad(
+                        traj,
+                        (0, max_episode_steps - len(traj)),
+                        mode="constant",
+                        constant_values=0,
+                    )
+                else:
+                    traj = np.pad(
+                        traj,
+                        ((0, max_episode_steps - traj.shape[0]), (0, 0)),
+                        mode="constant",
+                        constant_values=0,
+                    )
+                out[key] = np.vstack((out[key], traj[None]))
+
+            # check reward
+            episode_reward_all.append(np.sum(out["rewards"][-1]))
+        else:
+            print(f"skipping {i} / {len(done_indices)}")
+
+        # update prev index
+        prev_index = cur_index + 1
+
+    # Save to np file
+    save_train_path = os.path.join(save_dir, save_name_prefix + "train.pkl")
+    save_val_path = os.path.join(save_dir, save_name_prefix + "val.pkl")
+    with open(save_train_path, "wb") as f:
+        pickle.dump(out_train, f)
+    with open(save_val_path, "wb") as f:
+        pickle.dump(out_val, f)
+    normalization_save_path = os.path.join(
+        save_dir, save_name_prefix + "normalization.npz"
+    )
+    np.savez(
+        normalization_save_path,
+        obs_min=obs_min,
+        obs_max=obs_max,
+        action_min=action_min,
+        action_max=action_max,
+    )
+
+    # debug
+    logger.info("\n========== Final ===========")
+    logger.info(
+        f"Train - Number of episodes and transitions: {len(out_train['traj_length'])}, {np.sum(out_train['traj_length'])}"
+    )
+    logger.info(
+        f"Val - Number of episodes and transitions: {len(out_val['traj_length'])}, {np.sum(out_val['traj_length'])}"
+    )
+    logger.info(
+        f"Train - Mean/Std trajectory length: {np.mean(out_train['traj_length'])}, {np.std(out_train['traj_length'])}"
+    )
+    logger.info(
+        f"Train - Max/Min trajectory length: {np.max(out_train['traj_length'])}, {np.min(out_train['traj_length'])}"
+    )
+    if val_split > 0:
+        logger.info(
+            f"Val - Mean/Std trajectory length: {np.mean(out_val['traj_length'])}, {np.std(out_val['traj_length'])}"
+        )
+        logger.info(
+            f"Val - Max/Min trajectory length: {np.max(out_val['traj_length'])}, {np.min(out_val['traj_length'])}"
+        )
+    logger.info(
+        f"Train - Mean/Std episode reward: {np.mean(train_episode_reward_all)},  {np.std(train_episode_reward_all)}"
+    )
+    if val_split > 0:
+        logger.info(
+            f"Val - Mean/Std episode reward: {np.mean(val_episode_reward_all)},  {np.std(val_episode_reward_all)}"
+        )
+    for obs_dim_ind in range(obs_dim):
+        obs = out_train["observations"][:, :, obs_dim_ind]
+        logger.info(
+            f"Train - Obs dim {obs_dim_ind+1} mean {np.mean(obs)} std {np.std(obs)} min {np.min(obs)} max {np.max(obs)}"
+        )
+    for action_dim_ind in range(action_dim):
+        action = out_train["actions"][:, :, action_dim_ind]
+        logger.info(
+            f"Train - Action dim {action_dim_ind+1} mean {np.mean(action)} std {np.std(action)} min {np.min(action)} max {np.max(action)}"
+        )
+    if val_split > 0:
+        for obs_dim_ind in range(obs_dim):
+            obs = out_val["observations"][:, :, obs_dim_ind]
+            logger.info(
+                f"Val - Obs dim {obs_dim_ind+1} mean {np.mean(obs)} std {np.std(obs)} min {np.min(obs)} max {np.max(obs)}"
+            )
+        for action_dim_ind in range(action_dim):
+            action = out_val["actions"][:, :, action_dim_ind]
+            logger.info(
+                f"Val - Action dim {action_dim_ind+1} mean {np.mean(action)} std {np.std(action)} min {np.min(action)} max {np.max(action)}"
+            )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--env_name", type=str, default="hopper-medium-v2")
+    parser.add_argument("--save_dir", type=str, default=".")
+    parser.add_argument("--save_name_prefix", type=str, default="")
+    parser.add_argument("--val_split", type=float, default="0.2")
+    parser.add_argument("--max_episodes", type=int, default="-1")
+    args = parser.parse_args()
+
+    import datetime
+
+    # import logging.config
+    if args.max_episodes > 0:
+        args.save_name_prefix += f"max_episodes_{args.max_episodes}_"
+    os.makedirs(args.save_dir, exist_ok=True)
+    log_path = os.path.join(
+        args.save_dir,
+        args.save_name_prefix
+        + f"_{datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}.log",
+    )
+    logger = logging.getLogger("get_D4RL_dataset")
+    logger.setLevel(logging.INFO)
+    file_handler = logging.FileHandler(log_path)
+    file_handler.setLevel(logging.INFO)  # Set the minimum level for this handler
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+
+    make_dataset(
+        args.env_name, args.save_dir, args.save_name_prefix, args.val_split, logger
+    )
diff --git a/script/dataset/process_d3il_dataset.py b/script/dataset/process_d3il_dataset.py
new file mode 100644
index 0000000..bf2dac3
--- /dev/null
+++ b/script/dataset/process_d3il_dataset.py
@@ -0,0 +1,294 @@
+"""
+Process d3il dataset and save it into our custom format so it can be loaded for diffusion training.
+"""
+
+import os
+import numpy as np
+from tqdm import tqdm
+import pickle
+import random
+import matplotlib.pyplot as plt
+from copy import deepcopy
+
+from agent.dataset.d3il_dataset.aligning_dataset import Aligning_Dataset
+from agent.dataset.d3il_dataset.avoiding_dataset import Avoiding_Dataset
+from agent.dataset.d3il_dataset.pushing_dataset import Pushing_Dataset
+from agent.dataset.d3il_dataset.sorting_dataset import Sorting_Dataset
+from agent.dataset.d3il_dataset.stacking_dataset import Stacking_Dataset
+
+
+def make_dataset(load_path, save_dir, save_name_prefix, env_type, val_split):
+    if env_type == "align":
+        demo_dataset = Aligning_Dataset(
+            load_path,
+            action_dim=3,
+            obs_dim=20,
+            max_len_data=512,
+        )
+    elif env_type == "avoid":
+        demo_dataset = Avoiding_Dataset(
+            load_path,
+            action_dim=2,
+            obs_dim=4,
+            max_len_data=200,
+        )
+    elif env_type == "push":
+        demo_dataset = Pushing_Dataset(
+            load_path,
+            action_dim=2,
+            obs_dim=10,
+            max_len_data=512,
+        )
+    elif env_type == "sort":
+        # Can config number of boxes to be 2, 4, or 6.
+        # TODO: add other numbers of boxes
+        demo_dataset = Sorting_Dataset(
+            load_path,
+            action_dim=2,
+            obs_dim=10,
+            max_len_data=600,
+            num_boxes=2,
+        )
+    elif env_type == "stack":
+        demo_dataset = Stacking_Dataset(
+            load_path,
+            action_dim=8,
+            obs_dim=20,
+            max_len_data=1000,
+        )
+    else:
+        raise ValueError("Invalid dataset type.")
+    # extract length of each trajectory in the file
+    traj_lengths = []
+    actions = demo_dataset.actions
+    obs = demo_dataset.observations
+    masks = demo_dataset.masks
+    action_dim = actions.shape[2]
+    obs_dim = obs.shape[2]
+    for ep in range(masks.shape[0]):
+        traj_lengths.append(int(masks[ep].sum().item()))
+    traj_lengths = np.array(traj_lengths)
+    max_traj_length = np.max(traj_lengths)
+
+    # split indices in train and val
+    num_traj = len(traj_lengths)
+    num_train = int(num_traj * (1 - val_split))
+    train_indices = random.sample(range(num_traj), k=num_train)
+
+    # take the max and min of obs and action
+    obs_min = np.zeros((obs_dim))
+    obs_max = np.zeros((obs_dim))
+    action_min = np.zeros((action_dim))
+    action_max = np.zeros((action_dim))
+    for i in tqdm(range(len(traj_lengths))):
+        T = traj_lengths[i]
+        obs_traj = obs[i, :T].numpy()
+        action_traj = actions[i, :T].numpy()
+        obs_min = np.min(np.vstack((obs_min, np.min(obs_traj, axis=0))), axis=0)
+        obs_max = np.max(np.vstack((obs_max, np.max(obs_traj, axis=0))), axis=0)
+        action_min = np.min(
+            np.vstack((action_min, np.min(action_traj, axis=0))), axis=0
+        )
+        action_max = np.max(
+            np.vstack((action_max, np.max(action_traj, axis=0))), axis=0
+        )
+    logger.info("\n========== Basic Info ===========")
+    logger.info("total transitions: {}".format(np.sum(traj_lengths)))
+    logger.info("total trajectories: {}".format(len(traj_lengths)))
+    logger.info(
+        f"traj length mean/std: {np.mean(traj_lengths)}, {np.std(traj_lengths)}"
+    )
+    logger.info(f"traj length min/max: {np.min(traj_lengths)}, {np.max(traj_lengths)}")
+    logger.info(f"obs min: {obs_min}")
+    logger.info(f"obs max: {obs_max}")
+    logger.info(f"action min: {action_min}")
+    logger.info(f"action max: {action_max}")
+
+    # do over all indices
+    out_train = {}
+    keys = [
+        "observations",
+        "actions",
+        "rewards",
+    ]
+    total_timesteps = actions.shape[1]
+    out_train["observations"] = np.empty((0, total_timesteps, obs_dim))
+    out_train["actions"] = np.empty((0, total_timesteps, action_dim))
+    out_train["rewards"] = np.empty((0, total_timesteps))
+    out_train["traj_length"] = []
+    out_val = deepcopy(out_train)
+    for i in tqdm(range(len(traj_lengths))):
+        if i in train_indices:
+            out = out_train
+        else:
+            out = out_val
+
+        T = traj_lengths[i]
+        obs_traj = obs[i].numpy()
+        action_traj = actions[i].numpy()
+
+        # scale to [-1, 1] for both ob and action
+        obs_traj = 2 * (obs_traj - obs_min) / (obs_max - obs_min + 1e-6) - 1
+        action_traj = (
+            2 * (action_traj - action_min) / (action_max - action_min + 1e-6) - 1
+        )
+
+        # get episode length
+        traj_length = T
+        out["traj_length"].append(traj_length)
+
+        # extract
+        rewards = np.zeros(total_timesteps)  # no reward from d3il dataset
+        data_traj = {
+            "observations": obs_traj,
+            "actions": action_traj,
+            "rewards": rewards,
+        }
+        for key in keys:
+            traj = data_traj[key]
+            out[key] = np.vstack((out[key], traj[None]))
+
+    # plot all trajectories and save in a figure
+    def plot(out, name):
+        def get_obj_xy_list():
+            mid_pos = 0.5
+            offset = 0.075
+            first_level_y = -0.1
+            level_distance = 0.18
+            return [
+                [mid_pos, first_level_y],
+                [mid_pos - offset, first_level_y + level_distance],
+                [mid_pos + offset, first_level_y + level_distance],
+                [mid_pos - 2 * offset, first_level_y + 2 * level_distance],
+                [mid_pos, first_level_y + 2 * level_distance],
+                [mid_pos + 2 * offset, first_level_y + 2 * level_distance],
+            ]
+
+        pillar_xys = get_obj_xy_list()
+        fig = plt.figure()
+        all_trajs = out["observations"]  # num x timestep x obs
+        for traj, traj_length in zip(all_trajs, out["traj_length"]):
+            # unnormalize
+            traj = (traj + 1) / 2  # [-1, 1] -> [0, 1]
+            traj = traj * (obs_max - obs_min) + obs_min
+            plt.plot(
+                traj[:traj_length, 2], traj[:traj_length, 3], color=(0.3, 0.3, 0.3)
+            )
+        plt.axhline(y=0.4, color=np.array([31, 119, 180]) / 255, linestyle="-")
+        for xy in pillar_xys:
+            circle = plt.Circle(xy, 0.01, color=(0.0, 0.0, 0.0), fill=True)
+            plt.gca().add_patch(circle)
+        plt.xlabel("X pos")
+        plt.ylabel("Y pos")
+        plt.xlim([0.2, 0.8])
+        plt.ylim([-0.3, 0.5])
+        ax = plt.gca()
+        ax.set_aspect("equal", adjustable="box")
+        ax.set_facecolor("white")
+        plt.savefig(os.path.join(save_dir, name))
+        plt.close(fig)
+
+    plot(out_train, name="train-trajs.png")
+    plot(out_val, name="val-trajs.png")
+
+    # Save to np file
+    save_train_path = os.path.join(save_dir, save_name_prefix + "train.pkl")
+    save_val_path = os.path.join(save_dir, save_name_prefix + "val.pkl")
+    with open(save_train_path, "wb") as f:
+        pickle.dump(out_train, f)
+    with open(save_val_path, "wb") as f:
+        pickle.dump(out_val, f)
+    normalization_save_path = os.path.join(
+        save_dir, save_name_prefix + "normalization.npz"
+    )
+    np.savez(
+        normalization_save_path,
+        obs_min=obs_min,
+        obs_max=obs_max,
+        action_min=action_min,
+        action_max=action_max,
+    )
+
+    # debug
+    logger.info("\n========== Final ===========")
+    logger.info(
+        f"Train - Number of episodes and transitions: {len(out_train['traj_length'])}, {np.sum(out_train['traj_length'])}"
+    )
+    logger.info(
+        f"Val - Number of episodes and transitions: {len(out_val['traj_length'])}, {np.sum(out_val['traj_length'])}"
+    )
+    logger.info(
+        f"Train - Mean/Std trajectory length: {np.mean(out_train['traj_length'])}, {np.std(out_train['traj_length'])}"
+    )
+    logger.info(
+        f"Train - Max/Min trajectory length: {np.max(out_train['traj_length'])}, {np.min(out_train['traj_length'])}"
+    )
+    if val_split > 0:
+        logger.info(
+            f"Val - Mean/Std trajectory length: {np.mean(out_val['traj_length'])}, {np.std(out_val['traj_length'])}"
+        )
+        logger.info(
+            f"Val - Max/Min trajectory length: {np.max(out_val['traj_length'])}, {np.min(out_val['traj_length'])}"
+        )
+    for obs_dim_ind in range(obs_dim):
+        obs = out_train["observations"][:, :, obs_dim_ind]
+        logger.info(
+            f"Train - Obs dim {obs_dim_ind+1} mean {np.mean(obs)} std {np.std(obs)} min {np.min(obs)} max {np.max(obs)}"
+        )
+    for action_dim_ind in range(action_dim):
+        action = out_train["actions"][:, :, action_dim_ind]
+        logger.info(
+            f"Train - Action dim {action_dim_ind+1} mean {np.mean(action)} std {np.std(action)} min {np.min(action)} max {np.max(action)}"
+        )
+    if val_split > 0:
+        for obs_dim_ind in range(obs_dim):
+            obs = out_val["observations"][:, :, obs_dim_ind]
+            logger.info(
+                f"Val - Obs dim {obs_dim_ind+1} mean {np.mean(obs)} std {np.std(obs)} min {np.min(obs)} max {np.max(obs)}"
+            )
+        for action_dim_ind in range(action_dim):
+            action = out_val["actions"][:, :, action_dim_ind]
+            logger.info(
+                f"Val - Action dim {action_dim_ind+1} mean {np.mean(action)} std {np.std(action)} min {np.min(action)} max {np.max(action)}"
+            )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--load_path", type=str, default=".")
+    parser.add_argument("--save_dir", type=str, default=".")
+    parser.add_argument("--save_name_prefix", type=str, default="")
+    parser.add_argument("--env_type", type=str, default="align")
+    parser.add_argument("--val_split", type=float, default="0.2")
+    args = parser.parse_args()
+
+    os.makedirs(args.save_dir, exist_ok=True)
+
+    import logging
+    import datetime
+
+    os.makedirs(args.save_dir, exist_ok=True)
+    log_path = os.path.join(
+        args.save_dir,
+        args.save_name_prefix
+        + f"_{datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}.log",
+    )
+    logger = logging.getLogger("get_D4RL_dataset")
+    logger.setLevel(logging.INFO)
+    file_handler = logging.FileHandler(log_path)
+    file_handler.setLevel(logging.INFO)  # Set the minimum level for this handler
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+
+    make_dataset(
+        args.load_path,
+        args.save_dir,
+        args.save_name_prefix,
+        args.env_type,
+        args.val_split,
+    )
diff --git a/script/dataset/process_robomimic_dataset.py b/script/dataset/process_robomimic_dataset.py
new file mode 100644
index 0000000..c0ccc5c
--- /dev/null
+++ b/script/dataset/process_robomimic_dataset.py
@@ -0,0 +1,421 @@
+"""
+Process robomimic dataset and save it into our custom format so it can be loaded for diffusion training.
+
+Using some code from robomimic/robomimic/scripts/get_dataset_info.py
+
+can-mh:
+    total transitions: 62756
+    total trajectories: 300
+    traj length mean: 209.18666666666667
+    traj length std: 114.42181532479817
+    traj length min: 98
+    traj length max: 1050
+    action min: -1.0
+    action max: 1.0
+
+    {
+        "env_name": "PickPlaceCan",
+        "env_version": "1.4.1",
+        "type": 1,
+        "env_kwargs": {
+            "has_renderer": false,
+            "has_offscreen_renderer": false,
+            "ignore_done": true,
+            "use_object_obs": true,
+            "use_camera_obs": false,
+            "control_freq": 20,
+            "controller_configs": {
+                "type": "OSC_POSE",
+                "input_max": 1,
+                "input_min": -1,
+                "output_max": [
+                    0.05,
+                    0.05,
+                    0.05,
+                    0.5,
+                    0.5,
+                    0.5
+                ],
+                "output_min": [
+                    -0.05,
+                    -0.05,
+                    -0.05,
+                    -0.5,
+                    -0.5,
+                    -0.5
+                ],
+                "kp": 150,
+                "damping": 1,
+                "impedance_mode": "fixed",
+                "kp_limits": [
+                    0,
+                    300
+                ],
+                "damping_limits": [
+                    0,
+                    10
+                ],
+                "position_limits": null,
+                "orientation_limits": null,
+                "uncouple_pos_ori": true,
+                "control_delta": true,
+                "interpolation": null,
+                "ramp_ratio": 0.2
+            },
+            "robots": [
+                "Panda"
+            ],
+            "camera_depths": false,
+            "camera_heights": 84,
+            "camera_widths": 84,
+            "reward_shaping": false
+        }
+    }
+
+robomimic dataset normalizes action to [-1, 1], observation roughly? to [-1, 1]. Seems sometimes the upper value is a bit larger than 1 (but within 1.1).
+
+"""
+
+import numpy as np
+from tqdm import tqdm
+import pickle
+
+try:
+    import h5py  # not included in pyproject.toml
+except:
+    print("Installing h5py")
+    os.system("pip install h5py")
+import os
+import random
+from copy import deepcopy
+import logging
+
+
+def make_dataset(
+    load_path,
+    save_dir,
+    save_name_prefix,
+    val_split,
+    normalize,
+):
+    # Load hdf5 file from load_path
+    with h5py.File(load_path, "r") as f:
+        # put demonstration list in increasing episode order
+        demos = sorted(list(f["data"].keys()))
+        inds = np.argsort([int(elem[5:]) for elem in demos])
+        demos = [demos[i] for i in inds]
+
+        if args.max_episodes > 0:
+            demos = demos[: args.max_episodes]
+
+        # From generate_paper_configs.py: default observation is eef pose, gripper finger position, and object information, all of which are low-dim.
+        low_dim_obs_names = [
+            "robot0_eef_pos",
+            "robot0_eef_quat",
+            "robot0_gripper_qpos",
+        ]
+        if "transport" in load_path:
+            low_dim_obs_names += [
+                "robot1_eef_pos",
+                "robot1_eef_quat",
+                "robot1_gripper_qpos",
+            ]
+        if args.cameras is None:  # state-only
+            low_dim_obs_names.append("object")
+        obs_dim = 0
+        for low_dim_obs_name in low_dim_obs_names:
+            dim = f["data/demo_0/obs/{}".format(low_dim_obs_name)].shape[1]
+            obs_dim += dim
+            logging.info(f"Using {low_dim_obs_name} with dim {dim} for observation")
+        action_dim = f["data/demo_0/actions"].shape[1]
+        logging.info(f"Total low-dim observation dim: {obs_dim}")
+        logging.info(f"Action dim: {action_dim}")
+
+        # get basic stats
+        traj_lengths = []
+        obs_min = np.zeros((obs_dim))
+        obs_max = np.zeros((obs_dim))
+        action_min = np.zeros((action_dim))
+        action_max = np.zeros((action_dim))
+        for ep in demos:
+            traj_lengths.append(f[f"data/{ep}/actions"].shape[0])
+            obs = np.hstack(
+                [
+                    f[f"data/{ep}/obs/{low_dim_obs_name}"][()]
+                    for low_dim_obs_name in low_dim_obs_names
+                ]
+            )
+            actions = f[f"data/{ep}/actions"]
+            obs_min = np.minimum(obs_min, np.min(obs, axis=0))
+            obs_max = np.maximum(obs_max, np.max(obs, axis=0))
+            action_min = np.minimum(action_min, np.min(actions, axis=0))
+            action_max = np.maximum(action_max, np.max(actions, axis=0))
+        traj_lengths = np.array(traj_lengths)
+        max_traj_length = np.max(traj_lengths)
+
+        # report statistics on the data
+        logging.info("===== Basic stats =====")
+        logging.info("total transitions: {}".format(np.sum(traj_lengths)))
+        logging.info("total trajectories: {}".format(traj_lengths.shape[0]))
+        logging.info(
+            f"traj length mean/std: {np.mean(traj_lengths)}, {np.std(traj_lengths)}"
+        )
+        logging.info(
+            f"traj length min/max: {np.min(traj_lengths)}, {np.max(traj_lengths)}"
+        )
+        logging.info(f"obs min: {obs_min}")
+        logging.info(f"obs max: {obs_max}")
+        logging.info(f"action min: {action_min}")
+        logging.info(f"action max: {action_max}")
+
+        # deal with images
+        if args.cameras is not None:
+            img_shapes = []
+            img_names = []  # not necessary but keep old implementation
+            for camera in args.cameras:
+                if f"{camera}_image" in f["data/demo_0/obs"]:
+                    img_shape = f["data/demo_0/obs/{}_image".format(camera)].shape[1:]
+                    img_shapes.append(img_shape)
+                    img_names.append(f"{camera}_image")
+            # ensure all images have the same height and width
+            assert all(
+                [
+                    img_shape[0] == img_shapes[0][0]
+                    and img_shape[1] == img_shapes[0][1]
+                    for img_shape in img_shapes
+                ]
+            )
+            combined_img_shape = (
+                img_shapes[0][0],
+                img_shapes[0][1],
+                sum([img_shape[2] for img_shape in img_shapes]),
+            )
+            logging.info(f"Image shapes: {img_shapes}")
+
+        # split indices in train and val
+        num_traj = len(traj_lengths)
+        num_train = int(num_traj * (1 - val_split))
+        train_indices = random.sample(range(num_traj), k=num_train)
+
+        # do over all indices
+        out_train = {}
+        keys = [
+            "observations",
+            "actions",
+            "rewards",
+        ]
+        if args.cameras is not None:
+            keys.append("images")
+        out_train["observations"] = np.empty((0, max_traj_length, obs_dim))
+        out_train["actions"] = np.empty((0, max_traj_length, action_dim))
+        out_train["rewards"] = np.empty((0, max_traj_length))
+        out_train["traj_length"] = []
+        if args.cameras is not None:
+            out_train["images"] = np.empty(
+                (
+                    0,
+                    max_traj_length,
+                    *combined_img_shape,
+                ),
+                dtype=np.uint8,
+            )
+        out_val = deepcopy(out_train)
+        train_episode_reward_all = []
+        val_episode_reward_all = []
+        for i in tqdm(range(len(demos))):
+            ep = demos[i]
+            if i in train_indices:
+                out = out_train
+            else:
+                out = out_val
+
+            # get episode length
+            traj_length = f[f"data/{ep}"].attrs["num_samples"]
+            out["traj_length"].append(traj_length)
+            # print("Episode:", i, "Trajectory length:", traj_length)
+
+            # extract
+            raw_actions = f[f"data/{ep}/actions"][()]
+            rewards = f[f"data/{ep}/rewards"][()]
+            raw_obs = np.hstack(
+                [
+                    f[f"data/{ep}/obs/{low_dim_obs_name}"][()]
+                    for low_dim_obs_name in low_dim_obs_names
+                ]
+            )  # not normalized
+
+            # scale to [-1, 1] for both ob and action
+            if normalize:
+                obs = 2 * (raw_obs - obs_min) / (obs_max - obs_min + 1e-6) - 1
+                actions = (
+                    2 * (raw_actions - action_min) / (action_max - action_min + 1e-6)
+                    - 1
+                )
+            else:
+                obs = raw_obs
+                actions = raw_actions
+
+            data_traj = {
+                "observations": obs,
+                "actions": actions,
+                "rewards": rewards,
+            }
+            if args.cameras is not None:  # no normalization
+                data_traj["images"] = np.concatenate(
+                    (
+                        [
+                            f["data/{}/obs/{}".format(ep, img_name)][()]
+                            for img_name in img_names
+                        ]
+                    ),
+                    axis=-1,
+                )
+
+            # apply padding to make all episodes have the same max steps
+            # later when we load this dataset, we will use the traj_length to slice the data
+            for key in keys:
+                traj = data_traj[key]
+                if traj.ndim == 1:
+                    pad_width = (0, max_traj_length - len(traj))
+                elif traj.ndim == 2:
+                    pad_width = ((0, max_traj_length - traj.shape[0]), (0, 0))
+                elif traj.ndim == 4:
+                    pad_width = (
+                        (0, max_traj_length - traj.shape[0]),
+                        (0, 0),
+                        (0, 0),
+                        (0, 0),
+                    )
+                else:
+                    raise ValueError("Unsupported dimension")
+                traj = np.pad(
+                    traj,
+                    pad_width,
+                    mode="constant",
+                    constant_values=0,
+                )
+                out[key] = np.vstack((out[key], traj[None]))
+
+            # check reward
+            if i in train_indices:
+                train_episode_reward_all.append(np.sum(data_traj["rewards"]))
+            else:
+                val_episode_reward_all.append(np.sum(data_traj["rewards"]))
+
+    # Save to np file
+    save_train_path = os.path.join(save_dir, save_name_prefix + "train.pkl")
+    save_val_path = os.path.join(save_dir, save_name_prefix + "val.pkl")
+    with open(save_train_path, "wb") as f:
+        pickle.dump(out_train, f)
+    with open(save_val_path, "wb") as f:
+        pickle.dump(out_val, f)
+    if normalize:
+        normalization_save_path = os.path.join(
+            save_dir, save_name_prefix + "normalization.npz"
+        )
+        np.savez(
+            normalization_save_path,
+            obs_min=obs_min,
+            obs_max=obs_max,
+            action_min=action_min,
+            action_max=action_max,
+        )
+
+    # debug
+    logging.info("\n========== Final ===========")
+    logging.info(
+        f"Train - Number of episodes and transitions: {len(out_train['traj_length'])}, {np.sum(out_train['traj_length'])}"
+    )
+    logging.info(
+        f"Val - Number of episodes and transitions: {len(out_val['traj_length'])}, {np.sum(out_val['traj_length'])}"
+    )
+    logging.info(
+        f"Train - Mean/Std trajectory length: {np.mean(out_train['traj_length'])}, {np.std(out_train['traj_length'])}"
+    )
+    logging.info(
+        f"Train - Max/Min trajectory length: {np.max(out_train['traj_length'])}, {np.min(out_train['traj_length'])}"
+    )
+    logging.info(
+        f"Train - Mean/Std episode reward: {np.mean(train_episode_reward_all)},  {np.std(train_episode_reward_all)}"
+    )
+    if val_split > 0:
+        logging.info(
+            f"Val - Mean/Std trajectory length: {np.mean(out_val['traj_length'])}, {np.std(out_val['traj_length'])}"
+        )
+        logging.info(
+            f"Val - Max/Min trajectory length: {np.max(out_val['traj_length'])}, {np.min(out_val['traj_length'])}"
+        )
+        logging.info(
+            f"Val - Mean/Std episode reward: {np.mean(val_episode_reward_all)},  {np.std(val_episode_reward_all)}"
+        )
+    for obs_dim_ind in range(obs_dim):
+        obs = out_train["observations"][:, :, obs_dim_ind]
+        logging.info(
+            f"Train - Obs dim {obs_dim_ind+1} mean {np.mean(obs)} std {np.std(obs)} min {np.min(obs)} max {np.max(obs)}"
+        )
+    for action_dim_ind in range(action_dim):
+        action = out_train["actions"][:, :, action_dim_ind]
+        logging.info(
+            f"Train - Action dim {action_dim_ind+1} mean {np.mean(action)} std {np.std(action)} min {np.min(action)} max {np.max(action)}"
+        )
+    if val_split > 0:
+        for obs_dim_ind in range(obs_dim):
+            obs = out_val["observations"][:, :, obs_dim_ind]
+            logging.info(
+                f"Val - Obs dim {obs_dim_ind+1} mean {np.mean(obs)} std {np.std(obs)} min {np.min(obs)} max {np.max(obs)}"
+            )
+        for action_dim_ind in range(action_dim):
+            action = out_val["actions"][:, :, action_dim_ind]
+            logging.info(
+                f"Val - Action dim {action_dim_ind+1} mean {np.mean(action)} std {np.std(action)} min {np.min(action)} max {np.max(action)}"
+            )
+    # logging.info("Train - Observation shape:", out_train["observations"].shape)
+    # logging.info("Train - Action shape:", out_train["actions"].shape)
+    # logging.info("Train - Reward shape:", out_train["rewards"].shape)
+    # logging.info("Val - Observation shape:", out_val["observations"].shape)
+    # logging.info("Val - Action shape:", out_val["actions"].shape)
+    # logging.info("Val - Reward shape:", out_val["rewards"].shape)
+    # if use_img:
+    # logging.info("Image shapes:", img_shapes)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--load_path", type=str, default=".")
+    parser.add_argument("--save_dir", type=str, default=".")
+    parser.add_argument("--save_name_prefix", type=str, default="")
+    parser.add_argument("--val_split", type=float, default="0.2")
+    parser.add_argument("--max_episodes", type=int, default="-1")
+    parser.add_argument("--normalize", action="store_true")
+    parser.add_argument("--cameras", nargs="*", default=None)
+    args = parser.parse_args()
+
+    import datetime
+
+    if args.max_episodes > 0:
+        args.save_name_prefix += f"max_episodes_{args.max_episodes}_"
+
+    os.makedirs(args.save_dir, exist_ok=True)
+    log_path = os.path.join(
+        args.save_dir,
+        args.save_name_prefix
+        + f"_{datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}.log",
+    )
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(message)s",
+        handlers=[
+            logging.FileHandler(log_path, mode="w"),
+            logging.StreamHandler(),
+        ],
+    )
+
+    make_dataset(
+        args.load_path,
+        args.save_dir,
+        args.save_name_prefix,
+        args.val_split,
+        args.normalize,
+    )
diff --git a/script/download_url.py b/script/download_url.py
new file mode 100644
index 0000000..9838088
--- /dev/null
+++ b/script/download_url.py
@@ -0,0 +1,432 @@
+def get_dataset_download_url(cfg):
+    env = cfg.env
+    # Gym
+    if env == "hopper-medium-v2":
+        return "https://drive.google.com/drive/u/1/folders/18Ti-92XVq3sE24K096WAxjC_SCCngeHd"
+    elif env == "walker2d-medium-v2":
+        return "https://drive.google.com/drive/u/1/folders/1BJu8NklriunDHsDrLT6fEpcro3_2IPFf"
+    elif env == "halfcheetah-medium-v2":
+        return "https://drive.google.com/drive/u/1/folders/1Drel26tiuQ9oD3YNf1eyy0UVaf5SQj-U"
+    # D3IL
+    elif env == "avoid" and cfg.mode == "d56_r12":  # M1
+        return "https://drive.google.com/drive/u/1/folders/1ZAPvLQwv2y4Q98UDVKXFT4fvGF5yhD_o"
+    elif env == "avoid" and cfg.mode == "d57_r12":  # M2
+        return "https://drive.google.com/drive/u/1/folders/1wyJi1Zbnd6JNy4WGszHBH40A0bbl-vkd"
+    elif env == "avoid" and cfg.mode == "d58_r12":  # M3
+        return "https://drive.google.com/drive/u/1/folders/1mNXCIPnCO_FDBlEj95InA9eWJM2XcEEj"
+    # Robomimic
+    elif env == "lift" and "img" not in cfg.train_dataset_path:  # state
+        return "https://drive.google.com/drive/u/1/folders/1lbXgMKBTAiFdJqPZqWXpwjEyrVW16MBu"
+    elif env == "lift" and "img" in cfg.train_dataset_path:  # img
+        return "https://drive.google.com/drive/u/1/folders/1H-UncdzHx6wd5NWVzrQyftfGls7KGz1O"
+    elif env == "can" and "img" not in cfg.train_dataset_path:
+        return "https://drive.google.com/drive/u/1/folders/1J1qSvsDEf40jnMZY9W0r6ww--E3MdmK3"
+    elif env == "can" and "img" in cfg.train_dataset_path:
+        return "https://drive.google.com/drive/u/1/folders/1VGp_5xXXb1-GJutdSc6AZSzXNk-6_vRz"
+    elif env == "square" and "img" not in cfg.train_dataset_path:
+        return "https://drive.google.com/drive/u/1/folders/1mVVNOJ6wt2EXoapF7PKkcqxbsB9gvK-B"
+    elif env == "square" and "img" in cfg.train_dataset_path:
+        return "https://drive.google.com/drive/u/1/folders/1-aGqVeKLIzJCEst8p0ZTjfjkrXFfJLxa"
+    elif env == "transport" and "img" not in cfg.train_dataset_path:
+        return "https://drive.google.com/drive/u/1/folders/1EVHmFx-YdX4MEE1EjwduVayvaH9vvqvK"
+    elif env == "transport" and "img" in cfg.train_dataset_path:
+        return "https://drive.google.com/drive/u/1/folders/1cOkAZQmmETYEPFrnnX0EuD6mv0kUfMO2"
+    # Furniture-Bench
+    elif env == "one_leg_low_dim":
+        return "https://drive.google.com/drive/u/1/folders/1v4LG2D1fS8id5hqE7Jjt7MYNFUEBNyh4"
+    elif env == "one_leg_med_dim":
+        return "https://drive.google.com/drive/u/1/folders/1ohDuMSgCqGN1CSh1cI_8A0ia3DVHzj3w"
+    elif env == "lamp_low_dim":
+        return "https://drive.google.com/drive/u/1/folders/14MqDUmuNmTFcBtKw5gcvx7nuir0zmF7V"
+    elif env == "lamp_med_dim":
+        return "https://drive.google.com/drive/u/1/folders/1bhOoN0xet4ga0rOIvcRHUFoaCjPRNRLf"
+    elif env == "round_table_low_dim":
+        return "https://drive.google.com/drive/u/1/folders/15oF3qiqGzlT_98FDoTIVtGmLtHd5SIbi"
+    elif env == "round_table_med_dim":
+        return "https://drive.google.com/drive/u/1/folders/1U27xjdRrKlLC8E33o7jMFZ1HF5P_Soik"
+    # unknown
+    else:
+        raise ValueError(f"Unknown environment {env}")
+
+
+def get_normalization_download_url(cfg):
+    env = cfg.env_name
+    # Gym
+    if env == "hopper-medium-v2":
+        return "https://drive.google.com/file/d/1HHZ2X6r5io6hjG-MHVFFoPMJV2fJ3lis/view?usp=drive_link"
+    elif env == "walker2d-medium-v2":
+        return "https://drive.google.com/file/d/1NSX7t3DFKaBj5HNpv91Oo5h6oXTk0zoo/view?usp=drive_link"
+    elif env == "halfcheetah-medium-v2":
+        return "https://drive.google.com/file/d/1LlwCMfy1b5e8jSx99CV3lWhcrQWrI2Jm/view?usp=drive_link"
+    # D3IL
+    elif env == "avoiding-m5" and cfg.mode == "d56_r12":  # M1
+        return "https://drive.google.com/file/d/1PubKaPabbiSdWYpGmouDhYfXp4QwNHFG/view?usp=drive_link"
+    elif env == "avoiding-m5" and cfg.mode == "d57_r12":  # M2
+        return "https://drive.google.com/file/d/1Hoohw8buhsLzXoqivMA6IzKS5Izlj07_/view?usp=drive_link"
+    elif env == "avoiding-m5" and cfg.mode == "d58_r12":  # M3
+        return "https://drive.google.com/file/d/1qt7apV52C9Tflsc-A55J6uDMHzaFa1wN/view?usp=drive_link"
+    # Robomimic
+    elif env == "lift" and "img" not in cfg.normalization_path:  # state
+        return "https://drive.google.com/file/d/1d3WjwRds-7I5bBFpZuY27OT9ycb8r_QM/view?usp=drive_link"
+    elif env == "lift" and "img" in cfg.normalization_path:  # img
+        return "https://drive.google.com/file/d/15GnKDIK8VasvUHahcvEeK_uEs1J9i0ja/view?usp=drive_link"
+    elif env == "can" and "img" not in cfg.normalization_path:
+        return "https://drive.google.com/file/d/14FxHk9zQ-5ulAO26a6xrdvc-gRkL36FR/view?usp=drive_link"
+    elif env == "can" and "img" in cfg.normalization_path:
+        return "https://drive.google.com/file/d/1APAB6W10ECaNVVL72F0C2wC6oJhhbjWX/view?usp=drive_link"
+    elif env == "square" and "img" not in cfg.normalization_path:
+        return "https://drive.google.com/file/d/1FFMqWVv0145OJjbA_iglkWywmdK22Za-/view?usp=drive_link"
+    elif env == "square" and "img" in cfg.normalization_path:
+        return "https://drive.google.com/file/d/1jq5atfHdu-ZMQ8YaFjwcctbJESBYDgP8/view?usp=drive_link"
+    elif env == "transport" and "img" not in cfg.normalization_path:
+        return "https://drive.google.com/file/d/1EmC80gIgLoqQ8kRPH5r0mDVqX6NqHO3p/view?usp=drive_link"
+    elif env == "transport" and "img" in cfg.normalization_path:
+        return "https://drive.google.com/file/d/1LBgvIacNzbXCZXWKYanddqiotevG3hmA/view?usp=drive_link"
+    # Furniture-Bench
+    elif env == "one_leg_low_dim":
+        return "https://drive.google.com/file/d/1fbYxau8Z0tifeuu_06UKdRRz8zozxMUE/view?usp=drive_link"
+    elif env == "one_leg_med_dim":
+        return "https://drive.google.com/file/d/1bT-MIG99-uwSXL3VwCz6kK9Kg9t6a7BZ/view?usp=drive_link"
+    elif env == "lamp_low_dim":
+        return "https://drive.google.com/file/d/1KRyCcCT63-cikz6Q_McXM0iVYx9M2E_o/view?usp=drive_link"
+    elif env == "lamp_med_dim":
+        return "https://drive.google.com/file/d/1WBHKbA6BtSfF3qfRIkyuWga5T9X3fEvg/view?usp=drive_link"
+    elif env == "round_table_low_dim":
+        return "https://drive.google.com/file/d/1CwfNLI5KXEkl_jVgtAY7-xfq7jED-xc0/view?usp=drive_link"
+    elif env == "round_table_med_dim":
+        return "https://drive.google.com/file/d/1vNxzok6f6HROGxQR2eoThsN62hVjpayn/view?usp=drive_link"
+    # unknown
+    else:
+        raise ValueError(f"Unknown environment {env}")
+
+
+def get_checkpoint_download_url(cfg):
+    path = cfg.base_policy_path
+    ######################################
+    ####             Gym
+    ######################################
+    if (
+        "hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1uV3beg2YuBRh11t7jnRsyd_M9bsFGOkA/view?usp=drive_link"
+    elif (
+        "walker2d-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-06-12/checkpoint/state_3000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1StopetttozWba4l9u0VT_JW1rYe6kJx2/view?usp=drive_link"
+    elif (
+        "halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1o9ryyeZQAsaB4ffUTCJkIaGCi0frL3G4/view?usp=drive_link"
+    ######################################
+    ####             D3IL
+    ######################################
+    elif (
+        "avoid_d56_r12_pre_diffusion_mlp_ta4_td20/2024-07-06_22-50-07/checkpoint/state_10000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1JdEOG0KsCA9EX9zq09DE0xkTB4xy6DNp/view?usp=drive_link"
+    elif (
+        "avoid_d56_r12_pre_gaussian_mlp_ta4/2024-07-07_01-35-48/checkpoint/state_10000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/138wEi_rVV5HpcwgH6_3BlXQ1dhgZN05L/view?usp=drive_link"
+    elif (
+        "avoid_d56_r12_pre_gmm_mlp_ta4/2024-07-10_14-30-00/checkpoint/state_10000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1krvEINP6UBfnJG9bEge53h3MNJkTQXe_/view?usp=drive_link"
+    elif (
+        "avoid_d57_r12_pre_diffusion_mlp_ta4_td20/2024-07-07_13-12-09/checkpoint/state_15000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1wAmRHzRZ4O5z_ZWDhVg5JFZbugo4cHKA/view?usp=drive_link"
+    elif (
+        "avoid_d57_r12_pre_gaussian_mlp_ta4/2024-07-07_02-15-50/checkpoint/state_10000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1hf_647bJ0EMRhArsfxStSkhgkGaWGhmD/view?usp=drive_link"
+    elif (
+        "avoid_d57_r12_pre_gmm_mlp_ta4/2024-07-10_15-44-32/checkpoint/state_10000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1CE4AcNJp2UITIHpuLUwxp7cDH8Y8jsYC/view?usp=drive_link"
+    elif (
+        "avoid_d58_r12_pre_diffusion_mlp_ta4_td20/2024-07-07_13-54-54/checkpoint/state_15000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1w5X1lJZd0wI6E2XRdj839TqJ_9EziGRx/view?usp=drive_link"
+    elif (
+        "avoid_d58_r12_pre_gaussian_mlp_ta4/2024-07-07_13-11-49/checkpoint/state_10000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1YIVEN0Ykica9dj_DxxPrB4QJLgv-Ne0N/view?usp=drive_link"
+    elif (
+        "avoid_d58_r12_pre_gmm_mlp_ta4/2024-07-10_17-01-50/checkpoint/state_10000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/174tadrqjfxJdOgsMjNbg53goZtIkpW3f/view?usp=drive_link"
+    ######################################
+    ####             Robomimic-Lift
+    ######################################
+    elif (
+        "lift_pre_diffusion_unet_ta4_td20/2024-06-29_02-49-45/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1T-NGgBmT-UmcVWADygXj873IyWLewvsU/view?usp=drive_link"
+    elif (
+        "lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1Ngr-DNxoB9XNCZ2O-NF5p60NzmYlzmWG/view?usp=drive_link"
+    elif (
+        "lift_pre_diffusion_mlp_img_ta4_td100/2024-07-30_22-24-35/checkpoint/state_2500.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/19hqNicwKKKDrlS5UMr-FLRu51EAW8Z51/view?usp=drive_link"
+    elif (
+        "lift_pre_gaussian_mlp_ta4/2024-06-28_14-48-24/checkpoint/state_5000.pt" in path
+    ):
+        return "https://drive.google.com/file/d/157x5_XJy3ZyaPz_7Vr_opXAZ_mPvlmhp/view?usp=drive_link"
+    elif (
+        "lift_pre_gaussian_mlp_img_ta4/2024-07-28_23-00-48/checkpoint/state_500.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1Uae7K3Hv9XzaAGljG2fjsxYyOEgL03zv/view?usp=drive_link"
+    elif (
+        "lift_pre_gaussian_transformer_ta4/2024-06-28_14-49-23/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1Z_C8ureDDPXqpDUaRMVKBt3VgfZlIQdE/view?usp=drive_link"
+    elif "lift_pre_gmm_mlp_ta4/2024-06-28_15-30-32/checkpoint/state_5000.pt" in path:
+        return "https://drive.google.com/file/d/1wFvBoIOaCJVibqEIYSjJxAbxjGkH7RMk/view?usp=drive_link"
+    elif (
+        "lift_pre_gmm_transformer_ta4/2024-06-28_14-51-23/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1w_3WOXS51debWc1ShgaO2VQx49dj9ky2/view?usp=drive_link"
+    ######################################
+    ####             Robomimic-Can
+    ######################################
+    elif (
+        "can_pre_diffusion_unet_ta4_td20/2024-06-29_02-49-45/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1s346KCe2aar_tXX7u8rzjRF3kpwVpH5c/view?usp=drive_link"
+    elif (
+        "can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1L1ZLD1u1Y1YJmRLGzScXbQ02wGS-_cWo/view?usp=drive_link"
+    elif (
+        "can_pre_diffusion_mlp_img_ta4_td100/2024-07-30_22-23-55/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1siIKDGVHld3ZH8vDqgu9iE5H9KoaYq_Z/view?usp=drive_link"
+    elif (
+        "can_pre_gaussian_mlp_ta4/2024-06-28_13-31-00/checkpoint/state_5000.pt" in path
+    ):
+        return "https://drive.google.com/file/d/1bA-A0p0KnHwrVO3MqjuYdV3dZuuZICMy/view?usp=drive_link"
+    elif (
+        "can_pre_gaussian_mlp_img_ta4/2024-07-28_21-54-40/checkpoint/state_1000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/16vUUyVO9DvnDyPtSGiZx4iQP4JxuSAoS/view?usp=drive_link"
+    elif (
+        "can_pre_gaussian_transformer_ta4/2024-06-28_13-42-20/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1cGf7s8aS5grZsGRb5PYZiPogcPRr_lrf/view?usp=drive_link"
+    elif "can_pre_gmm_mlp_ta4/2024-06-28_13-32-19/checkpoint/state_5000.pt" in path:
+        return "https://drive.google.com/file/d/1KVx8-KICiIHstcjsvhPlQc_pZZ6RxXpd/view?usp=drive_link"
+    elif (
+        "can_pre_gmm_transformer_ta4/2024-06-28_13-43-21/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1xSgwGG40zdoO2DDSM79l0rMHeNmaifnq/view?usp=drive_link"
+    ######################################
+    ####             Robomimic-Square
+    ######################################
+    elif (
+        "square_pre_diffusion_unet_ta4_td20/2024-06-29_02-48-45/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/11IEgQe0LFI23hn1Cwf6Z_YfJdDilVc0z/view?usp=drive_link"
+    elif (
+        "square_pre_diffusion_mlp_ta4_td20/2024-07-10_01-46-16/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1lP9mNe2AxMigfOywcaHOOR7FxQ-KR_Ee/view?usp=drive_link"
+    elif (
+        "square_pre_diffusion_mlp_img_ta4_td100/2024-07-30_22-27-34/checkpoint/state_4000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1miud5SX41xjPoW8yqClRhaFx0Q7EWl3V/view?usp=drive_link"
+    elif (
+        "square_pre_gaussian_mlp_ta4/2024-06-28_15-02-32/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1rETiyXLz7YgYoHKwLEZa7dFabu4gfJR8/view?usp=drive_link"
+    elif (
+        "square_pre_gaussian_mlp_img_ta4/2024-07-30_18-44-32/checkpoint/state_4000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1myB6FOAmt6c6x3ScGKXRULgx826tZVS4/view?usp=drive_link"
+    elif (
+        "square_pre_gaussian_transformer_ta4/2024-06-28_15-02-39/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1JJQ7KbRWWBB09PLwNRriAUEAl9vjFeCW/view?usp=drive_link"
+    elif "square_pre_gmm_mlp_ta4/2024-06-28_15-03-08/checkpoint/state_5000.pt" in path:
+        return "https://drive.google.com/file/d/10ujnnOk2Pn-yjE7iW9-RYbApo_aKiNRq/view?usp=drive_link"
+    elif (
+        "square_pre_gmm_transformer_ta4/2024-06-28_15-03-15/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1LczXhgeNtQfqySsfGNbbviPrlLwyh-E3/view?usp=drive_link"
+    ######################################
+    ####             Robomimic-Transport
+    ######################################
+    elif (
+        "transport_pre_diffusion_unet_ta16_td20/2024-07-04_02-20-53/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1MNGT8j9x1uudugGUcia-xwP_7f7xVY4K/view?usp=drive_link"
+    elif (
+        "transport_pre_diffusion_mlp_ta8_td20/2024-07-08_11-18-59/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1240FmcDPg_VyXReEtePjBN4MML-OT21C/view?usp=drive_link"
+    elif (
+        "transport_pre_diffusion_mlp_img_ta8_td100/2024-07-30_22-30-06/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1_UmjNv27_w49XC_EO1WexgreJmC0SEdF/view?usp=drive_link"
+    elif (
+        "transport_pre_gaussian_mlp_ta8/2024-07-10_01-50-52/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1NOHDNHu1sTabxBd4DZrSrTyX9l1I2GuO/view?usp=drive_link"
+    elif (
+        "transport_pre_gaussian_mlp_img_ta8/2024-07-30_21-39-34/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1XYOIwOEgOVoUdMmxWuBRAcA5BOS777Kc/view?usp=drive_link"
+    elif (
+        "transport_pre_gaussian_transformer_ta8/2024-06-28_15-18-16/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1hnt9lX5bg82iFFsAk5FY3TktPXsAhUFU/view?usp=drive_link"
+    elif (
+        "transport_pre_gmm_mlp_ta8/2024-07-10_01-51-21/checkpoint/state_5000.pt" in path
+    ):
+        return "https://drive.google.com/file/d/1da9yLIu5ahq-ZgIIsG7wqehA5VopFTnt/view?usp=drive_link"
+    elif (
+        "transport_pre_gmm_transformer_ta8/2024-06-28_15-18-43/checkpoint/state_5000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1c0S7WX-U1Kn6n-wWZEQSR1alLPW2eZmi/view?usp=drive_link"
+    ######################################
+    ####             Furniture-One_leg
+    ######################################
+    elif (
+        "one_leg_low_dim_pre_diffusion_mlp_ta8_td100/2024-07-22_20-01-16/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1tP0i53EMwNyw_bS2lt9WtQDIAoTxn40g/view?usp=drive_link"
+    elif (
+        "one_leg_low_dim_pre_diffusion_unet_ta16_td100/2024-07-03_22-23-38/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/13nOr7EI79RqdRuoc-je0LoTX5Y-IaO9J/view?usp=drive_link"
+    elif (
+        "one_leg_low_dim_pre_gaussian_mlp_ta8/2024-06-26_23-43-02/checkpoint/state_3000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1DeHj-IEX3a2ZLXFlV0MYe1N_5qupod_w/view?usp=drive_link"
+    elif (
+        "one_leg_med_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-11/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1aEzObl4EkOBKSs0wI2MGmkslzIjZy2_L/view?usp=drive_link"
+    elif (
+        "one_leg_med_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-16-16/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1pSwp_IUDSQ15OszChCkrrTQYdvG3pHc1/view?usp=drive_link"
+    elif (
+        "one_leg_med_dim_pre_gaussian_mlp_ta8/2024-06-28_16-27-02/checkpoint/state_3000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1lC4PKRPn4tRWR9VW5fSfCu3MKnRvYqD6/view?usp=drive_link"
+    ######################################
+    ####             Furniture-Lamp
+    ######################################
+    elif (
+        "lamp_low_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-20/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1md5m4vpe5MmStu-fOk8snF-52BYGZpSc/view?usp=drive_link"
+    elif (
+        "lamp_low_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-16-48/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/103lUuvyKPvp97hzBYnUDAWgor6LsUf21/view?usp=drive_link"
+    elif (
+        "lamp_low_dim_pre_gaussian_mlp_ta8/2024-06-28_16-26-51/checkpoint/state_3000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1tK6NuZf4_xtTlZksIR9oQxiFEADFRVA9/view?usp=drive_link"
+    elif (
+        "lamp_med_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-20/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1yVUXYCK_vFhQ7mawxnCGxuHS6RgWWDjj/view?usp=drive_link"
+    elif (
+        "lamp_med_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-17-21/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1_qd47U50on-T1Pqojlfy7S3Cl6bLm_D1/view?usp=drive_link"
+    elif (
+        "lamp_med_dim_pre_gaussian_mlp_ta8/2024-06-28_16-26-56/checkpoint/state_3000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1d16HmHgidtXForqoN5QJg_e152XCaJM5/view?usp=drive_link"
+    ######################################
+    ####             Furniture-Round_table
+    ######################################
+    elif (
+        "round_table_low_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-26/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1iJyqJr84AtszGqPeSysN70A-SmP73mKu/view?usp=drive_link"
+    elif (
+        "round_table_low_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-19-48/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1F3RFgLcFemU-IDUrLCXMzmQXknC2ISgJ/view?usp=drive_link"
+    elif (
+        "round_table_low_dim_pre_gaussian_mlp_ta8/2024-06-28_16-26-51/checkpoint/state_3000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1eX2u0cvu_zveblrg2htFomAxSToJmHyL/view?usp=drive_link"
+    elif (
+        "round_table_med_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-29/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1DDvHEqb7WqVgT3_W6gYCNDQrvGyreatP/view?usp=drive_link"
+    elif (
+        "round_table_med_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-20-21/checkpoint/state_8000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1hzungyvt3Uc-XbrCzVghzdr59zDf6FlD/view?usp=drive_link"
+    elif (
+        "round_table_med_dim_pre_gaussian_mlp_ta8/2024-06-28_16-26-38/checkpoint/state_3000.pt"
+        in path
+    ):
+        return "https://drive.google.com/file/d/1LvELZ7A-whxKk1Oq7S8M9uBiSrWPAGQT/view?usp=drive_link"
+    # unknown --- this means the user trained own policy but specifies the wrong path
+    else:
+        return None
diff --git a/script/set_path.sh b/script/set_path.sh
new file mode 100755
index 0000000..5f4960d
--- /dev/null
+++ b/script/set_path.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+##################### Paths #####################
+
+# Set default paths
+DEFAULT_DATA_DIR="${PWD}/data"
+DEFAULT_LOG_DIR="${PWD}/log"
+
+# Prompt the user for input, allowing overrides
+read -p "Enter the desired data directory [default: ${DEFAULT_DATA_DIR}], leave empty to use default: " DATA_DIR
+DPPO_DATA_DIR=${DATA_DIR:-$DEFAULT_DATA_DIR}  # Use user input or default if input is empty
+
+read -p "Enter the desired logging directory [default: ${DEFAULT_LOG_DIR}], leave empty to use default: " LOG_DIR
+DPPO_LOG_DIR=${LOG_DIR:-$DEFAULT_LOG_DIR}  # Use user input or default if input is empty
+
+# Export to current session
+export DPPO_DATA_DIR="$DPPO_DATA_DIR"
+export DPPO_LOG_DIR="$DPPO_LOG_DIR"
+
+# Confirm the paths with the user
+echo "Data directory set to: $DPPO_DATA_DIR"
+echo "Log directory set to: $DPPO_LOG_DIR"
+
+# Append environment variables to .bashrc
+echo "export DPPO_DATA_DIR=\"$DPPO_DATA_DIR\"" >> ~/.bashrc
+echo "export DPPO_LOG_DIR=\"$DPPO_LOG_DIR\"" >> ~/.bashrc
+
+echo "Environment variables DPPO_DATA_DIR and DPPO_LOG_DIR added to .bashrc and applied to the current session."
+
+##################### WandB #####################
+
+# Prompt the user for input, allowing overrides
+read -p "Enter your WandB entity (username or team name), leave empty to skip: " ENTITY
+
+# Check if ENTITY is not empty
+if [ -n "$ENTITY" ]; then
+  # If ENTITY is not empty, set the environment variable
+  export DPPO_WANDB_ENTITY="$ENTITY"
+
+  # Confirm the entity with the user
+  echo "WandB entity set to: $DPPO_WANDB_ENTITY"
+
+  # Append environment variable to .bashrc
+  echo "export DPPO_WANDB_ENTITY=\"$ENTITY\"" >> ~/.bashrc
+  
+  echo "Environment variable DPPO_WANDB_ENTITY added to .bashrc and applied to the current session."
+else
+  # If ENTITY is empty, skip setting the environment variable
+  echo "No WandB entity provided. Please set wandb=null when running scripts to disable wandb logging and avoid error."
+fi
diff --git a/script/test_d3il_render.py b/script/test_d3il_render.py
new file mode 100644
index 0000000..bc1659c
--- /dev/null
+++ b/script/test_d3il_render.py
@@ -0,0 +1,34 @@
+"""
+Visualize Avoid environment from D3IL in MuJoCo GUI
+
+"""
+
+import gym
+import gym_avoiding
+import imageio
+
+# from gym_avoiding_env.gym_avoiding.envs.avoiding import ObstacleAvoidanceEnv
+# from envs.gym_avoiding_env.gym_avoiding.envs.avoiding import ObstacleAvoidanceEnv
+from gym.envs import make as make_
+import numpy as np
+
+# env = ObstacleAvoidanceEnv(render=False)
+env = make_("avoiding-v0", render=True)
+# env.start()   # no need to start() any more, already run in init() now
+env.reset()
+print(env.action_space)
+
+# video_writer = imageio.get_writer("test_d3il.mp4", fps=30)
+while 1:
+    obs, reward, done, info = env.step(np.array([0.02, 0.1]))
+    print("Reward:", reward)
+    # video_img = env.render(
+    #     mode="rgb_array",
+    #     # height=640,
+    #     # width=480,
+    #     # camera_name=self.render_camera_name,
+    # )
+    # video_writer.append_data(video_img)
+    if input("Press space to stop, or any other key to continue") == " ":
+        break
+# video_writer.close()
diff --git a/script/test_robomimic_render.py b/script/test_robomimic_render.py
new file mode 100644
index 0000000..613f5b4
--- /dev/null
+++ b/script/test_robomimic_render.py
@@ -0,0 +1,36 @@
+"""
+Test Robomimic rendering, no GUI
+
+"""
+
+import os
+import time
+from gym import spaces
+import robosuite as suite
+
+os.environ["MUJOCO_GL"] = "egl"
+if __name__ == "__main__":
+    env = suite.make(
+        env_name="TwoArmTransport",
+        robots=["Panda", "Panda"],
+        has_renderer=False,
+        has_offscreen_renderer=True,
+        use_camera_obs=True,
+        camera_heights=96,
+        camera_widths=96,
+        camera_names="shouldercamera0",
+        render_gpu_device_id=0,
+        horizon=20,
+    )
+    obs, done = env.reset(), False
+    print("Finished resetting!")
+    low, high = env.action_spec
+    action_space = spaces.Box(low=low, high=high)
+    steps, time_stamp = 0, time.time()
+    while True:
+        while not done:
+            obs, reward, done, info = env.step(action_space.sample())
+            steps += 1
+        obs, done = env.reset(), False
+        print(f"FPS: {steps / (time.time() - time_stamp)}")
+        steps, time_stamp = 0, time.time()
diff --git a/script/train.py b/script/train.py
new file mode 100644
index 0000000..4383921
--- /dev/null
+++ b/script/train.py
@@ -0,0 +1,91 @@
+"""
+Launcher for all experiments. Download pre-training data, normalization statistics, and pre-trained checkpoints if needed.
+
+"""
+
+import os
+import sys
+import pretty_errors
+import logging
+
+import math
+import hydra
+from omegaconf import OmegaConf
+import gdown
+from download_url import (
+    get_dataset_download_url,
+    get_normalization_download_url,
+    get_checkpoint_download_url,
+)
+
+# allows arbitrary python code execution in configs using the ${eval:''} resolver
+OmegaConf.register_new_resolver("eval", eval, replace=True)
+OmegaConf.register_new_resolver("round_up", math.ceil)
+OmegaConf.register_new_resolver("round_down", math.floor)
+
+# suppress d4rl import error
+os.environ["D4RL_SUPPRESS_IMPORT_ERROR"] = "1"
+
+# add logger
+log = logging.getLogger(__name__)
+
+# use line-buffering for both stdout and stderr
+sys.stdout = open(sys.stdout.fileno(), mode="w", buffering=1)
+sys.stderr = open(sys.stderr.fileno(), mode="w", buffering=1)
+
+
+@hydra.main(
+    version_base=None,
+    config_path=os.path.join(
+        os.getcwd(), "cfg"
+    ),  # possibly overwritten by --config-path
+)
+def main(cfg: OmegaConf):
+    # resolve immediately so all the ${now:} resolvers will use the same time.
+    OmegaConf.resolve(cfg)
+
+    # For pre-training: download dataset if needed
+    if "train_dataset_path" in cfg and not os.path.exists(cfg.train_dataset_path):
+        download_url = get_dataset_download_url(cfg)
+        download_target = os.path.dirname(cfg.train_dataset_path)
+        log.info(f"Downloading dataset from {download_url} to {download_target}")
+        gdown.download_folder(url=download_url, output=download_target)
+
+    # For for-tuning: download normalization if needed
+    if "normalization_path" in cfg and not os.path.exists(cfg.normalization_path):
+        download_url = get_normalization_download_url(cfg)
+        download_target = cfg.normalization_path
+        dir_name = os.path.dirname(download_target)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name)
+        log.info(
+            f"Downloading normalization statistics from {download_url} to {download_target}"
+        )
+        gdown.download(url=download_url, output=download_target, fuzzy=True)
+
+    # For for-tuning: download checkpoint if needed
+    if "base_policy_path" in cfg and not os.path.exists(cfg.base_policy_path):
+        download_url = get_checkpoint_download_url(cfg)
+        if download_url is None:
+            raise ValueError(
+                f"Unknown checkpoint path. Did you specify the correct path to the policy you trained?"
+            )
+        download_target = cfg.base_policy_path
+        dir_name = os.path.dirname(download_target)
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name)
+        log.info(f"Downloading checkpoint from {download_url} to {download_target}")
+        gdown.download(url=download_url, output=download_target, fuzzy=True)
+
+    # Deal with isaacgym needs to be imported before torch
+    if "env" in cfg and "env_type" in cfg.env and cfg.env.env_type == "furniture":
+        import furniture_bench
+
+    # run agent
+    cls = hydra.utils.get_class(cfg._target_)
+    agent = cls(cfg)
+    agent.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/util/__init__.py b/util/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/util/reward_scaling.py b/util/reward_scaling.py
new file mode 100644
index 0000000..f721b89
--- /dev/null
+++ b/util/reward_scaling.py
@@ -0,0 +1,87 @@
+"""
+To balance actor and critic losses, the rewards are divided through by the standard deviation of a rolling discounted sum of the rewards (without subtracting and re-adding the mean).
+
+Code is based on: https://github.com/openai/phasic-policy-gradient/blob/master/phasic_policy_gradient/reward_normalizer.py
+
+Reference: https://arxiv.org/pdf/2005.12729.pdf
+
+"""
+
+import numpy as np
+
+
+class RunningMeanStd:
+    def __init__(
+        self,
+        epsilon=1e-4,  # initial count (with mean=0 ,var=1)
+        shape=(),  # unbatched shape of data, shape[0] is the batch size
+    ):
+        super().__init__()
+        self.mean = np.zeros(shape)
+        self.var = np.ones(shape)
+        self.count = epsilon
+
+    def update(self, x):
+        batch_mean = np.mean(x, axis=0)
+        batch_var = np.var(x, axis=0)
+        batch_count = x.shape[0]
+        self.update_from_moments(batch_mean, batch_var, batch_count)
+
+    def update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.mean
+        tot_count = self.count + batch_count
+
+        self.mean = self.mean + delta * batch_count / tot_count
+        m_a = self.var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + delta**2 * self.count * batch_count / tot_count
+        self.var = M2 / (tot_count - 1)
+        self.count = tot_count
+
+
+class RunningRewardScaler:
+    """
+    Pseudocode can be found in https://arxiv.org/pdf/1811.02553.pdf
+    section 9.3 (which is based on our Baselines code, haha)
+    Motivation is that we'd rather normalize the returns = sum of future rewards,
+    but we haven't seen the future yet. So we assume that the time-reversed rewards
+    have similar statistics to the rewards, and normalize the time-reversed rewards.
+    """
+
+    def __init__(self, num_envs, cliprew=10.0, gamma=0.99, epsilon=1e-8, per_env=False):
+        ret_rms_shape = (num_envs,) if per_env else ()
+        self.ret_rms = RunningMeanStd(shape=ret_rms_shape)
+        self.cliprew = cliprew
+        self.ret = np.zeros(num_envs)
+        self.gamma = gamma
+        self.epsilon = epsilon
+        self.per_env = per_env
+
+    def __call__(self, reward, first):
+        rets = backward_discounted_sum(
+            prevret=self.ret, reward=reward, first=first, gamma=self.gamma
+        )
+        self.ret = rets[:, -1]
+        self.ret_rms.update(rets if self.per_env else rets.reshape(-1))
+        return self.transform(reward)
+
+    def transform(self, reward):
+        return np.clip(
+            reward / np.sqrt(self.ret_rms.var + self.epsilon),
+            -self.cliprew,
+            self.cliprew,
+        )
+
+
+def backward_discounted_sum(
+    prevret,  # value predictions
+    reward,  # reward
+    first,  # mark beginning of episodes"
+    gamma,  # discount
+):
+    assert first.ndim == 2
+    _, nstep = reward.shape
+    ret = np.zeros_like(reward)
+    for t in range(nstep):
+        prevret = ret[:, t] = reward[:, t] + (1 - first[:, t]) * gamma * prevret
+    return ret
diff --git a/util/scheduler.py b/util/scheduler.py
new file mode 100644
index 0000000..73c1684
--- /dev/null
+++ b/util/scheduler.py
@@ -0,0 +1,147 @@
+"""
+MIT License
+
+Copyright (c) 2022 Naoki Katsura
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+# From https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup
+
+import math
+import torch
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class CosineAnnealingWarmupRestarts(_LRScheduler):
+    """
+    optimizer (Optimizer): Wrapped optimizer.
+    first_cycle_steps (int): First cycle step size.
+    cycle_mult(float): Cycle steps magnification. Default: -1.
+    max_lr(float): First cycle's max learning rate. Default: 0.1.
+    min_lr(float): Min learning rate. Default: 0.001.
+    warmup_steps(int): Linear warmup step size. Default: 0.
+    gamma(float): Decrease rate of max learning rate by cycle. Default: 1.
+    last_epoch (int): The index of last epoch. Default: -1.
+    """
+
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        first_cycle_steps: int,
+        cycle_mult: float = 1.0,
+        max_lr: float = 0.1,
+        min_lr: float = 0.001,
+        warmup_steps: int = 0,
+        gamma: float = 1.0,
+        last_epoch: int = -1,
+    ):
+        assert warmup_steps < first_cycle_steps
+
+        self.first_cycle_steps = first_cycle_steps  # first cycle step size
+        self.cycle_mult = cycle_mult  # cycle steps magnification
+        self.base_max_lr = max_lr  # first max learning rate
+        self.max_lr = max_lr  # max learning rate in the current cycle
+        self.min_lr = min_lr  # min learning rate
+        self.warmup_steps = warmup_steps  # warmup step size
+        self.gamma = gamma  # decrease rate of max learning rate by cycle
+
+        self.cur_cycle_steps = first_cycle_steps  # first cycle step size
+        self.cycle = 0  # cycle count
+        self.step_in_cycle = last_epoch  # step size of the current cycle
+
+        super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch)
+
+        # set learning rate min_lr
+        self.init_lr()
+
+    def init_lr(self):
+        self.base_lrs = []
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = self.min_lr
+            self.base_lrs.append(self.min_lr)
+
+    def get_lr(self):
+        if self.step_in_cycle == -1:
+            return self.base_lrs
+        elif self.step_in_cycle < self.warmup_steps:
+            return [
+                (self.max_lr - base_lr) * self.step_in_cycle / self.warmup_steps
+                + base_lr
+                for base_lr in self.base_lrs
+            ]
+        else:
+            return [
+                base_lr
+                + (self.max_lr - base_lr)
+                * (
+                    1
+                    + math.cos(
+                        math.pi
+                        * (self.step_in_cycle - self.warmup_steps)
+                        / (self.cur_cycle_steps - self.warmup_steps)
+                    )
+                )
+                / 2
+                for base_lr in self.base_lrs
+            ]
+
+    def step(self, epoch=None):
+        if epoch is None:
+            epoch = self.last_epoch + 1
+            self.step_in_cycle = self.step_in_cycle + 1
+            if self.step_in_cycle >= self.cur_cycle_steps:
+                self.cycle += 1
+                self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps
+                self.cur_cycle_steps = (
+                    int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult)
+                    + self.warmup_steps
+                )
+        else:
+            if epoch >= self.first_cycle_steps:
+                if self.cycle_mult == 1.0:
+                    self.step_in_cycle = epoch % self.first_cycle_steps
+                    self.cycle = epoch // self.first_cycle_steps
+                else:
+                    n = int(
+                        math.log(
+                            (
+                                epoch / self.first_cycle_steps * (self.cycle_mult - 1)
+                                + 1
+                            ),
+                            self.cycle_mult,
+                        )
+                    )
+                    self.cycle = n
+                    self.step_in_cycle = epoch - int(
+                        self.first_cycle_steps
+                        * (self.cycle_mult**n - 1)
+                        / (self.cycle_mult - 1)
+                    )
+                    self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (
+                        n
+                    )
+            else:
+                self.cur_cycle_steps = self.first_cycle_steps
+                self.step_in_cycle = epoch
+
+        self.max_lr = self.base_max_lr * (self.gamma**self.cycle)
+        self.last_epoch = math.floor(epoch)
+        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
+            param_group["lr"] = lr
diff --git a/util/timer.py b/util/timer.py
new file mode 100644
index 0000000..4a3ca35
--- /dev/null
+++ b/util/timer.py
@@ -0,0 +1,19 @@
+"""
+Simple timer from https://github.com/jannerm/diffuser/blob/main/diffuser/utils/timer.py
+
+"""
+
+import time
+
+
+class Timer:
+
+    def __init__(self):
+        self._start = time.time()
+
+    def __call__(self, reset=True):
+        now = time.time()
+        diff = now - self._start
+        if reset:
+            self._start = now
+        return diff