This commit is contained in:
allenzren 2024-09-03 21:03:27 -04:00
commit 8293b0936b
282 changed files with 34664 additions and 0 deletions

146
.gitignore vendored Normal file
View File

@ -0,0 +1,146 @@
# Custom
result/
wandb/
.vscode/
data/
log/
logs/
checkpoints/
.github/
out/
err/
*.pkl
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
# env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
cfg/robomimic/*.sh
*.out

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Intelligent Robot Motion Lab
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

189
README.md Normal file
View File

@ -0,0 +1,189 @@
# Diffusion Policy Policy Optimization (DPPO)
[[Paper](https://arxiv.org/abs/2409.00588)]  [[Website](https://diffusion-ppo.github.io/)]
[Allen Z. Ren](https://allenzren.github.io/)<sup>1</sup>, [Justin Lidard](https://jlidard.github.io/)<sup>1</sup>, [Lars L. Ankile](https://ankile.com/)<sup>2,3</sup>, [Anthony Simeonov](https://anthonysimeonov.github.io/)<sup>3</sup><br>
[Pulkit Agrawal](https://people.csail.mit.edu/pulkitag/)<sup>3</sup>, [Anirudha Majumdar](https://mae.princeton.edu/people/faculty/majumdar)<sup>1</sup>, [Benjamin Burchfiel](http://www.benburchfiel.com/)<sup>4</sup>, [Hongkai Dai](https://hongkai-dai.github.io/)<sup>4</sup>, [Max Simchowitz](https://msimchowitz.github.io/)<sup>3,5</sup>
<sup>1</sup>Princeton University, <sup>2</sup>Harvard University, <sup>3</sup>Masschusetts Institute of Technology<br>
<sup>4</sup>Toyota Research Institute, <sup>5</sup>Carnegie Mellon University
<img src="https://github.com/diffusion-ppo/diffusion-ppo.github.io/blob/main/img/overview-full.png" alt="drawing" width="100%"/>
> DPPO is an algorithmic framework and set of best practices for fine-tuning diffusion-based policies in continuous control and robot learning tasks.
<!-- ## Release
* [08/30/24] [DPPO](https://diffusion-ppo.github.io/) codebase and technical whitepaper are released. -->
## Installation
1. Clone the repository
```console
git clone git@github.com:allenzren/dppo.git
cd dppo
```
2. Install core dependencies with a conda environment (if you do not plan to use Furniture-Bench, a higher Python version such as 3.10 can be installed instead) on a Linux machine with a Nvidia GPU.
```console
conda create -n dppo python=3.8 -y
conda activate dppo
pip install -e .
```
3. Install specific environment dependencies (Gym / Robomimic / D3IL / Furniture-Bench) or all dependencies
```console
pip install -e .[gym] # or [robomimic], [d3il], [furniture]
pip install -e .[all]
```
<!-- **Note**: Please do not set macros for robomimic and robosuite that the warnings suggest --- we will use some different global variables than the ones defined in macro.py -->
4. [Install MuJoCo for Gym and/or Robomimic](installation/install_mujoco.md). [Install D3IL](installation/install_d3il.md). [Install IsaacGym and Furniture-Bench](installation/install_furniture.md)
5. Set environment variables for data and logging directory (default is `data/` and `log/`), and set WandB entity (username or team name)
```
source script/set_path.sh
```
## Usage - Pre-training
**Note**: You may skip pre-training if you would like to use the default checkpoint (available for download) for fine-tuning.
<!-- ### Prepare pre-training data
First create a directory as the parent directory of the pre-training data and set the environment variable for it.
```console
export DPPO_DATA_DIR=/path/to/data -->
<!-- ``` -->
Pre-training data for all tasks are pre-processed and can be found at [here](https://drive.google.com/drive/folders/1AXZvNQEKOrp0_jk1VLepKh_oHCg_9e3r?usp=drive_link). Pre-training script will download the data (including normalization statistics) automatically to the data directory.
<!-- The data path follows `${DPPO_DATA_DIR}/<benchmark>/<task>/train.npz`, e.g., `${DPPO_DATA_DIR}/gym/hopper-medium-v2/train.pkl`. -->
### Run pre-training with data
All the configs can be found under `cfg/<env>/pretrain/`. A new WandB project may be created based on `wandb.project` in the config file; set `wandb=null` in the command line to test without WandB logging.
<!-- To run pre-training, first set your WandB entity (username or team name) and the parent directory for logging as environment variables. -->
<!-- ```console
export DPPO_WANDB_ENTITY=<your_wandb_entity>
export DPPO_LOG_DIR=<your_prefered_logging_directory>
``` -->
```console
# Gym - hopper/walker2d/halfcheetah
python script/train.py --config-name=pre_diffusion_mlp \
--config-dir=cfg/gym/pretrain/hopper-medium-v2
# Robomimic - lift/can/square/transport
python script/train.py --config-name=pre_diffusion_mlp \
--config-dir=cfg/robomimic/pretrain/can
# D3IL - avoid_m1/m2/m3
python script/train.py --config-name=pre_diffusion_mlp \
--config-dir=cfg/d3il/pretrain/avoid_m1
# Furniture-Bench - one_leg/lamp/round_table_low/med
python script/train.py --config-name=pre_diffusion_mlp \
--config-dir=cfg/furniture/pretrain/one_leg_low
```
See [here](cfg/pretraining.md) for details of the experiments in the paper.
## Usage - Fine-tuning
<!-- ### Set up pre-trained policy -->
<!-- If you did not set the environment variables for pre-training, we need to set them here for fine-tuning.
```console
export DPPO_WANDB_ENTITY=<your_wandb_entity>
export DPPO_LOG_DIR=<your_prefered_logging_directory>
``` -->
<!-- First create a directory as the parent directory of the downloaded checkpoints and set the environment variable for it.
```console
export DPPO_LOG_DIR=/path/to/checkpoint
``` -->
Pre-trained policies used in the paper can be found [here](https://drive.google.com/drive/folders/1ZlFqmhxC4S8Xh1pzZ-fXYzS5-P8sfpiP?usp=drive_link). Fine-tuning script will download the default checkpoint automatically to the logging directory.
<!-- or you may manually download other ones (different epochs) or use your own pre-trained policy if you like. -->
<!-- e.g., `${DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-08-26_22-31-03_42/checkpoint/state_0.pt`. -->
<!-- The checkpoint path follows `${DPPO_LOG_DIR}/<benchmark>/<task>/.../<run>/checkpoint/state_<epoch>.pt`. -->
### Fine-tuning pre-trained policy
All the configs can be found under `cfg/<env>/finetune/`. A new WandB project may be created based on `wandb.project` in the config file; set `wandb=null` in the command line to test without WandB logging.
<!-- Running them will download the default pre-trained policy. -->
<!-- Running the script will download the default pre-trained policy checkpoint specified in the config (`base_policy_path`) automatically, as well as the normalization statistics, to `DPPO_LOG_DIR`. -->
```console
# Gym - hopper/walker2d/halfcheetah
python script/train.py --config-name=ft_ppo_diffusion_mlp \
--config-dir=cfg/gym/finetune/hopper-v2
# Robomimic - lift/can/square/transport
python script/train.py --config-name=ft_ppo_diffusion_mlp \
--config-dir=cfg/robomimic/finetune/can
# D3IL - avoid_m1/m2/m3
python script/train.py --config-name=ft_ppo_diffusion_mlp \
--config-dir=cfg/d3il/finetune/avoid_m1
# Furniture-Bench - one_leg/lamp/round_table_low/med
python script/train.py --config-name=ft_ppo_diffusion_mlp \
--config-dir=cfg/furniture/finetune/one_leg_low
```
**Note**: In Gym, Robomimic, and D3IL tasks, we run 40, 50, and 50 parallelized MuJoCo environments on CPU, respectively. If you would like to use fewer environments (given limited CPU threads, or GPU memory for rendering), you can, e.g., set `env.n_envs` half and `train.n_steps` double, thus the total number of steps collected in each iteration remains the same. Furniture-Bench tasks run IsaacGym on a single GPU.
To fine-tune your own pre-trained policy instead, override `base_policy_path` to your own checkpoint, which is saved under `checkpoint/` of the pre-training directory. You can set `base_policy_path=<path>` in the command line when launching fine-tuning.
<!-- **Note**: If you did not download the pre-training [data](https://drive.google.com/drive/folders/1AXZvNQEKOrp0_jk1VLepKh_oHCg_9e3r?usp=drive_link), you need to download the normalization statistics from it for fine-tuning, e.g., `${DPPO_DATA_DIR}/furniture/round_table_low/normalization.pkl`. -->
See [here](cfg/finetuning.md) for details of the experiments in the paper.
### Visualization
* Furniture-Bench tasks can be visualized in GUI by specifying `env.specific.headless=False` and `env.n_envs=1` in fine-tuning configs.
* D3IL environment can be visualized in GUI by `+env.render=True`, `env.n_envs=1`, and `train.render.num=1`. There is a basic script at `script/test_d3il_render.py`.
* Videos of trials in Robomimic tasks can be recorded by specifying `env.save_video=True`, `train.render.freq=<iterations>`, and `train.render.num=<num_video>` in fine-tuning configs.
## DPPO implementation
Our diffusion implementation is mostly based on [Diffuser](https://github.com/jannerm/diffuser) and at [`model/diffusion/diffusion.py`](model/diffusion/diffusion.py) and [`model/diffusion/diffusion_vpg.py`](model/diffusion/diffusion_vpg.py). PPO specifics are implemented at [`model/diffusion/diffusion_ppo.py`](model/diffusion/diffusion_ppo.py). The main training script is at [`agent/finetune/train_ppo_diffusion_agent.py`](agent/finetune/train_ppo_diffusion_agent.py) that follows [CleanRL](https://github.com/vwxyzjn/cleanrl).
### Key configurations
* `denoising_steps`: number of denoising steps (should always be the same for pre-training and fine-tuning regardless the fine-tuning scheme)
* `ft_denoising_steps`: number of fine-tuned denoising steps
* `horizon_steps`: predicted action chunk size (should be the same as `act_steps`, executed action chunk size, with MLP. Can be different with UNet, e.g., `horizon_steps=16` and `act_steps=8`)
* `model.gamma_denoising`: denoising discount factor
* `model.min_sampling_denoising_std`: <img src="https://latex.codecogs.com/gif.latex?\epsilon^\text{exp}_\text{min} "/>, minimum amount of noise when sampling at a denoising step
* `model.min_logprob_denoising_std`: <img src="https://latex.codecogs.com/gif.latex?\epsilon^\text{prob}_\text{min} "/>, minimum standard deviation when evaluating likelihood at a denoising step
* `model.clip_ploss_coef`: PPO clipping ratio
### DDIM fine-tuning
To use DDIM fine-tuning, set `denoising_steps=100` in pre-training and set `model.use_ddim=True`, `model.ddim_steps` to the desired number of total DDIM steps, and `ft_denoising_steps` to the desired number of fine-tuned DDIM steps. In our Furniture-Bench experiments we use `denoising_steps=100`, `model_ddim_steps=5`, and `ft_denoising_steps=5`.
## Adding your own dataset/environment
### Pre-training data
Pre-training script is at [`agent/pretrain/train_diffusion_agent.py`](agent/pretrain/train_diffusion_agent.py). The pre-training dataset [loader](agent/dataset/sequence.py) assumes a pickle file containing a dictionary of `observations`, `actions`, and `traj_length`, where `observations` and `actions` have the shape of num_episode x max_episode_length x obs_dim/act_dim, and `traj_length` is a 1-D array. One pre-processing example can be found at [`script/process_robomimic_dataset.py`](script/process_robomimic_dataset.py).
**Note:** The current implementation does not support loading history observations (only using observation at the current timestep). If needed, you can modify [here](agent/dataset/sequence.py#L130-L131).
### Fine-tuning environment
We follow the Gym format for interacting with the environments. The vectorized environments are initialized at [make_async](env/gym_utils/__init__.py#L10) (called in the parent fine-tuning agent class [here](agent/finetune/train_agent.py#L38-L39)). The current implementation is not the cleanest as we tried to make it compatible with Gym, Robomimic, Furniture-Bench, and D3IL environments, but it should be easy to modify and allow using other environments. We use [multi_step](env/gym_utils/wrapper/multi_step.py) wrapper for history observations (not used currently) and multi-environment-step action execution. We also use environment-specific wrappers such as [robomimic_lowdim](env/gym_utils/wrapper/robomimic_lowdim.py) and [furniture](env/gym_utils/wrapper/furniture.py) for observation/action normalization, etc. You can implement a new environment wrapper if needed.
## Known issues
* IsaacGym simulation can become unstable at times and lead to NaN observations in Furniture-Bench. The current env wrapper does not handle NaN observations.
## License
This repository is released under the MIT license. See [LICENSE](LICENSE).
## Acknowledgement
* [Diffuser, Janner et al.](https://github.com/jannerm/diffuser): general code base and DDPM implementation
* [Diffusion Policy, Chi et al.](https://github.com/real-stanford/diffusion_policy): general code base especially the env wrappers
* [CleanRL, Huang et al.](https://github.com/vwxyzjn/cleanrl): PPO implementation
* [IBRL, Hu et al.](https://github.com/hengyuan-hu/ibrl): ViT implementation
* [D3IL, Jia et al.](https://github.com/ALRhub/d3il): D3IL benchmark
* [Robomimic, Mandlekar et al.](https://github.com/ARISE-Initiative/robomimic): Robomimic benchmark
* [Furniture-Bench, Heo et al.](https://github.com/clvrai/furniture-bench): Furniture-Bench benchmark
* [AWR, Peng et al.](https://github.com/xbpeng/awr): DAWR baseline (modified from AWR)
* [DIPO, Yang et al.](https://github.com/BellmanTimeHut/DIPO): DIPO baseline
* [IDQL, Hansen-Estruch et al.](https://github.com/philippe-eecs/IDQL): IDQL baseline
* [DQL, Wang et al.](https://github.com/Zhendong-Wang/Diffusion-Policies-for-Offline-RL): DQL baseline
* [QSM, Psenka et al.](https://www.michaelpsenka.io/qsm/): QSM baseline
* [Score SDE, Song et al.](https://github.com/yang-song/score_sde_pytorch/): diffusion exact likelihood

View File

110
agent/dataset/buffer.py Normal file
View File

@ -0,0 +1,110 @@
"""
Pre-training data loader. Modified from https://github.com/jannerm/diffuser/blob/main/diffuser/datasets/buffer.py
"""
import numpy as np
import torch
def atleast_2d(x):
if isinstance(x, torch.Tensor):
while x.dim() < 2:
x = x.unsqueeze(-1)
return x
else:
while x.ndim < 2:
x = np.expand_dims(x, axis=-1)
return x
class StitchedBuffer:
def __init__(
self,
sum_of_path_lengths,
device="cpu",
):
self.sum_of_path_lengths = sum_of_path_lengths
if device == "cpu":
self._dict = {
"path_lengths": np.zeros(sum_of_path_lengths, dtype=int),
}
else:
self._dict = {
"path_lengths": torch.zeros(sum_of_path_lengths, dtype=int).to(device),
}
self._count = 0
self.sum_of_path_lengths = sum_of_path_lengths
self.device = device
def __repr__(self):
return "Fields:\n" + "\n".join(
f" {key}: {val.shape}" for key, val in self.items()
)
def __getitem__(self, key):
return self._dict[key]
def __setitem__(self, key, val):
self._dict[key] = val
self._add_attributes()
@property
def n_episodes(self):
return self._count
@property
def n_steps(self):
return sum(self["path_lengths"])
def _add_keys(self, path):
if hasattr(self, "keys"):
return
self.keys = list(path.keys())
def _add_attributes(self):
"""
can access fields with `buffer.observations`
instead of `buffer['observations']`
"""
for key, val in self._dict.items():
setattr(self, key, val)
def items(self):
return {k: v for k, v in self._dict.items() if k != "path_lengths"}.items()
def _allocate(self, key, array):
assert key not in self._dict
dim = array.shape[1:] # skip batch dimension
shape = (self.sum_of_path_lengths, *dim)
if self.device == "cpu":
self._dict[key] = np.zeros(shape, dtype=np.float32)
else:
self._dict[key] = torch.zeros(shape, dtype=torch.float32).to(self.device)
# print(f'[ utils/mujoco ] Allocated {key} with size {shape}')
def add_path(self, path):
path_length = len(path["observations"])
# assert path_length <= self.sum_of_path_lengths
## if first path added, set keys based on contents
self._add_keys(path)
## add tracked keys in path
for key in self.keys:
array = atleast_2d(path[key])
if key not in self._dict:
self._allocate(key, array)
self._dict[key][self._count : self._count + path_length] = array
## record path length
self._dict["path_lengths"][
self._count : self._count + path_length
] = path_length
## increment path counter
self._count += path_length
def finalize(self):
self._add_attributes()

View File

@ -0,0 +1,18 @@
import os
FRAMEWORK_DIR = os.path.dirname(__file__)
def sim_framework_path(*args) -> str:
"""
Abstraction from os.path.join()
Builds absolute paths from relative path strings with SIM_FRAMEWORK/ as root.
If args already contains an absolute path, it is used as root for the subsequent joins
Args:
*args:
Returns:
absolute path
"""
return os.path.abspath(os.path.join(FRAMEWORK_DIR, *args))

View File

@ -0,0 +1,342 @@
import random
from typing import Optional, Callable, Any
import logging
import os
import glob
try:
import cv2 # not included in pyproject.toml
except:
print("Installing cv2")
os.system("pip install opencv-python")
import torch
import pickle
import numpy as np
from tqdm import tqdm
from agent.dataset.d3il_dataset.base_dataset import TrajectoryDataset
from agent.dataset.d3il_dataset import sim_framework_path
class Aligning_Dataset(TrajectoryDataset):
def __init__(
self,
data_directory: os.PathLike,
device="cpu",
obs_dim: int = 20,
action_dim: int = 2,
max_len_data: int = 256,
window_size: int = 1,
):
super().__init__(
data_directory=data_directory,
device=device,
obs_dim=obs_dim,
action_dim=action_dim,
max_len_data=max_len_data,
window_size=window_size,
)
logging.info("Loading Robot Push Dataset")
inputs = []
actions = []
masks = []
# data_dir = sim_framework_path(data_directory)
# state_files = glob.glob(data_dir + "/env*")
rp_data_dir = sim_framework_path("data/aligning/all_data/state")
state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
for file in state_files:
with open(os.path.join(rp_data_dir, file), "rb") as f:
env_state = pickle.load(f)
# lengths.append(len(env_state['robot']['des_c_pos']))
zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
zero_action = np.zeros(
(1, self.max_len_data, self.action_dim), dtype=np.float32
)
zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
# robot and box positions
robot_des_pos = env_state["robot"]["des_c_pos"]
robot_c_pos = env_state["robot"]["c_pos"]
push_box_pos = env_state["push-box"]["pos"]
push_box_quat = env_state["push-box"]["quat"]
target_box_pos = env_state["target-box"]["pos"]
target_box_quat = env_state["target-box"]["quat"]
# target_box_pos = np.zeros(push_box_pos.shape)
# target_box_quat = np.zeros(push_box_quat.shape)
# target_box_pos[:] = push_box_pos[-1:]
# target_box_quat[:] = push_box_quat[-1:]
input_state = np.concatenate(
(
robot_des_pos,
robot_c_pos,
push_box_pos,
push_box_quat,
target_box_pos,
target_box_quat,
),
axis=-1,
)
vel_state = robot_des_pos[1:] - robot_des_pos[:-1]
valid_len = len(input_state) - 1
zero_obs[0, :valid_len, :] = input_state[:-1]
zero_action[0, :valid_len, :] = vel_state
zero_mask[0, :valid_len] = 1
inputs.append(zero_obs)
actions.append(zero_action)
masks.append(zero_mask)
# shape: B, T, n
self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
self.num_data = len(self.observations)
self.slices = self.get_slices()
def get_slices(self):
slices = []
min_seq_length = np.inf
for i in range(self.num_data):
T = self.get_seq_length(i)
min_seq_length = min(T, min_seq_length)
if T - self.window_size < 0:
print(
f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
)
else:
slices += [
(i, start, start + self.window_size)
for start in range(T - self.window_size + 1)
] # slice indices follow convention [start, end)
return slices
def get_seq_length(self, idx):
return int(self.masks[idx].sum().item())
def get_all_actions(self):
result = []
# mask out invalid actions
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.actions[i, :T, :])
return torch.cat(result, dim=0)
def get_all_observations(self):
result = []
# mask out invalid observations
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.observations[i, :T, :])
return torch.cat(result, dim=0)
def __len__(self):
return len(self.slices)
def __getitem__(self, idx):
i, start, end = self.slices[idx]
obs = self.observations[i, start:end]
act = self.actions[i, start:end]
mask = self.masks[i, start:end]
return obs, act, mask
class Aligning_Img_Dataset(TrajectoryDataset):
def __init__(
self,
data_directory: os.PathLike,
device="cpu",
obs_dim: int = 20,
action_dim: int = 2,
max_len_data: int = 256,
window_size: int = 1,
):
super().__init__(
data_directory=data_directory,
device=device,
obs_dim=obs_dim,
action_dim=action_dim,
max_len_data=max_len_data,
window_size=window_size,
)
logging.info("Loading Robot Push Dataset")
inputs = []
actions = []
masks = []
data_dir = sim_framework_path("environments/dataset/data/aligning/all_data")
state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
bp_cam_imgs = []
inhand_cam_imgs = []
for file in tqdm(state_files[:3]):
with open(os.path.join(data_dir, "state", file), "rb") as f:
env_state = pickle.load(f)
# lengths.append(len(env_state['robot']['des_c_pos']))
zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
zero_action = np.zeros(
(1, self.max_len_data, self.action_dim), dtype=np.float32
)
zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
# robot and box positions
robot_des_pos = env_state["robot"]["des_c_pos"]
robot_c_pos = env_state["robot"]["c_pos"]
file_name = os.path.basename(file).split(".")[0]
###############################################################
bp_images = []
bp_imgs = glob.glob(data_dir + "/images/bp-cam/" + file_name + "/*")
bp_imgs.sort(key=lambda x: int(os.path.basename(x).split(".")[0]))
for img in bp_imgs:
image = cv2.imread(img).astype(np.float32)
image = image.transpose((2, 0, 1)) / 255.0
image = torch.from_numpy(image).to(self.device).float().unsqueeze(0)
bp_images.append(image)
bp_images = torch.concatenate(bp_images, dim=0)
################################################################
inhand_imgs = glob.glob(data_dir + "/images/inhand-cam/" + file_name + "/*")
inhand_imgs.sort(key=lambda x: int(os.path.basename(x).split(".")[0]))
inhand_images = []
for img in inhand_imgs:
image = cv2.imread(img).astype(np.float32)
image = image.transpose((2, 0, 1)) / 255.0
image = torch.from_numpy(image).to(self.device).float().unsqueeze(0)
inhand_images.append(image)
inhand_images = torch.concatenate(inhand_images, dim=0)
##################################################################
# push_box_pos = env_state['push-box']['pos']
# push_box_quat = env_state['push-box']['quat']
#
# target_box_pos = env_state['target-box']['pos']
# target_box_quat = env_state['target-box']['quat']
# target_box_pos = np.zeros(push_box_pos.shape)
# target_box_quat = np.zeros(push_box_quat.shape)
# target_box_pos[:] = push_box_pos[-1:]
# target_box_quat[:] = push_box_quat[-1:]
# input_state = np.concatenate((robot_des_pos), axis=-1)
vel_state = robot_des_pos[1:] - robot_des_pos[:-1]
valid_len = len(vel_state)
zero_obs[0, :valid_len, :] = robot_des_pos[:-1]
zero_action[0, :valid_len, :] = vel_state
zero_mask[0, :valid_len] = 1
bp_cam_imgs.append(bp_images)
inhand_cam_imgs.append(inhand_images)
inputs.append(zero_obs)
actions.append(zero_action)
masks.append(zero_mask)
self.bp_cam_imgs = bp_cam_imgs
self.inhand_cam_imgs = inhand_cam_imgs
# shape: B, T, n
self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
self.num_data = len(self.observations)
self.slices = self.get_slices()
def get_slices(self):
slices = []
min_seq_length = np.inf
for i in range(self.num_data):
T = self.get_seq_length(i)
min_seq_length = min(T, min_seq_length)
if T - self.window_size < 0:
print(
f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
)
else:
slices += [
(i, start, start + self.window_size)
for start in range(T - self.window_size + 1)
] # slice indices follow convention [start, end)
return slices
def get_seq_length(self, idx):
return int(self.masks[idx].sum().item())
def get_all_actions(self):
result = []
# mask out invalid actions
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.actions[i, :T, :])
return torch.cat(result, dim=0)
def get_all_observations(self):
result = []
# mask out invalid observations
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.observations[i, :T, :])
return torch.cat(result, dim=0)
def __len__(self):
return len(self.slices)
def __getitem__(self, idx):
i, start, end = self.slices[idx]
obs = self.observations[i, start:end]
act = self.actions[i, start:end]
mask = self.masks[i, start:end]
bp_imgs = self.bp_cam_imgs[i][start:end]
inhand_imgs = self.inhand_cam_imgs[i][start:end]
return bp_imgs, inhand_imgs, obs, act, mask

View File

@ -0,0 +1,126 @@
import logging
import os
import torch
import pickle
import numpy as np
from agent.dataset.d3il_dataset.base_dataset import TrajectoryDataset
class Avoiding_Dataset(TrajectoryDataset):
def __init__(
self,
data_directory: os.PathLike,
device="cpu",
obs_dim: int = 20,
action_dim: int = 2,
max_len_data: int = 256,
window_size: int = 1,
):
super().__init__(
data_directory=data_directory,
device=device,
obs_dim=obs_dim,
action_dim=action_dim,
max_len_data=max_len_data,
window_size=window_size,
)
logging.info("Loading Sorting Dataset")
inputs = []
actions = []
masks = []
data_dir = data_directory
state_files = os.listdir(data_dir)
for file in state_files:
with open(os.path.join(data_dir, file), "rb") as f:
env_state = pickle.load(f)
zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
zero_action = np.zeros(
(1, self.max_len_data, self.action_dim), dtype=np.float32
)
zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
# robot and box posistion
robot_des_pos = env_state["robot"]["des_c_pos"][:, :2]
robot_c_pos = env_state["robot"]["c_pos"][:, :2]
input_state = np.concatenate((robot_des_pos, robot_c_pos), axis=-1)
vel_state = robot_des_pos[1:] - robot_des_pos[:-1]
valid_len = len(vel_state)
zero_obs[0, :valid_len, :] = input_state[:-1]
zero_action[0, :valid_len, :] = vel_state
zero_mask[0, :valid_len] = 1
inputs.append(zero_obs)
actions.append(zero_action)
masks.append(zero_mask)
# shape: B, T, n
self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
self.num_data = len(self.observations)
self.slices = self.get_slices()
def get_slices(self):
slices = []
min_seq_length = np.inf
for i in range(self.num_data):
T = self.get_seq_length(i)
min_seq_length = min(T, min_seq_length)
if T - self.window_size < 0:
print(
f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
)
else:
slices += [
(i, start, start + self.window_size)
for start in range(T - self.window_size + 1)
] # slice indices follow convention [start, end)
return slices
def get_seq_length(self, idx):
return int(self.masks[idx].sum().item())
def get_all_actions(self):
result = []
# mask out invalid actions
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.actions[i, :T, :])
return torch.cat(result, dim=0)
def get_all_observations(self):
result = []
# mask out invalid observations
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.observations[i, :T, :])
return torch.cat(result, dim=0)
def __len__(self):
return len(self.slices)
def __getitem__(self, idx):
i, start, end = self.slices[idx]
obs = self.observations[i, start:end]
act = self.actions[i, start:end]
mask = self.masks[i, start:end]
return obs, act, mask

View File

@ -0,0 +1,54 @@
import abc
import os
from torch.utils.data import Dataset
class TrajectoryDataset(Dataset, abc.ABC):
"""
A dataset containing trajectories.
TrajectoryDataset[i] returns: (observations, actions, mask)
observations: Tensor[T, ...], T frames of observations
actions: Tensor[T, ...], T frames of actions
mask: Tensor[T]: 0: invalid; 1: valid
"""
def __init__(
self,
data_directory: os.PathLike,
device="cpu",
obs_dim: int = 20,
action_dim: int = 2,
max_len_data: int = 256,
window_size: int = 1,
):
self.data_directory = data_directory
self.device = device
self.max_len_data = max_len_data
self.action_dim = action_dim
self.obs_dim = obs_dim
self.window_size = window_size
@abc.abstractmethod
def get_seq_length(self, idx):
"""
Returns the length of the idx-th trajectory.
"""
raise NotImplementedError
@abc.abstractmethod
def get_all_actions(self):
"""
Returns all actions from all trajectories, concatenated on dim 0 (time).
"""
raise NotImplementedError
@abc.abstractmethod
def get_all_observations(self):
"""
Returns all actions from all trajectories, concatenated on dim 0 (time).
"""
raise NotImplementedError

View File

@ -0,0 +1,350 @@
import itertools
import numpy as np
"""
From OpenAIGym Please see there under mujoco/Robots
"""
# For testing whether a number is close to zero
_FLOAT_EPS = np.finfo(np.float64).eps
_EPS4 = _FLOAT_EPS * 4.0
def get_quaternion_error(curr_quat, des_quat):
"""
Calculates the difference between the current quaternion and the desired quaternion.
See Siciliano textbook page 140 Eq 3.91
:param curr_quat: current quaternion
:param des_quat: desired quaternion
:return: difference between current quaternion and desired quaternion
"""
quatError = np.zeros((3,))
quatError[0] = (
curr_quat[0] * des_quat[1]
- des_quat[0] * curr_quat[1]
- curr_quat[3] * des_quat[2]
+ curr_quat[2] * des_quat[3]
)
quatError[1] = (
curr_quat[0] * des_quat[2]
- des_quat[0] * curr_quat[2]
+ curr_quat[3] * des_quat[1]
- curr_quat[1] * des_quat[3]
)
quatError[2] = (
curr_quat[0] * des_quat[3]
- des_quat[0] * curr_quat[3]
- curr_quat[2] * des_quat[1]
+ curr_quat[1] * des_quat[2]
)
return quatError
def euler2mat(euler):
"""Convert Euler Angles to Rotation Matrix. See rotation.py for notes"""
euler = np.asarray(euler, dtype=np.float64)
assert euler.shape[-1] == 3, "Invalid shaped euler {}".format(euler)
ai, aj, ak = -euler[..., 2], -euler[..., 1], -euler[..., 0]
si, sj, sk = np.sin(ai), np.sin(aj), np.sin(ak)
ci, cj, ck = np.cos(ai), np.cos(aj), np.cos(ak)
cc, cs = ci * ck, ci * sk
sc, ss = si * ck, si * sk
mat = np.empty(euler.shape[:-1] + (3, 3), dtype=np.float64)
mat[..., 2, 2] = cj * ck
mat[..., 2, 1] = sj * sc - cs
mat[..., 2, 0] = sj * cc + ss
mat[..., 1, 2] = cj * sk
mat[..., 1, 1] = sj * ss + cc
mat[..., 1, 0] = sj * cs - sc
mat[..., 0, 2] = -sj
mat[..., 0, 1] = cj * si
mat[..., 0, 0] = cj * ci
return mat
def euler2quat(euler):
"""Convert Euler Angles to Quaternions. See rotation.py for notes"""
euler = np.asarray(euler, dtype=np.float64)
assert euler.shape[-1] == 3, "Invalid shape euler {}".format(euler)
ai, aj, ak = euler[..., 2] / 2, -euler[..., 1] / 2, euler[..., 0] / 2
si, sj, sk = np.sin(ai), np.sin(aj), np.sin(ak)
ci, cj, ck = np.cos(ai), np.cos(aj), np.cos(ak)
cc, cs = ci * ck, ci * sk
sc, ss = si * ck, si * sk
quat = np.empty(euler.shape[:-1] + (4,), dtype=np.float64)
quat[..., 0] = cj * cc + sj * ss
quat[..., 3] = cj * sc - sj * cs
quat[..., 2] = -(cj * ss + sj * cc)
quat[..., 1] = cj * cs - sj * sc
return quat
def mat2euler(mat):
"""Convert Rotation Matrix to Euler Angles. See rotation.py for notes"""
mat = np.asarray(mat, dtype=np.float64)
assert mat.shape[-2:] == (3, 3), "Invalid shape matrix {}".format(mat)
cy = np.sqrt(mat[..., 2, 2] * mat[..., 2, 2] + mat[..., 1, 2] * mat[..., 1, 2])
condition = cy > _EPS4
euler = np.empty(mat.shape[:-1], dtype=np.float64)
euler[..., 2] = np.where(
condition,
-np.arctan2(mat[..., 0, 1], mat[..., 0, 0]),
-np.arctan2(-mat[..., 1, 0], mat[..., 1, 1]),
)
euler[..., 1] = np.where(
condition, -np.arctan2(-mat[..., 0, 2], cy), -np.arctan2(-mat[..., 0, 2], cy)
)
euler[..., 0] = np.where(
condition, -np.arctan2(mat[..., 1, 2], mat[..., 2, 2]), 0.0
)
return euler
def mat2quat(mat):
"""Convert Rotation Matrix to Quaternion. See rotation.py for notes"""
mat = np.asarray(mat, dtype=np.float64)
assert mat.shape[-2:] == (3, 3), "Invalid shape matrix {}".format(mat)
Qxx, Qyx, Qzx = mat[..., 0, 0], mat[..., 0, 1], mat[..., 0, 2]
Qxy, Qyy, Qzy = mat[..., 1, 0], mat[..., 1, 1], mat[..., 1, 2]
Qxz, Qyz, Qzz = mat[..., 2, 0], mat[..., 2, 1], mat[..., 2, 2]
# Fill only lower half of symmetric matrix
K = np.zeros(mat.shape[:-2] + (4, 4), dtype=np.float64)
K[..., 0, 0] = Qxx - Qyy - Qzz
K[..., 1, 0] = Qyx + Qxy
K[..., 1, 1] = Qyy - Qxx - Qzz
K[..., 2, 0] = Qzx + Qxz
K[..., 2, 1] = Qzy + Qyz
K[..., 2, 2] = Qzz - Qxx - Qyy
K[..., 3, 0] = Qyz - Qzy
K[..., 3, 1] = Qzx - Qxz
K[..., 3, 2] = Qxy - Qyx
K[..., 3, 3] = Qxx + Qyy + Qzz
K /= 3.0
# TODO: vectorize this -- probably could be made faster
q = np.empty(K.shape[:-2] + (4,))
it = np.nditer(q[..., 0], flags=["multi_index"])
while not it.finished:
# Use Hermitian eigenvectors, values for speed
vals, vecs = np.linalg.eigh(K[it.multi_index])
# Select largest eigenvector, reorder to w,x,y,z quaternion
q[it.multi_index] = vecs[[3, 0, 1, 2], np.argmax(vals)]
# Prefer quaternion with positive w
# (q * -1 corresponds to same rotation as q)
if q[it.multi_index][0] < 0:
q[it.multi_index] *= -1
it.iternext()
return q
def quat2euler(quat):
"""Convert Quaternion to Euler Angles. See rotation.py for notes"""
return mat2euler(quat2mat(quat))
def subtract_euler(e1, e2):
assert e1.shape == e2.shape
assert e1.shape[-1] == 3
q1 = euler2quat(e1)
q2 = euler2quat(e2)
q_diff = quat_mul(q1, quat_conjugate(q2))
return quat2euler(q_diff)
def quat2mat(quat):
"""Convert Quaternion to Euler Angles. See rotation.py for notes"""
quat = np.asarray(quat, dtype=np.float64)
assert quat.shape[-1] == 4, "Invalid shape quat {}".format(quat)
w, x, y, z = quat[..., 0], quat[..., 1], quat[..., 2], quat[..., 3]
Nq = np.sum(quat * quat, axis=-1)
s = 2.0 / Nq
X, Y, Z = x * s, y * s, z * s
wX, wY, wZ = w * X, w * Y, w * Z
xX, xY, xZ = x * X, x * Y, x * Z
yY, yZ, zZ = y * Y, y * Z, z * Z
mat = np.empty(quat.shape[:-1] + (3, 3), dtype=np.float64)
mat[..., 0, 0] = 1.0 - (yY + zZ)
mat[..., 0, 1] = xY - wZ
mat[..., 0, 2] = xZ + wY
mat[..., 1, 0] = xY + wZ
mat[..., 1, 1] = 1.0 - (xX + zZ)
mat[..., 1, 2] = yZ - wX
mat[..., 2, 0] = xZ - wY
mat[..., 2, 1] = yZ + wX
mat[..., 2, 2] = 1.0 - (xX + yY)
return np.where((Nq > _FLOAT_EPS)[..., np.newaxis, np.newaxis], mat, np.eye(3))
def quat_conjugate(q):
inv_q = -q
inv_q[..., 0] *= -1
return inv_q
def quat_mul(q0, q1):
assert q0.shape == q1.shape
assert q0.shape[-1] == 4
assert q1.shape[-1] == 4
w0 = q0[..., 0]
x0 = q0[..., 1]
y0 = q0[..., 2]
z0 = q0[..., 3]
w1 = q1[..., 0]
x1 = q1[..., 1]
y1 = q1[..., 2]
z1 = q1[..., 3]
w = w0 * w1 - x0 * x1 - y0 * y1 - z0 * z1
x = w0 * x1 + x0 * w1 + y0 * z1 - z0 * y1
y = w0 * y1 + y0 * w1 + z0 * x1 - x0 * z1
z = w0 * z1 + z0 * w1 + x0 * y1 - y0 * x1
q = np.array([w, x, y, z])
if q.ndim == 2:
q = q.swapaxes(0, 1)
assert q.shape == q0.shape
return q
def quat_rot_vec(q, v0):
q_v0 = np.array([0, v0[0], v0[1], v0[2]])
q_v = quat_mul(q, quat_mul(q_v0, quat_conjugate(q)))
v = q_v[1:]
return v
def quat_identity():
return np.array([1, 0, 0, 0])
def quat2axisangle(quat):
theta = 0
axis = np.array([0, 0, 1])
sin_theta = np.linalg.norm(quat[1:])
if sin_theta > 0.0001:
theta = 2 * np.arcsin(sin_theta)
theta *= 1 if quat[0] >= 0 else -1
axis = quat[1:] / sin_theta
return axis, theta
def euler2point_euler(euler):
_euler = euler.copy()
if len(_euler.shape) < 2:
_euler = np.expand_dims(_euler, 0)
assert _euler.shape[1] == 3
_euler_sin = np.sin(_euler)
_euler_cos = np.cos(_euler)
return np.concatenate([_euler_sin, _euler_cos], axis=-1)
def point_euler2euler(euler):
_euler = euler.copy()
if len(_euler.shape) < 2:
_euler = np.expand_dims(_euler, 0)
assert _euler.shape[1] == 6
angle = np.arctan(_euler[..., :3] / _euler[..., 3:])
angle[_euler[..., 3:] < 0] += np.pi
return angle
def quat2point_quat(quat):
# Should be in qw, qx, qy, qz
_quat = quat.copy()
if len(_quat.shape) < 2:
_quat = np.expand_dims(_quat, 0)
assert _quat.shape[1] == 4
angle = np.arccos(_quat[:, [0]]) * 2
xyz = _quat[:, 1:]
xyz[np.squeeze(np.abs(np.sin(angle / 2))) >= 1e-5] = (xyz / np.sin(angle / 2))[
np.squeeze(np.abs(np.sin(angle / 2))) >= 1e-5
]
return np.concatenate([np.sin(angle), np.cos(angle), xyz], axis=-1)
def point_quat2quat(quat):
_quat = quat.copy()
if len(_quat.shape) < 2:
_quat = np.expand_dims(_quat, 0)
assert _quat.shape[1] == 5
angle = np.arctan(_quat[:, [0]] / _quat[:, [1]])
qw = np.cos(angle / 2)
qxyz = _quat[:, 2:]
qxyz[np.squeeze(np.abs(np.sin(angle / 2))) >= 1e-5] = (qxyz * np.sin(angle / 2))[
np.squeeze(np.abs(np.sin(angle / 2))) >= 1e-5
]
return np.concatenate([qw, qxyz], axis=-1)
def normalize_angles(angles):
"""Puts angles in [-pi, pi] range."""
angles = angles.copy()
if angles.size > 0:
angles = (angles + np.pi) % (2 * np.pi) - np.pi
assert -np.pi - 1e-6 <= angles.min() and angles.max() <= np.pi + 1e-6
return angles
def round_to_straight_angles(angles):
"""Returns closest angle modulo 90 degrees"""
angles = np.round(angles / (np.pi / 2)) * (np.pi / 2)
return normalize_angles(angles)
def get_parallel_rotations():
mult90 = [0, np.pi / 2, -np.pi / 2, np.pi]
parallel_rotations = []
for euler in itertools.product(mult90, repeat=3):
canonical = mat2euler(euler2mat(euler))
canonical = np.round(canonical / (np.pi / 2))
if canonical[0] == -2:
canonical[0] = 2
if canonical[2] == -2:
canonical[2] = 2
canonical *= np.pi / 2
if all([(canonical != rot).any() for rot in parallel_rotations]):
parallel_rotations += [canonical]
assert len(parallel_rotations) == 24
return parallel_rotations
def posRotMat2TFMat(pos, rot_mat):
"""Converts a position and a 3x3 rotation matrix to a 4x4 transformation matrix"""
t_mat = np.eye(4)
t_mat[:3, :3] = rot_mat
t_mat[:3, 3] = np.array(pos)
return t_mat
def mat2posQuat(mat):
"""Converts a 4x4 rotation matrix to a position and a quaternion"""
pos = mat[:3, 3]
quat = mat2quat(mat[:3, :3])
return pos, quat
def wxyz_to_xyzw(quat):
"""Converts WXYZ Quaternions to XYZW Quaternions"""
return np.roll(quat, -1)
def xyzw_to_wxyz(quat):
"""Converts XYZW Quaternions to WXYZ Quaternions"""
return np.roll(quat, 1)

View File

@ -0,0 +1,161 @@
import logging
import os
import torch
import pickle
import numpy as np
from agent.dataset.d3il_dataset.base_dataset import TrajectoryDataset
from agent.dataset.d3il_dataset import sim_framework_path
from .geo_transform import quat2euler
class Pushing_Dataset(TrajectoryDataset):
def __init__(
self,
data_directory: os.PathLike,
device="cpu",
obs_dim: int = 20,
action_dim: int = 2,
max_len_data: int = 256,
window_size: int = 1,
):
super().__init__(
data_directory=data_directory,
device=device,
obs_dim=obs_dim,
action_dim=action_dim,
max_len_data=max_len_data,
window_size=window_size,
)
logging.info("Loading Block Push Dataset")
inputs = []
actions = []
masks = []
# for root, dirs, files in os.walk(self.data_directory):
#
# for mode_dir in dirs:
# state_files = glob.glob(os.path.join(root, mode_dir) + "/env*")
# data_dir = os.path.join(sim_framework_path(data_directory), "local")
# data_dir = sim_framework_path(data_directory)
# state_files = glob.glob(data_dir + "/env*")
bp_data_dir = sim_framework_path("data/pushing/all_data")
state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
for file in state_files:
with open(os.path.join(bp_data_dir, file), "rb") as f:
env_state = pickle.load(f)
# lengths.append(len(env_state['robot']['des_c_pos']))
zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
zero_action = np.zeros(
(1, self.max_len_data, self.action_dim), dtype=np.float32
)
zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
# robot and box positions
robot_des_pos = env_state["robot"]["des_c_pos"][:, :2]
robot_c_pos = env_state["robot"]["c_pos"][:, :2]
red_box_pos = env_state["red-box"]["pos"][:, :2]
red_box_quat = np.tan(quat2euler(env_state["red-box"]["quat"])[:, -1:])
green_box_pos = env_state["green-box"]["pos"][:, :2]
green_box_quat = np.tan(quat2euler(env_state["green-box"]["quat"])[:, -1:])
red_target_pos = env_state["red-target"]["pos"][:, :2]
green_target_pos = env_state["green-target"]["pos"][:, :2]
input_state = np.concatenate(
(
robot_des_pos,
robot_c_pos,
red_box_pos,
red_box_quat,
green_box_pos,
green_box_quat,
),
axis=-1,
)
vel_state = robot_des_pos[1:] - robot_des_pos[:-1]
valid_len = len(input_state) - 1
zero_obs[0, :valid_len, :] = input_state[:-1]
zero_action[0, :valid_len, :] = vel_state
zero_mask[0, :valid_len] = 1
inputs.append(zero_obs)
actions.append(zero_action)
masks.append(zero_mask)
# shape: B, T, n
self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
self.num_data = len(self.observations)
self.slices = self.get_slices()
def get_slices(self):
slices = []
min_seq_length = np.inf
for i in range(self.num_data):
T = self.get_seq_length(i)
min_seq_length = min(T, min_seq_length)
if T - self.window_size < 0:
print(
f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
)
else:
slices += [
(i, start, start + self.window_size)
for start in range(T - self.window_size + 1)
] # slice indices follow convention [start, end)
return slices
def get_seq_length(self, idx):
return int(self.masks[idx].sum().item())
def get_all_actions(self):
result = []
# mask out invalid actions
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.actions[i, :T, :])
return torch.cat(result, dim=0)
def get_all_observations(self):
result = []
# mask out invalid observations
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.observations[i, :T, :])
return torch.cat(result, dim=0)
def __len__(self):
return len(self.slices)
def __getitem__(self, idx):
i, start, end = self.slices[idx]
obs = self.observations[i, start:end]
act = self.actions[i, start:end]
mask = self.masks[i, start:end]
return obs, act, mask

View File

@ -0,0 +1,491 @@
import logging
import os
import glob
import torch
import pickle
import numpy as np
try:
import cv2 # not included in pyproject.toml
except:
print("Installing cv2")
os.system("pip install opencv-python")
from tqdm import tqdm
from agent.dataset.d3il_dataset.base_dataset import TrajectoryDataset
from agent.dataset.d3il_dataset import sim_framework_path
from .geo_transform import quat2euler
class Sorting_Dataset(TrajectoryDataset):
def __init__(
self,
data_directory: os.PathLike,
device="cpu",
obs_dim: int = 20,
action_dim: int = 2,
max_len_data: int = 256,
window_size: int = 1,
num_boxes: int = 2,
):
super().__init__(
data_directory=data_directory,
device=device,
obs_dim=obs_dim,
action_dim=action_dim,
max_len_data=max_len_data,
window_size=window_size,
)
logging.info("Loading Sorting Dataset")
inputs = []
actions = []
masks = []
# for root, dirs, files in os.walk(self.data_directory):
#
# for mode_dir in dirs:
# state_files = glob.glob(os.path.join(root, mode_dir) + "/env*")
# data_dir = os.path.join(sim_framework_path(data_directory), "local")
# data_dir = sim_framework_path(data_directory)
# state_files = glob.glob(data_dir + "/env*")
# random.seed(0)
# data_dir = sim_framework_path(data_directory)
# state_files = os.listdir(data_dir)
#
# random.shuffle(state_files)
#
# if data == "train":
# env_state_files = state_files[50:]
# elif data == "eval":
# env_state_files = state_files[:50]
# else:
# assert False, "wrong data type"
if num_boxes == 2:
data_dir = sim_framework_path("data/sorting/2_boxes/state")
elif num_boxes == 4:
data_dir = sim_framework_path("data/sorting/4_boxes/state")
elif num_boxes == 6:
data_dir = sim_framework_path("data/sorting/6_boxes/state")
else:
assert False, "check num boxes"
state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
for file in state_files:
with open(os.path.join(data_dir, file), "rb") as f:
env_state = pickle.load(f)
# lengths.append(len(env_state['robot']['des_c_pos']))
zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
zero_action = np.zeros(
(1, self.max_len_data, self.action_dim), dtype=np.float32
)
zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
# robot and box posistion
robot_des_pos = env_state["robot"]["des_c_pos"][:, :2]
robot_c_pos = env_state["robot"]["c_pos"][:, :2]
if num_boxes == 2:
red_box1_pos = env_state["red-box1"]["pos"][:, :2]
red_box1_quat = np.tan(
quat2euler(env_state["red-box1"]["quat"])[:, -1:]
)
blue_box1_pos = env_state["blue-box1"]["pos"][:, :2]
blue_box1_quat = np.tan(
quat2euler(env_state["blue-box1"]["quat"])[:, -1:]
)
input_state = np.concatenate(
(
robot_des_pos,
robot_c_pos,
red_box1_pos,
red_box1_quat,
blue_box1_pos,
blue_box1_quat,
),
axis=-1,
)
elif num_boxes == 4:
red_box1_pos = env_state["red-box1"]["pos"][:, :2]
red_box1_quat = np.tan(
quat2euler(env_state["red-box1"]["quat"])[:, -1:]
)
red_box2_pos = env_state["red-box2"]["pos"][:, :2]
red_box2_quat = np.tan(
quat2euler(env_state["red-box2"]["quat"])[:, -1:]
)
blue_box1_pos = env_state["blue-box1"]["pos"][:, :2]
blue_box1_quat = np.tan(
quat2euler(env_state["blue-box1"]["quat"])[:, -1:]
)
blue_box2_pos = env_state["blue-box2"]["pos"][:, :2]
blue_box2_quat = np.tan(
quat2euler(env_state["blue-box2"]["quat"])[:, -1:]
)
input_state = np.concatenate(
(
robot_des_pos,
robot_c_pos,
red_box1_pos,
red_box1_quat,
red_box2_pos,
red_box2_quat,
blue_box1_pos,
blue_box1_quat,
blue_box2_pos,
blue_box2_quat,
),
axis=-1,
)
elif num_boxes == 6:
red_box1_pos = env_state["red-box1"]["pos"][:, :2]
red_box1_quat = np.tan(
quat2euler(env_state["red-box1"]["quat"])[:, -1:]
)
red_box2_pos = env_state["red-box2"]["pos"][:, :2]
red_box2_quat = np.tan(
quat2euler(env_state["red-box2"]["quat"])[:, -1:]
)
red_box3_pos = env_state["red-box3"]["pos"][:, :2]
red_box3_quat = np.tan(
quat2euler(env_state["red-box3"]["quat"])[:, -1:]
)
blue_box1_pos = env_state["blue-box1"]["pos"][:, :2]
blue_box1_quat = np.tan(
quat2euler(env_state["blue-box1"]["quat"])[:, -1:]
)
blue_box2_pos = env_state["blue-box2"]["pos"][:, :2]
blue_box2_quat = np.tan(
quat2euler(env_state["blue-box2"]["quat"])[:, -1:]
)
blue_box3_pos = env_state["blue-box3"]["pos"][:, :2]
blue_box3_quat = np.tan(
quat2euler(env_state["blue-box3"]["quat"])[:, -1:]
)
input_state = np.concatenate(
(
robot_des_pos,
robot_c_pos,
red_box1_pos,
red_box1_quat,
red_box2_pos,
red_box2_quat,
red_box3_pos,
red_box3_quat,
blue_box1_pos,
blue_box1_quat,
blue_box2_pos,
blue_box2_quat,
blue_box3_pos,
blue_box3_quat,
),
axis=-1,
)
else:
assert False, "check num boxes"
vel_state = robot_des_pos[1:] - robot_des_pos[:-1]
valid_len = len(vel_state)
zero_obs[0, :valid_len, :] = input_state[:-1]
zero_action[0, :valid_len, :] = vel_state
zero_mask[0, :valid_len] = 1
inputs.append(zero_obs)
actions.append(zero_action)
masks.append(zero_mask)
# shape: B, T, n
self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
self.num_data = len(self.observations)
self.slices = self.get_slices()
def get_slices(self):
slices = []
min_seq_length = np.inf
for i in range(self.num_data):
T = self.get_seq_length(i)
min_seq_length = min(T, min_seq_length)
if T - self.window_size < 0:
print(
f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
)
else:
slices += [
(i, start, start + self.window_size)
for start in range(T - self.window_size + 1)
] # slice indices follow convention [start, end)
return slices
def get_seq_length(self, idx):
return int(self.masks[idx].sum().item())
def get_all_actions(self):
result = []
# mask out invalid actions
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.actions[i, :T, :])
return torch.cat(result, dim=0)
def get_all_observations(self):
result = []
# mask out invalid observations
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.observations[i, :T, :])
return torch.cat(result, dim=0)
def __len__(self):
return len(self.slices)
def __getitem__(self, idx):
i, start, end = self.slices[idx]
obs = self.observations[i, start:end]
act = self.actions[i, start:end]
mask = self.masks[i, start:end]
return obs, act, mask
class Sorting_Img_Dataset(TrajectoryDataset):
def __init__(
self,
data_directory: os.PathLike,
device="cpu",
obs_dim: int = 20,
action_dim: int = 2,
max_len_data: int = 256,
window_size: int = 1,
num_boxes: int = 2,
):
super().__init__(
data_directory=data_directory,
device=device,
obs_dim=obs_dim,
action_dim=action_dim,
max_len_data=max_len_data,
window_size=window_size,
)
logging.info("Loading Sorting Dataset")
inputs = []
actions = []
masks = []
# for root, dirs, files in os.walk(self.data_directory):
#
# for mode_dir in dirs:
# state_files = glob.glob(os.path.join(root, mode_dir) + "/env*")
# data_dir = os.path.join(sim_framework_path(data_directory), "local")
# data_dir = sim_framework_path(data_directory)
# state_files = glob.glob(data_dir + "/env*")
# random.seed(0)
#
# data_dir = sim_framework_path(data_directory)
# state_files = glob.glob(data_dir + '/state/*')
#
# random.shuffle(state_files)
#
# if data == "train":
# env_state_files = state_files[30:]
# elif data == "eval":
# env_state_files = state_files[:30]
# else:
# assert False, "wrong data type"
if num_boxes == 2:
data_dir = sim_framework_path("environments/dataset/data/sorting/2_boxes/")
elif num_boxes == 4:
data_dir = sim_framework_path("environments/dataset/data/sorting/4_boxes/")
elif num_boxes == 6:
data_dir = sim_framework_path("environments/dataset/data/sorting/6_boxes/")
else:
assert False, "check num boxes"
state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
bp_cam_imgs = []
inhand_cam_imgs = []
for file in tqdm(state_files[:100]):
with open(os.path.join(data_dir, "state", file), "rb") as f:
env_state = pickle.load(f)
# lengths.append(len(env_state['robot']['des_c_pos']))
zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
zero_action = np.zeros(
(1, self.max_len_data, self.action_dim), dtype=np.float32
)
zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
# robot and box posistion
robot_des_pos = env_state["robot"]["des_c_pos"][:, :2]
robot_c_pos = env_state["robot"]["c_pos"][:, :2]
file_name = os.path.basename(file).split(".")[0]
###############################################################
bp_images = []
bp_imgs = glob.glob(data_dir + "/images/bp-cam/" + file_name + "/*")
bp_imgs.sort(key=lambda x: int(os.path.basename(x).split(".")[0]))
for img in bp_imgs:
image = cv2.imread(img).astype(np.float32)
image = image.transpose((2, 0, 1)) / 255.0
image = torch.from_numpy(image).to(self.device).float().unsqueeze(0)
bp_images.append(image)
bp_images = torch.concatenate(bp_images, dim=0)
################################################################
inhand_imgs = glob.glob(data_dir + "/images/inhand-cam/" + file_name + "/*")
inhand_imgs.sort(key=lambda x: int(os.path.basename(x).split(".")[0]))
inhand_images = []
for img in inhand_imgs:
image = cv2.imread(img).astype(np.float32)
image = image.transpose((2, 0, 1)) / 255.0
image = torch.from_numpy(image).to(self.device).float().unsqueeze(0)
inhand_images.append(image)
inhand_images = torch.concatenate(inhand_images, dim=0)
##################################################################
# input_state = np.concatenate((robot_des_pos, robot_c_pos), axis=-1)
vel_state = robot_des_pos[1:] - robot_des_pos[:-1]
valid_len = len(vel_state)
zero_obs[0, :valid_len, :] = robot_des_pos[:-1]
zero_action[0, :valid_len, :] = vel_state
zero_mask[0, :valid_len] = 1
bp_cam_imgs.append(bp_images)
inhand_cam_imgs.append(inhand_images)
inputs.append(zero_obs)
actions.append(zero_action)
masks.append(zero_mask)
self.bp_cam_imgs = bp_cam_imgs
self.inhand_cam_imgs = inhand_cam_imgs
# shape: B, T, n
self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
self.num_data = len(self.actions)
self.slices = self.get_slices()
def get_slices(self):
slices = []
min_seq_length = np.inf
for i in range(self.num_data):
T = self.get_seq_length(i)
min_seq_length = min(T, min_seq_length)
if T - self.window_size < 0:
print(
f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
)
else:
slices += [
(i, start, start + self.window_size)
for start in range(T - self.window_size + 1)
] # slice indices follow convention [start, end)
return slices
def get_seq_length(self, idx):
return int(self.masks[idx].sum().item())
def get_all_actions(self):
result = []
# mask out invalid actions
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.actions[i, :T, :])
return torch.cat(result, dim=0)
def get_all_observations(self):
result = []
# mask out invalid observations
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.observations[i, :T, :])
return torch.cat(result, dim=0)
def __len__(self):
return len(self.slices)
def __getitem__(self, idx):
i, start, end = self.slices[idx]
obs = self.observations[i, start:end]
act = self.actions[i, start:end]
mask = self.masks[i, start:end]
bp_imgs = self.bp_cam_imgs[i][start:end]
inhand_imgs = self.inhand_cam_imgs[i][start:end]
# bp_imgs = np.zeros((self.window_size, 3, 96, 96), dtype=np.float32)
# inhand_imgs = np.zeros((self.window_size, 3, 96, 96), dtype=np.float32)
#
# for num_frame, img_file in enumerate(bp_img_files):
# image = cv2.imread(img_file).astype(np.float32)
# bp_imgs[num_frame] = image.transpose((2, 0, 1)) / 255.
#
# for num_frame, img_file in enumerate(inhand_img_files):
# image = cv2.imread(img_file).astype(np.float32)
# inhand_imgs[num_frame] = image.transpose((2, 0, 1)) / 255.
#
# bp_imgs = torch.from_numpy(bp_imgs).to(self.device).float()
# inhand_imgs = torch.from_numpy(inhand_imgs).to(self.device).float()
return bp_imgs, inhand_imgs, obs, act, mask

View File

@ -0,0 +1,398 @@
import logging
import os
import glob
try:
import cv2 # not included in pyproject.toml
except:
print("Installing cv2")
os.system("pip install opencv-python")
import torch
import pickle
import numpy as np
from tqdm import tqdm
from agent.dataset.d3il_dataset.base_dataset import TrajectoryDataset
from agent.dataset.d3il_dataset import sim_framework_path
from .geo_transform import quat2euler
class Stacking_Dataset(TrajectoryDataset):
def __init__(
self,
data_directory: os.PathLike,
# data='train',
device="cpu",
obs_dim: int = 20,
action_dim: int = 2,
max_len_data: int = 256,
window_size: int = 1,
):
super().__init__(
data_directory=data_directory,
device=device,
obs_dim=obs_dim,
action_dim=action_dim,
max_len_data=max_len_data,
window_size=window_size,
)
logging.info("Loading CubeStacking Dataset")
inputs = []
actions = []
masks = []
# for root, dirs, files in os.walk(self.data_directory):
#
# for mode_dir in dirs:
# state_files = glob.glob(os.path.join(root, mode_dir) + "/env*")
# data_dir = os.path.join(sim_framework_path(data_directory), "local")
# data_dir = sim_framework_path(data_directory)
# state_files = glob.glob(data_dir + "/env*")
# bp_data_dir = sim_framework_path("environments/dataset/data/stacking/all_data_new")
# state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
# bp_data_dir = sim_framework_path("environments/dataset/data/stacking/single_test")
# state_files = os.listdir(bp_data_dir)
# random.seed(0)
#
# data_dir = sim_framework_path(data_directory)
# state_files = os.listdir(data_dir)
#
# random.shuffle(state_files)
#
# if data == "train":
# env_state_files = state_files[20:]
# elif data == "eval":
# env_state_files = state_files[:20]
# else:
# assert False, "wrong data type"
data_dir = sim_framework_path("data/stacking/all_data")
state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
for file in state_files:
with open(os.path.join(data_dir, file), "rb") as f:
env_state = pickle.load(f)
# lengths.append(len(env_state['robot']['des_c_pos']))
zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
zero_action = np.zeros(
(1, self.max_len_data, self.action_dim), dtype=np.float32
)
zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
# robot and box positions
robot_des_j_pos = env_state["robot"]["des_j_pos"]
robot_des_j_vel = env_state["robot"]["des_j_vel"]
robot_des_c_pos = env_state["robot"]["des_c_pos"]
robot_des_quat = env_state["robot"]["des_c_quat"]
robot_c_pos = env_state["robot"]["c_pos"]
robot_c_quat = env_state["robot"]["c_quat"]
robot_j_pos = env_state["robot"]["j_pos"]
robot_j_vel = env_state["robot"]["j_vel"]
robot_gripper = np.expand_dims(env_state["robot"]["gripper_width"], -1)
# pred_gripper = np.zeros(robot_gripper.shape, dtype=np.float32)
# pred_gripper[robot_gripper > 0.075] = 1
sim_steps = np.expand_dims(np.arange(len(robot_des_j_pos)), -1)
red_box_pos = env_state["red-box"]["pos"]
red_box_quat = np.tan(quat2euler(env_state["red-box"]["quat"])[:, -1:])
# red_box_quat = np.concatenate((np.sin(red_box_quat), np.cos(red_box_quat)), axis=-1)
green_box_pos = env_state["green-box"]["pos"]
green_box_quat = np.tan(quat2euler(env_state["green-box"]["quat"])[:, -1:])
# green_box_quat = np.concatenate((np.sin(green_box_quat), np.cos(green_box_quat)), axis=-1)
blue_box_pos = env_state["blue-box"]["pos"]
blue_box_quat = np.tan(quat2euler(env_state["blue-box"]["quat"])[:, -1:])
# blue_box_quat = np.concatenate((np.sin(blue_box_quat), np.cos(blue_box_quat)), axis=-1)
# target_box_pos = env_state['target-box']['pos'] #- robot_c_pos
# input_state = np.concatenate((robot_des_c_pos, robot_des_quat, pred_gripper, red_box_pos, red_box_quat), axis=-1)
# input_state = np.concatenate((robot_des_j_pos, robot_gripper, blue_box_pos, blue_box_quat), axis=-1)
input_state = np.concatenate(
(
robot_des_j_pos,
robot_gripper,
red_box_pos,
red_box_quat,
green_box_pos,
green_box_quat,
blue_box_pos,
blue_box_quat,
),
axis=-1,
)
# input_state = np.concatenate((robot_des_j_pos, robot_des_j_vel, robot_c_pos, robot_c_quat, green_box_pos, green_box_quat,
# target_box_pos), axis=-1)
vel_state = robot_des_j_pos[1:] - robot_des_j_pos[:-1]
valid_len = len(input_state) - 1
zero_obs[0, :valid_len, :] = input_state[:-1]
zero_action[0, :valid_len, :] = np.concatenate(
(vel_state, robot_gripper[1:]), axis=-1
)
zero_mask[0, :valid_len] = 1
inputs.append(zero_obs)
actions.append(zero_action)
masks.append(zero_mask)
# shape: B, T, n
self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
self.num_data = len(self.observations)
self.slices = self.get_slices()
def get_slices(self):
slices = []
min_seq_length = np.inf
for i in range(self.num_data):
T = self.get_seq_length(i)
min_seq_length = min(T, min_seq_length)
if T - self.window_size < 0:
print(
f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
)
else:
slices += [
(i, start, start + self.window_size)
for start in range(T - self.window_size + 1)
] # slice indices follow convention [start, end)
return slices
def get_seq_length(self, idx):
return int(self.masks[idx].sum().item())
def get_all_actions(self):
result = []
# mask out invalid actions
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.actions[i, :T, :])
return torch.cat(result, dim=0)
def get_all_observations(self):
result = []
# mask out invalid observations
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.observations[i, :T, :])
return torch.cat(result, dim=0)
def __len__(self):
return len(self.slices)
def __getitem__(self, idx):
i, start, end = self.slices[idx]
obs = self.observations[i, start:end]
act = self.actions[i, start:end]
mask = self.masks[i, start:end]
return obs, act, mask
class Stacking_Img_Dataset(TrajectoryDataset):
def __init__(
self,
data_directory: os.PathLike,
# data='train',
device="cpu",
obs_dim: int = 20,
action_dim: int = 2,
max_len_data: int = 256,
window_size: int = 1,
):
super().__init__(
data_directory=data_directory,
device=device,
obs_dim=obs_dim,
action_dim=action_dim,
max_len_data=max_len_data,
window_size=window_size,
)
logging.info("Loading CubeStacking Dataset")
inputs = []
actions = []
masks = []
# TODO: insert data_dir here
state_files = np.load(sim_framework_path(data_directory), allow_pickle=True)
bp_cam_imgs = []
inhand_cam_imgs = []
for file in tqdm(state_files):
with open(os.path.join(data_dir, "state", file), "rb") as f:
env_state = pickle.load(f)
# lengths.append(len(env_state['robot']['des_c_pos']))
zero_obs = np.zeros((1, self.max_len_data, self.obs_dim), dtype=np.float32)
zero_action = np.zeros(
(1, self.max_len_data, self.action_dim), dtype=np.float32
)
zero_mask = np.zeros((1, self.max_len_data), dtype=np.float32)
# robot and box positions
robot_des_j_pos = env_state["robot"]["des_j_pos"]
robot_des_j_vel = env_state["robot"]["des_j_vel"]
robot_des_c_pos = env_state["robot"]["des_c_pos"]
robot_des_quat = env_state["robot"]["des_c_quat"]
robot_c_pos = env_state["robot"]["c_pos"]
robot_c_quat = env_state["robot"]["c_quat"]
robot_j_pos = env_state["robot"]["j_pos"]
robot_j_vel = env_state["robot"]["j_vel"]
robot_gripper = np.expand_dims(env_state["robot"]["gripper_width"], -1)
# pred_gripper = np.zeros(robot_gripper.shape, dtype=np.float32)
# pred_gripper[robot_gripper > 0.075] = 1
file_name = os.path.basename(file).split(".")[0]
###############################################################
bp_images = []
bp_imgs = glob.glob(data_dir + "/images/bp-cam/" + file_name + "/*")
bp_imgs.sort(key=lambda x: int(os.path.basename(x).split(".")[0]))
for img in bp_imgs:
image = cv2.imread(img).astype(np.float32)
image = image.transpose((2, 0, 1)) / 255.0
image = torch.from_numpy(image).to(self.device).float().unsqueeze(0)
bp_images.append(image)
bp_images = torch.concatenate(bp_images, dim=0)
################################################################
inhand_imgs = glob.glob(data_dir + "/images/inhand-cam/" + file_name + "/*")
inhand_imgs.sort(key=lambda x: int(os.path.basename(x).split(".")[0]))
inhand_images = []
for img in inhand_imgs:
image = cv2.imread(img).astype(np.float32)
image = image.transpose((2, 0, 1)) / 255.0
image = torch.from_numpy(image).to(self.device).float().unsqueeze(0)
inhand_images.append(image)
inhand_images = torch.concatenate(inhand_images, dim=0)
##################################################################
input_state = np.concatenate((robot_des_j_pos, robot_gripper), axis=-1)
vel_state = robot_des_j_pos[1:] - robot_des_j_pos[:-1]
valid_len = len(input_state) - 1
zero_obs[0, :valid_len, :] = input_state[:-1]
zero_action[0, :valid_len, :] = np.concatenate(
(vel_state, robot_gripper[1:]), axis=-1
)
zero_mask[0, :valid_len] = 1
bp_cam_imgs.append(bp_images)
inhand_cam_imgs.append(inhand_images)
inputs.append(zero_obs)
actions.append(zero_action)
masks.append(zero_mask)
self.bp_cam_imgs = bp_cam_imgs
self.inhand_cam_imgs = inhand_cam_imgs
# shape: B, T, n
self.observations = torch.from_numpy(np.concatenate(inputs)).to(device).float()
self.actions = torch.from_numpy(np.concatenate(actions)).to(device).float()
self.masks = torch.from_numpy(np.concatenate(masks)).to(device).float()
self.num_data = len(self.observations)
self.slices = self.get_slices()
def get_slices(self):
slices = []
min_seq_length = np.inf
for i in range(self.num_data):
T = self.get_seq_length(i)
min_seq_length = min(T, min_seq_length)
if T - self.window_size < 0:
print(
f"Ignored short sequence #{i}: len={T}, window={self.window_size}"
)
else:
slices += [
(i, start, start + self.window_size)
for start in range(T - self.window_size + 1)
] # slice indices follow convention [start, end)
return slices
def get_seq_length(self, idx):
return int(self.masks[idx].sum().item())
def get_all_actions(self):
result = []
# mask out invalid actions
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.actions[i, :T, :])
return torch.cat(result, dim=0)
def get_all_observations(self):
result = []
# mask out invalid observations
for i in range(len(self.masks)):
T = int(self.masks[i].sum().item())
result.append(self.observations[i, :T, :])
return torch.cat(result, dim=0)
def __len__(self):
return len(self.slices)
def __getitem__(self, idx):
i, start, end = self.slices[idx]
obs = self.observations[i, start:end]
act = self.actions[i, start:end]
mask = self.masks[i, start:end]
bp_imgs = self.bp_cam_imgs[i][start:end]
inhand_imgs = self.inhand_cam_imgs[i][start:end]
return bp_imgs, inhand_imgs, obs, act, mask

162
agent/dataset/sequence.py Normal file
View File

@ -0,0 +1,162 @@
"""
Pre-training data loader. Modified from https://github.com/jannerm/diffuser/blob/main/diffuser/datasets/sequence.py
TODO: implement history observation
No normalization is applied here --- we always normalize the data when pre-processing it with a different script, and the normalization info is also used in RL fine-tuning.
"""
from collections import namedtuple
from tqdm import tqdm
import numpy as np
import torch
import logging
import pickle
import random
log = logging.getLogger(__name__)
from .buffer import StitchedBuffer
Batch = namedtuple("Batch", "trajectories conditions")
ValueBatch = namedtuple("ValueBatch", "trajectories conditions values")
class StitchedSequenceDataset(torch.utils.data.Dataset):
"""
Dataset to efficiently load and sample trajectories. Stitches episodes together in the time dimension to avoid excessive zero padding. Episode ID's are used to index unique trajectories.
Returns a dictionary with values of shape: [sum_e(T_e), *D] where T_e is traj length of episode e and D is
(tuple of) dimension of observation, action, images, etc.
Example:
Observations: [----------traj 1----------][---------traj 2----------] ... [---------traj N----------]
Episode IDs: [---------- 1 ----------][---------- 2 ---------] ... [---------- N ---------]
"""
def __init__(
self,
dataset_path,
horizon_steps=64,
cond_steps=1,
max_n_episodes=10000,
use_img=False,
device="cpu",
):
self.horizon_steps = horizon_steps
self.cond_steps = cond_steps
self.device = device
# Load dataset to device specified
if dataset_path.endswith(".npz"):
dataset = np.load(dataset_path, allow_pickle=True)
else:
with open(dataset_path, "rb") as f:
dataset = pickle.load(f)
num_episodes = dataset["observations"].shape[0]
# Get the sum total of the valid trajectories' lengths
traj_lengths = dataset["traj_length"]
sum_of_path_lengths = np.sum(traj_lengths)
self.sum_of_path_lengths = sum_of_path_lengths
fields = StitchedBuffer(sum_of_path_lengths, device)
for i in tqdm(
range(min(max_n_episodes, num_episodes)), desc="Loading trajectories"
):
traj_length = traj_lengths[i]
episode = {
"observations": dataset["observations"][i][:traj_length],
"actions": dataset["actions"][i][:traj_length],
"episode_ids": i * np.ones(traj_length),
}
if use_img:
episode["images"] = dataset["images"][i][:traj_length]
for key, val in episode.items():
if device == "cpu":
episode[key] = val
else:
# if None array, save as empty tensor
if np.all(np.equal(episode[key], None)):
episode[key] = torch.empty(episode[key].shape).to(device)
else:
if key == "images":
episode[key] = torch.tensor(val, dtype=torch.uint8).to(
device
)
# (, H, W, C) -> (, C, H, W)
episode[key] = episode[key].permute(0, 3, 1, 2)
else:
episode[key] = torch.tensor(val, dtype=torch.float32).to(
device
)
fields.add_path(episode)
fields.finalize()
self.indices = self.make_indices(traj_lengths, horizon_steps)
self.obs_dim = fields.observations.shape[-1]
self.action_dim = fields.actions.shape[-1]
self.fields = fields
self.n_episodes = fields.n_episodes
self.path_lengths = fields.path_lengths
self.traj_lengths = traj_lengths
self.use_img = use_img
log.info(fields)
def make_indices(self, traj_lengths, horizon_steps):
"""
makes indices for sampling from dataset;
each index maps to a datapoint
"""
indices = []
cur_traj_index = 0
for traj_length in traj_lengths:
max_start = cur_traj_index + traj_length - horizon_steps + 1
indices += list(range(cur_traj_index, max_start))
cur_traj_index += traj_length
return indices
def set_train_val_split(self, train_split):
num_train = int(len(self.indices) * train_split)
train_indices = random.sample(self.indices, num_train)
val_indices = [i for i in range(len(self.indices)) if i not in train_indices]
self.indices = train_indices
return val_indices
def set_indices(self, indices):
self.indices = indices
def get_conditions(self, observations, images=None):
"""
condition on current observation for planning. Take into account the number of conditioning steps.
"""
if images is not None:
return {
1 - self.cond_steps: {"state": observations[0], "rgb": images[0]}
} # TODO: allow obs history, -1, -2, ...
else:
return {1 - self.cond_steps: observations[0]}
def __len__(self):
return len(self.indices)
def __getitem__(self, idx, eps=1e-4):
raise NotImplementedError("Get item defined in subclass.")
class StitchedActionSequenceDataset(StitchedSequenceDataset):
"""Only use action trajectory, and then obs_cond for current observation"""
def __getitem__(self, idx):
start = self.indices[idx]
end = start + self.horizon_steps
observations = self.fields.observations[start:end]
actions = self.fields.actions[start:end]
images = None
if self.use_img:
images = self.fields.images[start:end]
conditions = self.get_conditions(observations, images)
batch = Batch(actions, conditions)
return batch

View File

@ -0,0 +1,170 @@
"""
Parent fine-tuning agent class.
"""
import os
import numpy as np
from omegaconf import OmegaConf
import torch
import hydra
import logging
import wandb
import random
log = logging.getLogger(__name__)
from env.gym_utils import make_async
class TrainAgent:
def __init__(self, cfg):
super().__init__()
self.cfg = cfg
self.device = cfg.device
self.seed = cfg.get("seed", 42)
random.seed(self.seed)
np.random.seed(self.seed)
torch.manual_seed(self.seed)
# Wandb
self.use_wandb = cfg.wandb is not None
if cfg.wandb is not None:
wandb.init(
entity=cfg.wandb.entity,
project=cfg.wandb.project,
name=cfg.wandb.run,
config=OmegaConf.to_container(cfg, resolve=True),
)
# Make vectorized env
self.env_name = cfg.env.name
env_type = cfg.env.get("env_type", None)
self.venv = make_async(
cfg.env.name,
env_type=env_type,
num_envs=cfg.env.n_envs,
asynchronous=True,
max_episode_steps=cfg.env.max_episode_steps,
wrappers=cfg.env.get("wrappers", None),
robomimic_env_cfg_path=cfg.get("robomimic_env_cfg_path", None),
shape_meta=cfg.get("shape_meta", None),
use_image_obs=cfg.env.get("use_image_obs", False),
render=cfg.env.get("render", False),
render_offscreen=cfg.env.get("save_video", False),
obs_dim=cfg.obs_dim,
action_dim=cfg.action_dim,
**cfg.env.specific if "specific" in cfg.env else {},
)
if not env_type == "furniture":
self.venv.seed(
[self.seed + i for i in range(cfg.env.n_envs)]
) # otherwise parallel envs might have the same initial states!
# isaacgym environments do not need seeding
self.n_envs = cfg.env.n_envs
self.n_cond_step = cfg.cond_steps
self.obs_dim = cfg.obs_dim
self.action_dim = cfg.action_dim
self.act_steps = cfg.act_steps
self.horizon_steps = cfg.horizon_steps
self.max_episode_steps = cfg.env.max_episode_steps
self.reset_at_iteration = cfg.env.get("reset_at_iteration", True)
self.save_full_observations = cfg.env.get("save_full_observations", False)
self.furniture_sparse_reward = (
cfg.env.specific.get("sparse_reward", False)
if "specific" in cfg.env
else False
) # furniture specific, for best reward calculation
# Batch size for gradient update
self.batch_size: int = cfg.train.batch_size
# Build model and load checkpoint
self.model = hydra.utils.instantiate(cfg.model)
# Training params
self.itr = 0
self.n_train_itr = cfg.train.n_train_itr
self.val_freq = cfg.train.val_freq
self.force_train = cfg.train.get("force_train", False)
self.n_steps = cfg.train.n_steps
self.best_reward_threshold_for_success = (
len(self.venv.pairs_to_assemble)
if env_type == "furniture"
else cfg.env.best_reward_threshold_for_success
)
self.max_grad_norm = cfg.train.get("max_grad_norm", None)
# Logging, rendering, checkpoints
self.logdir = cfg.logdir
self.render_dir = os.path.join(self.logdir, "render")
self.checkpoint_dir = os.path.join(self.logdir, "checkpoint")
self.result_path = os.path.join(self.logdir, "result.pkl")
os.makedirs(self.render_dir, exist_ok=True)
os.makedirs(self.checkpoint_dir, exist_ok=True)
self.save_trajs = cfg.train.get("save_trajs", False)
self.log_freq = cfg.train.get("log_freq", 1)
self.save_model_freq = cfg.train.save_model_freq
self.render_freq = cfg.train.render.freq
self.n_render = cfg.train.render.num
self.render_video = cfg.env.get("save_video", False)
assert self.n_render <= self.n_envs, "n_render must be <= n_envs"
assert not (
self.n_render <= 0 and self.render_video
), "Need to set n_render > 0 if saving video"
self.traj_plotter = (
hydra.utils.instantiate(cfg.train.plotter)
if "plotter" in cfg.train
else None
)
def run(self):
pass
def save_model(self):
"""
saves model to disk; no ema
"""
data = {
"itr": self.itr,
"model": self.model.state_dict(),
}
savepath = os.path.join(self.checkpoint_dir, f"state_{self.itr}.pt")
torch.save(data, savepath)
log.info(f"Saved model to {savepath}")
def load(self, itr):
"""
loads model from disk
"""
loadpath = os.path.join(self.checkpoint_dir, f"state_{itr}.pt")
data = torch.load(loadpath, weights_only=True)
self.itr = data["itr"]
self.model.load_state_dict(data["model"])
def reset_env_all(self, verbose=False, options_venv=None, **kwargs):
if options_venv is None:
options_venv = [
{k: v for k, v in kwargs.items()} for _ in range(self.n_envs)
]
obs_venv = self.venv.reset_arg(options_list=options_venv)
# convert to OrderedDict if obs_venv is a list of dict
if isinstance(obs_venv, list):
obs_venv = {
key: np.stack([obs_venv[i][key] for i in range(self.n_envs)])
for key in obs_venv[0].keys()
}
if verbose:
for index in range(self.n_envs):
logging.info(
f"<-- Reset environment {index} with options {options_venv[index]}"
)
return obs_venv
def reset_env(self, env_ind, verbose=False):
task = {}
obs = self.venv.reset_one_arg(env_ind=env_ind, options=task)
if verbose:
logging.info(f"<-- Reset environment {env_ind} with task {task}")
return obs

View File

@ -0,0 +1,389 @@
"""
Advantage-weighted regression (AWR) for diffusion policy.
Advantage = discounted-reward-to-go - V(s)
"""
import os
import pickle
import einops
import numpy as np
import torch
import logging
import wandb
from copy import deepcopy
log = logging.getLogger(__name__)
from util.timer import Timer
from collections import deque
from agent.finetune.train_agent import TrainAgent
from util.scheduler import CosineAnnealingWarmupRestarts
def td_values(
states,
rewards,
dones,
state_values,
gamma=0.99,
alpha=0.95,
lam=0.95,
):
"""
Gives a list of TD estimates for a given list of samples from an RL environment.
The TD(λ) estimator is used for this computation.
:param replay_buffers: The replay buffers filled by exploring the RL environment.
Includes: states, rewards, "final state?"s.
:param state_values: The currently estimated state values.
:return: The TD estimates.
"""
sample_count = len(states)
tds = np.zeros_like(state_values, dtype=np.float32)
dones[-1] = 1
next_value = 1 - dones[-1]
val = 0.0
for i in range(sample_count - 1, -1, -1):
# next_value = 0.0 if dones[i] else state_values[i + 1]
# get next_value for vectorized
if i < sample_count - 1:
next_value = state_values[i + 1]
next_value = next_value * (1 - dones[i])
state_value = state_values[i]
error = rewards[i] + gamma * next_value - state_value
val = alpha * error + gamma * lam * (1 - dones[i]) * val
tds[i] = val + state_value
return tds
class TrainAWRDiffusionAgent(TrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
self.logprob_batch_size = cfg.train.get("logprob_batch_size", 10000)
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
self.gamma = cfg.train.gamma
# Wwarm up period for critic before actor updates
self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
# Optimizer
self.actor_optimizer = torch.optim.AdamW(
self.model.actor.parameters(),
lr=cfg.train.actor_lr,
weight_decay=cfg.train.actor_weight_decay,
)
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
self.actor_optimizer,
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.actor_lr,
min_lr=cfg.train.actor_lr_scheduler.min_lr,
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
gamma=1.0,
)
self.critic_optimizer = torch.optim.AdamW(
self.model.critic.parameters(),
lr=cfg.train.critic_lr,
weight_decay=cfg.train.critic_weight_decay,
)
self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
self.critic_optimizer,
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.critic_lr,
min_lr=cfg.train.critic_lr_scheduler.min_lr,
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
gamma=1.0,
)
# Buffer size
self.buffer_size = cfg.train.buffer_size
# Reward exponential
self.beta = cfg.train.beta
# Max weight for AWR
self.max_adv_weight = cfg.train.max_adv_weight
# Scaling reward
self.scale_reward_factor = cfg.train.scale_reward_factor
# Updates
self.replay_ratio = cfg.train.replay_ratio
self.critic_update_ratio = cfg.train.critic_update_ratio
def run(self):
# make a FIFO replay buffer for obs, action, and reward
obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
done_buffer = deque(maxlen=self.buffer_size)
first_buffer = deque(maxlen=self.buffer_size)
# Start training loop
timer = Timer()
run_results = []
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = self.itr % self.val_freq == 0 and not self.force_train
self.model.eval() if eval_mode else self.model.train()
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
# Reset env at the beginning of an iteration
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
last_itr_eval = eval_mode
reward_trajs = np.empty((0, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
if step % 10 == 0:
print(f"Processed step {step} of {self.n_steps}")
# Select action
with torch.no_grad():
samples = (
self.model(
cond=torch.from_numpy(prev_obs_venv)
.float()
.to(self.device),
deterministic=eval_mode,
)
.cpu()
.numpy()
) # n_env x horizon x act
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
# add to buffer
obs_buffer.append(prev_obs_venv)
action_buffer.append(action_venv)
reward_buffer.append(reward_venv * self.scale_reward_factor)
done_buffer.append(done_venv)
first_buffer.append(firsts_trajs[step])
firsts_trajs[step + 1] = done_venv
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
log.info("[WARNING] No episode completed within the iteration!")
# Update
if not eval_mode:
obs_trajs = np.array(deepcopy(obs_buffer))
reward_trajs = np.array(deepcopy(reward_buffer))
dones_trajs = np.array(deepcopy(done_buffer))
obs_t = einops.rearrange(
torch.from_numpy(obs_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
values_t = np.array(self.model.critic(obs_t).detach().cpu().numpy())
values_trajs = values_t.reshape(-1, self.n_envs)
td_trajs = td_values(obs_trajs, reward_trajs, dones_trajs, values_trajs)
# flatten
obs_trajs = einops.rearrange(
obs_trajs,
"s e h d -> (s e) h d",
)
td_trajs = einops.rearrange(
td_trajs,
"s e -> (s e)",
)
# Update policy and critic
num_batch = int(
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
)
for _ in range(num_batch // self.critic_update_ratio):
# Sample batch
inds = np.random.choice(len(obs_trajs), self.batch_size)
obs_b = torch.from_numpy(obs_trajs[inds]).float().to(self.device)
td_b = torch.from_numpy(td_trajs[inds]).float().to(self.device)
# Update critic
loss_critic = self.model.loss_critic(obs_b, td_b)
self.critic_optimizer.zero_grad()
loss_critic.backward()
self.critic_optimizer.step()
obs_trajs = np.array(deepcopy(obs_buffer))
samples_trajs = np.array(deepcopy(action_buffer))
reward_trajs = np.array(deepcopy(reward_buffer))
dones_trajs = np.array(deepcopy(done_buffer))
obs_t = einops.rearrange(
torch.from_numpy(obs_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
values_t = np.array(self.model.critic(obs_t).detach().cpu().numpy())
values_trajs = values_t.reshape(-1, self.n_envs)
td_trajs = td_values(obs_trajs, reward_trajs, dones_trajs, values_trajs)
advantages_trajs = td_trajs - values_trajs
# flatten
obs_trajs = einops.rearrange(
obs_trajs,
"s e h d -> (s e) h d",
)
samples_trajs = einops.rearrange(
samples_trajs,
"s e h d -> (s e) h d",
)
advantages_trajs = einops.rearrange(
advantages_trajs,
"s e -> (s e)",
)
for _ in range(num_batch):
# Sample batch
inds = np.random.choice(len(obs_trajs), self.batch_size)
obs_b = torch.from_numpy(obs_trajs[inds]).float().to(self.device)
actions_b = (
torch.from_numpy(samples_trajs[inds]).float().to(self.device)
)
advantages_b = (
torch.from_numpy(advantages_trajs[inds]).float().to(self.device)
)
advantages_b = (advantages_b - advantages_b.mean()) / (
advantages_b.std() + 1e-6
)
advantages_b_scaled = torch.exp(self.beta * advantages_b)
advantages_b_scaled.clamp_(max=self.max_adv_weight)
# Update policy with collected trajectories
loss = self.model.loss(
actions_b,
obs_b,
advantages_b_scaled.detach(),
)
self.actor_optimizer.zero_grad()
loss.backward()
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
self.model.actor.parameters(), self.max_grad_norm
)
self.actor_optimizer.step()
# Update lr
self.actor_lr_scheduler.step()
self.critic_lr_scheduler.step()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
}
)
if self.itr % self.log_freq == 0:
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{timer():8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"loss - critic": loss_critic,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
},
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["loss_critic"] = loss_critic
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = timer()
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,358 @@
"""
Model-free online RL with DIffusion POlicy (DIPO)
Applies action gradient to perturb actions towards maximizer of Q-function.
a_t <- a_t + \eta * \grad_a Q(s, a)
"""
import os
import pickle
import numpy as np
import torch
import logging
import wandb
log = logging.getLogger(__name__)
from util.timer import Timer
from collections import deque
from agent.finetune.train_agent import TrainAgent
from util.scheduler import CosineAnnealingWarmupRestarts
class TrainDIPODiffusionAgent(TrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
self.gamma = cfg.train.gamma
# Wwarm up period for critic before actor updates
self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
# Optimizer
self.actor_optimizer = torch.optim.AdamW(
self.model.actor.parameters(),
lr=cfg.train.actor_lr,
weight_decay=cfg.train.actor_weight_decay,
)
# use cosine scheduler with linear warmup
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
self.actor_optimizer,
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.actor_lr,
min_lr=cfg.train.actor_lr_scheduler.min_lr,
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
gamma=1.0,
)
self.critic_optimizer = torch.optim.AdamW(
self.model.critic.parameters(),
lr=cfg.train.critic_lr,
weight_decay=cfg.train.critic_weight_decay,
)
self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
self.critic_optimizer,
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.critic_lr,
min_lr=cfg.train.critic_lr_scheduler.min_lr,
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
gamma=1.0,
)
# Buffer size
self.buffer_size = cfg.train.buffer_size
# Perturbation scale
self.eta = cfg.train.eta
# Updates
self.replay_ratio = cfg.train.replay_ratio
# Scaling reward
self.scale_reward_factor = cfg.train.scale_reward_factor
# Apply action gradient many steps
self.action_gradient_steps = cfg.train.action_gradient_steps
def run(self):
# make a FIFO replay buffer for obs, action, and reward
obs_buffer = deque(maxlen=self.buffer_size)
next_obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
done_buffer = deque(maxlen=self.buffer_size)
first_buffer = deque(maxlen=self.buffer_size)
# Start training loop
timer = Timer()
run_results = []
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = self.itr % self.val_freq == 0 and not self.force_train
self.model.eval() if eval_mode else self.model.train()
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
# Reset env at the beginning of an iteration
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
last_itr_eval = eval_mode
reward_trajs = np.empty((0, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
if step % 10 == 0:
print(f"Processed step {step} of {self.n_steps}")
# Select action
with torch.no_grad():
samples = (
self.model(
cond=torch.from_numpy(prev_obs_venv)
.float()
.to(self.device),
deterministic=eval_mode,
)
.cpu()
.numpy()
) # n_env x horizon x act
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
# add to buffer
for i in range(self.n_envs):
obs_buffer.append(prev_obs_venv[i])
next_obs_buffer.append(obs_venv[i])
action_buffer.append(action_venv[i])
reward_buffer.append(reward_venv[i] * self.scale_reward_factor)
done_buffer.append(done_venv[i])
first_buffer.append(firsts_trajs[step])
firsts_trajs[step + 1] = done_venv
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
log.info("[WARNING] No episode completed within the iteration!")
if not eval_mode:
num_batch = self.replay_ratio
# Critic learning
for _ in range(num_batch):
# Sample batch
inds = np.random.choice(len(obs_buffer), self.batch_size)
obs_b = (
torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds]))
.float()
.to(self.device)
)
next_obs_b = (
torch.from_numpy(
np.vstack([next_obs_buffer[i][None] for i in inds])
)
.float()
.to(self.device)
)
actions_b = (
torch.from_numpy(
np.vstack([action_buffer[i][None] for i in inds])
)
.float()
.to(self.device)
)
rewards_b = (
torch.from_numpy(np.vstack([reward_buffer[i] for i in inds]))
.float()
.to(self.device)
)
dones_b = (
torch.from_numpy(np.vstack([done_buffer[i] for i in inds]))
.float()
.to(self.device)
)
# Update critic
loss_critic = self.model.loss_critic(
obs_b, next_obs_b, actions_b, rewards_b, dones_b, self.gamma
)
self.critic_optimizer.zero_grad()
loss_critic.backward()
self.critic_optimizer.step()
# Actor learning
for _ in range(num_batch):
# Sample batch
inds = np.random.choice(len(obs_buffer), self.batch_size)
obs_b = (
torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds]))
.float()
.to(self.device)
)
actions_b = (
torch.from_numpy(
np.vstack([action_buffer[i][None] for i in inds])
)
.float()
.to(self.device)
)
# Replace actions in buffer with guided actions
guided_action_list = []
# get Q-perturbed actions by optimizing
actions_flat = actions_b.reshape(actions_b.shape[0], -1)
actions_optim = torch.optim.Adam(
[actions_flat], lr=self.eta, eps=1e-5
)
for _ in range(self.action_gradient_steps):
actions_flat.requires_grad_(True)
q_values_1, q_values_2 = self.model.critic(obs_b, actions_flat)
q_values = torch.min(q_values_1, q_values_2)
action_opt_loss = -q_values.sum()
actions_optim.zero_grad()
action_opt_loss.backward(torch.ones_like(action_opt_loss))
# get the perturbed action
actions_optim.step()
actions_flat.requires_grad_(False)
actions_flat.clamp_(-1.0, 1.0)
guided_action = actions_flat.detach()
guided_action = guided_action.reshape(
guided_action.shape[0], -1, self.action_dim
)
guided_action_list.append(guided_action)
guided_action_stacked = torch.cat(guided_action_list, 0)
# Add to buffer (need separate indices since we're working with a limited subset)
for i, i_buf in enumerate(inds):
action_buffer[i_buf] = (
guided_action_stacked[i].detach().cpu().numpy()
)
# Update policy with collected trajectories
loss = self.model.loss(guided_action.detach(), {0: obs_b})
self.actor_optimizer.zero_grad()
loss.backward()
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
self.model.actor.parameters(), self.max_grad_norm
)
self.actor_optimizer.step()
# Update lr
self.actor_lr_scheduler.step()
self.critic_lr_scheduler.step()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
}
)
if self.itr % self.log_freq == 0:
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{timer():8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"loss - critic": loss_critic,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
},
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["loss_critic"] = loss_critic
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = timer()
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,317 @@
"""
Diffusion Q-Learning (DQL)
Learns a critic Q-function and backprops the expected Q-value to train the actor
pi = argmin L_d(\theta) - \alpha * E[Q(s, a)]
L_d is demonstration loss for regularization
"""
import os
import pickle
import numpy as np
import torch
import logging
import wandb
log = logging.getLogger(__name__)
from util.timer import Timer
from collections import deque
from agent.finetune.train_agent import TrainAgent
from util.scheduler import CosineAnnealingWarmupRestarts
class TrainDQLDiffusionAgent(TrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
self.gamma = cfg.train.gamma
# Wwarm up period for critic before actor updates
self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
# Optimizer
self.actor_optimizer = torch.optim.AdamW(
self.model.actor.parameters(),
lr=cfg.train.actor_lr,
weight_decay=cfg.train.actor_weight_decay,
)
# use cosine scheduler with linear warmup
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
self.actor_optimizer,
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.actor_lr,
min_lr=cfg.train.actor_lr_scheduler.min_lr,
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
gamma=1.0,
)
self.critic_optimizer = torch.optim.AdamW(
self.model.critic.parameters(),
lr=cfg.train.critic_lr,
weight_decay=cfg.train.critic_weight_decay,
)
self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
self.critic_optimizer,
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.critic_lr,
min_lr=cfg.train.critic_lr_scheduler.min_lr,
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
gamma=1.0,
)
# Buffer size
self.buffer_size = cfg.train.buffer_size
# Perturbation scale
self.eta = cfg.train.eta
# Reward factor - scale down mujoco reward for better critic training
self.scale_reward_factor = cfg.train.scale_reward_factor
# Updates
self.replay_ratio = cfg.train.replay_ratio
def run(self):
# make a FIFO replay buffer for obs, action, and reward
obs_buffer = deque(maxlen=self.buffer_size)
next_obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
done_buffer = deque(maxlen=self.buffer_size)
first_buffer = deque(maxlen=self.buffer_size)
# Start training loop
timer = Timer()
run_results = []
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = self.itr % self.val_freq == 0 and not self.force_train
self.model.eval() if eval_mode else self.model.train()
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
# Reset env at the beginning of an iteration
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
last_itr_eval = eval_mode
reward_trajs = np.empty((0, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
if step % 10 == 0:
print(f"Processed step {step} of {self.n_steps}")
# Select action
with torch.no_grad():
samples = (
self.model(
cond=torch.from_numpy(prev_obs_venv)
.float()
.to(self.device),
deterministic=eval_mode,
)
.cpu()
.numpy()
) # n_env x horizon x act
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
# add to buffer
for i in range(self.n_envs):
obs_buffer.append(prev_obs_venv[i])
next_obs_buffer.append(obs_venv[i])
action_buffer.append(action_venv[i])
reward_buffer.append(reward_venv[i] * self.scale_reward_factor)
done_buffer.append(done_venv[i])
first_buffer.append(firsts_trajs[step])
firsts_trajs[step + 1] = done_venv
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
log.info("[WARNING] No episode completed within the iteration!")
if not eval_mode:
num_batch = self.replay_ratio
# Critic learning
for _ in range(num_batch):
# Sample batch
inds = np.random.choice(len(obs_buffer), self.batch_size)
obs_b = (
torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds]))
.float()
.to(self.device)
)
next_obs_b = (
torch.from_numpy(
np.vstack([next_obs_buffer[i][None] for i in inds])
)
.float()
.to(self.device)
)
actions_b = (
torch.from_numpy(
np.vstack([action_buffer[i][None] for i in inds])
)
.float()
.to(self.device)
)
rewards_b = (
torch.from_numpy(np.vstack([reward_buffer[i] for i in inds]))
.float()
.to(self.device)
)
dones_b = (
torch.from_numpy(np.vstack([done_buffer[i] for i in inds]))
.float()
.to(self.device)
)
# Update critic
loss_critic = self.model.loss_critic(
obs_b, next_obs_b, actions_b, rewards_b, dones_b, self.gamma
)
self.critic_optimizer.zero_grad()
loss_critic.backward()
self.critic_optimizer.step()
# get the new action and q values
samples = self.model.forward_train(
cond=obs_b.to(self.device),
deterministic=eval_mode,
)
output_venv = samples # n_env x horizon x act
action_venv = output_venv[:, : self.act_steps, : self.action_dim]
actions_flat_b = action_venv.reshape(action_venv.shape[0], -1)
q_values_b = self.model.critic(obs_b, actions_flat_b)
q1_new_action, q2_new_action = q_values_b
# Update policy with collected trajectories
self.actor_optimizer.zero_grad()
actor_loss = self.model.loss_actor(
obs_b, actions_b, q1_new_action, q2_new_action, self.eta
)
actor_loss.backward()
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
self.model.actor.parameters(), self.max_grad_norm
)
self.actor_optimizer.step()
loss = actor_loss
# Update lr
self.actor_lr_scheduler.step()
self.critic_lr_scheduler.step()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
}
)
if self.itr % self.log_freq == 0:
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{timer():8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"loss - critic": loss_critic,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
},
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["loss_critic"] = loss_critic
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = timer()
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,347 @@
"""
Implicit diffusion Q-learning (IDQL) trainer for diffusion policy.
"""
import os
import pickle
import einops
import numpy as np
import torch
import logging
import wandb
from copy import deepcopy
import random
log = logging.getLogger(__name__)
from util.timer import Timer
from collections import deque
from agent.finetune.train_agent import TrainAgent
from util.scheduler import CosineAnnealingWarmupRestarts
class TrainIDQLDiffusionAgent(TrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
self.gamma = cfg.train.gamma
# Wwarm up period for critic before actor updates
self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
# Optimizer
self.actor_optimizer = torch.optim.AdamW(
self.model.actor.parameters(),
lr=cfg.train.actor_lr,
weight_decay=cfg.train.actor_weight_decay,
)
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
self.actor_optimizer,
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.actor_lr,
min_lr=cfg.train.actor_lr_scheduler.min_lr,
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
gamma=1.0,
)
self.critic_q_optimizer = torch.optim.AdamW(
self.model.critic_q.parameters(),
lr=cfg.train.critic_lr,
weight_decay=cfg.train.critic_weight_decay,
)
self.critic_v_optimizer = torch.optim.AdamW(
self.model.critic_v.parameters(),
lr=cfg.train.critic_lr,
weight_decay=cfg.train.critic_weight_decay,
)
self.critic_v_lr_scheduler = CosineAnnealingWarmupRestarts(
self.critic_v_optimizer,
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.critic_lr,
min_lr=cfg.train.critic_lr_scheduler.min_lr,
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
gamma=1.0,
)
self.critic_q_lr_scheduler = CosineAnnealingWarmupRestarts(
self.critic_q_optimizer,
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.critic_lr,
min_lr=cfg.train.critic_lr_scheduler.min_lr,
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
gamma=1.0,
)
# Buffer size
self.buffer_size = cfg.train.buffer_size
# Actor params
self.use_expectile_exploration = cfg.train.use_expectile_exploration
# Scaling reward
self.scale_reward_factor = cfg.train.scale_reward_factor
# Updates
self.replay_ratio = cfg.train.replay_ratio
self.critic_tau = cfg.train.critic_tau
# Whether to use deterministic mode when sampling at eval
self.eval_deterministic = cfg.train.get("eval_deterministic", False)
# Sampling
self.num_sample = cfg.train.eval_sample_num
def run(self):
# make a FIFO replay buffer for obs, action, and reward
obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
next_obs_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
done_buffer = deque(maxlen=self.buffer_size)
first_buffer = deque(maxlen=self.buffer_size)
# Start training loop
timer = Timer()
run_results = []
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = self.itr % self.val_freq == 0 and not self.force_train
self.model.eval() if eval_mode else self.model.train()
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
# Reset env at the beginning of an iteration
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
last_itr_eval = eval_mode
reward_trajs = np.empty((0, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
if step % 10 == 0:
print(f"Processed step {step} of {self.n_steps}")
# Select action
with torch.no_grad():
samples = (
self.model(
cond=torch.from_numpy(prev_obs_venv)
.float()
.to(self.device),
deterministic=eval_mode and self.eval_deterministic,
num_sample=self.num_sample,
use_expectile_exploration=self.use_expectile_exploration,
)
.cpu()
.numpy()
) # n_env x horizon x act
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
# add to buffer
obs_buffer.append(prev_obs_venv)
action_buffer.append(action_venv)
next_obs_buffer.append(obs_venv)
reward_buffer.append(reward_venv * self.scale_reward_factor)
done_buffer.append(done_venv)
first_buffer.append(firsts_trajs[step])
firsts_trajs[step + 1] = done_venv
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
log.info("[WARNING] No episode completed within the iteration!")
# Update
if not eval_mode:
obs_trajs = np.array(deepcopy(obs_buffer))
action_trajs = np.array(deepcopy(action_buffer))
next_obs_trajs = np.array(deepcopy(next_obs_buffer))
reward_trajs = np.array(deepcopy(reward_buffer))
done_trajs = np.array(deepcopy(done_buffer))
first_trajs = np.array(deepcopy(first_buffer))
# flatten
obs_trajs = einops.rearrange(
obs_trajs,
"s e h d -> (s e) h d",
)
next_obs_trajs = einops.rearrange(
next_obs_trajs,
"s e h d -> (s e) h d",
)
action_trajs = einops.rearrange(
action_trajs,
"s e h d -> (s e) h d",
)
reward_trajs = reward_trajs.reshape(-1)
done_trajs = done_trajs.reshape(-1)
first_trajs = first_trajs.reshape(-1)
num_batch = int(
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
)
for _ in range(num_batch):
# Sample batch
inds = np.random.choice(len(obs_trajs), self.batch_size)
obs_b = torch.from_numpy(obs_trajs[inds]).float().to(self.device)
next_obs_b = (
torch.from_numpy(next_obs_trajs[inds]).float().to(self.device)
)
actions_b = (
torch.from_numpy(action_trajs[inds]).float().to(self.device)
)
reward_b = (
torch.from_numpy(reward_trajs[inds]).float().to(self.device)
)
done_b = torch.from_numpy(done_trajs[inds]).float().to(self.device)
# update critic value function
critic_loss_v = self.model.loss_critic_v(obs_b, actions_b)
self.critic_v_optimizer.zero_grad()
critic_loss_v.backward()
self.critic_v_optimizer.step()
# update critic q function
critic_loss_q = self.model.loss_critic_q(
obs_b, next_obs_b, actions_b, reward_b, done_b, self.gamma
)
self.critic_q_optimizer.zero_grad()
critic_loss_q.backward()
self.critic_q_optimizer.step()
# update target q function
self.model.update_target_critic(self.critic_tau)
loss_critic = critic_loss_q.detach() + critic_loss_v.detach()
# Update policy with collected trajectories - no weighting
loss = self.model.loss(
actions_b,
obs_b,
)
self.actor_optimizer.zero_grad()
loss.backward()
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
self.model.actor.parameters(), self.max_grad_norm
)
self.actor_optimizer.step()
# Update lr
self.actor_lr_scheduler.step()
self.critic_v_lr_scheduler.step()
self.critic_q_lr_scheduler.step()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
}
)
if self.itr % self.log_freq == 0:
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{timer():8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"loss - critic": loss_critic,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
},
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["loss_critic"] = loss_critic
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = timer()
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,112 @@
"""
Parent PPO fine-tuning agent class.
"""
from typing import Optional
import torch
import logging
from util.scheduler import CosineAnnealingWarmupRestarts
log = logging.getLogger(__name__)
from agent.finetune.train_agent import TrainAgent
from util.reward_scaling import RunningRewardScaler
class TrainPPOAgent(TrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
# Batch size for logprobs calculations after an iteration --- prevent out of memory if using a single batch
self.logprob_batch_size = cfg.train.get("logprob_batch_size", 10000)
assert (
self.logprob_batch_size % self.n_envs == 0
), "logprob_batch_size must be divisible by n_envs"
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
self.gamma = cfg.train.gamma
# Wwarm up period for critic before actor updates
self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
# Optimizer
self.actor_optimizer = torch.optim.AdamW(
self.model.actor_ft.parameters(),
lr=cfg.train.actor_lr,
weight_decay=cfg.train.actor_weight_decay,
)
# use cosine scheduler with linear warmup
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
self.actor_optimizer,
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.actor_lr,
min_lr=cfg.train.actor_lr_scheduler.min_lr,
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
gamma=1.0,
)
self.critic_optimizer = torch.optim.AdamW(
self.model.critic.parameters(),
lr=cfg.train.critic_lr,
weight_decay=cfg.train.critic_weight_decay,
)
self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
self.critic_optimizer,
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.critic_lr,
min_lr=cfg.train.critic_lr_scheduler.min_lr,
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
gamma=1.0,
)
# Generalized advantage estimation
self.gae_lambda: float = cfg.train.get("gae_lambda", 0.95)
# If specified, stop gradient update once KL difference reaches it
self.target_kl: Optional[float] = cfg.train.target_kl
# Number of times the collected data is used in gradient update
self.update_epochs: int = cfg.train.update_epochs
# Entropy loss coefficient
self.ent_coef: float = cfg.train.get("ent_coef", 0)
# Value loss coefficient
self.vf_coef: float = cfg.train.get("vf_coef", 0)
# Whether to use running reward scaling
self.reward_scale_running: bool = cfg.train.reward_scale_running
if self.reward_scale_running:
self.running_reward_scaler = RunningRewardScaler(self.n_envs)
# Scaling reward with constant
self.reward_scale_const: float = cfg.train.get("reward_scale_const", 1)
# Use base policy
self.use_bc_loss: bool = cfg.train.get("use_bc_loss", False)
self.bc_loss_coeff: float = cfg.train.get("bc_loss_coeff", 0)
def reset_actor_optimizer(self):
"""Not used anywhere currently"""
new_optimizer = torch.optim.AdamW(
self.model.actor_ft.parameters(),
lr=self.cfg.train.actor_lr,
weight_decay=self.cfg.train.actor_weight_decay,
)
new_optimizer.load_state_dict(self.actor_optimizer.state_dict())
self.actor_optimizer = new_optimizer
new_scheduler = CosineAnnealingWarmupRestarts(
self.actor_optimizer,
first_cycle_steps=self.cfg.train.actor_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=self.cfg.train.actor_lr,
min_lr=self.cfg.train.actor_lr_scheduler.min_lr,
warmup_steps=self.cfg.train.actor_lr_scheduler.warmup_steps,
gamma=1.0,
)
new_scheduler.load_state_dict(self.actor_lr_scheduler.state_dict())
self.actor_lr_scheduler = new_scheduler
log.info("Reset actor optimizer")

View File

@ -0,0 +1,452 @@
"""
DPPO fine-tuning.
"""
import os
import pickle
import einops
import numpy as np
import torch
import logging
import wandb
log = logging.getLogger(__name__)
from util.timer import Timer
from agent.finetune.train_ppo_agent import TrainPPOAgent
from util.scheduler import CosineAnnealingWarmupRestarts
class TrainPPODiffusionAgent(TrainPPOAgent):
def __init__(self, cfg):
super().__init__(cfg)
# Reward horizon --- always set to act_steps for now
self.reward_horizon = cfg.get("reward_horizon", self.act_steps)
# Eta - between DDIM (=0 for eval) and DDPM (=1 for training)
self.learn_eta = self.model.learn_eta
if self.learn_eta:
self.eta_update_interval = cfg.train.eta_update_interval
self.eta_optimizer = torch.optim.AdamW(
self.model.eta.parameters(),
lr=cfg.train.eta_lr,
weight_decay=cfg.train.eta_weight_decay,
)
self.eta_lr_scheduler = CosineAnnealingWarmupRestarts(
self.eta_optimizer,
first_cycle_steps=cfg.train.eta_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.eta_lr,
min_lr=cfg.train.eta_lr_scheduler.min_lr,
warmup_steps=cfg.train.eta_lr_scheduler.warmup_steps,
gamma=1.0,
)
def run(self):
# Start training loop
timer = Timer()
run_results = []
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = self.itr % self.val_freq == 0 and not self.force_train
self.model.eval() if eval_mode else self.model.train()
last_itr_eval = eval_mode
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
dones_trajs = np.empty((0, self.n_envs))
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
# Holder
obs_trajs = np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
chains_trajs = np.empty(
(
0,
self.n_envs,
self.model.ft_denoising_steps + 1,
self.horizon_steps,
self.action_dim,
)
)
reward_trajs = np.empty((0, self.n_envs))
obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
obs_full_trajs = np.vstack(
(obs_full_trajs, prev_obs_venv[None].squeeze(2))
) # remove cond_step dim
# Collect a set of trajectories from env
for step in range(self.n_steps):
if step % 10 == 0:
print(f"Processed step {step} of {self.n_steps}")
# Select action
with torch.no_grad():
samples = self.model(
cond=torch.from_numpy(prev_obs_venv).float().to(self.device),
deterministic=eval_mode,
return_chain=True,
)
output_venv = (
samples.trajectories.cpu().numpy()
) # n_env x horizon x act
chains_venv = (
samples.chains.cpu().numpy()
) # n_env x denoising x horizon x act
action_venv = output_venv[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
)
if self.save_full_observations:
obs_full_venv = np.vstack(
[info["full_obs"][None] for info in info_venv]
) # n_envs x n_act_steps x obs_dim
obs_full_trajs = np.vstack(
(obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
)
obs_trajs = np.vstack((obs_trajs, prev_obs_venv[None]))
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
dones_trajs = np.vstack((dones_trajs, done_venv[None]))
firsts_trajs[step + 1] = done_venv
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
if (
self.furniture_sparse_reward
): # only for furniture tasks, where reward only occurs in one env step
episode_best_reward = episode_reward
else:
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
log.info("[WARNING] No episode completed within the iteration!")
# Update models
if not eval_mode:
with torch.no_grad():
# Calculate value and logprobs - split into batches to prevent out of memory
obs_t = einops.rearrange(
torch.from_numpy(obs_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
obs_ts = torch.split(obs_t, self.logprob_batch_size, dim=0)
values_trajs = np.empty((0, self.n_envs))
for obs in obs_ts:
values = self.model.critic(obs).cpu().numpy().flatten()
values_trajs = np.vstack(
(values_trajs, values.reshape(-1, self.n_envs))
)
chains_t = einops.rearrange(
torch.from_numpy(chains_trajs).float().to(self.device),
"s e t h d -> (s e) t h d",
)
chains_ts = torch.split(chains_t, self.logprob_batch_size, dim=0)
logprobs_trajs = np.empty(
(
0,
self.model.ft_denoising_steps,
self.horizon_steps,
self.action_dim,
)
)
for obs, chains in zip(obs_ts, chains_ts):
logprobs = self.model.get_logprobs(obs, chains).cpu().numpy()
logprobs_trajs = np.vstack(
(
logprobs_trajs,
logprobs.reshape(-1, *logprobs_trajs.shape[1:]),
)
)
# normalize reward with running variance if specified
if self.reward_scale_running:
reward_trajs_transpose = self.running_reward_scaler(
reward=reward_trajs.T, first=firsts_trajs[:-1].T
)
reward_trajs = reward_trajs_transpose.T
# bootstrap value with GAE if not done - apply reward scaling with constant if specified
obs_venv_ts = torch.from_numpy(obs_venv).float().to(self.device)
with torch.no_grad():
next_value = (
self.model.critic(obs_venv_ts).reshape(1, -1).cpu().numpy()
)
advantages_trajs = np.zeros_like(reward_trajs)
lastgaelam = 0
for t in reversed(range(self.n_steps)):
if t == self.n_steps - 1:
nextnonterminal = 1.0 - done_venv
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones_trajs[t + 1]
nextvalues = values_trajs[t + 1]
# delta = r + gamma*V(st+1) - V(st)
delta = (
reward_trajs[t] * self.reward_scale_const
+ self.gamma * nextvalues * nextnonterminal
- values_trajs[t]
)
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
advantages_trajs[t] = lastgaelam = (
delta
+ self.gamma
* self.gae_lambda
* nextnonterminal
* lastgaelam
)
returns_trajs = advantages_trajs + values_trajs
# k for environment step
obs_k = einops.rearrange(
torch.tensor(obs_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
chains_k = einops.rearrange(
torch.tensor(chains_trajs).float().to(self.device),
"s e t h d -> (s e) t h d",
)
returns_k = (
torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
)
values_k = (
torch.tensor(values_trajs).float().to(self.device).reshape(-1)
)
advantages_k = (
torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
)
logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
# Update policy and critic
total_steps = self.n_steps * self.n_envs
inds_k = np.arange(total_steps)
clipfracs = []
for update_epoch in range(self.update_epochs):
# for each epoch, go through all data in batches
flag_break = False
np.random.shuffle(inds_k)
num_batch = max(1, total_steps // self.batch_size) # skip last ones
for batch in range(num_batch):
start = batch * self.batch_size
end = start + self.batch_size
inds_b = inds_k[start:end] # b for batch
obs_b = obs_k[inds_b]
chains_b = chains_k[inds_b]
returns_b = returns_k[inds_b]
values_b = values_k[inds_b]
advantages_b = advantages_k[inds_b]
logprobs_b = logprobs_k[inds_b]
# get loss
(
pg_loss,
entropy_loss,
v_loss,
clipfrac,
approx_kl,
ratio,
bc_loss,
eta,
) = self.model.loss(
obs_b,
chains_b,
returns_b,
values_b,
advantages_b,
logprobs_b,
use_bc_loss=self.use_bc_loss,
reward_horizon=self.reward_horizon,
)
loss = (
pg_loss
+ entropy_loss * self.ent_coef
+ v_loss * self.vf_coef
+ bc_loss * self.bc_loss_coeff
)
clipfracs += [clipfrac]
# update policy and critic
self.actor_optimizer.zero_grad()
self.critic_optimizer.zero_grad()
if self.learn_eta:
self.eta_optimizer.zero_grad()
loss.backward()
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
self.model.actor_ft.parameters(), self.max_grad_norm
)
self.actor_optimizer.step()
if self.learn_eta and batch % self.eta_update_interval == 0:
self.eta_optimizer.step()
self.critic_optimizer.step()
log.info(
f"approx_kl: {approx_kl}, update_epoch: {update_epoch}, num_batch: {num_batch}"
)
# Stop gradient update if KL difference reaches target
if self.target_kl is not None and approx_kl > self.target_kl:
flag_break = True
break
if flag_break:
break
# Explained variation of future rewards using value function
y_pred, y_true = values_k.cpu().numpy(), returns_k.cpu().numpy()
var_y = np.var(y_true)
explained_var = (
np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
)
# Plot state trajectories in D3IL
if (
self.itr % self.render_freq == 0
and self.n_render > 0
and self.traj_plotter is not None
):
self.traj_plotter(
obs_full_trajs=obs_full_trajs,
n_render=self.n_render,
max_episode_steps=self.max_episode_steps,
render_dir=self.render_dir,
itr=self.itr,
)
# Update lr, min_sampling_std
if self.itr >= self.n_critic_warmup_itr:
self.actor_lr_scheduler.step()
if self.learn_eta:
self.eta_lr_scheduler.step()
self.critic_lr_scheduler.step()
self.model.step()
diffusion_min_sampling_std = self.model.get_min_sampling_denoising_std()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
}
)
if self.save_trajs:
run_results[-1]["obs_full_trajs"] = obs_full_trajs
run_results[-1]["obs_trajs"] = obs_trajs
run_results[-1]["chains_trajs"] = chains_trajs
run_results[-1]["reward_trajs"] = reward_trajs
if self.itr % self.log_freq == 0:
time = timer()
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"pg loss": pg_loss,
"value loss": v_loss,
"bc loss": bc_loss,
"eta": eta,
"approx kl": approx_kl,
"ratio": ratio,
"clipfrac": np.mean(clipfracs),
"explained variance": explained_var,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
"diffusion - min sampling std": diffusion_min_sampling_std,
"actor lr": self.actor_optimizer.param_groups[0]["lr"],
"critic lr": self.critic_optimizer.param_groups[0][
"lr"
],
},
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["pg_loss"] = pg_loss
run_results[-1]["value_loss"] = v_loss
run_results[-1]["bc_loss"] = bc_loss
run_results[-1]["eta"] = eta
run_results[-1]["approx_kl"] = approx_kl
run_results[-1]["ratio"] = ratio
run_results[-1]["clip_frac"] = np.mean(clipfracs)
run_results[-1]["explained_variance"] = explained_var
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = time
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,468 @@
"""
DPPO fine-tuning for pixel observations.
"""
import os
import pickle
import einops
import numpy as np
import torch
import logging
import wandb
import math
log = logging.getLogger(__name__)
from util.timer import Timer
from agent.finetune.train_ppo_diffusion_agent import TrainPPODiffusionAgent
from model.common.modules import RandomShiftsAug
class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
def __init__(self, cfg):
super().__init__(cfg)
# Image randomization
self.augment = cfg.train.augment
if self.augment:
self.aug = RandomShiftsAug(pad=4)
# Set obs dim - we will save the different obs in batch in a dict
shape_meta = cfg.shape_meta
self.obs_dims = {k: shape_meta.obs[k]["shape"] for k in shape_meta.obs.keys()}
# Gradient accumulation to deal with large GPU RAM usage
self.grad_accumulate = cfg.train.grad_accumulate
def run(self):
# Start training loop
timer = Timer()
run_results = []
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = self.itr % self.val_freq == 0 and not self.force_train
self.model.eval() if eval_mode else self.model.train()
last_itr_eval = eval_mode
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
dones_trajs = np.empty((0, self.n_envs))
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
# Holder
obs_trajs = {
k: np.empty((0, self.n_envs, self.n_cond_step, *self.obs_dims[k]))
for k in self.obs_dims
}
chains_trajs = np.empty(
(
0,
self.n_envs,
self.model.ft_denoising_steps + 1,
self.horizon_steps,
self.action_dim,
)
)
reward_trajs = np.empty((0, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
if step % 10 == 0:
print(f"Processed step {step} of {self.n_steps}")
# Select action
with torch.no_grad():
cond = {
key: torch.from_numpy(prev_obs_venv[key])
.float()
.to(self.device)
for key in self.obs_dims.keys()
} # batch each type of obs and put into dict
samples = self.model(
cond=cond,
deterministic=eval_mode,
return_chain=True,
)
output_venv = (
samples.trajectories.cpu().numpy()
) # n_env x horizon x act
chains_venv = (
samples.chains.cpu().numpy()
) # n_env x denoising x horizon x act
action_venv = output_venv[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
)
for k in obs_trajs.keys():
obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None]))
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
dones_trajs = np.vstack((dones_trajs, done_venv[None]))
firsts_trajs[step + 1] = done_venv
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
log.info("[WARNING] No episode completed within the iteration!")
# Update
if not eval_mode:
with torch.no_grad():
# apply image randomization
obs_trajs["rgb"] = (
torch.from_numpy(obs_trajs["rgb"]).float().to(self.device)
)
obs_trajs["state"] = (
torch.from_numpy(obs_trajs["state"]).float().to(self.device)
)
if self.augment:
rgb = einops.rearrange(
obs_trajs["rgb"],
"s e t c h w -> (s e t) c h w",
)
rgb = self.aug(rgb)
obs_trajs["rgb"] = einops.rearrange(
rgb,
"(s e t) c h w -> s e t c h w",
s=self.n_steps,
e=self.n_envs,
)
# Calculate value and logprobs - split into batches to prevent out of memory
num_split = math.ceil(
self.n_envs * self.n_steps / self.logprob_batch_size
)
obs_ts = [{} for _ in range(num_split)]
for k in obs_trajs.keys():
obs_k = einops.rearrange(
obs_trajs[k],
"s e ... -> (s e) ...",
)
obs_ts_k = torch.split(obs_k, self.logprob_batch_size, dim=0)
for i, obs_t in enumerate(obs_ts_k):
obs_ts[i][k] = obs_t
values_trajs = np.empty((0, self.n_envs))
for obs in obs_ts:
values = (
self.model.critic(obs, no_augment=True)
.cpu()
.numpy()
.flatten()
)
values_trajs = np.vstack(
(values_trajs, values.reshape(-1, self.n_envs))
)
chains_t = einops.rearrange(
torch.from_numpy(chains_trajs).float().to(self.device),
"s e t h d -> (s e) t h d",
)
chains_ts = torch.split(chains_t, self.logprob_batch_size, dim=0)
logprobs_trajs = np.empty(
(
0,
self.model.ft_denoising_steps,
self.horizon_steps,
self.action_dim,
)
)
for obs, chains in zip(obs_ts, chains_ts):
logprobs = self.model.get_logprobs(obs, chains).cpu().numpy()
logprobs_trajs = np.vstack(
(
logprobs_trajs,
logprobs.reshape(-1, *logprobs_trajs.shape[1:]),
)
)
# normalize reward with running variance if specified
if self.reward_scale_running:
reward_trajs_transpose = self.running_reward_scaler(
reward=reward_trajs.T, first=firsts_trajs[:-1].T
)
reward_trajs = reward_trajs_transpose.T
# bootstrap value with GAE if not done - apply reward scaling with constant if specified
obs_venv_ts = {
key: torch.from_numpy(obs_venv[key]).float().to(self.device)
for key in self.obs_dims.keys()
}
with torch.no_grad():
next_value = (
self.model.critic(obs_venv_ts, no_augment=True)
.reshape(1, -1)
.cpu()
.numpy()
)
advantages_trajs = np.zeros_like(reward_trajs)
lastgaelam = 0
for t in reversed(range(self.n_steps)):
if t == self.n_steps - 1:
nextnonterminal = 1.0 - done_venv
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones_trajs[t + 1]
nextvalues = values_trajs[t + 1]
# delta = r + gamma*V(st+1) - V(st)
delta = (
reward_trajs[t] * self.reward_scale_const
+ self.gamma * nextvalues * nextnonterminal
- values_trajs[t]
)
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
advantages_trajs[t] = lastgaelam = (
delta
+ self.gamma
* self.gae_lambda
* nextnonterminal
* lastgaelam
)
returns_trajs = advantages_trajs + values_trajs
# k for environment step
obs_k = {
k: einops.rearrange(
obs_trajs[k],
"s e ... -> (s e) ...",
)
for k in obs_trajs.keys()
}
chains_k = einops.rearrange(
torch.tensor(chains_trajs).float().to(self.device),
"s e t h d -> (s e) t h d",
)
returns_k = (
torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
)
values_k = (
torch.tensor(values_trajs).float().to(self.device).reshape(-1)
)
advantages_k = (
torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
)
logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
# Update policy and critic
total_steps = self.n_steps * self.n_envs
inds_k = np.arange(total_steps)
clipfracs = []
for update_epoch in range(self.update_epochs):
# for each epoch, go through all data in batches
flag_break = False
np.random.shuffle(inds_k)
num_batch = max(1, total_steps // self.batch_size) # skip last ones
for batch in range(num_batch):
start = batch * self.batch_size
end = start + self.batch_size
inds_b = inds_k[start:end] # b for batch
obs_b = {k: obs_k[k][inds_b] for k in obs_k.keys()}
chains_b = chains_k[inds_b]
returns_b = returns_k[inds_b]
values_b = values_k[inds_b]
advantages_b = advantages_k[inds_b]
logprobs_b = logprobs_k[inds_b]
# get loss
(
pg_loss,
entropy_loss,
v_loss,
clipfrac,
approx_kl,
ratio,
bc_loss,
eta,
) = self.model.loss(
obs_b,
chains_b,
returns_b,
values_b,
advantages_b,
logprobs_b,
use_bc_loss=self.use_bc_loss,
reward_horizon=self.reward_horizon,
)
loss = (
pg_loss
+ entropy_loss * self.ent_coef
+ v_loss * self.vf_coef
+ bc_loss * self.bc_loss_coeff
)
clipfracs += [clipfrac]
# update policy and critic
loss.backward()
if (batch + 1) % self.grad_accumulate == 0:
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
self.model.actor_ft.parameters(),
self.max_grad_norm,
)
self.actor_optimizer.step()
if (
self.learn_eta
and batch % self.eta_update_interval == 0
):
self.eta_optimizer.step()
self.critic_optimizer.step()
self.actor_optimizer.zero_grad()
self.critic_optimizer.zero_grad()
if self.learn_eta:
self.eta_optimizer.zero_grad()
log.info(f"run grad update at batch {batch}")
log.info(
f"approx_kl: {approx_kl}, update_epoch: {update_epoch}, num_batch: {num_batch}"
)
# Stop gradient update if KL difference reaches target
if (
self.target_kl is not None
and approx_kl > self.target_kl
and self.itr >= self.n_critic_warmup_itr
):
flag_break = True
break
if flag_break:
break
# Explained variation of future rewards using value function
y_pred, y_true = values_k.cpu().numpy(), returns_k.cpu().numpy()
var_y = np.var(y_true)
explained_var = (
np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
)
# Update lr
if self.itr >= self.n_critic_warmup_itr:
self.actor_lr_scheduler.step()
if self.learn_eta:
self.eta_lr_scheduler.step()
self.critic_lr_scheduler.step()
self.model.step()
diffusion_min_sampling_std = self.model.get_min_sampling_denoising_std()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
}
)
if self.itr % self.log_freq == 0:
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{timer():8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"pg loss": pg_loss,
"value loss": v_loss,
"bc loss": bc_loss,
"eta": eta,
"approx kl": approx_kl,
"ratio": ratio,
"clipfrac": np.mean(clipfracs),
"explained variance": explained_var,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
"diffusion - min sampling std": diffusion_min_sampling_std,
"actor lr": self.actor_optimizer.param_groups[0]["lr"],
"critic lr": self.critic_optimizer.param_groups[0][
"lr"
],
},
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["pg_loss"] = pg_loss
run_results[-1]["value_loss"] = v_loss
run_results[-1]["bc_loss"] = bc_loss
run_results[-1]["eta"] = eta
run_results[-1]["approx_kl"] = approx_kl
run_results[-1]["ratio"] = ratio
run_results[-1]["clip_frac"] = np.mean(clipfracs)
run_results[-1]["explained_variance"] = explained_var
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = timer()
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,405 @@
"""
Use diffusion exact likelihood for policy gradient.
"""
import os
import pickle
import einops
import numpy as np
import torch
import logging
import wandb
log = logging.getLogger(__name__)
from util.timer import Timer
from agent.finetune.train_ppo_diffusion_agent import TrainPPODiffusionAgent
class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
def __init__(self, cfg):
super().__init__(cfg)
def run(self):
"""
For exact likelihood, we do not need to save the chains.
"""
# Start training loop
timer = Timer()
run_results = []
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = self.itr % self.val_freq == 0 and not self.force_train
self.model.eval() if eval_mode else self.model.train()
last_itr_eval = eval_mode
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
dones_trajs = np.empty((0, self.n_envs))
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
# Holder
obs_trajs = np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
samples_trajs = np.empty(
(
0,
self.n_envs,
self.horizon_steps,
self.action_dim,
)
)
chains_trajs = np.empty(
(
0,
self.n_envs,
self.model.ft_denoising_steps + 1,
self.horizon_steps,
self.action_dim,
)
)
reward_trajs = np.empty((0, self.n_envs))
obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
obs_full_trajs = np.vstack(
(obs_full_trajs, prev_obs_venv[None].squeeze(2))
) # remove cond_step dim
# Collect a set of trajectories from env
for step in range(self.n_steps):
if step % 10 == 0:
print(f"Processed step {step} of {self.n_steps}")
# Select action
with torch.no_grad():
samples = self.model(
cond=torch.from_numpy(prev_obs_venv).float().to(self.device),
deterministic=eval_mode,
return_chain=True,
)
output_venv = (
samples.trajectories.cpu().numpy()
) # n_env x horizon x act
chains_venv = (
samples.chains.cpu().numpy()
) # n_env x denoising x horizon x act
action_venv = output_venv[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
)
if self.save_full_observations:
obs_full_venv = np.vstack(
[info["full_obs"][None] for info in info_venv]
) # n_envs x n_act_steps x obs_dim
obs_full_trajs = np.vstack(
(obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
)
obs_trajs = np.vstack((obs_trajs, prev_obs_venv[None]))
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
samples_trajs = np.vstack((samples_trajs, output_venv[None]))
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
dones_trajs = np.vstack((dones_trajs, done_venv[None]))
firsts_trajs[step + 1] = done_venv
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
log.info("[WARNING] No episode completed within the iteration!")
# Update
if not eval_mode:
with torch.no_grad():
# Calculate value and logprobs - split into batches to prevent out of memory
obs_t = einops.rearrange(
torch.from_numpy(obs_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
obs_ts = torch.split(obs_t, self.logprob_batch_size, dim=0)
values_trajs = np.empty((0, self.n_envs))
for obs in obs_ts:
values = self.model.critic(obs).cpu().numpy().flatten()
values_trajs = np.vstack(
(values_trajs, values.reshape(-1, self.n_envs))
)
samples_t = einops.rearrange(
torch.from_numpy(samples_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
samples_ts = torch.split(samples_t, self.logprob_batch_size, dim=0)
logprobs_trajs = np.empty((0))
for obs, samples in zip(obs_ts, samples_ts):
logprobs = (
self.model.get_exact_logprobs(obs, samples).cpu().numpy()
)
logprobs_trajs = np.concatenate((logprobs_trajs, logprobs))
# normalize reward with running variance if specified
if self.reward_scale_running:
reward_trajs_transpose = self.running_reward_scaler(
reward=reward_trajs.T, first=firsts_trajs[:-1].T
)
reward_trajs = reward_trajs_transpose.T
# bootstrap value with GAE if not done - apply reward scaling with constant if specified
obs_venv_ts = torch.from_numpy(obs_venv).float().to(self.device)
with torch.no_grad():
next_value = (
self.model.critic(obs_venv_ts).reshape(1, -1).cpu().numpy()
)
advantages_trajs = np.zeros_like(reward_trajs)
lastgaelam = 0
for t in reversed(range(self.n_steps)):
if t == self.n_steps - 1:
nextnonterminal = 1.0 - done_venv
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones_trajs[t + 1]
nextvalues = values_trajs[t + 1]
# delta = r + gamma*V(st+1) - V(st)
delta = (
reward_trajs[t] * self.reward_scale_const
+ self.gamma * nextvalues * nextnonterminal
- values_trajs[t]
)
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
advantages_trajs[t] = lastgaelam = (
delta
+ self.gamma
* self.gae_lambda
* nextnonterminal
* lastgaelam
)
returns_trajs = advantages_trajs + values_trajs
# k for environment step
obs_k = einops.rearrange(
torch.tensor(obs_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
samples_k = einops.rearrange(
torch.tensor(samples_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
returns_k = (
torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
)
values_k = (
torch.tensor(values_trajs).float().to(self.device).reshape(-1)
)
advantages_k = (
torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
)
logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
# Update policy and critic
total_steps = self.n_steps * self.n_envs
inds_k = np.arange(total_steps)
clipfracs = []
for update_epoch in range(self.update_epochs):
# for each epoch, go through all data in batches
flag_break = False
np.random.shuffle(inds_k)
num_batch = max(1, total_steps // self.batch_size) # skip last ones
for batch in range(num_batch):
start = batch * self.batch_size
end = start + self.batch_size
inds_b = inds_k[start:end] # b for batch
obs_b = obs_k[inds_b]
samples_b = samples_k[inds_b]
returns_b = returns_k[inds_b]
values_b = values_k[inds_b]
advantages_b = advantages_k[inds_b]
logprobs_b = logprobs_k[inds_b]
# get loss
(
pg_loss,
v_loss,
clipfrac,
approx_kl,
ratio,
bc_loss,
) = self.model.loss(
obs_b,
samples_b,
returns_b,
values_b,
advantages_b,
logprobs_b,
use_bc_loss=self.use_bc_loss,
reward_horizon=self.reward_horizon,
)
loss = (
pg_loss
+ v_loss * self.vf_coef
+ bc_loss * self.bc_loss_coeff
)
clipfracs += [clipfrac]
# update policy and critic
self.actor_optimizer.zero_grad()
self.critic_optimizer.zero_grad()
loss.backward()
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
self.model.actor_ft.parameters(), self.max_grad_norm
)
self.actor_optimizer.step()
self.critic_optimizer.step()
log.info(
f"approx_kl: {approx_kl}, update_epoch: {update_epoch}, num_batch: {num_batch}"
)
# Stop gradient update if KL difference reaches target
if self.target_kl is not None and approx_kl > self.target_kl:
flag_break = True
break
if flag_break:
break
# Explained variation of future rewards using value function
y_pred, y_true = values_k.cpu().numpy(), returns_k.cpu().numpy()
var_y = np.var(y_true)
explained_var = (
np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
)
# Plot state trajectories
if (
self.itr % self.render_freq == 0
and self.n_render > 0
and self.traj_plotter is not None
):
self.traj_plotter(
obs_full_trajs=obs_full_trajs,
n_render=self.n_render,
max_episode_steps=self.max_episode_steps,
render_dir=self.render_dir,
itr=self.itr,
)
# Update lr
if self.itr >= self.n_critic_warmup_itr:
self.actor_lr_scheduler.step()
self.critic_lr_scheduler.step()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
}
)
if self.save_trajs:
run_results[-1]["obs_full_trajs"] = obs_full_trajs
run_results[-1]["obs_trajs"] = obs_trajs
run_results[-1]["action_trajs"] = samples_trajs
run_results[-1]["chains_trajs"] = chains_trajs
run_results[-1]["reward_trajs"] = reward_trajs
if self.itr % self.log_freq == 0:
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{timer():8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"pg loss": pg_loss,
"value loss": v_loss,
"approx kl": approx_kl,
"ratio": ratio,
"clipfrac": np.mean(clipfracs),
"explained variance": explained_var,
"avg episode reward - train": avg_episode_reward,
},
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["pg_loss"] = pg_loss
run_results[-1]["value_loss"] = v_loss
run_results[-1]["approx_kl"] = approx_kl
run_results[-1]["ratio"] = ratio
run_results[-1]["clip_frac"] = np.mean(clipfracs)
run_results[-1]["explained_variance"] = explained_var
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = timer()
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,404 @@
"""
PPO training for Gaussian/GMM policy.
"""
import os
import pickle
import einops
import numpy as np
import torch
import logging
import wandb
log = logging.getLogger(__name__)
from util.timer import Timer
from agent.finetune.train_ppo_agent import TrainPPOAgent
class TrainPPOGaussianAgent(TrainPPOAgent):
def __init__(self, cfg):
super().__init__(cfg)
def run(self):
# Start training loop
timer = Timer()
run_results = []
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = self.itr % self.val_freq == 0 and not self.force_train
self.model.eval() if eval_mode else self.model.train()
last_itr_eval = eval_mode
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
dones_trajs = np.empty((0, self.n_envs))
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
# Holder
obs_trajs = np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
samples_trajs = np.empty(
(
0,
self.n_envs,
self.horizon_steps,
self.action_dim,
)
)
reward_trajs = np.empty((0, self.n_envs))
obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
obs_full_trajs = np.vstack(
(obs_full_trajs, prev_obs_venv[None].squeeze(2))
) # remove cond_step dim
# Collect a set of trajectories from env
for step in range(self.n_steps):
if step % 10 == 0:
print(f"Processed step {step} of {self.n_steps}")
# Select action
with torch.no_grad():
samples = self.model(
cond=torch.from_numpy(prev_obs_venv).float().to(self.device),
deterministic=eval_mode,
)
output_venv = samples.cpu().numpy()
action_venv = output_venv[:, : self.act_steps, : self.action_dim]
obs_trajs = np.vstack((obs_trajs, prev_obs_venv[None]))
samples_trajs = np.vstack((samples_trajs, output_venv[None]))
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
)
if self.save_full_observations:
obs_full_venv = np.vstack(
[info["full_obs"][None] for info in info_venv]
) # n_envs x n_act_steps x obs_dim
obs_full_trajs = np.vstack(
(obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
dones_trajs = np.vstack((dones_trajs, done_venv[None]))
firsts_trajs[step + 1] = done_venv
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
if (
self.furniture_sparse_reward
): # only for furniture tasks, where reward only occurs in one env step
episode_best_reward = episode_reward
else:
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
log.info("[WARNING] No episode completed within the iteration!")
# Update
if not eval_mode:
with torch.no_grad():
# Calculate value and logprobs - split into batches to prevent out of memory
obs_t = einops.rearrange(
torch.from_numpy(obs_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
obs_ts = torch.split(obs_t, self.logprob_batch_size, dim=0)
values_trajs = np.empty((0, self.n_envs))
for obs in obs_ts:
values = self.model.critic(obs).cpu().numpy().flatten()
values_trajs = np.vstack(
(values_trajs, values.reshape(-1, self.n_envs))
)
samples_t = einops.rearrange(
torch.from_numpy(samples_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
samples_ts = torch.split(samples_t, self.logprob_batch_size, dim=0)
logprobs_trajs = np.empty((0))
for obs_t, samples_t in zip(obs_ts, samples_ts):
logprobs = (
self.model.get_logprobs(obs_t, samples_t)[0].cpu().numpy()
)
logprobs_trajs = np.concatenate(
(
logprobs_trajs,
logprobs.reshape(-1),
)
)
# normalize reward with running variance if specified
if self.reward_scale_running:
reward_trajs_transpose = self.running_reward_scaler(
reward=reward_trajs.T, first=firsts_trajs[:-1].T
)
reward_trajs = reward_trajs_transpose.T
# bootstrap value with GAE if not done - apply reward scaling with constant if specified
obs_venv_ts = torch.from_numpy(obs_venv).float().to(self.device)
with torch.no_grad():
next_value = (
self.model.critic(obs_venv_ts).reshape(1, -1).cpu().numpy()
)
advantages_trajs = np.zeros_like(reward_trajs)
lastgaelam = 0
for t in reversed(range(self.n_steps)):
if t == self.n_steps - 1:
nextnonterminal = 1.0 - done_venv
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones_trajs[t + 1]
nextvalues = values_trajs[t + 1]
# delta = r + gamma*V(st+1) - V(st)
delta = (
reward_trajs[t] * self.reward_scale_const
+ self.gamma * nextvalues * nextnonterminal
- values_trajs[t]
)
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
advantages_trajs[t] = lastgaelam = (
delta
+ self.gamma
* self.gae_lambda
* nextnonterminal
* lastgaelam
)
returns_trajs = advantages_trajs + values_trajs
# k for environment step
obs_k = einops.rearrange(
torch.tensor(obs_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
samples_k = einops.rearrange(
torch.tensor(samples_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
returns_k = (
torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
)
values_k = (
torch.tensor(values_trajs).float().to(self.device).reshape(-1)
)
advantages_k = (
torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
)
logprobs_k = (
torch.tensor(logprobs_trajs).float().to(self.device).reshape(-1)
)
# Update policy and critic
total_steps = self.n_steps * self.n_envs
inds_k = np.arange(total_steps)
clipfracs = []
for update_epoch in range(self.update_epochs):
# for each epoch, go through all data in batches
flag_break = False
np.random.shuffle(inds_k)
num_batch = max(1, total_steps // self.batch_size) # skip last ones
for batch in range(num_batch):
start = batch * self.batch_size
end = start + self.batch_size
inds_b = inds_k[start:end] # b for batch
obs_b = obs_k[inds_b]
samples_b = samples_k[inds_b]
returns_b = returns_k[inds_b]
values_b = values_k[inds_b]
advantages_b = advantages_k[inds_b]
logprobs_b = logprobs_k[inds_b]
# get loss
(
pg_loss,
entropy_loss,
v_loss,
clipfrac,
approx_kl,
ratio,
bc_loss,
std,
) = self.model.loss(
obs_b,
samples_b,
returns_b,
values_b,
advantages_b,
logprobs_b,
use_bc_loss=self.use_bc_loss,
)
loss = (
pg_loss
+ entropy_loss * self.ent_coef
+ v_loss * self.vf_coef
+ bc_loss * self.bc_loss_coeff
)
clipfracs += [clipfrac]
# update policy and critic
self.actor_optimizer.zero_grad()
self.critic_optimizer.zero_grad()
loss.backward()
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
self.model.actor_ft.parameters(), self.max_grad_norm
)
self.actor_optimizer.step()
self.critic_optimizer.step()
log.info(
f"approx_kl: {approx_kl}, update_epoch: {update_epoch}, num_batch: {num_batch}"
)
# Stop gradient update if KL difference reaches target
if self.target_kl is not None and approx_kl > self.target_kl:
flag_break = True
break
if flag_break:
break
# Explained variation of future rewards using value function
y_pred, y_true = values_k.cpu().numpy(), returns_k.cpu().numpy()
var_y = np.var(y_true)
explained_var = (
np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
)
# Plot state trajectories
if (
self.itr % self.render_freq == 0
and self.n_render > 0
and self.traj_plotter is not None
):
self.traj_plotter(
obs_full_trajs=obs_full_trajs,
n_render=self.n_render,
max_episode_steps=self.max_episode_steps,
render_dir=self.render_dir,
itr=self.itr,
)
# Update lr
if self.itr >= self.n_critic_warmup_itr:
self.actor_lr_scheduler.step()
self.critic_lr_scheduler.step()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
}
)
if self.save_trajs:
run_results[-1]["obs_full_trajs"] = obs_full_trajs
run_results[-1]["obs_trajs"] = obs_trajs
run_results[-1]["action_trajs"] = samples_trajs
run_results[-1]["reward_trajs"] = reward_trajs
if self.itr % self.log_freq == 0:
time = timer()
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | ent {-entropy_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"pg loss": pg_loss,
"value loss": v_loss,
"entropy": -entropy_loss,
"std": std,
"approx kl": approx_kl,
"ratio": ratio,
"clipfrac": np.mean(clipfracs),
"explained variance": explained_var,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
},
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["pg_loss"] = pg_loss
run_results[-1]["value_loss"] = v_loss
run_results[-1]["entropy_loss"] = entropy_loss
run_results[-1]["approx_kl"] = approx_kl
run_results[-1]["ratio"] = ratio
run_results[-1]["clip_frac"] = np.mean(clipfracs)
run_results[-1]["explained_variance"] = explained_var
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = time
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,443 @@
"""
PPO training for Gaussian/GMM policy with pixel observations.
"""
import os
import pickle
import einops
import numpy as np
import torch
import logging
import wandb
import math
log = logging.getLogger(__name__)
from util.timer import Timer
from agent.finetune.train_ppo_gaussian_agent import TrainPPOGaussianAgent
from model.common.modules import RandomShiftsAug
class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
def __init__(self, cfg):
super().__init__(cfg)
# Image randomization
self.augment = cfg.train.augment
if self.augment:
self.aug = RandomShiftsAug(pad=4)
# Set obs dim - we will save the different obs in batch in a dict
shape_meta = cfg.shape_meta
self.obs_dims = {k: shape_meta.obs[k]["shape"] for k in shape_meta.obs.keys()}
# Gradient accumulation to deal with large GPU RAM usage
self.grad_accumulate = cfg.train.grad_accumulate
def run(self):
# Start training loop
timer = Timer()
run_results = []
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = self.itr % self.val_freq == 0 and not self.force_train
self.model.eval() if eval_mode else self.model.train()
last_itr_eval = eval_mode
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
dones_trajs = np.empty((0, self.n_envs))
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
# Holder
obs_trajs = {
k: np.empty((0, self.n_envs, self.n_cond_step, *self.obs_dims[k]))
for k in self.obs_dims
}
samples_trajs = np.empty(
(
0,
self.n_envs,
self.horizon_steps,
self.action_dim,
)
)
reward_trajs = np.empty((0, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
if step % 10 == 0:
print(f"Processed step {step} of {self.n_steps}")
# Select action
with torch.no_grad():
cond = {
key: torch.from_numpy(prev_obs_venv[key])
.float()
.to(self.device)
for key in self.obs_dims.keys()
} # batch each type of obs and put into dict
samples = self.model(
cond=cond,
deterministic=eval_mode,
)
output_venv = samples.cpu().numpy()
action_venv = output_venv[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
)
for k in obs_trajs.keys():
obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None]))
samples_trajs = np.vstack((samples_trajs, output_venv[None]))
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
dones_trajs = np.vstack((dones_trajs, done_venv[None]))
firsts_trajs[step + 1] = done_venv
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
log.info("[WARNING] No episode completed within the iteration!")
# Update
if not eval_mode:
with torch.no_grad():
# apply image randomization
obs_trajs["rgb"] = (
torch.from_numpy(obs_trajs["rgb"]).float().to(self.device)
)
obs_trajs["state"] = (
torch.from_numpy(obs_trajs["state"]).float().to(self.device)
)
if self.augment:
rgb = einops.rearrange(
obs_trajs["rgb"],
"s e t c h w -> (s e t) c h w",
)
rgb = self.aug(rgb)
obs_trajs["rgb"] = einops.rearrange(
rgb,
"(s e t) c h w -> s e t c h w",
s=self.n_steps,
e=self.n_envs,
)
# Calculate value and logprobs - split into batches to prevent out of memory
num_split = math.ceil(
self.n_envs * self.n_steps / self.logprob_batch_size
)
obs_ts = [{} for _ in range(num_split)]
for k in obs_trajs.keys():
obs_k = einops.rearrange(
obs_trajs[k],
"s e ... -> (s e) ...",
)
obs_ts_k = torch.split(obs_k, self.logprob_batch_size, dim=0)
for i, obs_t in enumerate(obs_ts_k):
obs_ts[i][k] = obs_t
values_trajs = np.empty((0, self.n_envs))
for obs in obs_ts:
values = (
self.model.critic(obs, no_augment=True)
.cpu()
.numpy()
.flatten()
)
values_trajs = np.vstack(
(values_trajs, values.reshape(-1, self.n_envs))
)
samples_t = einops.rearrange(
torch.from_numpy(samples_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
samples_ts = torch.split(samples_t, self.logprob_batch_size, dim=0)
logprobs_trajs = np.empty((0))
for obs_t, samples_t in zip(obs_ts, samples_ts):
logprobs = (
self.model.get_logprobs(obs_t, samples_t)[0].cpu().numpy()
)
logprobs_trajs = np.concatenate(
(
logprobs_trajs,
logprobs.reshape(-1),
)
)
# normalize reward with running variance if specified
if self.reward_scale_running:
reward_trajs_transpose = self.running_reward_scaler(
reward=reward_trajs.T, first=firsts_trajs[:-1].T
)
reward_trajs = reward_trajs_transpose.T
# bootstrap value with GAE if not done - apply reward scaling with constant if specified
obs_venv_ts = {
key: torch.from_numpy(obs_venv[key]).float().to(self.device)
for key in self.obs_dims.keys()
}
with torch.no_grad():
next_value = (
self.model.critic(obs_venv_ts, no_augment=True)
.reshape(1, -1)
.cpu()
.numpy()
)
advantages_trajs = np.zeros_like(reward_trajs)
lastgaelam = 0
for t in reversed(range(self.n_steps)):
if t == self.n_steps - 1:
nextnonterminal = 1.0 - done_venv
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones_trajs[t + 1]
nextvalues = values_trajs[t + 1]
# delta = r + gamma*V(st+1) - V(st)
delta = (
reward_trajs[t] * self.reward_scale_const
+ self.gamma * nextvalues * nextnonterminal
- values_trajs[t]
)
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
advantages_trajs[t] = lastgaelam = (
delta
+ self.gamma
* self.gae_lambda
* nextnonterminal
* lastgaelam
)
returns_trajs = advantages_trajs + values_trajs
# k for environment step
obs_k = {
k: einops.rearrange(
obs_trajs[k],
"s e ... -> (s e) ...",
)
for k in obs_trajs.keys()
}
samples_k = einops.rearrange(
torch.tensor(samples_trajs).float().to(self.device),
"s e h d -> (s e) h d",
)
returns_k = (
torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
)
values_k = (
torch.tensor(values_trajs).float().to(self.device).reshape(-1)
)
advantages_k = (
torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
)
logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
# Update policy and critic
total_steps = self.n_steps * self.n_envs
inds_k = np.arange(total_steps)
clipfracs = []
for update_epoch in range(self.update_epochs):
# for each epoch, go through all data in batches
flag_break = False
np.random.shuffle(inds_k)
num_batch = max(1, total_steps // self.batch_size) # skip last ones
for batch in range(num_batch):
start = batch * self.batch_size
end = start + self.batch_size
inds_b = inds_k[start:end] # b for batch
obs_b = {k: obs_k[k][inds_b] for k in obs_k.keys()}
samples_b = samples_k[inds_b]
returns_b = returns_k[inds_b]
values_b = values_k[inds_b]
advantages_b = advantages_k[inds_b]
logprobs_b = logprobs_k[inds_b]
# get loss
(
pg_loss,
entropy_loss,
v_loss,
clipfrac,
approx_kl,
ratio,
bc_loss,
std,
) = self.model.loss(
obs_b,
samples_b,
returns_b,
values_b,
advantages_b,
logprobs_b,
use_bc_loss=self.use_bc_loss,
)
loss = (
pg_loss
+ entropy_loss * self.ent_coef
+ v_loss * self.vf_coef
+ bc_loss * self.bc_loss_coeff
)
clipfracs += [clipfrac]
# update policy and critic
loss.backward()
if (batch + 1) % self.grad_accumulate == 0:
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
self.model.actor_ft.parameters(),
self.max_grad_norm,
)
self.actor_optimizer.step()
self.critic_optimizer.step()
self.actor_optimizer.zero_grad()
self.critic_optimizer.zero_grad()
log.info(f"run grad update at batch {batch}")
log.info(
f"approx_kl: {approx_kl}, update_epoch: {update_epoch}, num_batch: {num_batch}"
)
# Stop gradient update if KL difference reaches target
if (
self.target_kl is not None
and approx_kl > self.target_kl
and self.itr >= self.n_critic_warmup_itr
):
flag_break = True
break
if flag_break:
break
# Explained variation of future rewards using value function
y_pred, y_true = values_k.cpu().numpy(), returns_k.cpu().numpy()
var_y = np.var(y_true)
explained_var = (
np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
)
# Update lr
if self.itr >= self.n_critic_warmup_itr:
self.actor_lr_scheduler.step()
self.critic_lr_scheduler.step()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
}
)
if self.itr % self.log_freq == 0:
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{timer():8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"pg loss": pg_loss,
"value loss": v_loss,
"bc loss": bc_loss,
"std": std,
"approx kl": approx_kl,
"ratio": ratio,
"clipfrac": np.mean(clipfracs),
"explained variance": explained_var,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
"actor lr": self.actor_optimizer.param_groups[0]["lr"],
"critic lr": self.critic_optimizer.param_groups[0][
"lr"
],
},
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["pg_loss"] = pg_loss
run_results[-1]["value_loss"] = v_loss
run_results[-1]["bc_loss"] = bc_loss
run_results[-1]["std"] = std
run_results[-1]["approx_kl"] = approx_kl
run_results[-1]["ratio"] = ratio
run_results[-1]["clip_frac"] = np.mean(clipfracs)
run_results[-1]["explained_variance"] = explained_var
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = timer()
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,316 @@
"""
QSM (Q-Score Matching) for diffusion policy.
"""
import os
import pickle
import einops
import numpy as np
import torch
import logging
import wandb
from copy import deepcopy
log = logging.getLogger(__name__)
from util.timer import Timer
from collections import deque
from agent.finetune.train_agent import TrainAgent
from util.scheduler import CosineAnnealingWarmupRestarts
class TrainQSMDiffusionAgent(TrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
self.gamma = cfg.train.gamma
# Wwarm up period for critic before actor updates
self.n_critic_warmup_itr = cfg.train.n_critic_warmup_itr
# Optimizer
self.actor_optimizer = torch.optim.AdamW(
self.model.actor.parameters(),
lr=cfg.train.actor_lr,
weight_decay=cfg.train.actor_weight_decay,
)
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
self.actor_optimizer,
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.actor_lr,
min_lr=cfg.train.actor_lr_scheduler.min_lr,
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
gamma=1.0,
)
self.critic_optimizer = torch.optim.AdamW(
self.model.critic_q.parameters(),
lr=cfg.train.critic_lr,
weight_decay=cfg.train.critic_weight_decay,
)
self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
self.critic_optimizer,
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.critic_lr,
min_lr=cfg.train.critic_lr_scheduler.min_lr,
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
gamma=1.0,
)
# Buffer size
self.buffer_size = cfg.train.buffer_size
# Scaling reward
self.scale_reward_factor = cfg.train.scale_reward_factor
# Updates
self.replay_ratio = cfg.train.replay_ratio
self.critic_tau = cfg.train.critic_tau
self.q_grad_coeff = cfg.train.q_grad_coeff
def run(self):
# make a FIFO replay buffer for obs, action, and reward
obs_buffer = deque(maxlen=self.buffer_size)
action_buffer = deque(maxlen=self.buffer_size)
next_obs_buffer = deque(maxlen=self.buffer_size)
reward_buffer = deque(maxlen=self.buffer_size)
done_buffer = deque(maxlen=self.buffer_size)
first_buffer = deque(maxlen=self.buffer_size)
# Start training loop
timer = Timer()
run_results = []
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = self.itr % self.val_freq == 0 and not self.force_train
self.model.eval() if eval_mode else self.model.train()
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
# Reset env at the beginning of an iteration
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
last_itr_eval = eval_mode
reward_trajs = np.empty((0, self.n_envs))
# Collect a set of trajectories from env
for step in range(self.n_steps):
if step % 10 == 0:
print(f"Processed step {step} of {self.n_steps}")
# Select action
with torch.no_grad():
samples = (
self.model(
cond=torch.from_numpy(prev_obs_venv)
.float()
.to(self.device),
deterministic=eval_mode,
)
.cpu()
.numpy()
) # n_env x horizon x act
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
# add to buffer
obs_buffer.append(prev_obs_venv)
action_buffer.append(action_venv)
next_obs_buffer.append(obs_venv)
reward_buffer.append(reward_venv * self.scale_reward_factor)
done_buffer.append(done_venv)
first_buffer.append(firsts_trajs[step])
firsts_trajs[step + 1] = done_venv
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
log.info("[WARNING] No episode completed within the iteration!")
# Update
if not eval_mode:
obs_trajs = np.array(deepcopy(obs_buffer))
action_trajs = np.array(deepcopy(action_buffer))
next_obs_trajs = np.array(deepcopy(next_obs_buffer))
reward_trajs = np.array(deepcopy(reward_buffer))
done_trajs = np.array(deepcopy(done_buffer))
first_trajs = np.array(deepcopy(first_buffer))
# flatten
obs_trajs = einops.rearrange(
obs_trajs,
"s e h d -> (s e) h d",
)
next_obs_trajs = einops.rearrange(
next_obs_trajs,
"s e h d -> (s e) h d",
)
action_trajs = einops.rearrange(
action_trajs,
"s e h d -> (s e) h d",
)
reward_trajs = reward_trajs.reshape(-1)
done_trajs = done_trajs.reshape(-1)
first_trajs = first_trajs.reshape(-1)
num_batch = int(
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
)
for _ in range(num_batch):
# Sample batch
inds = np.random.choice(len(obs_trajs), self.batch_size)
obs_b = torch.from_numpy(obs_trajs[inds]).float().to(self.device)
next_obs_b = (
torch.from_numpy(next_obs_trajs[inds]).float().to(self.device)
)
actions_b = (
torch.from_numpy(action_trajs[inds]).float().to(self.device)
)
reward_b = (
torch.from_numpy(reward_trajs[inds]).float().to(self.device)
)
done_b = torch.from_numpy(done_trajs[inds]).float().to(self.device)
# update critic q function
critic_loss = self.model.loss_critic(
obs_b, next_obs_b, actions_b, reward_b, done_b, self.gamma
)
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# update target q function
self.model.update_target_critic(self.critic_tau)
loss_critic = critic_loss.detach()
# Update policy with collected trajectories
loss = self.model.loss_actor(
obs_b,
actions_b,
self.q_grad_coeff,
)
self.actor_optimizer.zero_grad()
loss.backward()
if self.itr >= self.n_critic_warmup_itr:
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
self.model.actor.parameters(), self.max_grad_norm
)
self.actor_optimizer.step()
# Update lr
self.actor_lr_scheduler.step()
self.critic_lr_scheduler.step()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
}
)
if self.itr % self.log_freq == 0:
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{timer():8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"loss - critic": loss_critic,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
},
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["loss_critic"] = loss_critic
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = timer()
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,301 @@
"""
Reward-weighted regression (RWR) for diffusion policy.
"""
import os
import pickle
import numpy as np
import torch
import logging
import wandb
log = logging.getLogger(__name__)
from util.timer import Timer
from agent.finetune.train_agent import TrainAgent
from util.scheduler import CosineAnnealingWarmupRestarts
class TrainRWRDiffusionAgent(TrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
self.gamma = cfg.train.gamma
# Build optimizer
self.optimizer = torch.optim.AdamW(
self.model.parameters(),
lr=cfg.train.lr,
weight_decay=cfg.train.weight_decay,
)
self.lr_scheduler = CosineAnnealingWarmupRestarts(
self.optimizer,
first_cycle_steps=cfg.train.lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.lr,
min_lr=cfg.train.lr_scheduler.min_lr,
warmup_steps=cfg.train.lr_scheduler.warmup_steps,
gamma=1.0,
)
# Reward exponential
self.beta = cfg.train.beta
# Max weight for AWR
self.max_reward_weight = cfg.train.max_reward_weight
# Updates
self.update_epochs = cfg.train.update_epochs
def run(self):
# Start training loop
timer = Timer()
run_results = []
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
for env_ind in range(self.n_render):
options_venv[env_ind]["video_path"] = os.path.join(
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
)
# Define train or eval - all envs restart
eval_mode = self.itr % self.val_freq == 0 and not self.force_train
self.model.eval() if eval_mode else self.model.train()
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
# Reset env at the beginning of an iteration
if self.reset_at_iteration or eval_mode or last_itr_eval:
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
firsts_trajs[0] = 1
else:
firsts_trajs[0] = (
done_venv # if done at the end of last iteration, then the envs are just reset
)
last_itr_eval = eval_mode
reward_trajs = np.empty((0, self.n_envs))
# Holders
obs_trajs = np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
samples_trajs = np.empty(
(
0,
self.n_envs,
self.horizon_steps,
self.action_dim,
)
)
# Collect a set of trajectories from env
for step in range(self.n_steps):
if step % 10 == 0:
print(f"Processed step {step} of {self.n_steps}")
# Select action
with torch.no_grad():
samples = (
self.model(
cond=torch.from_numpy(prev_obs_venv)
.float()
.to(self.device),
deterministic=eval_mode,
)
.cpu()
.numpy()
) # n_env x horizon x act
action_venv = samples[:, : self.act_steps]
obs_trajs = np.vstack((obs_trajs, prev_obs_venv[None]))
samples_trajs = np.vstack((samples_trajs, samples[None]))
# Apply multi-step action
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
action_venv
)
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
firsts_trajs[step + 1] = done_venv
prev_obs_venv = obs_venv
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
episodes_start_end = []
for env_ind in range(self.n_envs):
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
for i in range(len(env_steps) - 1):
start = env_steps[i]
end = env_steps[i + 1]
if end - start > 1:
episodes_start_end.append((env_ind, start, end - 1))
if len(episodes_start_end) > 0:
# Compute transitions for completed trajectories
obs_trajs_split = [
obs_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
samples_trajs_split = [
samples_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
reward_trajs_split = [
reward_trajs[start : end + 1, env_ind]
for env_ind, start, end in episodes_start_end
]
num_episode_finished = len(reward_trajs_split)
# Compute episode returns
discounted_reward_trajs_split = [
[
self.gamma**t * r
for t, r in zip(
list(range(end - start + 1)),
reward_trajs[start : end + 1, env_ind],
)
]
for env_ind, start, end in episodes_start_end
]
returns_trajs_split = [
np.cumsum(y[::-1])[::-1] for y in discounted_reward_trajs_split
]
returns_trajs_split = np.concatenate(returns_trajs_split)
episode_reward = np.array(
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
)
episode_best_reward = np.array(
[
np.max(reward_traj) / self.act_steps
for reward_traj in reward_trajs_split
]
)
avg_episode_reward = np.mean(episode_reward)
avg_best_reward = np.mean(episode_best_reward)
success_rate = np.mean(
episode_best_reward >= self.best_reward_threshold_for_success
)
else:
episode_reward = np.array([])
num_episode_finished = 0
avg_episode_reward = 0
avg_best_reward = 0
success_rate = 0
log.info("[WARNING] No episode completed within the iteration!")
# Update
if not eval_mode:
# Tensorize data and put them to device
# k for environment step
obs_k = (
torch.tensor(np.concatenate(obs_trajs_split))
.float()
.to(self.device)
)
samples_k = (
torch.tensor(np.concatenate(samples_trajs_split))
.float()
.to(self.device)
)
# Normalize reward
returns_trajs_split = (
returns_trajs_split - np.mean(returns_trajs_split)
) / (returns_trajs_split.std() + 1e-3)
rewards_k = (
torch.tensor(returns_trajs_split)
.float()
.to(self.device)
.reshape(-1)
)
rewards_k_scaled = torch.exp(self.beta * rewards_k)
rewards_k_scaled.clamp_(max=self.max_reward_weight)
# rewards_k_scaled = rewards_k_scaled / rewards_k_scaled.mean()
# Update policy and critic
total_steps = len(rewards_k_scaled)
inds_k = np.arange(total_steps)
for _ in range(self.update_epochs):
# for each epoch, go through all data in batches
np.random.shuffle(inds_k)
num_batch = max(1, total_steps // self.batch_size) # skip last ones
for batch in range(num_batch):
start = batch * self.batch_size
end = start + self.batch_size
inds_b = inds_k[start:end] # b for batch
obs_b = obs_k[inds_b]
samples_b = samples_k[inds_b]
rewards_b = rewards_k_scaled[inds_b]
# Update policy with collected trajectories
loss = self.model.loss(
samples_b,
obs_b,
rewards_b,
)
self.optimizer.zero_grad()
loss.backward()
if self.max_grad_norm is not None:
torch.nn.utils.clip_grad_norm_(
self.model.parameters(), self.max_grad_norm
)
self.optimizer.step()
# Update lr
self.lr_scheduler.step()
# Save model
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
self.save_model()
# Log loss and save metrics
run_results.append(
{
"itr": self.itr,
}
)
if self.itr % self.log_freq == 0:
if eval_mode:
log.info(
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
)
if self.use_wandb:
wandb.log(
{
"success rate - eval": success_rate,
"avg episode reward - eval": avg_episode_reward,
"avg best reward - eval": avg_best_reward,
"num episode - eval": num_episode_finished,
},
step=self.itr,
commit=False,
)
run_results[-1]["eval_success_rate"] = success_rate
run_results[-1]["eval_episode_reward"] = avg_episode_reward
run_results[-1]["eval_best_reward"] = avg_best_reward
else:
log.info(
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{timer():8.4f}"
)
if self.use_wandb:
wandb.log(
{
"loss": loss,
"avg episode reward - train": avg_episode_reward,
"num episode - train": num_episode_finished,
},
step=self.itr,
commit=True,
)
run_results[-1]["loss"] = loss
run_results[-1]["train_episode_reward"] = avg_episode_reward
run_results[-1]["time"] = timer()
with open(self.result_path, "wb") as f:
pickle.dump(run_results, f)
self.itr += 1

View File

@ -0,0 +1,168 @@
"""
Parent pre-training agent class.
"""
import os
import random
import numpy as np
from omegaconf import OmegaConf
import torch
import hydra
import logging
import wandb
from copy import deepcopy
log = logging.getLogger(__name__)
from util.scheduler import CosineAnnealingWarmupRestarts
DEVICE = "cuda:0"
def to_device(x, device=DEVICE):
if torch.is_tensor(x):
return x.to(device)
elif type(x) is dict:
return {k: to_device(v, device) for k, v in x.items()}
else:
print(f"Unrecognized type in `to_device`: {type(x)}")
def batch_to_device(batch, device="cuda:0"):
vals = [to_device(getattr(batch, field), device) for field in batch._fields]
return type(batch)(*vals)
class EMA:
"""
Empirical moving average
"""
def __init__(self, cfg):
super().__init__()
self.beta = cfg.decay
def update_model_average(self, ma_model, current_model):
for current_params, ma_params in zip(
current_model.parameters(), ma_model.parameters()
):
old_weight, up_weight = ma_params.data, current_params.data
ma_params.data = self.update_average(old_weight, up_weight)
def update_average(self, old, new):
if old is None:
return new
return old * self.beta + (1 - self.beta) * new
class PreTrainAgent:
def __init__(self, cfg):
super().__init__()
self.seed = cfg.get("seed", 42)
random.seed(self.seed)
np.random.seed(self.seed)
torch.manual_seed(self.seed)
# Wandb
self.use_wandb = cfg.wandb is not None
if cfg.wandb is not None:
wandb.init(
entity=cfg.wandb.entity,
project=cfg.wandb.project,
name=cfg.wandb.run,
config=OmegaConf.to_container(cfg, resolve=True),
)
# Build model
self.model = hydra.utils.instantiate(cfg.model)
self.ema = EMA(cfg.ema)
self.ema_model = deepcopy(self.model)
# Training params
self.n_epochs = cfg.train.n_epochs
self.batch_size = cfg.train.batch_size
self.update_ema_freq = cfg.train.update_ema_freq
self.epoch_start_ema = cfg.train.epoch_start_ema
self.val_freq = cfg.train.get("val_freq", 100)
# Logging, checkpoints
self.logdir = cfg.logdir
self.checkpoint_dir = os.path.join(self.logdir, "checkpoint")
os.makedirs(self.checkpoint_dir, exist_ok=True)
self.log_freq = cfg.train.get("log_freq", 1)
self.save_model_freq = cfg.train.save_model_freq
# Build dataset
self.dataset_train = hydra.utils.instantiate(cfg.train_dataset)
self.dataloader_train = torch.utils.data.DataLoader(
self.dataset_train,
batch_size=self.batch_size,
num_workers=4 if self.dataset_train.device == "cpu" else 0,
shuffle=True,
pin_memory=True if self.dataset_train.device == "cpu" else False,
)
self.dataloader_val = None
if "train_split" in cfg.train and cfg.train.train_split < 1:
val_indices = self.dataset_train.set_train_val_split(cfg.train.train_split)
self.dataset_val = deepcopy(self.dataset_train)
self.dataset_val.set_indices(val_indices)
self.dataloader_val = torch.utils.data.DataLoader(
self.dataset_val,
batch_size=self.batch_size,
num_workers=4 if self.dataset_val.device == "cpu" else 0,
shuffle=True,
pin_memory=True if self.dataset_val.device == "cpu" else False,
)
self.optimizer = torch.optim.AdamW(
self.model.parameters(),
lr=cfg.train.learning_rate,
weight_decay=cfg.train.weight_decay,
)
self.lr_scheduler = CosineAnnealingWarmupRestarts(
self.optimizer,
first_cycle_steps=cfg.train.lr_scheduler.first_cycle_steps,
cycle_mult=1.0,
max_lr=cfg.train.learning_rate,
min_lr=cfg.train.lr_scheduler.min_lr,
warmup_steps=cfg.train.lr_scheduler.warmup_steps,
gamma=1.0,
)
self.reset_parameters()
def run(self):
raise NotImplementedError
def reset_parameters(self):
self.ema_model.load_state_dict(self.model.state_dict())
def step_ema(self):
if self.epoch < self.epoch_start_ema:
self.reset_parameters()
return
self.ema.update_model_average(self.ema_model, self.model)
def save_model(self):
"""
saves model and ema to disk;
"""
data = {
"epoch": self.epoch,
"model": self.model.state_dict(),
"ema": self.ema_model.state_dict(),
}
savepath = os.path.join(self.checkpoint_dir, f"state_{self.epoch}.pt")
torch.save(data, savepath)
log.info(f"Saved model to {savepath}")
def load(self, epoch):
"""
loads model and ema from disk
"""
loadpath = os.path.join(self.checkpoint_dir, f"state_{epoch}.pt")
data = torch.load(loadpath, weights_only=True)
self.epoch = data["epoch"]
self.model.load_state_dict(data["model"])
self.ema_model.load_state_dict(data["ema"])

View File

@ -0,0 +1,83 @@
"""
Pre-training diffusion policy
"""
import logging
import wandb
import numpy as np
log = logging.getLogger(__name__)
from util.timer import Timer
from agent.pretrain.train_agent import PreTrainAgent, batch_to_device
class TrainDiffusionAgent(PreTrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
def run(self):
timer = Timer()
self.epoch = 1
for _ in range(self.n_epochs):
# train
loss_train_epoch = []
for batch_train in self.dataloader_train:
if self.dataset_train.device == "cpu":
batch_train = batch_to_device(batch_train)
self.model.train()
loss_train = self.model.loss(*batch_train)
loss_train.backward()
loss_train_epoch.append(loss_train.item())
self.optimizer.step()
self.optimizer.zero_grad()
loss_train = np.mean(loss_train_epoch)
# validate
loss_val_epoch = []
if self.dataloader_val is not None and self.epoch % self.val_freq == 0:
self.model.eval()
for batch_val in self.dataloader_val:
if self.dataset_val.device == "cpu":
batch_val = batch_to_device(batch_val)
loss_val, infos_val = self.model.loss(*batch_val)
loss_val_epoch.append(loss_val.item())
self.model.train()
loss_val = np.mean(loss_val_epoch) if len(loss_val_epoch) > 0 else None
# update lr
self.lr_scheduler.step()
# update ema
if self.epoch % self.update_ema_freq == 0:
self.step_ema()
# save model
if self.epoch % self.save_model_freq == 0 or self.epoch == self.n_epochs:
self.save_model()
# log loss
if self.epoch % self.log_freq == 0:
log.info(
f"{self.epoch}: train loss {loss_train:8.4f} | t:{timer():8.4f}"
)
if self.use_wandb:
if loss_val is not None:
wandb.log(
{"loss - val": loss_val}, step=self.epoch, commit=False
)
wandb.log(
{
"loss - train": loss_train,
},
step=self.epoch,
commit=True,
)
# count
self.epoch += 1

View File

@ -0,0 +1,99 @@
"""
Pre-training Gaussian/GMM policy
"""
import logging
import wandb
import numpy as np
log = logging.getLogger(__name__)
from util.timer import Timer
from agent.pretrain.train_agent import PreTrainAgent, batch_to_device
class TrainGaussianAgent(PreTrainAgent):
def __init__(self, cfg):
super().__init__(cfg)
# Entropy bonus - not used right now since using fixed_std
self.ent_coef = cfg.train.get("ent_coef", 0)
def run(self):
timer = Timer()
self.epoch = 1
for _ in range(self.n_epochs):
# train
loss_train_epoch = []
ent_train_epoch = []
for batch_train in self.dataloader_train:
if self.dataset_train.device == "cpu":
batch_train = batch_to_device(batch_train)
self.model.train()
loss_train, infos_train = self.model.loss(
*batch_train,
ent_coef=self.ent_coef,
)
loss_train.backward()
loss_train_epoch.append(loss_train.item())
ent_train_epoch.append(infos_train["entropy"].item())
self.optimizer.step()
self.optimizer.zero_grad()
loss_train = np.mean(loss_train_epoch)
ent_train = np.mean(ent_train_epoch)
# validate
loss_val_epoch = []
if self.dataloader_val is not None and self.epoch % self.val_freq == 0:
self.model.eval()
for batch_val in self.dataloader_val:
if self.dataset_val.device == "cpu":
batch_val = batch_to_device(batch_val)
loss_val, infos_val = self.model.loss(
*batch_val,
ent_coef=self.ent_coef,
)
loss_val_epoch.append(loss_val.item())
self.model.train()
loss_val = np.mean(loss_val_epoch) if len(loss_val_epoch) > 0 else None
# update lr
self.lr_scheduler.step()
# update ema
if self.epoch % self.update_ema_freq == 0:
self.step_ema()
# save model
if self.epoch % self.save_model_freq == 0 or self.epoch == self.n_epochs:
self.save_model()
# log loss
if self.epoch % self.log_freq == 0:
infos_str = " | ".join(
[f"{key}: {val:8.4f}" for key, val in infos_train.items()]
)
log.info(
f"{self.epoch}: train loss {loss_train:8.4f} | {infos_str} | t:{timer():8.4f}"
)
if self.use_wandb:
if loss_val is not None:
wandb.log(
{"loss - val": loss_val}, step=self.epoch, commit=False
)
wandb.log(
{
"loss - train": loss_train,
"entropy - train": ent_train,
},
step=self.epoch,
commit=True,
)
# count
self.epoch += 1

View File

@ -0,0 +1,119 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_m1_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m1/avoid_d56_r12_pre_diffusion_mlp_ta4_td20/2024-07-06_22-50-07/checkpoint/state_10000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/normalization.npz
seed: 42
device: cuda:0
env_name: avoiding-m5
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 10
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 50
name: ${env_name}
max_episode_steps: 100
reset_at_iteration: True
save_video: False
best_reward_threshold_for_success: 2
save_full_observations: True
wrappers:
d3il_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
pass_full_observations: ${env.save_full_observations}
reset_within_step: False
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env_name}-m1-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 51
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.99
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 100 # no eval, always train mode
force_train: True
render:
freq: 1
num: 10
plotter:
_target_: env.plot_traj.TrajPlotter
env_type: avoid
normalization_path: ${normalization_path}
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
update_epochs: 10
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.95
clip_ploss_coef: 0.1
clip_ploss_coef_base: 0.1
clip_ploss_coef_rate: 1
randn_clip_value: 3
min_sampling_denoising_std: 0.1
min_logprob_denoising_std: 0.1
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,105 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_m1_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m1/avoid_d56_r12_pre_gaussian_mlp_ta4/2024-07-07_01-35-48/checkpoint/state_10000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/normalization.npz
seed: 42
device: cuda:0
env_name: avoiding-m5
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 50
name: ${env_name}
max_episode_steps: 100
reset_at_iteration: True
save_video: False
best_reward_threshold_for_success: 2
save_full_observations: True
wrappers:
d3il_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
pass_full_observations: ${env.save_full_observations}
reset_within_step: False
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env_name}-m1-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 51
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.99
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 100 # no eval, always train mode
force_train: True
render:
freq: 1
num: 10
plotter:
_target_: env.plot_traj.TrajPlotter
env_type: avoid
normalization_path: ${normalization_path}
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
update_epochs: 10
vf_coef: 0.5
target_kl: 1
model:
_target_: model.rl.gaussian_ppo.PPO_Gaussian
clip_ploss_coef: 0.1
randn_clip_value: 3
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [256, 256, 256] # smaller MLP for less overfitting
activation_type: ReLU
residual_style: True
fixed_std: 0.1
learn_fixed_std: False
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
mlp_dims: [256, 256, 256]
residual_style: True
obs_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,106 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_m1_gmm_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m1/avoid_d56_r12_pre_gmm_mlp_ta4/2024-07-10_14-30-00/checkpoint/state_10000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/normalization.npz
seed: 42
device: cuda:0
env_name: avoiding-m5
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 4
act_steps: 4
num_modes: 5
env:
n_envs: 50
name: ${env_name}
max_episode_steps: 100
reset_at_iteration: True
save_video: False
best_reward_threshold_for_success: 2
save_full_observations: True
wrappers:
d3il_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
pass_full_observations: ${env.save_full_observations}
reset_within_step: False
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env_name}-m1-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 51
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.99
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 100 # no eval, always train mode
force_train: True
render:
freq: 1
num: 10
plotter:
_target_: env.plot_traj.TrajPlotter
env_type: avoid
normalization_path: ${normalization_path}
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
update_epochs: 10
vf_coef: 0.5
target_kl: 1
model:
_target_: model.rl.gmm_ppo.PPO_GMM
clip_ploss_coef: 0.1
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gmm.GMM_MLP
mlp_dims: [256, 256] # smaller MLP for less overfitting
activation_type: ReLU
residual_style: False
fixed_std: 0.1
learn_fixed_std: False
num_modes: ${num_modes}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
mlp_dims: [256, 256, 256]
residual_style: True
obs_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,119 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_m2_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m2/avoid_d57_r12_pre_diffusion_mlp_ta4_td20/2024-07-07_13-12-09/checkpoint/state_15000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m2/normalization.npz
seed: 42
device: cuda:0
env_name: avoiding-m5
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 10
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 50
name: ${env_name}
max_episode_steps: 100
reset_at_iteration: True
save_video: False
best_reward_threshold_for_success: 2
save_full_observations: True
wrappers:
d3il_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
pass_full_observations: ${env.save_full_observations}
reset_within_step: False
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env_name}-m2-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 51
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.99
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 100 # no eval, always train mode
force_train: True
render:
freq: 1
num: 10
plotter:
_target_: env.plot_traj.TrajPlotter
env_type: avoid
normalization_path: ${normalization_path}
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
update_epochs: 10
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.95
clip_ploss_coef: 0.1
clip_ploss_coef_base: 0.1
clip_ploss_coef_rate: 1
randn_clip_value: 3
min_sampling_denoising_std: 0.1
min_logprob_denoising_std: 0.1
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,105 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_m2_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m2/avoid_d57_r12_pre_gaussian_mlp_ta4/2024-07-07_02-15-50/checkpoint/state_10000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m2/normalization.npz
seed: 42
device: cuda:0
env_name: avoiding-m5
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 50
name: ${env_name}
max_episode_steps: 100
reset_at_iteration: True
save_video: False
best_reward_threshold_for_success: 2
save_full_observations: True
wrappers:
d3il_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
pass_full_observations: ${env.save_full_observations}
reset_within_step: False
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env_name}-m2-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 51
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.99
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 100 # no eval, always train mode
force_train: True
render:
freq: 1
num: 10
plotter:
_target_: env.plot_traj.TrajPlotter
env_type: avoid
normalization_path: ${normalization_path}
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
update_epochs: 10
vf_coef: 0.5
target_kl: 1
model:
_target_: model.rl.gaussian_ppo.PPO_Gaussian
clip_ploss_coef: 0.1
randn_clip_value: 3
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [256, 256, 256] # smaller MLP for less overfitting
activation_type: ReLU
residual_style: True
fixed_std: 0.1
learn_fixed_std: False
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
mlp_dims: [256, 256, 256]
residual_style: True
obs_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,106 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_m2_gmm_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m2/avoid_d57_r12_pre_gmm_mlp_ta4/2024-07-10_15-44-32/checkpoint/state_10000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m2/normalization.npz
seed: 42
device: cuda:0
env_name: avoiding-m5
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 4
act_steps: 4
num_modes: 5
env:
n_envs: 50
name: ${env_name}
max_episode_steps: 100
reset_at_iteration: True
save_video: False
best_reward_threshold_for_success: 2
save_full_observations: True
wrappers:
d3il_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
pass_full_observations: ${env.save_full_observations}
reset_within_step: False
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env_name}-m2-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 51
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.99
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 100 # no eval, always train mode
force_train: True
render:
freq: 1
num: 10
plotter:
_target_: env.plot_traj.TrajPlotter
env_type: avoid
normalization_path: ${normalization_path}
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
update_epochs: 10
vf_coef: 0.5
target_kl: 1
model:
_target_: model.rl.gmm_ppo.PPO_GMM
clip_ploss_coef: 0.1
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gmm.GMM_MLP
mlp_dims: [256, 256] # smaller MLP for less overfitting
activation_type: ReLU
residual_style: False
fixed_std: 0.1
learn_fixed_std: False
num_modes: ${num_modes}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
mlp_dims: [256, 256, 256]
residual_style: True
obs_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,119 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_m3_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m3/avoid_d58_r12_pre_diffusion_mlp_ta4_td20/2024-07-07_13-54-54/checkpoint/state_15000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m3/normalization.npz
seed: 42
device: cuda:0
env_name: avoiding-m5
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 10
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 50
name: ${env_name}
max_episode_steps: 100
reset_at_iteration: True
save_video: False
best_reward_threshold_for_success: 2
save_full_observations: True
wrappers:
d3il_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
pass_full_observations: ${env.save_full_observations}
reset_within_step: False
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env_name}-m3-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 51
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.99
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 100 # no eval, always train mode
force_train: True
render:
freq: 1
num: 10
plotter:
_target_: env.plot_traj.TrajPlotter
env_type: avoid
normalization_path: ${normalization_path}
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
update_epochs: 10
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.95
clip_ploss_coef: 0.1
clip_ploss_coef_base: 0.1
clip_ploss_coef_rate: 1
randn_clip_value: 3
min_sampling_denoising_std: 0.1
min_logprob_denoising_std: 0.1
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,105 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_m3_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m3/avoid_d58_r12_pre_gaussian_mlp_ta4/2024-07-07_13-11-49/checkpoint/state_10000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m3/normalization.npz
seed: 42
device: cuda:0
env_name: avoiding-m5
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 50
name: ${env_name}
max_episode_steps: 100
reset_at_iteration: True
save_video: False
best_reward_threshold_for_success: 2
save_full_observations: True
wrappers:
d3il_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
pass_full_observations: ${env.save_full_observations}
reset_within_step: False
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env_name}-m3-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 51
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.99
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 100 # no eval, always train mode
force_train: True
render:
freq: 1
num: 10
plotter:
_target_: env.plot_traj.TrajPlotter
env_type: avoid
normalization_path: ${normalization_path}
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
update_epochs: 10
vf_coef: 0.5
target_kl: 1
model:
_target_: model.rl.gaussian_ppo.PPO_Gaussian
clip_ploss_coef: 0.1
randn_clip_value: 3
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [256, 256, 256] # smaller MLP for less overfitting
activation_type: ReLU
residual_style: True
fixed_std: 0.1
learn_fixed_std: False
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
mlp_dims: [256, 256, 256]
residual_style: True
obs_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,106 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_m3_gmm_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/m3/avoid_d58_r12_pre_gmm_mlp_ta4/2024-07-10_17-01-50/checkpoint/state_10000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m3/normalization.npz
seed: 42
device: cuda:0
env_name: avoiding-m5
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 4
act_steps: 4
num_modes: 5
env:
n_envs: 50
name: ${env_name}
max_episode_steps: 100
reset_at_iteration: True
save_video: False
best_reward_threshold_for_success: 2
save_full_observations: True
wrappers:
d3il_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
pass_full_observations: ${env.save_full_observations}
reset_within_step: False
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env_name}-m3-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 51
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.99
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 100
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 100 # no eval, always train mode
force_train: True
render:
freq: 1
num: 10
plotter:
_target_: env.plot_traj.TrajPlotter
env_type: avoid
normalization_path: ${normalization_path}
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: ${eval:'round(${train.n_steps} * ${env.n_envs} / 2)'}
update_epochs: 10
vf_coef: 0.5
target_kl: 1
model:
_target_: model.rl.gmm_ppo.PPO_GMM
clip_ploss_coef: 0.1
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gmm.GMM_MLP
mlp_dims: [256, 256] # smaller MLP for less overfitting
activation_type: ReLU
residual_style: False
fixed_std: 0.1
learn_fixed_std: False
num_modes: ${num_modes}
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
mlp_dims: [256, 256, 256]
residual_style: True
obs_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,70 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: avoid_m1_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/train.pkl
seed: 42
device: cuda:0
env: avoid
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
denoising_steps: 20
horizon_steps: 4
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 15000
batch_size: 16
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 15000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,62 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: avoid_m1_pre_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/train.pkl
seed: 42
device: cuda:0
env: avoid
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
horizon_steps: 4
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 10000
batch_size: 16
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.common.gaussian.GaussianModel
network:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [256, 256, 256] # smaller MLP for less overfitting
activation_type: ReLU
residual_style: True
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,64 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: avoid_m1_pre_gmm_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/train.pkl
seed: 42
device: cuda:0
env: avoid
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
horizon_steps: 4
cond_steps: 1
num_modes: 5
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 10000
batch_size: 32
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.common.gmm.GMMModel
network:
_target_: model.common.mlp_gmm.GMM_MLP
mlp_dims: [256, 256] # smaller MLP for less overfitting
activation_type: ReLU
residual_style: False
fixed_std: 0.1
num_modes: ${num_modes}
cond_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,70 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: avoid_m2_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m2/train.pkl
seed: 42
device: cuda:0
env: avoid
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
denoising_steps: 20
horizon_steps: 4
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 15000
batch_size: 16
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 15000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,62 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: avoid_m2_pre_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m2/train.pkl
seed: 42
device: cuda:0
env: avoid
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
horizon_steps: 4
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 10000
batch_size: 16
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.common.gaussian.GaussianModel
network:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [256, 256, 256] # smaller MLP for less overfitting
activation_type: ReLU
residual_style: True
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,64 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: avoid_m2_pre_gmm_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m2/train.pkl
seed: 42
device: cuda:0
env: avoid
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
horizon_steps: 4
cond_steps: 1
num_modes: 5
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 10000
batch_size: 32
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.common.gmm.GMMModel
network:
_target_: model.common.mlp_gmm.GMM_MLP
mlp_dims: [256, 256] # smaller MLP for less overfitting
activation_type: ReLU
residual_style: False
fixed_std: 0.1
num_modes: ${num_modes}
cond_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,70 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: avoid_m3_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m3/train.pkl
seed: 42
device: cuda:0
env: avoid
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
denoising_steps: 20
horizon_steps: 4
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 15000
batch_size: 16
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 15000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,62 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: avoid_m3_pre_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m3/train.pkl
seed: 42
device: cuda:0
env: avoid
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
horizon_steps: 4
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 10000
batch_size: 16
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.common.gaussian.GaussianModel
network:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [256, 256, 256] # smaller MLP for less overfitting
activation_type: ReLU
residual_style: True
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,64 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: avoid_m3_pre_gmm_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/d3il-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m3/train.pkl
seed: 42
device: cuda:0
env: avoid
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
obs_dim: 4
action_dim: 2
transition_dim: ${action_dim}
horizon_steps: 4
cond_steps: 1
num_modes: 5
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: d3il-${env}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 10000
batch_size: 32
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.common.gmm.GMMModel
network:
_target_: model.common.mlp_gmm.GMM_MLP
mlp_dims: [256, 256] # smaller MLP for less overfitting
activation_type: ReLU
residual_style: False
fixed_std: 0.1
num_modes: ${num_modes}
cond_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

26
cfg/finetuning.md Normal file
View File

@ -0,0 +1,26 @@
## Fine-tuning experiments
### Comparing diffusion-based RL algorithms (Sec. 5.1)
Gym configs are under `cfg/gym/finetune/<env_name>/`, and the naming follows `ft_<alg_name>_diffusion_mlp`, e.g., `ft_awr_diffusion_mlp`. `alg_name` is one of `rwr`, `awr`, `dipo`, `idql`, `dql`, `qsm`, `ppo` (DPPO), `ppo_exact` (exact likelihood). They share the same pre-trained checkpoint in each env.
Robomimic configs are under `cfg/robomimic/finetune/<env_name>/`, and the naming follows the same.
<!-- **Note**: For *Can* and *Lift* in Robomimic, we use earlier checkpoints from pre-training (epoch 5000) so there is more room for fine-tuning improvement. For comparing policy parameterizations, we use the final checkpoints for all tasks (epoch 8000). -->
### Comparing policy parameterizations (Sec. 5.2, 5.3)
Robomimic configs are under `cfg/robomimic/finetune/<env_name>/`, and the naming follows `ft_ppo_<diffusion/gaussian/gmm>_<mlp/unet/transformer>_<img?>`. For pixel experiments, we choose pre-trained checkpoints such that the pre-training performance is similar between DPPO and Gaussian baseline.
**Note**: For *Can* and *Lift* in Robomimic with DPPO, you need to manually download the final checkpoints (epoch 8000). The default ones in the configs are from epoch 5000 (more room for fine-tuning improvement) and used for comparing diffusion-based RL algorithms,
Furniture-Bench configs are under `cfg/furniture/finetune/<env_name>/`, and the naming follows `ft_<diffusion/gaussian>_<mlp/unet>`. In the paper we did not show the results of `ft_diffusion_mlp`. Running IsaacGym for the first time may take a while for setting up the meshes. If you encounter the error about `libpython`, see instruction [here](installation/install_furniture.md#L27).
### D3IL (Sec. 6)
D3IL configs are under `cfg/d3il/finetune/avoid_<mode>/`, and the naming follows `ft_ppo_<diffusion/gaussian/gmm>_mlp`. The number of fine-tuned denoising steps can be specified with `ft_denoising_steps`.
### Training from scratch (App. B.2)
`ppo_diffusion_mlp` and `ppo_gaussian_mlp` under `cfg/gym/finetune/<env_name>` are for training DPPO or Gaussian policy from scratch.
### Comparing to exact likelihood policy gradient (App. B.5)
`ft_ppo_exact_diffusion_mlp` under `cfg/gym/finetune/hopper-v2`, `cfg/gym/finetune/halfcheetah-v2`, and `cfg/robomimic/finetune/can` are for training diffusion policy gradient with exact likelihood. `torchdiffeq` package needs to be installed first with `pip install -e .[exact]`.

View File

@ -0,0 +1,121 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/lamp/lamp_low_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-20/checkpoint/state_8000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
horizon_steps: 8
act_steps: 8
use_ddim: True
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 1000
best_reward_threshold_for_success: 2
specific:
headless: true
furniture: lamp
randomness: low
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.9
clip_ploss_coef: 0.001
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.04
#
use_ddim: ${use_ddim}
ddim_steps: ${ft_denoising_steps}
learn_eta: False
eta:
base_eta: 1
input_dim: ${obs_dim}
mlp_dims: [256, 256]
action_dim: ${action_dim}
min_eta: 0.1
max_eta: 1.0
_target_: model.diffusion.eta.EtaFixed
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 32
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
cond_mlp_dims: [512, 64]
use_layernorm: True # needed for larger MLP
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,123 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/lamp/lamp_low_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-16-48/checkpoint/state_8000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
horizon_steps: 16
act_steps: 8
use_ddim: True
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 1000
best_reward_threshold_for_success: 2
specific:
headless: true
furniture: lamp
randomness: low
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.9
clip_ploss_coef: 0.001
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.04
#
use_ddim: ${use_ddim}
ddim_steps: ${ft_denoising_steps}
learn_eta: False
eta:
base_eta: 1
input_dim: ${obs_dim}
mlp_dims: [256, 256]
action_dim: ${action_dim}
min_eta: 0.1
max_eta: 1.0
_target_: model.diffusion.eta.EtaFixed
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.unet.Unet1D
diffusion_step_embed_dim: 16
dim: 64
dim_mults: [1, 2, 4]
kernel_size: 5
n_groups: 8
smaller_encoder: False
cond_predict_scale: True
groupnorm_eps: 1e-4
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,109 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/lamp/lamp_low_dim_pre_gaussian_mlp_ta8/2024-06-28_16-26-51/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 8
act_steps: 8
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 1000
best_reward_threshold_for_success: 2
specific:
headless: true
furniture: lamp
randomness: low
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.rl.gaussian_ppo.PPO_Gaussian
clip_ploss_coef: 0.01
randn_clip_value: 3
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims:
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
activation_type: ReLU
residual_style: True
fixed_std: 0.04
learn_fixed_std: True
std_min: 0.01
std_max: 0.2
cond_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,121 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/lamp/lamp_med_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-20/checkpoint/state_8000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
horizon_steps: 8
act_steps: 8
use_ddim: True
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 1000
best_reward_threshold_for_success: 2
specific:
headless: true
furniture: lamp
randomness: med
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.9
clip_ploss_coef: 0.001
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.04
#
use_ddim: ${use_ddim}
ddim_steps: ${ft_denoising_steps}
learn_eta: False
eta:
base_eta: 1
input_dim: ${obs_dim}
mlp_dims: [256, 256]
action_dim: ${action_dim}
min_eta: 0.1
max_eta: 1.0
_target_: model.diffusion.eta.EtaFixed
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 32
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
cond_mlp_dims: [512, 64]
use_layernorm: True # needed for larger MLP
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,122 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/lamp/lamp_med_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-17-21/checkpoint/state_8000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
horizon_steps: 16
act_steps: 8
use_ddim: True
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 1000
best_reward_threshold_for_success: 2
specific:
headless: true
furniture: lamp
randomness: med
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.9
clip_ploss_coef: 0.001
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.04
#
use_ddim: ${use_ddim}
ddim_steps: ${ft_denoising_steps}
learn_eta: False
eta:
base_eta: 1
input_dim: ${obs_dim}
mlp_dims: [256, 256]
action_dim: ${action_dim}
min_eta: 0.1
max_eta: 1.0
_target_: model.diffusion.eta.EtaFixed
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.unet.Unet1D
diffusion_step_embed_dim: 16
dim: 64
dim_mults: [1, 2, 4]
kernel_size: 5
n_groups: 8
smaller_encoder: False
cond_predict_scale: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,109 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/lamp/lamp_med_dim_pre_gaussian_mlp_ta8/2024-06-28_16-26-56/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 8
act_steps: 8
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 1000
best_reward_threshold_for_success: 2
specific:
headless: true
furniture: lamp
randomness: med
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.rl.gaussian_ppo.PPO_Gaussian
clip_ploss_coef: 0.01
randn_clip_value: 3
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims:
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
activation_type: ReLU
residual_style: True
fixed_std: 0.04
learn_fixed_std: True
std_min: 0.01
std_max: 0.2
cond_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,121 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_low_dim_pre_diffusion_mlp_ta8_td100/2024-07-22_20-01-16/checkpoint/state_8000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
horizon_steps: 8
act_steps: 8
use_ddim: True
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 700
best_reward_threshold_for_success: 1
specific:
headless: true
furniture: one_leg
randomness: low
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.9
clip_ploss_coef: 0.001
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.04
#
use_ddim: ${use_ddim}
ddim_steps: ${ft_denoising_steps}
learn_eta: False
eta:
base_eta: 1
input_dim: ${obs_dim}
mlp_dims: [256, 256]
action_dim: ${action_dim}
min_eta: 0.1
max_eta: 1.0
_target_: model.diffusion.eta.EtaFixed
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 32
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
cond_mlp_dims: [512, 64]
use_layernorm: True # needed for larger MLP
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,123 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_low_dim_pre_diffusion_unet_ta16_td100/2024-07-03_22-23-38/checkpoint/state_8000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
horizon_steps: 16
act_steps: 8
use_ddim: True
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 700
best_reward_threshold_for_success: 1
specific:
headless: true
furniture: one_leg
randomness: low
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.9
clip_ploss_coef: 0.001
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.04
#
use_ddim: ${use_ddim}
ddim_steps: ${ft_denoising_steps}
learn_eta: False
eta:
base_eta: 1
input_dim: ${obs_dim}
mlp_dims: [256, 256]
action_dim: ${action_dim}
min_eta: 0.1
max_eta: 1.0
_target_: model.diffusion.eta.EtaFixed
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.unet.Unet1D
diffusion_step_embed_dim: 16
dim: 64
dim_mults: [1, 2, 4]
kernel_size: 5
n_groups: 8
smaller_encoder: False
cond_predict_scale: True
groupnorm_eps: 1e-4
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,109 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_low_dim_pre_gaussian_mlp_ta8/2024-06-26_23-43-02/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 8
act_steps: 8
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 700
best_reward_threshold_for_success: 1
specific:
headless: true
furniture: one_leg
randomness: low
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.rl.gaussian_ppo.PPO_Gaussian
clip_ploss_coef: 0.01
randn_clip_value: 3
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims:
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
activation_type: ReLU
residual_style: True
fixed_std: 0.04
learn_fixed_std: True
std_min: 0.01
std_max: 0.2
cond_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,121 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_med_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-11/checkpoint/state_8000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
horizon_steps: 8
act_steps: 8
use_ddim: True
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 700
best_reward_threshold_for_success: 1
specific:
headless: true
furniture: one_leg
randomness: med
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.9
clip_ploss_coef: 0.001
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.04
#
use_ddim: ${use_ddim}
ddim_steps: ${ft_denoising_steps}
learn_eta: False
eta:
base_eta: 1
input_dim: ${obs_dim}
mlp_dims: [256, 256]
action_dim: ${action_dim}
min_eta: 0.1
max_eta: 1.0
_target_: model.diffusion.eta.EtaFixed
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 32
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
cond_mlp_dims: [512, 64]
use_layernorm: True # needed for larger MLP
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,123 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_med_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-16-16/checkpoint/state_8000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
horizon_steps: 16
act_steps: 8
use_ddim: True
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 700
best_reward_threshold_for_success: 1
specific:
headless: true
furniture: one_leg
randomness: med
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.9
clip_ploss_coef: 0.001
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.04
#
use_ddim: ${use_ddim}
ddim_steps: ${ft_denoising_steps}
learn_eta: False
eta:
base_eta: 1
input_dim: ${obs_dim}
mlp_dims: [256, 256]
action_dim: ${action_dim}
min_eta: 0.1
max_eta: 1.0
_target_: model.diffusion.eta.EtaFixed
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.unet.Unet1D
diffusion_step_embed_dim: 16
dim: 64
dim_mults: [1, 2, 4]
kernel_size: 5
n_groups: 8
smaller_encoder: False
cond_predict_scale: True
groupnorm_eps: 1e-4
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,109 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_med_dim_pre_gaussian_mlp_ta8/2024-06-28_16-27-02/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 8
act_steps: 8
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 700
best_reward_threshold_for_success: 1
specific:
headless: true
furniture: one_leg
randomness: med
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.rl.gaussian_ppo.PPO_Gaussian
clip_ploss_coef: 0.01
randn_clip_value: 3
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims:
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
activation_type: ReLU
residual_style: True
fixed_std: 0.04
learn_fixed_std: True
std_min: 0.01
std_max: 0.2
cond_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,121 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/round_table/round_table_low_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-26/checkpoint/state_8000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
horizon_steps: 8
act_steps: 8
use_ddim: True
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 1000
best_reward_threshold_for_success: 2
specific:
headless: true
furniture: round_table
randomness: low
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.9
clip_ploss_coef: 0.001
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.04
#
use_ddim: ${use_ddim}
ddim_steps: ${ft_denoising_steps}
learn_eta: False
eta:
base_eta: 1
input_dim: ${obs_dim}
mlp_dims: [256, 256]
action_dim: ${action_dim}
min_eta: 0.1
max_eta: 1.0
_target_: model.diffusion.eta.EtaFixed
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 32
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
cond_mlp_dims: [512, 64]
use_layernorm: True # needed for larger MLP
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,123 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/round_table/round_table_low_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-19-48/checkpoint/state_8000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
horizon_steps: 16
act_steps: 8
use_ddim: True
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 1000
best_reward_threshold_for_success: 2
specific:
headless: true
furniture: round_table
randomness: low
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.9
clip_ploss_coef: 0.001
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.04
#
use_ddim: ${use_ddim}
ddim_steps: ${ft_denoising_steps}
learn_eta: False
eta:
base_eta: 1
input_dim: ${obs_dim}
mlp_dims: [256, 256]
action_dim: ${action_dim}
min_eta: 0.1
max_eta: 1.0
_target_: model.diffusion.eta.EtaFixed
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.unet.Unet1D
diffusion_step_embed_dim: 16
dim: 64
dim_mults: [1, 2, 4]
kernel_size: 5
n_groups: 8
smaller_encoder: False
cond_predict_scale: True
groupnorm_eps: 1e-4
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,109 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/round_table/round_table_low_dim_pre_gaussian_mlp_ta8/2024-06-28_16-26-51/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 8
act_steps: 8
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 1000
best_reward_threshold_for_success: 2
specific:
headless: true
furniture: round_table
randomness: low
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.rl.gaussian_ppo.PPO_Gaussian
clip_ploss_coef: 0.01
randn_clip_value: 3
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims:
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
activation_type: ReLU
residual_style: True
fixed_std: 0.04
learn_fixed_std: True
std_min: 0.01
std_max: 0.2
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,121 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/round_table/round_table_med_dim_pre_diffusion_mlp_ta8_td100/2024-07-23_01-28-29/checkpoint/state_8000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
horizon_steps: 8
act_steps: 8
use_ddim: True
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 1000
best_reward_threshold_for_success: 2
specific:
headless: true
furniture: round_table
randomness: med
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.9
clip_ploss_coef: 0.001
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.04
#
use_ddim: ${use_ddim}
ddim_steps: ${ft_denoising_steps}
learn_eta: False
eta:
base_eta: 1
input_dim: ${obs_dim}
mlp_dims: [256, 256]
action_dim: ${action_dim}
min_eta: 0.1
max_eta: 1.0
_target_: model.diffusion.eta.EtaFixed
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 32
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
cond_mlp_dims: [512, 64]
use_layernorm: True # needed for larger MLP
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,122 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ft_diffusion_unet_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/round_table/round_table_med_dim_pre_diffusion_unet_ta16_td100/2024-07-04_02-20-21/checkpoint/state_8000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
ft_denoising_steps: 5
cond_steps: 1
horizon_steps: 16
act_steps: 8
use_ddim: True
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 1000
best_reward_threshold_for_success: 2
specific:
headless: true
furniture: round_table
randomness: med
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.9
clip_ploss_coef: 0.001
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.04
#
use_ddim: ${use_ddim}
ddim_steps: ${ft_denoising_steps}
learn_eta: False
eta:
base_eta: 1
input_dim: ${obs_dim}
mlp_dims: [256, 256]
action_dim: ${action_dim}
min_eta: 0.1
max_eta: 1.0
_target_: model.diffusion.eta.EtaFixed
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.unet.Unet1D
diffusion_step_embed_dim: 16
dim: 64
dim_mults: [1, 2, 4]
kernel_size: 5
n_groups: 8
smaller_encoder: False
cond_predict_scale: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,109 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_ft_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/round_table/round_table_med_dim_pre_gaussian_mlp_ta8/2024-06-28_16-26-38/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
seed: 42
device: cuda:0
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 8
act_steps: 8
env:
n_envs: 1000
name: ${env_name}
env_type: furniture
max_episode_steps: 1000
best_reward_threshold_for_success: 2
specific:
headless: true
furniture: round_table
randomness: med
normalization_path: ${normalization_path}
act_steps: ${act_steps}
sparse_reward: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${env.specific.furniture}-${env.specific.randomness}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 1
n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
gamma: 0.999
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-6
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 50
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 8800
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.rl.gaussian_ppo.PPO_Gaussian
clip_ploss_coef: 0.01
randn_clip_value: 3
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims:
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
- 1024
activation_type: ReLU
residual_style: True
fixed_std: 0.04
learn_fixed_std: True
std_min: 0.01
std_max: 0.2
cond_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [512, 512, 512]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,72 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: lamp
randomness: low
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 8
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 8000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 32
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
cond_mlp_dims: [512, 64]
use_layernorm: True # needed for larger MLP
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,74 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: lamp
randomness: low
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 16
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 8000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.unet.Unet1D
diffusion_step_embed_dim: 16
dim: 64
dim_mults: [1, 2, 4]
kernel_size: 5
n_groups: 8
smaller_encoder: False
cond_predict_scale: True
groupnorm_eps: 1e-4 # not important
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,64 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: lamp
randomness: low
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
horizon_steps: 8
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 3000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.common.gaussian.GaussianModel
network:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]
use_layernorm: False # worse with layernorm
activation_type: ReLU # worse with Mish
residual_style: True
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,72 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: lamp
randomness: med
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 8
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 8000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 32
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
cond_mlp_dims: [512, 64]
use_layernorm: True # needed for larger MLP
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,73 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: lamp
randomness: med
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 16
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 8000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.unet.Unet1D
diffusion_step_embed_dim: 16
dim: 64
dim_mults: [1, 2, 4]
kernel_size: 5
n_groups: 8
smaller_encoder: False
cond_predict_scale: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,64 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: lamp
randomness: med
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
horizon_steps: 8
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 3000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.common.gaussian.GaussianModel
network:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]
use_layernorm: False # worse with layernorm
activation_type: ReLU # worse with Mish
residual_style: True
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,72 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: one_leg
randomness: low
env: ${task}_${randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 8
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 8000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 32
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
cond_mlp_dims: [512, 64]
use_layernorm: True # needed for larger MLP
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,74 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: one_leg
randomness: low
env: ${task}_${randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 16
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 8000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.unet.Unet1D
diffusion_step_embed_dim: 16
dim: 64
dim_mults: [1, 2, 4]
kernel_size: 5
n_groups: 8
smaller_encoder: False
cond_predict_scale: True
groupnorm_eps: 1e-4 # not important
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,64 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: one_leg
randomness: low
env: ${task}_${randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
horizon_steps: 8
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 3000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.common.gaussian.GaussianModel
network:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]
use_layernorm: False # worse with layernorm
activation_type: ReLU # worse with Mish
residual_style: True
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,72 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: one_leg
randomness: med
env: ${task}_${randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 8
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 8000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 32
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
cond_mlp_dims: [512, 64]
use_layernorm: True # needed for larger MLP
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,73 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: one_leg
randomness: med
env: ${task}_${randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 16
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 8000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.unet.Unet1D
diffusion_step_embed_dim: 16
dim: 64
dim_mults: [1, 2, 4]
kernel_size: 5
n_groups: 8
smaller_encoder: False
cond_predict_scale: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,64 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: one_leg
randomness: med
env: ${task}_${randomness}_dim
obs_dim: 58
action_dim: 10
transition_dim: ${action_dim}
horizon_steps: 8
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 10000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.common.gaussian.GaussianModel
network:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]
use_layernorm: False # worse with layernorm
activation_type: ReLU # worse with Mish
residual_style: True
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,72 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: round_table
randomness: low
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 8
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 8000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 32
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
cond_mlp_dims: [512, 64]
use_layernorm: True # needed for larger MLP
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,73 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: round_table
randomness: low
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 16
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 8000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.unet.Unet1D
diffusion_step_embed_dim: 16
dim: 64
dim_mults: [1, 2, 4]
kernel_size: 5
n_groups: 8
smaller_encoder: False
cond_predict_scale: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,64 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: round_table
randomness: low
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
horizon_steps: 8
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 3000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.common.gaussian.GaussianModel
network:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]
use_layernorm: False # worse with layernorm
activation_type: ReLU # worse with Mish
residual_style: True
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,72 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: round_table
randomness: med
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 8
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 8000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 32
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
cond_mlp_dims: [512, 64]
use_layernorm: True # needed for larger MLP
residual_style: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,73 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
name: ${env}_pre_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: round_table
randomness: med
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
denoising_steps: 100
horizon_steps: 16
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 8000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.diffusion.diffusion.DiffusionModel
predict_epsilon: True
denoised_clip_value: 1.0
network:
_target_: model.diffusion.unet.Unet1D
diffusion_step_embed_dim: 16
dim: 64
dim_mults: [1, 2, 4]
kernel_size: 5
n_groups: 8
smaller_encoder: False
cond_predict_scale: True
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
cond_steps: ${cond_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,64 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/furniture/${task}_${randomness}/train.pkl
seed: 42
device: cuda:0
task: round_table
randomness: med
env: ${task}_${randomness}_dim
obs_dim: 44
action_dim: 10
transition_dim: ${action_dim}
horizon_steps: 8
cond_steps: 1
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: furniture-${task}-${randomness}-pretrain
run: ${now:%H-%M-%S}_${name}
train:
n_epochs: 3000
batch_size: 256
learning_rate: 1e-4
weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 100
min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
model:
_target_: model.common.gaussian.GaussianModel
network:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024]
use_layernorm: False # worse with layernorm
activation_type: ReLU # worse with Mish
residual_style: True
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
device: ${device}
ema:
decay: 0.995
train_dataset:
_target_: agent.dataset.sequence.StitchedActionSequenceDataset
dataset_path: ${train_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,102 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 0
n_steps: 500
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# AWR specific
scale_reward_factor: 0.01
max_adv_weight: 100
beta: 10
buffer_size: 5000
batch_size: 256
replay_ratio: 64
critic_update_ratio: 4
model:
_target_: model.diffusion.diffusion_awr.AWRDiffusion
# Sampling HPs
min_sampling_denoising_std: 0.10
randn_clip_value: 3
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
cond_dim: ${obs_dim}
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,103 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 5
n_steps: 500
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# DIPO specific
scale_reward_factor: 0.01
eta: 0.0001
action_gradient_steps: 10
buffer_size: 400000
batch_size: 5000
replay_ratio: 64
model:
_target_: model.diffusion.diffusion_dipo.DIPODiffusion
# Sampling HPs
min_sampling_denoising_std: 0.10
randn_clip_value: 3
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
cond_dim: ${obs_dim}
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
critic:
_target_: model.common.critic.CriticObsAct
action_dim: ${action_dim}
action_steps: ${act_steps}
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,102 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 5
n_steps: 500
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# DQL specific
scale_reward_factor: 0.01
eta: 1.0
buffer_size: 400000
batch_size: 5000
replay_ratio: 64
model:
_target_: model.diffusion.diffusion_dql.DQLDiffusion
# Sampling HPs
min_sampling_denoising_std: 0.10
randn_clip_value: 3
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
cond_dim: ${obs_dim}
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
critic:
_target_: model.common.critic.CriticObsAct
action_dim: ${action_dim}
action_steps: ${act_steps}
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,111 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 5
n_steps: 500
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# IDQL specific
scale_reward_factor: 0.01
eval_deterministic: True
eval_sample_num: 20 # how many samples to score during eval
critic_tau: 0.001 # rate of target q network update
use_expectile_exploration: True
buffer_size: 5000
batch_size: 512
replay_ratio: 16
model:
_target_: model.diffusion.diffusion_idql.IDQLDiffusion
# Sampling HPs
min_sampling_denoising_std: 0.10
randn_clip_value: 3
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
cond_dim: ${obs_dim}
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
critic_q:
_target_: model.common.critic.CriticObsAct
action_dim: ${action_dim}
action_steps: ${act_steps}
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
critic_v:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,110 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 10
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 0
n_steps: 500
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 5000
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.99
clip_ploss_coef: 0.01
clip_ploss_coef_base: 0.01
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.1
min_logprob_denoising_std: 0.1
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
cond_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,118 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_exact_diffusion_agent.TrainPPOExactDiffusionAgent
name: ${env_name}_ppo_exact_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 10
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-finetune-exact
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 0
n_steps: 500
gamma: 0.99
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 5000
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo_exact.PPOExactDiffusion
sde:
_target_: model.diffusion.sde_lib.VPSDE # VP for DDPM
N: ${denoising_steps}
rtol: 1e-4
atol: 1e-4
sde_hutchinson_type: Rademacher
sde_step_size: 0.05
sde_method: euler
sde_num_epsilon: 2
sde_min_beta: 1e-10
sde_probability_flow: True
#
gamma_denoising: 0.99
clip_ploss_coef: 0.01
min_sampling_denoising_std: 0.1
min_logprob_denoising_std: 0.1
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
cond_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,103 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent
name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 5
n_steps: 500
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# QSM specific
scale_reward_factor: 0.01
q_grad_coeff: 50
critic_tau: 0.005 # rate of target q network update
buffer_size: 5000
batch_size: 256
replay_ratio: 32
model:
_target_: model.diffusion.diffusion_qsm.QSMDiffusion
# Sampling HPs
min_sampling_denoising_std: 0.10
randn_clip_value: 3
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
cond_dim: ${obs_dim}
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
critic:
_target_: model.common.critic.CriticObsAct
action_dim: ${action_dim}
action_steps: ${act_steps}
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,87 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent
name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 0
n_steps: 500
gamma: 0.99
lr: 1e-4
weight_decay: 0
lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# RWR specific
max_reward_weight: 100
beta: 10
batch_size: 1000
update_epochs: 16
model:
_target_: model.diffusion.diffusion_rwr.RWRDiffusion
# Sampling HPs
min_sampling_denoising_std: 0.1
randn_clip_value: 3
#
network_path: ${base_policy_path}
network:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
cond_dim: ${obs_dim}
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,108 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_nopre_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 20
cond_steps: 1
horizon_steps: 1
act_steps: 1
env:
n_envs: 10
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-scratch
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 0
n_steps: 1000
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 1000
update_epochs: 10
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.99
clip_ploss_coef: 0.1
clip_ploss_coef_base: 0.1
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.1
min_logprob_denoising_std: 0.1
#
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
cond_dim: ${obs_dim}
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,94 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
name: ${env_name}_nopre_ppo_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
transition_dim: ${action_dim}
cond_steps: 1
horizon_steps: 1
act_steps: 1
env:
n_envs: 10
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-scratch
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 0
n_steps: 1000
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 10000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 1000
update_epochs: 10
vf_coef: 0.5
target_kl: 1
model:
_target_: model.rl.gaussian_ppo.PPO_Gaussian
clip_ploss_coef: 0.1
randn_clip_value: 3
#
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
cond_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
device: ${device}

View File

@ -0,0 +1,102 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 0
n_steps: 500
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# AWR specific
scale_reward_factor: 0.01
max_adv_weight: 100
beta: 10
buffer_size: 5000
batch_size: 256
replay_ratio: 64
critic_update_ratio: 4
model:
_target_: model.diffusion.diffusion_awr.AWRDiffusion
# Sampling HPs
min_sampling_denoising_std: 0.10
randn_clip_value: 3
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
cond_dim: ${obs_dim}
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,103 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 5
n_steps: 500
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# DIPO specific
scale_reward_factor: 0.01
eta: 0.0001
action_gradient_steps: 10
buffer_size: 400000
batch_size: 5000
replay_ratio: 64
model:
_target_: model.diffusion.diffusion_dipo.DIPODiffusion
# Sampling HPs
min_sampling_denoising_std: 0.10
randn_clip_value: 3
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
cond_dim: ${obs_dim}
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
critic:
_target_: model.common.critic.CriticObsAct
action_dim: ${action_dim}
action_steps: ${act_steps}
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,102 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 5
n_steps: 500
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# DQL specific
scale_reward_factor: 0.01
eta: 1.0
buffer_size: 400000
batch_size: 5000
replay_ratio: 64
model:
_target_: model.diffusion.diffusion_dql.DQLDiffusion
# Sampling HPs
min_sampling_denoising_std: 0.10
randn_clip_value: 3
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
cond_dim: ${obs_dim}
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
critic:
_target_: model.common.critic.CriticObsAct
action_dim: ${action_dim}
action_steps: ${act_steps}
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,111 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
transition_dim: ${action_dim}
denoising_steps: 20
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 5
n_steps: 500
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# IDQL specific
scale_reward_factor: 0.01
eval_deterministic: True
eval_sample_num: 20 # how many samples to score during eval
critic_tau: 0.001 # rate of target q network update
use_expectile_exploration: True
buffer_size: 5000
batch_size: 512
replay_ratio: 16
model:
_target_: model.diffusion.diffusion_idql.IDQLDiffusion
# Sampling HPs
min_sampling_denoising_std: 0.10
randn_clip_value: 3
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
cond_dim: ${obs_dim}
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
critic_q:
_target_: model.common.critic.CriticObsAct
action_dim: ${action_dim}
action_steps: ${act_steps}
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
critic_v:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
transition_dim: ${transition_dim}
denoising_steps: ${denoising_steps}
device: ${device}

View File

@ -0,0 +1,110 @@
defaults:
- _self_
hydra:
run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
transition_dim: ${action_dim}
denoising_steps: 20
ft_denoising_steps: 10
cond_steps: 1
horizon_steps: 4
act_steps: 4
env:
n_envs: 40
name: ${env_name}
max_episode_steps: 1000
reset_at_iteration: False
save_video: False
best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
multi_step:
n_obs_steps: ${cond_steps}
n_action_steps: ${act_steps}
max_episode_steps: ${env.max_episode_steps}
reset_within_step: True
wandb:
entity: ${oc.env:DPPO_WANDB_ENTITY}
project: gym-${env_name}-finetune
run: ${now:%H-%M-%S}_${name}
train:
n_train_itr: 1000
n_critic_warmup_itr: 0
n_steps: 500
gamma: 0.99
actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
val_freq: 10
render:
freq: 1
num: 0
# PPO specific
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 5000
update_epochs: 5
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
gamma_denoising: 0.99
clip_ploss_coef: 0.01
clip_ploss_coef_base: 0.01
clip_ploss_coef_rate: 3
randn_clip_value: 3
min_sampling_denoising_std: 0.1
min_logprob_denoising_std: 0.1
#
network_path: ${base_policy_path}
actor:
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
time_dim: 16
mlp_dims: [512, 512, 512]
activation_type: ReLU
residual_style: True
cond_dim: ${obs_dim}
horizon_steps: ${horizon_steps}
transition_dim: ${transition_dim}
critic:
_target_: model.common.critic.CriticObs
obs_dim: ${obs_dim}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
ft_denoising_steps: ${ft_denoising_steps}
transition_dim: ${transition_dim}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
cond_steps: ${cond_steps}
denoising_steps: ${denoising_steps}
device: ${device}

Some files were not shown because too many files have changed in this diff Show More