106 lines
3.4 KiB
Python
106 lines
3.4 KiB
Python
import os
|
|
|
|
import numpy as np
|
|
from gym import utils
|
|
from gym.envs.mujoco import MujocoEnv
|
|
|
|
|
|
class ReacherEnv(MujocoEnv, utils.EzPickle):
|
|
"""
|
|
More general version of the gym mujoco Reacher environment
|
|
"""
|
|
|
|
def __init__(self, sparse: bool = False, n_links: int = 5, ctrl_cost_weight: int = 1):
|
|
utils.EzPickle.__init__(**locals())
|
|
|
|
self._steps = 0
|
|
self.n_links = n_links
|
|
|
|
self.ctrl_cost_weight = ctrl_cost_weight
|
|
|
|
self.sparse = sparse
|
|
self.reward_weight = 1 if not sparse else 200
|
|
|
|
file_name = f'reacher_{n_links}links.xml'
|
|
|
|
MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", file_name), 2)
|
|
|
|
def step(self, action):
|
|
self._steps += 1
|
|
|
|
is_reward = not self.sparse or (self.sparse and self._steps == 200)
|
|
|
|
reward_dist = 0.0
|
|
angular_vel = 0.0
|
|
if is_reward:
|
|
reward_dist = self.distance_reward()
|
|
angular_vel = self.velocity_reward()
|
|
|
|
reward_ctrl = -self.ctrl_cost_weight * np.square(action).sum()
|
|
|
|
reward = reward_dist + reward_ctrl + angular_vel
|
|
self.do_simulation(action, self.frame_skip)
|
|
ob = self._get_obs()
|
|
done = False
|
|
|
|
infos = dict(
|
|
reward_dist=reward_dist,
|
|
reward_ctrl=reward_ctrl,
|
|
velocity=angular_vel,
|
|
end_effector=self.get_body_com("fingertip").copy(),
|
|
goal=self.goal if hasattr(self, "goal") else None
|
|
)
|
|
|
|
return ob, reward, done, infos
|
|
|
|
def distance_reward(self):
|
|
vec = self.get_body_com("fingertip") - self.get_body_com("target")
|
|
return -self.reward_weight * np.linalg.norm(vec)
|
|
|
|
def velocity_reward(self):
|
|
return -10 * np.square(self.data.qvel.flat[:self.n_links]).sum() if self.sparse else 0.0
|
|
|
|
def viewer_setup(self):
|
|
self.viewer.cam.trackbodyid = 0
|
|
|
|
def reset_model(self):
|
|
qpos = (
|
|
# self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) +
|
|
self.init_qpos.copy()
|
|
)
|
|
while True:
|
|
# full space
|
|
self.goal = self.np_random.uniform(low=-self.n_links / 10, high=self.n_links / 10, size=2)
|
|
# I Quadrant
|
|
# self.goal = self.np_random.uniform(low=0, high=self.n_links / 10, size=2)
|
|
# II Quadrant
|
|
# self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=[0, self.n_links / 10], size=2)
|
|
# II + III Quadrant
|
|
# self.goal = np.random.uniform(low=-self.n_links / 10, high=[0, self.n_links / 10], size=2)
|
|
# I + II Quadrant
|
|
# self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=self.n_links, size=2)
|
|
if np.linalg.norm(self.goal) < self.n_links / 10:
|
|
break
|
|
|
|
qpos[-2:] = self.goal
|
|
qvel = (
|
|
# self.np_random.uniform(low=-0.005, high=0.005, size=self.model.nv) +
|
|
self.init_qvel.copy()
|
|
)
|
|
qvel[-2:] = 0
|
|
self.set_state(qpos, qvel)
|
|
self._steps = 0
|
|
|
|
return self._get_obs()
|
|
|
|
def _get_obs(self):
|
|
theta = self.data.qpos.flat[:self.n_links]
|
|
target = self.get_body_com("target")
|
|
return np.concatenate([
|
|
np.cos(theta),
|
|
np.sin(theta),
|
|
target[:2], # x-y of goal position
|
|
self.data.qvel.flat[:self.n_links], # angular velocity
|
|
self.get_body_com("fingertip") - target, # goal distance
|
|
])
|