updates

2021-02-11 12:32:32 +01:00 · 2021-02-11 12:32:32 +01:00 · 13a292f0e0
commit 13a292f0e0
parent c81378b9e7
10 changed files with 116 additions and 155 deletions
--- a/alr_envs/classic_control/hole_reacher.py
+++ b/alr_envs/classic_control/hole_reacher.py
@ -37,7 +37,7 @@ class HoleReacher(gym.Env):
        self.start_vel = np.zeros(self.num_links)
        self.weight_matrix_scale = 50  # for the holereacher, the dmp weights become quite large compared to the values of the goal attractor. this scaling is to ensure they are on similar scale for the optimizer
-        self._dt = 0.01
+        self.dt = 0.01
        self.time_limit = 2
        action_bound = np.pi * np.ones((self.num_links,))
@ -82,9 +82,9 @@ class HoleReacher(gym.Env):
        a single step with an action in joint velocity space
        """
        vel = action
-        acc = (vel - self._angle_velocity) / self._dt
+        acc = (vel - self._angle_velocity) / self.dt
        self._angle_velocity = vel
-        self._joint_angles = self._joint_angles + self._dt * self._angle_velocity
+        self._joint_angles = self._joint_angles + self.dt * self._angle_velocity
        self._update_joints()
@ -113,7 +113,7 @@ class HoleReacher(gym.Env):
        self._steps += 1
-        done = self._steps * self._dt > self.time_limit or self._is_collided
+        done = self._steps * self.dt > self.time_limit or self._is_collided
        return self._get_obs().copy(), reward, done, info
--- a/alr_envs/classic_control/utils.py
+++ b/alr_envs/classic_control/utils.py
@ -26,7 +26,7 @@ def make_env(rank, seed=0):
                               num_dof=5,
                               num_basis=5,
                               duration=2,
-                               dt=env._dt,
+                               dt=env.dt,
                               learn_goal=True)
        env.seed(seed + rank)
        return env
--- a/alr_envs/mujoco/alr_mujoco_env.py
+++ b/alr_envs/mujoco/alr_mujoco_env.py
@ -73,7 +73,7 @@ class AlrMujocoEnv(gym.Env):
        # observation, _reward, done, _info = self.step(action)
        # assert not done
-        observation = self.reset()
+        observation = self._get_obs()  # TODO: is calling get_obs enough? should we call reset, or even step?
        self._set_observation_space(observation)
@ -82,14 +82,14 @@ class AlrMujocoEnv(gym.Env):
    @property
    def current_pos(self):
        """
-        By default returns the joint positions of all simulated objects. May be overriden in subclass.
+        By default returns the joint positions of all simulated objects. May be overridden in subclass.
        """
        return self.sim.data.qpos
    @property
    def current_vel(self):
        """
-        By default returns the joint velocities of all simulated objects. May be overriden in subclass.
+        By default returns the joint velocities of all simulated objects. May be overridden in subclass.
        """
        return self.sim.data.qvel
@ -125,10 +125,15 @@ class AlrMujocoEnv(gym.Env):
    # methods to override:
    # ----------------------------
    def _get_obs(self):
        """Returns the observation.
        """
        raise NotImplementedError()
    def configure(self, *args, **kwargs):
        """
        Helper method to set certain environment properties such as contexts in contextual environments since reset()
-        doesn't take arguments. Should be called before/after reset(). TODO: before or after?
+        doesn't take arguments. Should be called before reset().
        """
        pass
--- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py
+++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py
@ -1,12 +1,12 @@
 from gym.envs.mujoco import mujoco_env
 from gym import utils
 import os
 import numpy as np
-from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
+from alr_envs.mujoco import alr_mujoco_env
 from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
 import mujoco_py
-class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
    def __init__(self, ):
        self._steps = 0
@ -21,8 +21,12 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
        self._q_pos = []
        utils.EzPickle.__init__(self)
-        mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", "ball-in-a-cup_base.xml"),
+        alr_mujoco_env.AlrMujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", "ball-in-a-cup_base.xml"),
-                                      frame_skip=4)
+                                      n_substeps=4)
    def configure(self, context):
        self.context = context
        self.reward_function.reset(context)
    def reset_model(self):
        start_pos = self.init_qpos.copy()
@ -30,24 +34,8 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
        start_vel = np.zeros_like(start_pos)
        self.set_state(start_pos, start_vel)
        self._steps = 0
        self.reward_function.reset()
        self._q_pos = []
    def do_simulation(self, ctrl, n_frames):
        self.sim.data.ctrl[:] = ctrl
        for _ in range(n_frames):
            try:
                self.sim.step()
            except mujoco_py.builder.MujocoException as e:
                # print("Error in simulation: " + str(e))
                # error = True
                # Copy the current torque as if it would have been applied until the end of the trajectory
                # for i in range(k + 1, sim_time):
                #     torques.append(trq)
                return True
        return False
    def step(self, a):
        # Apply gravity compensation
        if not np.all(self.sim.data.qfrc_applied[:7] == self.sim.data.qfrc_bias[:7]):
@ -98,6 +86,7 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 if __name__ == "__main__":
    env = ALRBallInACupEnv()
    env.configure(None)
    env.reset()
    for i in range(2000):
        # objective.load_result("/tmp/cma")
--- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py
+++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py
@ -26,9 +26,9 @@ class BallInACupReward(alr_reward_fct.AlrReward):
        self.dists_final = None
        self.costs = None
-        self.reset()
+        self.reset(None)
-    def reset(self):
+    def reset(self, context):
        self.ball_traj = np.zeros(shape=(self.sim_time, 3))
        self.dists = []
        self.dists_final = []
@ -51,11 +51,12 @@ class BallInACupReward(alr_reward_fct.AlrReward):
        self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
        self.ball_traj[step, :] = ball_pos
        if self.check_collision(sim):
            return -1000, False, True
        action_cost = np.sum(np.square(action))
        if self.check_collision(sim):
            reward = - 1e-5 * action_cost - 1000
            return reward, False, True
        if step == self.sim_time - 1:
            min_dist = np.min(self.dists)
            dist_final = self.dists_final[-1]
--- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py
+++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py
@ -1,21 +1,16 @@
 from alr_envs.mujoco import alr_mujoco_env
-from gym import utils, spaces
+from gym import utils
 import os
 import numpy as np
 from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
 class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
-    def __init__(self, reward_function=None):
+    def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None):
        self._steps = 0
        self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
                                     "biac_base" + ".xml")
        self.sim_time = 8  # seconds
        self.sim_steps = int(self.sim_time / (0.0005 * 4))  # circular dependency.. sim.dt <-> mujocoenv init <-> reward fct
        self.reward_function = reward_function(self.sim_steps)
        self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
        self.start_vel = np.zeros(7)
@ -34,8 +29,15 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
        utils.EzPickle.__init__(self)
        alr_mujoco_env.AlrMujocoEnv.__init__(self,
                                             self.xml_path,
-                                             apply_gravity_comp=True,
+                                             apply_gravity_comp=apply_gravity_comp,
-                                             n_substeps=4)
+                                             n_substeps=n_substeps)
        self.sim_time = 8  # seconds
        self.sim_steps = int(self.sim_time / self.dt)
        if reward_function is None:
            from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
            reward_function = BallInACupReward
        self.reward_function = reward_function(self.sim_steps)
    @property
    def current_pos(self):
@ -47,6 +49,7 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
    def configure(self, context):
        self.context = context
        self.reward_function.reset(context)
    def reset_model(self):
        init_pos_all = self.init_qpos.copy()
@ -56,7 +59,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
        goal_final_id = self.sim.model._site_name2id["cup_goal_final"]
        self._steps = 0
        self.reward_function.reset()
        self._q_pos = []
        self._q_vel = []
@ -65,38 +67,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
        self.set_state(start_pos, init_vel)
        # Reset the system
        # self.sim.data.qpos[:] = init_pos_all
        # self.sim.data.qvel[:] = init_vel
        # self.sim.data.qpos[0:7] = init_pos_robot
        #
        # self.sim.step()
        #
        # self.sim.data.qpos[:] = init_pos_all
        # self.sim.data.qvel[:] = init_vel
        # self.sim.data.qpos[0:7] = init_pos_robot
        # self.sim.data.body_xpos[ball_id, :] = np.copy(self.sim.data.site_xpos[goal_final_id, :]) - np.array([0., 0., 0.329])
        #
        # # Stabilize the system around the initial position
        # for i in range(0, 500):
        #     self.sim.data.qpos[7:] = 0.
        #     self.sim.data.qvel[7:] = 0.
        #     # self.sim.data.qpos[7] = -0.2
        #     cur_pos = self.sim.data.qpos[0:7].copy()
        #     cur_vel = self.sim.data.qvel[0:7].copy()
        #     trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel)
        #     self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy()
        #     self.sim.step()
        #     self.render()
        #
        # for i in range(0, 500):
        #     cur_pos = self.sim.data.qpos[0:7].copy()
        #     cur_vel = self.sim.data.qvel[0:7].copy()
        #     trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel)
        #     self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy()
        #     self.sim.step()
        #     self.render()
        return self._get_obs()
    def step(self, a):
@ -154,7 +124,10 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
 if __name__ == "__main__":
    from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
    env = ALRBallInACupEnv(reward_function=BallInACupReward)
    env.configure(None)
    env.reset()
    env.render()
    for i in range(4000):
--- a/alr_envs/utils/dmp_async_vec_env.py
+++ b/alr_envs/utils/dmp_async_vec_env.py
@ -7,9 +7,54 @@ import multiprocessing as mp
 import sys
 def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
    assert shared_memory is None
    env = env_fn()
    parent_pipe.close()
    try:
        while True:
            command, data = pipe.recv()
            if command == 'reset':
                observation = env.reset()
                pipe.send((observation, True))
            elif command == 'step':
                observation, reward, done, info = env.step(data)
                if done:
                    observation = env.reset()
                pipe.send(((observation, reward, done, info), True))
            elif command == 'rollout':
                rewards = []
                infos = []
                for p, c in zip(*data):
                    reward, info = env.rollout(p, c)
                    rewards.append(reward)
                    infos.append(info)
                pipe.send(((rewards, infos), (True, ) * len(rewards)))
            elif command == 'seed':
                env.seed(data)
                pipe.send((None, True))
            elif command == 'close':
                env.close()
                pipe.send((None, True))
                break
            elif command == 'idle':
                pipe.send((None, True))
            elif command == '_check_observation_space':
                pipe.send((data == env.observation_space, True))
            else:
                raise RuntimeError('Received unknown command `{0}`. Must '
                    'be one of {`reset`, `step`, `seed`, `close`, '
                    '`_check_observation_space`}.'.format(command))
    except (KeyboardInterrupt, Exception):
        error_queue.put((index,) + sys.exc_info()[:2])
        pipe.send((None, False))
    finally:
        env.close()
 class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
    def __init__(self, env_fns, n_samples, observation_space=None, action_space=None,
-                 shared_memory=True, copy=True, context=None, daemon=True, worker=None):
+                 shared_memory=False, copy=True, context="spawn", daemon=True, worker=_worker):
        super(DmpAsyncVectorEnv, self).__init__(env_fns,
                                                observation_space=observation_space,
                                                action_space=action_space,
@ -91,7 +136,7 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
        self._raise_if_errors(successes)
        self._state = AsyncState.DEFAULT
-        observations_list, rewards, dones, infos = [_flatten_list(r) for r in zip(*results)]
+        rewards, infos = [_flatten_list(r) for r in zip(*results)]
        # for now, we ignore the observations and only return the rewards
@ -109,55 +154,6 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
        return self.rollout_wait()
 def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
    assert shared_memory is None
    env = env_fn()
    parent_pipe.close()
    try:
        while True:
            command, data = pipe.recv()
            if command == 'reset':
                observation = env.reset()
                pipe.send((observation, True))
            elif command == 'step':
                observation, reward, done, info = env.step(data)
                if done:
                    observation = env.reset()
                pipe.send(((observation, reward, done, info), True))
            elif command == 'rollout':
                observations = []
                rewards = []
                dones = []
                infos = []
                for p, c in zip(*data):
                    observation, reward, done, info = env.rollout(p, c)
                    observations.append(observation)
                    rewards.append(reward)
                    dones.append(done)
                    infos.append(info)
                pipe.send(((observations, rewards, dones, infos), (True, ) * len(rewards)))
            elif command == 'seed':
                env.seed(data)
                pipe.send((None, True))
            elif command == 'close':
                env.close()
                pipe.send((None, True))
                break
            elif command == 'idle':
                pipe.send((None, True))
            elif command == '_check_observation_space':
                pipe.send((data == env.observation_space, True))
            else:
                raise RuntimeError('Received unknown command `{0}`. Must '
                    'be one of {`reset`, `step`, `seed`, `close`, '
                    '`_check_observation_space`}.'.format(command))
    except (KeyboardInterrupt, Exception):
        error_queue.put((index,) + sys.exc_info()[:2])
        pipe.send((None, False))
    finally:
        env.close()
 def _flatten_obs(obs):
    assert isinstance(obs, (list, tuple))
    assert len(obs) > 0
--- a/alr_envs/utils/dmp_env_wrapper.py
+++ b/alr_envs/utils/dmp_env_wrapper.py
@ -69,15 +69,11 @@ class DmpEnvWrapper(gym.Wrapper):
    def __call__(self, params, contexts=None):
        params = np.atleast_2d(params)
        observations = []
        rewards = []
        dones = []
        infos = []
        for p, c in zip(params, contexts):
-            observation, reward, done, info = self.rollout(p, c)
+            reward, info = self.rollout(p, c)
            observations.append(observation)
            rewards.append(reward)
            dones.append(done)
            infos.append(info)
        return np.array(rewards), infos
@ -116,9 +112,8 @@ class DmpEnvWrapper(gym.Wrapper):
        rews = []
        infos = []
        self.env.configure(context)
        self.env.reset()
        if context is not None:
            self.env.configure(context)
        for t, pos_vel in enumerate(zip(trajectory, velocity)):
            ac = self.policy.get_action(pos_vel[0], pos_vel[1])
@ -132,4 +127,4 @@ class DmpEnvWrapper(gym.Wrapper):
        reward = np.sum(rews)
-        return obs, reward, done, info
+        return reward, info
--- a/dmp_env_wrapper_example.py
+++ b/dmp_env_wrapper_example.py
@ -1,4 +1,4 @@
-from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapperVel
+from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper
 from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv, _worker
 from alr_envs.classic_control.hole_reacher import HoleReacher
 import numpy as np
@ -16,21 +16,25 @@ if __name__ == "__main__":
        :param rank: (int) index of the subprocess
        """
        def _init():
-            env = HoleReacher(num_links=5,
+            _env = HoleReacher(num_links=5,
-                              allow_self_collision=False,
+                               allow_self_collision=False,
-                              allow_wall_collision=False,
+                               allow_wall_collision=False,
-                              hole_width=0.15,
+                               hole_width=0.15,
-                              hole_depth=1,
+                               hole_depth=1,
-                              hole_x=1)
+                               hole_x=1)
-            env = DmpEnvWrapperVel(env,
+            _env = DmpEnvWrapper(_env,
-                                   num_dof=5,
+                                 num_dof=5,
-                                   num_basis=5,
+                                 num_basis=5,
-                                   duration=2,
+                                 duration=2,
-                                   dt=env._dt,
+                                 dt=_env.dt,
-                                   learn_goal=True)
+                                 learn_goal=True,
-            env.seed(seed + rank)
+                                 alpha_phase=2,
-            return env
+                                 start_pos=_env.start_pos,
                                 policy_type="velocity"
                                 )
            _env.seed(seed + rank)
            return _env
        return _init
    n_samples = 4
@ -45,6 +49,6 @@ if __name__ == "__main__":
    params = np.hstack([50 * np.random.randn(n_samples, 25), np.tile(np.array([np.pi/2, -np.pi/4, -np.pi/4, -np.pi/4, -np.pi/4]), [n_samples, 1])])
    # env.reset()
-    out = env.rollout(params)
+    out = env(params)
    print(out)
--- a/dmp_pd_control_example.py
+++ b/dmp_pd_control_example.py
@ -1,7 +1,6 @@
 from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper
 from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv, _worker
 from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv
 from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
 import numpy as np
@ -17,7 +16,7 @@ if __name__ == "__main__":
        :param rank: (int) index of the subprocess
        """
        def _init():
-            _env = ALRBallInACupEnv(BallInACupReward)
+            _env = ALRBallInACupEnv()
            _env = DmpEnvWrapper(_env,
                                 num_dof=3,
@ -29,7 +28,6 @@ if __name__ == "__main__":
                                 start_pos=_env.start_pos[1::2],
                                 final_pos=_env.start_pos[1::2],
                                 policy_type="motor"
                                 # contextual=True
                                 )
            _env.seed(seed + rank)
            return _env