updates

2021-02-11 12:32:32 +01:00 · 2021-02-11 12:32:32 +01:00 · 13a292f0e0
commit 13a292f0e0
parent c81378b9e7
10 changed files with 116 additions and 155 deletions
--- a/alr_envs/classic_control/hole_reacher.py
+++ b/alr_envs/classic_control/hole_reacher.py
@ -37,7 +37,7 @@ class HoleReacher(gym.Env):
        self.start_vel = np.zeros(self.num_links)
        self.weight_matrix_scale = 50  # for the holereacher, the dmp weights become quite large compared to the values of the goal attractor. this scaling is to ensure they are on similar scale for the optimizer

-        self._dt = 0.01
+        self.dt = 0.01
        self.time_limit = 2

        action_bound = np.pi * np.ones((self.num_links,))
@ -82,9 +82,9 @@ class HoleReacher(gym.Env):
        a single step with an action in joint velocity space
        """
        vel = action
-        acc = (vel - self._angle_velocity) / self._dt
+        acc = (vel - self._angle_velocity) / self.dt
        self._angle_velocity = vel
-        self._joint_angles = self._joint_angles + self._dt * self._angle_velocity
+        self._joint_angles = self._joint_angles + self.dt * self._angle_velocity

        self._update_joints()

@ -113,7 +113,7 @@ class HoleReacher(gym.Env):

        self._steps += 1

-        done = self._steps * self._dt > self.time_limit or self._is_collided
+        done = self._steps * self.dt > self.time_limit or self._is_collided

        return self._get_obs().copy(), reward, done, info

--- a/alr_envs/classic_control/utils.py
+++ b/alr_envs/classic_control/utils.py
@ -26,7 +26,7 @@ def make_env(rank, seed=0):
                               num_dof=5,
                               num_basis=5,
                               duration=2,
-                               dt=env._dt,
+                               dt=env.dt,
                               learn_goal=True)
        env.seed(seed + rank)
        return env
--- a/alr_envs/mujoco/alr_mujoco_env.py
+++ b/alr_envs/mujoco/alr_mujoco_env.py
@ -73,7 +73,7 @@ class AlrMujocoEnv(gym.Env):
        # observation, _reward, done, _info = self.step(action)
        # assert not done

-        observation = self.reset()
+        observation = self._get_obs()  # TODO: is calling get_obs enough? should we call reset, or even step?

        self._set_observation_space(observation)

@ -82,14 +82,14 @@ class AlrMujocoEnv(gym.Env):
    @property
    def current_pos(self):
        """
-        By default returns the joint positions of all simulated objects. May be overriden in subclass.
+        By default returns the joint positions of all simulated objects. May be overridden in subclass.
        """
        return self.sim.data.qpos

    @property
    def current_vel(self):
        """
-        By default returns the joint velocities of all simulated objects. May be overriden in subclass.
+        By default returns the joint velocities of all simulated objects. May be overridden in subclass.
        """
        return self.sim.data.qvel

@ -125,10 +125,15 @@ class AlrMujocoEnv(gym.Env):
    # methods to override:
    # ----------------------------

+    def _get_obs(self):
+        """Returns the observation.
+        """
+        raise NotImplementedError()
+
    def configure(self, *args, **kwargs):
        """
        Helper method to set certain environment properties such as contexts in contextual environments since reset()
-        doesn't take arguments. Should be called before/after reset(). TODO: before or after?
+        doesn't take arguments. Should be called before reset().
        """
        pass

--- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py
+++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py
@ -1,12 +1,12 @@
-from gym.envs.mujoco import mujoco_env
 from gym import utils
 import os
 import numpy as np
-from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
+from alr_envs.mujoco import alr_mujoco_env
+from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
 import mujoco_py


-class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
    def __init__(self, ):
        self._steps = 0

@ -21,8 +21,12 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
        self._q_pos = []

        utils.EzPickle.__init__(self)
-        mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", "ball-in-a-cup_base.xml"),
-                                      frame_skip=4)
+        alr_mujoco_env.AlrMujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", "ball-in-a-cup_base.xml"),
+                                      n_substeps=4)
+
+    def configure(self, context):
+        self.context = context
+        self.reward_function.reset(context)

    def reset_model(self):
        start_pos = self.init_qpos.copy()
@ -30,24 +34,8 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
        start_vel = np.zeros_like(start_pos)
        self.set_state(start_pos, start_vel)
        self._steps = 0
-        self.reward_function.reset()
        self._q_pos = []

-    def do_simulation(self, ctrl, n_frames):
-        self.sim.data.ctrl[:] = ctrl
-        for _ in range(n_frames):
-            try:
-                self.sim.step()
-            except mujoco_py.builder.MujocoException as e:
-                # print("Error in simulation: " + str(e))
-                # error = True
-                # Copy the current torque as if it would have been applied until the end of the trajectory
-                # for i in range(k + 1, sim_time):
-                #     torques.append(trq)
-                return True
-
-        return False
-
    def step(self, a):
        # Apply gravity compensation
        if not np.all(self.sim.data.qfrc_applied[:7] == self.sim.data.qfrc_bias[:7]):
@ -98,6 +86,7 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):

 if __name__ == "__main__":
    env = ALRBallInACupEnv()
+    env.configure(None)
    env.reset()
    for i in range(2000):
        # objective.load_result("/tmp/cma")
--- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py
+++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py
@ -26,9 +26,9 @@ class BallInACupReward(alr_reward_fct.AlrReward):
        self.dists_final = None
        self.costs = None

-        self.reset()
+        self.reset(None)

-    def reset(self):
+    def reset(self, context):
        self.ball_traj = np.zeros(shape=(self.sim_time, 3))
        self.dists = []
        self.dists_final = []
@ -51,11 +51,12 @@ class BallInACupReward(alr_reward_fct.AlrReward):
        self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
        self.ball_traj[step, :] = ball_pos

-        if self.check_collision(sim):
-            return -1000, False, True
-
        action_cost = np.sum(np.square(action))

+        if self.check_collision(sim):
+            reward = - 1e-5 * action_cost - 1000
+            return reward, False, True
+
        if step == self.sim_time - 1:
            min_dist = np.min(self.dists)
            dist_final = self.dists_final[-1]
--- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py
+++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py
@ -1,21 +1,16 @@
 from alr_envs.mujoco import alr_mujoco_env
-from gym import utils, spaces
+from gym import utils
 import os
 import numpy as np
-from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward


 class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
-    def __init__(self, reward_function=None):
+    def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None):
        self._steps = 0

        self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
                                     "biac_base" + ".xml")

-        self.sim_time = 8  # seconds
-        self.sim_steps = int(self.sim_time / (0.0005 * 4))  # circular dependency.. sim.dt <-> mujocoenv init <-> reward fct
-        self.reward_function = reward_function(self.sim_steps)
-
        self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
        self.start_vel = np.zeros(7)

@ -34,8 +29,15 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
        utils.EzPickle.__init__(self)
        alr_mujoco_env.AlrMujocoEnv.__init__(self,
                                             self.xml_path,
-                                             apply_gravity_comp=True,
-                                             n_substeps=4)
+                                             apply_gravity_comp=apply_gravity_comp,
+                                             n_substeps=n_substeps)
+
+        self.sim_time = 8  # seconds
+        self.sim_steps = int(self.sim_time / self.dt)
+        if reward_function is None:
+            from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
+            reward_function = BallInACupReward
+        self.reward_function = reward_function(self.sim_steps)

    @property
    def current_pos(self):
@ -47,6 +49,7 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):

    def configure(self, context):
        self.context = context
+        self.reward_function.reset(context)

    def reset_model(self):
        init_pos_all = self.init_qpos.copy()
@ -56,7 +59,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
        goal_final_id = self.sim.model._site_name2id["cup_goal_final"]

        self._steps = 0
-        self.reward_function.reset()
        self._q_pos = []
        self._q_vel = []

@ -65,38 +67,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):

        self.set_state(start_pos, init_vel)

-        # Reset the system
-        # self.sim.data.qpos[:] = init_pos_all
-        # self.sim.data.qvel[:] = init_vel
-        # self.sim.data.qpos[0:7] = init_pos_robot
-        #
-        # self.sim.step()
-        #
-        # self.sim.data.qpos[:] = init_pos_all
-        # self.sim.data.qvel[:] = init_vel
-        # self.sim.data.qpos[0:7] = init_pos_robot
-        # self.sim.data.body_xpos[ball_id, :] = np.copy(self.sim.data.site_xpos[goal_final_id, :]) - np.array([0., 0., 0.329])
-        #
-        # # Stabilize the system around the initial position
-        # for i in range(0, 500):
-        #     self.sim.data.qpos[7:] = 0.
-        #     self.sim.data.qvel[7:] = 0.
-        #     # self.sim.data.qpos[7] = -0.2
-        #     cur_pos = self.sim.data.qpos[0:7].copy()
-        #     cur_vel = self.sim.data.qvel[0:7].copy()
-        #     trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel)
-        #     self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy()
-        #     self.sim.step()
-        #     self.render()
-        #
-        # for i in range(0, 500):
-        #     cur_pos = self.sim.data.qpos[0:7].copy()
-        #     cur_vel = self.sim.data.qvel[0:7].copy()
-        #     trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel)
-        #     self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy()
-        #     self.sim.step()
-        #     self.render()
-
        return self._get_obs()

    def step(self, a):
@ -154,7 +124,10 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):


 if __name__ == "__main__":
+    from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
+
    env = ALRBallInACupEnv(reward_function=BallInACupReward)
+    env.configure(None)
    env.reset()
    env.render()
    for i in range(4000):
--- a/alr_envs/utils/dmp_async_vec_env.py
+++ b/alr_envs/utils/dmp_async_vec_env.py
@ -7,9 +7,54 @@ import multiprocessing as mp
 import sys


+def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
+    assert shared_memory is None
+    env = env_fn()
+    parent_pipe.close()
+    try:
+        while True:
+            command, data = pipe.recv()
+            if command == 'reset':
+                observation = env.reset()
+                pipe.send((observation, True))
+            elif command == 'step':
+                observation, reward, done, info = env.step(data)
+                if done:
+                    observation = env.reset()
+                pipe.send(((observation, reward, done, info), True))
+            elif command == 'rollout':
+                rewards = []
+                infos = []
+                for p, c in zip(*data):
+                    reward, info = env.rollout(p, c)
+                    rewards.append(reward)
+                    infos.append(info)
+                pipe.send(((rewards, infos), (True, ) * len(rewards)))
+            elif command == 'seed':
+                env.seed(data)
+                pipe.send((None, True))
+            elif command == 'close':
+                env.close()
+                pipe.send((None, True))
+                break
+            elif command == 'idle':
+                pipe.send((None, True))
+            elif command == '_check_observation_space':
+                pipe.send((data == env.observation_space, True))
+            else:
+                raise RuntimeError('Received unknown command `{0}`. Must '
+                    'be one of {`reset`, `step`, `seed`, `close`, '
+                    '`_check_observation_space`}.'.format(command))
+    except (KeyboardInterrupt, Exception):
+        error_queue.put((index,) + sys.exc_info()[:2])
+        pipe.send((None, False))
+    finally:
+        env.close()
+
+
 class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
    def __init__(self, env_fns, n_samples, observation_space=None, action_space=None,
-                 shared_memory=True, copy=True, context=None, daemon=True, worker=None):
+                 shared_memory=False, copy=True, context="spawn", daemon=True, worker=_worker):
        super(DmpAsyncVectorEnv, self).__init__(env_fns,
                                                observation_space=observation_space,
                                                action_space=action_space,
@ -91,7 +136,7 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
        self._raise_if_errors(successes)
        self._state = AsyncState.DEFAULT

-        observations_list, rewards, dones, infos = [_flatten_list(r) for r in zip(*results)]
+        rewards, infos = [_flatten_list(r) for r in zip(*results)]

        # for now, we ignore the observations and only return the rewards

@ -109,55 +154,6 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
        return self.rollout_wait()


-def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
-    assert shared_memory is None
-    env = env_fn()
-    parent_pipe.close()
-    try:
-        while True:
-            command, data = pipe.recv()
-            if command == 'reset':
-                observation = env.reset()
-                pipe.send((observation, True))
-            elif command == 'step':
-                observation, reward, done, info = env.step(data)
-                if done:
-                    observation = env.reset()
-                pipe.send(((observation, reward, done, info), True))
-            elif command == 'rollout':
-                observations = []
-                rewards = []
-                dones = []
-                infos = []
-                for p, c in zip(*data):
-                    observation, reward, done, info = env.rollout(p, c)
-                    observations.append(observation)
-                    rewards.append(reward)
-                    dones.append(done)
-                    infos.append(info)
-                pipe.send(((observations, rewards, dones, infos), (True, ) * len(rewards)))
-            elif command == 'seed':
-                env.seed(data)
-                pipe.send((None, True))
-            elif command == 'close':
-                env.close()
-                pipe.send((None, True))
-                break
-            elif command == 'idle':
-                pipe.send((None, True))
-            elif command == '_check_observation_space':
-                pipe.send((data == env.observation_space, True))
-            else:
-                raise RuntimeError('Received unknown command `{0}`. Must '
-                    'be one of {`reset`, `step`, `seed`, `close`, '
-                    '`_check_observation_space`}.'.format(command))
-    except (KeyboardInterrupt, Exception):
-        error_queue.put((index,) + sys.exc_info()[:2])
-        pipe.send((None, False))
-    finally:
-        env.close()
-
-
 def _flatten_obs(obs):
    assert isinstance(obs, (list, tuple))
    assert len(obs) > 0
--- a/alr_envs/utils/dmp_env_wrapper.py
+++ b/alr_envs/utils/dmp_env_wrapper.py
@ -69,15 +69,11 @@ class DmpEnvWrapper(gym.Wrapper):

    def __call__(self, params, contexts=None):
        params = np.atleast_2d(params)
-        observations = []
        rewards = []
-        dones = []
        infos = []
        for p, c in zip(params, contexts):
-            observation, reward, done, info = self.rollout(p, c)
-            observations.append(observation)
+            reward, info = self.rollout(p, c)
            rewards.append(reward)
-            dones.append(done)
            infos.append(info)

        return np.array(rewards), infos
@ -116,9 +112,8 @@ class DmpEnvWrapper(gym.Wrapper):
        rews = []
        infos = []

+        self.env.configure(context)
        self.env.reset()
-        if context is not None:
-            self.env.configure(context)

        for t, pos_vel in enumerate(zip(trajectory, velocity)):
            ac = self.policy.get_action(pos_vel[0], pos_vel[1])
@ -132,4 +127,4 @@ class DmpEnvWrapper(gym.Wrapper):

        reward = np.sum(rews)

-        return obs, reward, done, info
+        return reward, info
--- a/dmp_env_wrapper_example.py
+++ b/dmp_env_wrapper_example.py
@ -1,4 +1,4 @@
-from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapperVel
+from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper
 from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv, _worker
 from alr_envs.classic_control.hole_reacher import HoleReacher
 import numpy as np
@ -16,21 +16,25 @@ if __name__ == "__main__":
        :param rank: (int) index of the subprocess
        """
        def _init():
-            env = HoleReacher(num_links=5,
-                              allow_self_collision=False,
-                              allow_wall_collision=False,
-                              hole_width=0.15,
-                              hole_depth=1,
-                              hole_x=1)
+            _env = HoleReacher(num_links=5,
+                               allow_self_collision=False,
+                               allow_wall_collision=False,
+                               hole_width=0.15,
+                               hole_depth=1,
+                               hole_x=1)

-            env = DmpEnvWrapperVel(env,
-                                   num_dof=5,
-                                   num_basis=5,
-                                   duration=2,
-                                   dt=env._dt,
-                                   learn_goal=True)
-            env.seed(seed + rank)
-            return env
+            _env = DmpEnvWrapper(_env,
+                                 num_dof=5,
+                                 num_basis=5,
+                                 duration=2,
+                                 dt=_env.dt,
+                                 learn_goal=True,
+                                 alpha_phase=2,
+                                 start_pos=_env.start_pos,
+                                 policy_type="velocity"
+                                 )
+            _env.seed(seed + rank)
+            return _env
        return _init

    n_samples = 4
@ -45,6 +49,6 @@ if __name__ == "__main__":
    params = np.hstack([50 * np.random.randn(n_samples, 25), np.tile(np.array([np.pi/2, -np.pi/4, -np.pi/4, -np.pi/4, -np.pi/4]), [n_samples, 1])])

    # env.reset()
-    out = env.rollout(params)
+    out = env(params)

    print(out)
--- a/dmp_pd_control_example.py
+++ b/dmp_pd_control_example.py
@ -1,7 +1,6 @@
 from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper
 from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv, _worker
 from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv
-from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
 import numpy as np


@ -17,7 +16,7 @@ if __name__ == "__main__":
        :param rank: (int) index of the subprocess
        """
        def _init():
-            _env = ALRBallInACupEnv(BallInACupReward)
+            _env = ALRBallInACupEnv()

            _env = DmpEnvWrapper(_env,
                                 num_dof=3,
@ -29,7 +28,6 @@ if __name__ == "__main__":
                                 start_pos=_env.start_pos[1::2],
                                 final_pos=_env.start_pos[1::2],
                                 policy_type="motor"
-                                 # contextual=True
                                 )
            _env.seed(seed + rank)
            return _env