116 lines
3.6 KiB
Python
116 lines
3.6 KiB
Python
from collections import defaultdict
|
|
import gym
|
|
import numpy as np
|
|
from alr_envs.utils.mp_env_async_sampler import AlrMpEnvSampler, AlrContextualMpEnvSampler, DummyDist
|
|
|
|
|
|
def example_mujoco():
|
|
env = gym.make('alr_envs:ALRReacher-v0')
|
|
rewards = 0
|
|
obs = env.reset()
|
|
|
|
# number of environment steps
|
|
for i in range(10000):
|
|
obs, reward, done, info = env.step(env.action_space.sample())
|
|
rewards += reward
|
|
|
|
if i % 1 == 0:
|
|
env.render()
|
|
|
|
if done:
|
|
print(rewards)
|
|
rewards = 0
|
|
obs = env.reset()
|
|
|
|
|
|
def example_mp(env_name="alr_envs:HoleReacherDMP-v0"):
|
|
# env = gym.make("alr_envs:ViaPointReacherDMP-v0")
|
|
env = gym.make(env_name)
|
|
rewards = 0
|
|
# env.render(mode=None)
|
|
obs = env.reset()
|
|
|
|
# number of samples/full trajectories (multiple environment steps)
|
|
for i in range(10):
|
|
obs, reward, done, info = env.step(env.action_space.sample())
|
|
rewards += reward
|
|
|
|
if i % 1 == 0:
|
|
# render full DMP trajectory
|
|
# render can only be called once in the beginning as well. That would render every trajectory
|
|
# Calling it after every trajectory allows to modify the mode. mode=None, disables rendering.
|
|
env.render(mode="human")
|
|
|
|
if done:
|
|
print(rewards)
|
|
rewards = 0
|
|
obs = env.reset()
|
|
|
|
|
|
def example_async(env_id="alr_envs:HoleReacherDMP-v0", n_cpu=4, seed=int('533D', 16)):
|
|
def make_env(env_id, seed, rank):
|
|
env = gym.make(env_id)
|
|
env.seed(seed + rank)
|
|
return lambda: env
|
|
|
|
def sample(env: gym.vector.VectorEnv, n_samples=100):
|
|
# for plotting
|
|
rewards = np.zeros(n_cpu)
|
|
|
|
# this would generate more samples than requested if n_samples % num_envs != 0
|
|
repeat = int(np.ceil(n_samples / env.num_envs))
|
|
vals = defaultdict(list)
|
|
for i in range(repeat):
|
|
obs, reward, done, info = envs.step(envs.action_space.sample())
|
|
vals['obs'].append(obs)
|
|
vals['reward'].append(reward)
|
|
vals['done'].append(done)
|
|
vals['info'].append(info)
|
|
rewards += reward
|
|
if np.any(done):
|
|
print(rewards[done])
|
|
rewards[done] = 0
|
|
|
|
# do not return values above threshold
|
|
return (*map(lambda v: np.stack(v)[:n_samples], vals.values()),)
|
|
|
|
envs = gym.vector.AsyncVectorEnv([make_env(env_id, seed, i) for i in range(n_cpu)])
|
|
|
|
obs = envs.reset()
|
|
print(sample(envs, 16))
|
|
|
|
|
|
def example_async_sampler(env_name="alr_envs:HoleReacherDetPMP-v1", n_cpu=4):
|
|
n_samples = 10
|
|
|
|
sampler = AlrMpEnvSampler(env_name, num_envs=n_cpu)
|
|
dim = sampler.env.action_space.spaces[0].shape[0]
|
|
|
|
thetas = np.random.randn(n_samples, dim) # usually form a search distribution
|
|
|
|
_, rewards, __, ___ = sampler(thetas)
|
|
|
|
print(rewards)
|
|
|
|
|
|
def example_async_contextual_sampler(env_name="alr_envs:SimpleReacherDMP-v1", n_cpu=4):
|
|
sampler = AlrContextualMpEnvSampler(env_name, num_envs=n_cpu)
|
|
dim = sampler.env.action_space.spaces[0].shape[0]
|
|
dist = DummyDist(dim) # needs a sample function
|
|
|
|
n_samples = 10
|
|
new_samples, new_contexts, obs, new_rewards, done, infos = sampler(dist, n_samples)
|
|
|
|
print(new_rewards)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# example_mujoco()
|
|
example_mp("alr_envs:SimpleReacherDMP-v1")
|
|
# example_async("alr_envs:LongSimpleReacherDMP-v0", 4)
|
|
# example_async_contextual_sampler()
|
|
# env = gym.make("alr_envs:HoleReacherDetPMP-v1")
|
|
# env_name = "alr_envs:ALRBallInACupSimpleDetPMP-v0"
|
|
# example_async_sampler(env_name)
|
|
# example_mp(env_name)
|