diff --git a/test.py b/test.py index 27cbdb5..5042e33 100755 --- a/test.py +++ b/test.py @@ -7,11 +7,12 @@ import time import datetime from stable_baselines3.common.evaluation import evaluate_policy -from stable_baselines3.common.policies import ActorCriticCnnPolicy, ActorCriticPolicy, MultiInputActorCriticPolicy +from metastable_baselines.distributions.distributions import get_legal_setups from metastable_baselines.ppo import PPO -# from metastable_baselines.sac import SAC -from metastable_baselines.ppo.policies import MlpPolicy +from metastable_baselines.sac import SAC +from metastable_baselines.ppo.policies import MlpPolicy as MlpPolicyPPO +from metastable_baselines.sac.policies import MlpPolicy as MlpPolicySAC from metastable_baselines.projections import BaseProjectionLayer, FrobeniusProjectionLayer, WassersteinProjectionLayer, KLProjectionLayer import columbus @@ -20,11 +21,11 @@ from metastable_baselines.distributions import Strength, ParametrizationType, En root_path = '.' -def main(env_name='ColumbusCandyland_Aux10-v0', timesteps=2_000_000, showRes=True, saveModel=True, n_eval_episodes=0): +def main(env_name='ColumbusCandyland_Aux10-v0', timesteps=1_000_000, showRes=True, saveModel=True, n_eval_episodes=0): env = gym.make(env_name) use_sde = False ppo = PPO( - MlpPolicy, + MlpPolicyPPO, env, projection=FrobeniusProjectionLayer(), policy_kwargs={'dist_kwargs': {'neural_strength': Strength.FULL, 'cov_strength': Strength.FULL, 'parameterization_type': @@ -39,7 +40,7 @@ def main(env_name='ColumbusCandyland_Aux10-v0', timesteps=2_000_000, showRes=Tru ent_coef=0.02, # 0.1 vf_coef=0.5, use_sde=use_sde, # False - clip_range=0.2, + clip_range=1 # 0.2, ) # trl_frob = PPO( # MlpPolicy, @@ -66,6 +67,47 @@ def main(env_name='ColumbusCandyland_Aux10-v0', timesteps=2_000_000, showRes=Tru # saveModel, n_eval_episodes) +def full(env_name='ColumbusCandyland_Aux10-v0', timesteps=35_000, saveModel=True, n_eval_episodes=4): + env = gym.make(env_name) + use_sde = False + skip_num = 8 # 10 (/ start at index) + sac = True + Model = [PPO, SAC][sac] + Policy = [MlpPolicyPPO, MlpPolicySAC][sac] + #projection = FrobeniusProjectionLayer() + projection = BaseProjectionLayer() + + gen = enumerate(get_legal_setups( + allowedEPTs=[EnforcePositiveType.SOFTPLUS, EnforcePositiveType.ABS])) + for i in range(skip_num): + gen.__next__() + for i, setup in gen: + (ps, cs, ept, pt) = setup + print('{'+str(i)+'}: '+str(setup)) + model = Model( + Policy, + env, + # projection=projection, + policy_kwargs={'dist_kwargs': {'neural_strength': ps, 'cov_strength': cs, 'parameterization_type': + pt, 'enforce_positive_type': ept, 'prob_squashing_type': ProbSquashingType.NONE}}, + verbose=0, + tensorboard_log=root_path+"/logs_tb/" + + env_name+"/"+['ppo', 'sac'][sac]+"_" + + ("_".join([str(s) for s in setup])+['', '_sde'][use_sde])+"/", + # learning_rate=3e-4, + # gamma=0.99, + # gae_lambda=0.95, + # normalize_advantage=True, + # ent_coef=0.02, # 0.1 + # vf_coef=0.5, + use_sde=use_sde, # False + # clip_range=1 # 0.2, + ) + + testModel(model, timesteps, False, + saveModel, n_eval_episodes) + + def testModel(model, timesteps, showRes=False, saveModel=False, n_eval_episodes=16): env = model.get_env() try: @@ -108,5 +150,7 @@ def testModel(model, timesteps, showRes=False, saveModel=False, n_eval_episodes= if __name__ == '__main__': # main('LunarLanderContinuous-v2') # main('ColumbusJustState-v0') - main('ColumbusStateWithBarriers-v0') - # main('ColumbusEasierObstacles-v0') + # main('ColumbusStateWithBarriers-v0') + # full('ColumbusEasierObstacles-v0') + # full('ColumbusStateWithBarriers-v0') + full('LunarLanderContinuous-v2')