diff --git a/examples/reinforcement_learning/tutorial_AC.py b/examples/reinforcement_learning/tutorial_AC.py index ce5a4679c..c497e714a 100644 --- a/examples/reinforcement_learning/tutorial_AC.py +++ b/examples/reinforcement_learning/tutorial_AC.py @@ -46,11 +46,11 @@ """ import argparse -import os import time +import matplotlib.pyplot as plt +import os import gym -import matplotlib.pyplot as plt import numpy as np import tensorflow as tf @@ -78,6 +78,8 @@ LR_A = 0.001 # learning rate for actor LR_C = 0.01 # learning rate for critic + + ############################### Actor-Critic #################################### @@ -137,12 +139,13 @@ def __init__(self, state_dim, lr=0.01): self.optimizer = tf.optimizers.Adam(lr) - def learn(self, state, reward, state_): + def learn(self, state, reward, state_, done): + d = 0 if done else 1 v_ = self.model(np.array([state_])) with tf.GradientTape() as tape: v = self.model(np.array([state])) - ## TD_error = r + lambda * V(newS) - V(S) - td_error = reward + LAM * v_ - v + ## TD_error = r + d * lambda * V(newS) - V(S) + td_error = reward + d * LAM * v_ - v loss = tf.square(td_error) grad = tape.gradient(loss, self.model.trainable_weights) self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights)) @@ -203,7 +206,7 @@ def load(self): # load trained weights state_new, reward, done, info = env.step(action) state_new = state_new.astype(np.float32) - if done: reward = -20 # reward shaping trick + if done: reward = -20 # reward shaping trick # these may helpful in some tasks # if abs(s_new[0]) >= env.observation_space.high[0]: # # cart moves more than 2.4 units from the center @@ -215,7 +218,7 @@ def load(self): # load trained weights try: td_error = critic.learn( - state, reward, state_new + state, reward, state_new, done ) # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)] actor.learn(state, action, td_error) # learn Policy : true_gradient = grad[logPi(s, a) * td_error] except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn() @@ -238,7 +241,7 @@ def load(self): # load trained weights # Early Stopping for quick check if step >= MAX_STEPS: - print("Early Stopping") # Hao Dong: it is important for this task + print("Early Stopping") # Hao Dong: it is important for this task break actor.save() critic.save() diff --git a/examples/reinforcement_learning/tutorial_DPPO.py b/examples/reinforcement_learning/tutorial_DPPO.py index 434599494..dbfd78db5 100644 --- a/examples/reinforcement_learning/tutorial_DPPO.py +++ b/examples/reinforcement_learning/tutorial_DPPO.py @@ -37,8 +37,8 @@ import matplotlib.pyplot as plt import numpy as np import tensorflow as tf - import tensorflow_probability as tfp + import tensorlayer as tl parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') @@ -73,6 +73,7 @@ # ppo-clip parameters EPSILON = 0.2 + ############################### DPPO #################################### @@ -282,7 +283,10 @@ def work(self): GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers if t == MAX_STEPS - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: # finish patyh - v_s_ = self.ppo.critic(np.array([s_], np.float32))[0][0] + if done: + v_s_ = 0 + else: + v_s_ = self.ppo.critic(np.array([s_], np.float32))[0][0] discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ @@ -304,8 +308,7 @@ def work(self): print( 'Training | Episode: {}/{} | Worker: {} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( - GLOBAL_EP + 1, TRAIN_EPISODES, self.wid, ep_r, - time.time() - T0 + GLOBAL_EP + 1, TRAIN_EPISODES, self.wid, ep_r, time.time() - T0 ) ) # record reward changes, plot later @@ -372,6 +375,4 @@ def work(self): print( 'Testing | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( episode + 1, TEST_EPISODES, episode_reward, - time.time() - T0 - ) - ) + time.time() - T0)) diff --git a/examples/reinforcement_learning/tutorial_PPO.py b/examples/reinforcement_learning/tutorial_PPO.py index 40ad06cdc..82d20d2e3 100644 --- a/examples/reinforcement_learning/tutorial_PPO.py +++ b/examples/reinforcement_learning/tutorial_PPO.py @@ -30,8 +30,8 @@ import matplotlib.pyplot as plt import numpy as np import tensorflow as tf - import tensorflow_probability as tfp + import tensorlayer as tl parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') @@ -63,6 +63,7 @@ # ppo-clip parameters EPSILON = 0.2 + ############################### PPO #################################### @@ -70,7 +71,6 @@ class PPO(object): """ PPO class """ - def __init__(self, state_dim, action_dim, action_bound, method='clip'): # critic with tf.name_scope('critic'): @@ -233,13 +233,16 @@ def store_transition(self, state, action, reward): self.action_buffer.append(action) self.reward_buffer.append(reward) - def finish_path(self, next_state): + def finish_path(self, next_state, done): """ Calculate cumulative reward :param next_state: :return: None """ - v_s_ = self.critic(np.array([next_state], np.float32))[0, 0] + if done: + v_s_ = 0 + else: + v_s_ = self.critic(np.array([next_state], np.float32))[0, 0] discounted_r = [] for r in self.reward_buffer[::-1]: v_s_ = r + GAMMA * v_s_ @@ -280,17 +283,15 @@ def finish_path(self, next_state): episode_reward += reward # update ppo - if (step + 1) % BATCH_SIZE == 0: - agent.finish_path(state_) + if len(agent.state_buffer) >= BATCH_SIZE: + agent.finish_path(state_, done) agent.update() if done: break - agent.finish_path(state_) + agent.finish_path(state_, done) print( 'Training | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( - episode + 1, TRAIN_EPISODES, episode_reward, - time.time() - t0 - ) + episode + 1, TRAIN_EPISODES, episode_reward, time.time() - t0) ) if episode == 0: all_episode_reward.append(episode_reward) @@ -318,6 +319,4 @@ def finish_path(self, next_state): print( 'Testing | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( episode + 1, TEST_EPISODES, episode_reward, - time.time() - t0 - ) - ) + time.time() - t0)) diff --git a/examples/reinforcement_learning/tutorial_SAC.py b/examples/reinforcement_learning/tutorial_SAC.py index f5aef5cb9..ef9b28d44 100644 --- a/examples/reinforcement_learning/tutorial_SAC.py +++ b/examples/reinforcement_learning/tutorial_SAC.py @@ -185,7 +185,7 @@ def evaluate(self, state, epsilon=1e-6): std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow normal = Normal(0, 1) - z = normal.sample() + z = normal.sample(mean.shape) action_0 = tf.math.tanh(mean + std * z) # TanhNormal distribution as actions; reparameterization trick action = self.action_range * action_0 # according to original paper, with an extra last term for normalizing different action range @@ -204,7 +204,7 @@ def get_action(self, state, greedy=False): std = tf.math.exp(log_std) normal = Normal(0, 1) - z = normal.sample() + z = normal.sample(mean.shape) action = self.action_range * tf.math.tanh( mean + std * z ) # TanhNormal distribution as actions; reparameterization trick