资源简介
基于Tensorflow实现的PPO算法,依赖库:tensorflow-1.4及以上,gym
代码片段和文件信息
import tensorflow as tf
import numpy as np
import gym
import copy
class PPO:
def __init__(self n_features n_actions):
self.n_actions = n_actions
self.n_features = n_features
self.learning_rate = 0.0015
self.sess = tf.Session()
self.observe = tf.placeholder(tf.float32 [None self.n_features])
self.v self.act_prob self.params = self._build_net(‘pi‘ train=True)
_ self.act_prob_old self.params_old = self._build_net(‘old_pi‘ train=False)
self._get_loss()
self.sess.run(tf.global_variables_initializer())
def _build_net(self name train):
with tf.variable_scope(name):
initer = tf.initializers.truncated_normal(0.0 0.1)
hidden = tf.layers.dense(self.observe 20 tf.nn.tanh trainable=train)
hidden = tf.layers.dense(hidden 20 tf.nn.tanh trainable=train)
v = tf.layers.dense(hidden 1 activation=None trainable=train)
hidden1 = tf.layers.dense(self.observe 20 tf.nn.tanh trainable=train)
hidden1 = tf.layers.dense(hidden1 20 tf.nn.tanh trainable=train)
hidden1 = tf.layers.dense(hidden1 self.n_actions tf.nn.tanh trainable=train)
act_prob = tf.layers.dense(hidden1 self.n_actions tf.nn.softmax trainable=train)
params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES scope=name)
return v act_prob params
def _get_loss(self):
self.adv = tf.placeholder(tf.float32 [None])
self.v_next = tf.placeholder(tf.float32 [None])
self.action = tf.placeholder(tf.int32 [None])
self.reward = tf.placeholder(tf.float32 [None])
td_error = self.reward + 0.95*self.v_next - self.v
v_loss = tf.reduce_mean(tf.square(td_error))
act_encode = tf.one_hot(self.action self.n_actions)
prob = tf.reduce_sum(self.act_prob*act_encode axis=1)
prob_old = tf.reduce_sum(self.act_prob_old*act_encode axis=1)
ratio = tf.exp(tf.log(tf.clip_by_value(prob 1e-10 1.0)) - tf.log(tf.clip_by_value(prob_old 1e-10 1.0)))
clip_ratio = tf.clip_by_value(ratio 1.0-0.2 1.0+0.2)
clip_loss = tf.reduce_mean(tf.minimum(ratio*self.adv clip_ratio*self.adv))
entroy_loss = -tf.reduce_mean(tf.reduce_sum(self.act_prob*tf.log(tf.clip_by_value(self.act_prob 1e-10 1.0)) axis=1))
self.total_loss = clip_loss - v_loss + 0.01*entroy_loss
learning_rate = tf.train.exponential_decay(0.0015 0 200 0.95)
self.train_op = tf.train.AdamOptimizer(learning_rate).minimize(-self.total_loss)
self.old_pi_update = [tf.assign(t e) for t e in zip(self.params_old self.params)]
def learn(self observe v_pred adv reward act):
loss _ = self.sess.run([self.total_loss self.train_op]
feed_dict={self.observe: observe self.v_next: v_pred
评论
共有 条评论