深度强化学习PPO算法（python）

大小: 6KB

文件类型: .py

金币: 1

下载: 1 次

发布日期: 2021-06-07
语言: Python
标签:

高速下载

资源简介

基于Tensorflow实现的PPO算法，依赖库：tensorflow-1.4及以上，gym

资源截图

小图大图

代码片段和文件信息

import tensorflow as tf
import numpy as np
import gym
import copy

class PPO:
    def __init__（self n_features n_actions）:
        self.n_actions = n_actions
        self.n_features = n_features
        self.learning_rate = 0.0015
        self.sess = tf.Session（）
        self.observe = tf.placeholder（tf.float32 [None self.n_features]）
        self.v self.act_prob self.params = self._build_net（‘pi‘ train=True）
        _ self.act_prob_old self.params_old = self._build_net（‘old_pi‘ train=False）
        self._get_loss（）
        self.sess.run（tf.global_variables_initializer（））

    def _build_net（self name train）:
        with tf.variable_scope（name）:
            initer = tf.initializers.truncated_normal（0.0 0.1）
            hidden = tf.layers.dense（self.observe 20 tf.nn.tanh trainable=train）
            hidden = tf.layers.dense（hidden 20 tf.nn.tanh trainable=train）
            v = tf.layers.dense（hidden 1 activation=None trainable=train）

            hidden1 = tf.layers.dense（self.observe 20 tf.nn.tanh trainable=train）
            hidden1 = tf.layers.dense（hidden1 20 tf.nn.tanh trainable=train）
            hidden1 = tf.layers.dense（hidden1 self.n_actions tf.nn.tanh trainable=train）
            act_prob = tf.layers.dense（hidden1 self.n_actions tf.nn.softmax trainable=train）

        params = tf.get_collection（tf.GraphKeys.GLOBAL_VARIABLES scope=name）
        return v act_prob params

    def _get_loss（self）:
        self.adv = tf.placeholder（tf.float32 [None]）
        self.v_next = tf.placeholder（tf.float32 [None]）
        self.action = tf.placeholder（tf.int32 [None]）
        self.reward = tf.placeholder（tf.float32 [None]）

        td_error = self.reward + 0.95*self.v_next - self.v
        v_loss = tf.reduce_mean（tf.square（td_error））

        act_encode = tf.one_hot（self.action self.n_actions）

        prob = tf.reduce_sum（self.act_prob*act_encode axis=1）
        prob_old = tf.reduce_sum（self.act_prob_old*act_encode axis=1）

        ratio = tf.exp（tf.log（tf.clip_by_value（prob 1e-10 1.0）） - tf.log（tf.clip_by_value（prob_old 1e-10 1.0）））
        clip_ratio = tf.clip_by_value（ratio 1.0-0.2 1.0+0.2）
        clip_loss = tf.reduce_mean（tf.minimum（ratio*self.adv clip_ratio*self.adv））

        entroy_loss = -tf.reduce_mean（tf.reduce_sum（self.act_prob*tf.log（tf.clip_by_value（self.act_prob 1e-10 1.0）） axis=1））

        self.total_loss = clip_loss - v_loss + 0.01*entroy_loss
        learning_rate = tf.train.exponential_decay（0.0015 0 200 0.95）
        self.train_op = tf.train.AdamOptimizer（learning_rate）.minimize（-self.total_loss）
        self.old_pi_update = [tf.assign（t e） for t e in zip（self.params_old self.params）]

    def learn（self observe v_pred adv reward act）:
        loss _ = self.sess.run（[self.total_loss self.train_op]
                      feed_dict={self.observe: observe self.v_next: v_pred

上一篇：stdint.h文件来自VS2017
下一篇：Python中使用PIL快速实现二值图代码与资源

共有条评论

深度强化学习PPO算法（python）

资源简介

资源截图

代码片段和文件信息

评论

相关资源