Policy Gradient

xiaoxiao2021-02-28 75

使用策略网络玩游戏

# hyperparameters image_size = 80 D = image_size * image_size H = 200 batch_size = 10 learning_rate = 1e-4 gamma = 0.99 decay_rate = 0.99 render = False # display the game environment # resume = True # load existing policy network model_file_name = "model_pong" np.set_printoptions(threshold=np.nan) def prepro(I): """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """ I = I[35:195] I = I[::2,::2,0] I[I == 144] = 0 I[I == 109] = 0 I[I != 0] = 1 return I.astype(np.float).ravel() 使用tensorlayer这种的第三方库，定义一个策略网络，输入的是一个状态，定义为游戏中两个帧之间的差，输出为三个动作对应的概率，对应为上，下，保持原位。cross entroy reward loss 为一个交叉熵损失，预测值为：当前的输出的概率，对应的标签：state计算得到的action中按照概率大小采样的到的行为。sess.run(train_op,feed_dict={t_states: epx,t_actions: epy,t_discount_rewards: disR}) xs, ys, rs = [], [], [] # observation for training and inference t_states = tf.placeholder(tf.float32, shape=[None, D]) # policy network network = InputLayer(t_states, name='input') network = DenseLayer(network, n_units=H, act=tf.nn.relu, name='hidden') network = DenseLayer(network, n_units=3, name='output') probs = network.outputs sampling_prob = tf.nn.softmax(probs) t_actions = tf.placeholder(tf.int32, shape=[None]) t_discount_rewards = tf.placeholder(tf.float32, shape=[None]) loss = tl.rein.cross_entropy_reward_loss(probs, t_actions, t_discount_rewards) train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss) with tf.Session() as sess: tl.layers.initialize_global_variables(sess) tl.files.load_and_assign_npz(sess, model_file_name+'.npz', network) network.print_params() network.print_layers() start_time = time.time() game_number = 0 while True: if render: env.render() cur_x = prepro(observation) x = cur_x - prev_x if prev_x is not None else np.zeros(D) x = x.reshape(1, D) prev_x = cur_x prob = sess.run(sampling_prob,feed_dict={t_states: x}) # action. 1: STOP 2: UP 3: DOWN # action = np.random.choice([1,2,3], p=prob.flatten()) action = tl.rein.choice_action_by_probs(prob.flatten(), [1,2,3]) observation, reward, done, _ = env.step(action) reward_sum += reward xs.append(x) # all observations in a episode ys.append(action - 1) # all fake labels in a episode (action begins from 1, so minus 1) rs.append(reward) # all rewards in a episode if done: episode_number += 1 game_number = 0 if episode_number % batch_size == 0: print('batch over...... updating parameters......') epx = np.vstack(xs);epy = np.asarray(ys);epr = np.asarray(rs) disR = tl.rein.discount_episode_rewards(epr, gamma) disR -= np.mean(disR) disR /= np.std(disR) xs, ys, rs = [], [], [] sess.run(train_op,feed_dict={t_states: epx,t_actions: epy,t_discount_rewards: disR}) running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 print('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward)) reward_sum = 0 observation = env.reset() # reset env prev_x = None

转载请注明原文地址: https://www.6miu.com/read-50994.html

技术

最新回复(0)