算法描述
中文描述
# -*- coding: utf-8 -*-
# import the necessary packages
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import gym
# 1. Define some Hyper Parameters
BATCH_SIZE = 32 # batch size of sampling process from buffer
LR = 0.01 # learning rate
EPSILON = 0.9 # epsilon used for epsilon greedy approach
GAMMA = 0.9 # discount factor
TARGET_NETWORK_REPLACE_FREQ = 100 # How frequently target netowrk updates
MEMORY_CAPACITY = 2000 # The capacity of experience replay buffer
env = gym.make("CartPole-v0") # Use cartpole game as environment
env = env.unwrapped
N_ACTIONS = env.action_space.n # 2 actions
N_STATES = env.observation_space.shape[0] # 4 states
ENV_A_SHAPE = 0 if isinstance(env.action_space.sample(),
int) else env.action_space.sample().shape # to confirm the shape
# 2. Define the network used in both target net and the net for training
class Net(nn.Module):
def __init__(self):
# Define the network structure, a very simple fully connected network
super(Net, self).__init__()
# Define the structure of fully connected network
self.fc1 = nn.Linear(N_STATES, 10) # layer 1
self.fc1.weight.data.normal_(0, 0.1) # in-place initilization of weights of fc1
self.out = nn.Linear(10, N_ACTIONS) # layer 2
self.out.weight.data.normal_(0, 0.1) # in-place initilization of weights of fc2
def forward(self, x):
# Define how the input data pass inside the network
x = self.fc1(x)
x = F.relu(x)
actions_value = self.out(x)
return actions_value
# 3. Define the DQN network and its corresponding methods
class DQN(object):
def __init__(self):
# -----------Define 2 networks (target and training)------#
self.eval_net, self.target_net = Net(), Net()
# Define counter, memory size and loss function
self.learn_step_counter = 0 # count the steps of learning process
self.memory_counter = 0 # counter used for experience replay buffer
# ----Define the memory (or the buffer), allocate some space to it. The number
# of columns depends on 4 elements, s, a, r, s_, the total is N_STATES*2 + 2---#
self.memory = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))
# ------- Define the optimizer------#
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)
# ------Define the loss function-----#
self.loss_func = nn.MSELoss()
def choose_action(self, x):
# This function is used to make decision based upon epsilon greedy
x = torch.unsqueeze(torch.FloatTensor(x), 0) # add 1 dimension to input state x
# input only one sample
if np.random.uniform() MEMORY_CAPACITY:
dqn.learn()
if done:
print('Ep: ', i_episode, ' |', 'Ep_r: ', round(ep_r, 2))
if done:
# if game is over, then skip the while loop.
break
# use next state to update the current state.
s = s_
参考博客: https://blog.csdn.net/weixin_39274659/article/details/88354638 https://blog.csdn.net/qq_41871826/article/details/108263919