深度强化学习-D3QN算法原理与代码
Dueling Double Deep Q Network(D3QN)算法结合了Double DQN和Dueling DQN算法的思想,进一步提升了算法的性能。如果对Doubel DQN和Dueling DQN算法还不太了解的话,可以参考我的这两篇博文:深度强化学习-Double DQN算法原理与代码和深度强化学习-Dueling DQN算法原理与代码,分别详细讲述了这两个算法的原理以及代码实现。本文就带领大家了解一下D3QN算法,代码链接见下方。
代码:https://github.com/indigoLovee/D3QN
喜欢的话可以点个star呢。
1 D3QN算法简介
Dueling Double Deep Q Network(D3QN)算法是在Dueling DQN算法的基础上融入了Doubel DQN算法的思想,它与Dueling DQN算法唯一的区别在于计算目标值的方式。在Dueling DQN算法中,目标值的计算方式为
即利用目标网络获取状态下所有动作的动作价值,然后基于最优动作价值计算目标值。由于这里的最大化操作,导致算法存在“过估计”问题,影响决策的准确性。其中表示目标网络参数。
在D3QN算法中,目标值的计算方式为
即利用评估网络获取状态下最优动作价值对应的动作,然后利用目标网络计算该动作的动作价值,从而得到目标值。通过两个网络的交互,有效避免了算法的“过估计”问题。其中和分别表示评估网络和目标网络的参数。
这其实就是D3QN算法的核心所在啦,如果已经熟悉Dueling DQN和Doubel DQN算法的话,这个算法其实是非常容易理解的。
2 D3QN算法代码
经验回放采用集中式均匀回放,代码如下(脚本buffer.py):
import numpy as np
class ReplayBuffer:
def __init__(self, state_dim, action_dim, max_size, batch_size):
self.mem_size = max_size
self.batch_size = batch_size
self.mem_cnt = 0
self.state_memory = np.zeros((self.mem_size, state_dim))
self.action_memory = np.zeros((self.mem_size, ))
self.reward_memory = np.zeros((self.mem_size, ))
self.next_state_memory = np.zeros((self.mem_size, state_dim))
self.terminal_memory = np.zeros((self.mem_size, ), dtype=np.bool)
def store_transition(self, state, action, reward, state_, done):
mem_idx = self.mem_cnt % self.mem_size
self.state_memory[mem_idx] = state
self.action_memory[mem_idx] = action
self.reward_memory[mem_idx] = reward
self.next_state_memory[mem_idx] = state_
self.terminal_memory[mem_idx] = done
self.mem_cnt += 1
def sample_buffer(self):
mem_len = min(self.mem_size, self.mem_cnt)
batch = np.random.choice(mem_len, self.batch_size, replace=False)
states = self.state_memory[batch]
actions = self.action_memory[batch]
rewards = self.reward_memory[batch]
states_ = self.next_state_memory[batch]
terminals = self.terminal_memory[batch]
return states, actions, rewards, states_, terminals
def ready(self):
return self.mem_cnt > self.batch_size
目标网络的更新方式为软更新,D3QN算法的实现代码如下(脚本D3QN.py):
import torch as T
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from buffer import ReplayBuffer
device = T.device("cuda:0" if T.cuda.is_available() else "cpu")
class DuelingDeepQNetwork(nn.Module):
def __init__(self, alpha, state_dim, action_dim, fc1_dim, fc2_dim):
super(DuelingDeepQNetwork, self).__init__()
self.fc1 = nn.Linear(state_dim, fc1_dim)
self.fc2 = nn.Linear(fc1_dim, fc2_dim)
self.V = nn.Linear(fc2_dim, 1)
self.A = nn.Linear(fc2_dim, action_dim)
self.optimizer = optim.Adam(self.parameters(), lr=alpha)
self.to(device)
def forward(self, state):
x = T.relu(self.fc1(state))
x = T.relu(self.fc2(x))
V = self.V(x)
A = self.A(x)
Q = V + A - T.mean(A, dim=-1, keepdim=True)
return Q
def save_checkpoint(self, checkpoint_file):
T.save(self.state_dict(), checkpoint_file)
def load_checkpoint(self, checkpoint_file):
self.load_state_dict(T.load(checkpoint_file))
class D3QN:
def __init__(self, alpha, state_dim, action_dim, fc1_dim, fc2_dim, ckpt_dir,
gamma=0.99, tau=0.005, epsilon=1.0, eps_end=0.01, eps_dec=5e-7,
max_size=1000000, batch_size=256):
self.gamma = gamma
self.tau = tau
self.epsilon = epsilon
self.eps_min = eps_end
self.eps_dec = eps_dec
self.batch_size = batch_size
self.checkpoint_dir = ckpt_dir
self.action_space = [i for i in range(action_dim)]
self.q_eval = DuelingDeepQNetwork(alpha=alpha, state_dim=state_dim, action_dim=action_dim,
fc1_dim=fc1_dim, fc2_dim=fc2_dim)
self.q_target = DuelingDeepQNetwork(alpha=alpha, state_dim=state_dim, action_dim=action_dim,
fc1_dim=fc1_dim, fc2_dim=fc2_dim)
self.memory = ReplayBuffer(state_dim=state_dim, action_dim=action_dim,
max_size=max_size, batch_size=batch_size)
self.update_network_parameters(tau=1.0)
def update_network_parameters(self, tau=None):
if tau is None:
tau = self.tau
for q_target_params, q_eval_params in zip(self.q_target.parameters(), self.q_eval.parameters()):
q_target_params.data.copy_(tau * q_eval_params + (1 - tau) * q_target_params)
def remember(self, state, action, reward, state_, done):
self.memory.store_transition(state, action, reward, state_, done)
def decrement_epsilon(self):
self.epsilon = self.epsilon - self.eps_dec \
if self.epsilon > self.eps_min else self.eps_min
def choose_action(self, observation, isTrain=True):
state = T.tensor([observation], dtype=T.float).to(device)
q_vals = self.q_eval.forward(state)
action = T.argmax(q_vals).item()
if (np.random.random() < self.epsilon) and isTrain:
action = np.random.choice(self.action_space)
return action
def learn(self):
if not self.memory.ready():
return
states, actions, rewards, next_states, terminals = self.memory.sample_buffer()
batch_idx = T.arange(self.batch_size, dtype=T.long).to(device)
states_tensor = T.tensor(states, dtype=T.float).to(device)
actions_tensor = T.tensor(actions, dtype=T.long).to(device)
rewards_tensor = T.tensor(rewards, dtype=T.float).to(device)
next_states_tensor = T.tensor(next_states, dtype=T.float).to(device)
terminals_tensor = T.tensor(terminals).to(device)
with T.no_grad():
q_ = self.q_target.forward(next_states_tensor)
max_actions = T.argmax(self.q_eval.forward(next_states_tensor), dim=-1)
q_[terminals_tensor] = 0.0
target = rewards_tensor + self.gamma * q_[batch_idx, max_actions]
q = self.q_eval.forward(states_tensor)[batch_idx, actions_tensor]
loss = F.mse_loss(q, target.detach())
self.q_eval.optimizer.zero_grad()
loss.backward()
self.q_eval.optimizer.step()
self.update_network_parameters()
self.decrement_epsilon()
def save_models(self, episode):
self.q_eval.save_checkpoint(self.checkpoint_dir + 'Q_eval/D3QN_q_eval_{}.pth'.format(episode))
print('Saving Q_eval network successfully!')
self.q_target.save_checkpoint(self.checkpoint_dir + 'Q_target/D3QN_Q_target_{}.pth'.format(episode))
print('Saving Q_target network successfully!')
def load_models(self, episode):
self.q_eval.load_checkpoint(self.checkpoint_dir + 'Q_eval/D3QN_q_eval_{}.pth'.format(episode))
print('Loading Q_eval network successfully!')
self.q_target.load_checkpoint(self.checkpoint_dir + 'Q_target/D3QN_Q_target_{}.pth'.format(episode))
print('Loading Q_target network successfully!')
算法仿真环境为gym库中的LunarLander-v2,因此需要先配置好gym库。进入Anaconda3中对应的Python环境中,执行下面的指令
pip install gym
但是,这样安装的gym库只包括少量的内置环境,如算法环境、简单文字游戏和经典控制环境,无法使用LunarLander-v2。因此还需要安装一些其他依赖项,具体可以参考我的这篇博文:AttributeError: module ‘gym.envs.box2d‘ has no attribute ‘LunarLander‘ 解决办法
让智能体在环境中训练500轮,训练代码如下(脚本train.py):
import gym
import numpy as np
import argparse
from utils import create_directory, plot_learning_curve
from D3QN import D3QN
parser = argparse.ArgumentParser()
parser.add_argument('--max_episodes', type=int, default=500)
parser.add_argument('--ckpt_dir', type=str, default='./checkpoints/D3QN/')
parser.add_argument('--reward_path', type=str, default='./output_images/reward.png')
parser.add_argument('--epsilon_path', type=str, default='./output_images/epsilon.png')
args = parser.parse_args()
def main():
env = gym.make('LunarLander-v2')
agent = D3QN(alpha=0.0003, state_dim=env.observation_space.shape[0], action_dim=env.action_space.n,
fc1_dim=256, fc2_dim=256, ckpt_dir=args.ckpt_dir, gamma=0.99, tau=0.005, epsilon=1.0,
eps_end=0.05, eps_dec=5e-4, max_size=1000000, batch_size=256)
create_directory(args.ckpt_dir, sub_dirs=['Q_eval', 'Q_target'])
total_rewards, avg_rewards, epsilon_history = [], [], []
for episode in range(args.max_episodes):
total_reward = 0
done = False
observation = env.reset()
while not done:
action = agent.choose_action(observation, isTrain=True)
observation_, reward, done, info = env.step(action)
agent.remember(observation, action, reward, observation_, done)
agent.learn()
total_reward += reward
observation = observation_
total_rewards.append(total_reward)
avg_reward = np.mean(total_rewards[-100:])
avg_rewards.append(avg_reward)
epsilon_history.append(agent.epsilon)
print('EP:{} Reward:{} Avg_reward:{} Epsilon:{}'.
format(episode+1, total_reward, avg_reward, agent.epsilon))
if (episode + 1) % 50 == 0:
agent.save_models(episode+1)
episodes = [i+1 for i in range(args.max_episodes)]
plot_learning_curve(episodes, avg_rewards, title='Reward', ylabel='reward',
figure_file=args.reward_path)
plot_learning_curve(episodes, epsilon_history, title='Epsilon', ylabel='epsilon',
figure_file=args.epsilon_path)
if __name__ == '__main__':
main()
训练时还会用到画图函数和创建文件夹函数,它们均放置在utils.py脚本中,具体代码如下:
import os
import matplotlib.pyplot as plt
def create_directory(path: str, sub_dirs: list):
for sub_dir in sub_dirs:
if os.path.exists(path + sub_dir):
print(path + sub_dir + 'is already exist!')
else:
os.makedirs(path + sub_dir, exist_ok=True)
print(path + sub_dir + 'create successfully!')
def plot_learning_curve(episodes, records, title, ylabel, figure_file):
plt.figure()
plt.plot(episodes, records, linestyle='-', color='r')
plt.title(title)
plt.xlabel('episode')
plt.ylabel(ylabel)
plt.show()
plt.savefig(figure_file)
仿真结果如下图所示:
平均累积奖励曲线
epsilon变化曲线
通过平均累积奖励可以看出,D3QN算法大约在300步左右时趋于收敛。
更多推荐
所有评论(0)