Prompt Detail:
Sure! Here's an example of how to modify the code to have 10 possible actions:
pythonimport gym
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
# Actor network
class Actor(nn.Module):
def __init__(self, state_dim, num_actions, hidden_size):
super(Actor, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, num_actions)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.softmax(self.fc3(x), dim=1)
return x
# Critic network
class Critic(nn.Module):
def __init__(self, state_dim, hidden_size):
super(Critic, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
class ActorCriticAgent:
def __init__(self, state_dim, num_actions=10, hidden_size=128, lr=0.001, gamma=0.99):
self.actor = Actor(state_dim, num_actions, hidden_size).to(device)
self.critic = Critic(state_dim, hidden_size).to(device)
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)
self.gamma = gamma
self.num_actions = num_actions
self.steps_done = 0
def get_action(self, state, eps):
state = torch.FloatTensor(state).to(device)
dist = self.actor(state)
if eps is None:
action = dist.sample()
else:
# gradually reduce exploration from 1 to 0.01 over the course of 10000 steps
eps_threshold = 0.01
eps = eps_threshold + (eps - eps_threshold) * \
math.exp(-1. * self.steps_done / 10000)
if random.random() > eps:
action = dist.sample()
else:
action = torch.tensor([[random.randrange(self.num_actions)]],
device=device, dtype=torch.long)
self.steps_done += 1
return action.detach().cpu().numpy()[0]
def update(self, state, action, reward, next_state, done):
state = torch.FloatTensor(state).to(device)
next_state = torch.FloatTensor(next_state).to(device)
action = torch.tensor([action], device=device)
reward = torch.tensor([reward], device=device)
done = torch.tensor([done], device=device)
# update critic
value = self.critic(state)
next_value = self.critic(next_state)
target = reward + self.gamma * (1 - done) * next_value
advantage = target - value.detach()
critic_loss = F.smooth_l1_loss(value, target)
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# update actor
dist = self.actor(state)
log_prob = dist.log_prob(action)
entropy = -torch.sum(dist.probs * dist.log_probs)
Add a comment