Prompt Title: Actor critic 구조만들기

Created 1 year ago

Votes 0

Views 119 times

#code

Code

Prompt Detail:

Sure! Here's an example of how to modify the code to have 10 possible actions:

python
import gym
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical


# Actor network
class Actor(nn.Module):
    def __init__(self, state_dim, num_actions, hidden_size):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_actions)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x), dim=1)
        return x


# Critic network
class Critic(nn.Module):
    def __init__(self, state_dim, hidden_size):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class ActorCriticAgent:
    def __init__(self, state_dim, num_actions=10, hidden_size=128, lr=0.001, gamma=0.99):
        self.actor = Actor(state_dim, num_actions, hidden_size).to(device)
        self.critic = Critic(state_dim, hidden_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)
        self.gamma = gamma
        self.num_actions = num_actions
        self.steps_done = 0

    def get_action(self, state, eps):
        state = torch.FloatTensor(state).to(device)
        dist = self.actor(state)
        if eps is None:
            action = dist.sample()
        else:
            # gradually reduce exploration from 1 to 0.01 over the course of 10000 steps
            eps_threshold = 0.01
            eps = eps_threshold + (eps - eps_threshold) * \
                math.exp(-1. * self.steps_done / 10000)
            if random.random() > eps:
                action = dist.sample()
            else:
                action = torch.tensor([[random.randrange(self.num_actions)]],
                                      device=device, dtype=torch.long)
        self.steps_done += 1
        return action.detach().cpu().numpy()[0]

    def update(self, state, action, reward, next_state, done):
        state = torch.FloatTensor(state).to(device)
        next_state = torch.FloatTensor(next_state).to(device)
        action = torch.tensor([action], device=device)
        reward = torch.tensor([reward], device=device)
        done = torch.tensor([done], device=device)

        # update critic
        value = self.critic(state)
        next_value = self.critic(next_state)
        target = reward + self.gamma * (1 - done) * next_value
        advantage = target - value.detach()
        critic_loss = F.smooth_l1_loss(value, target)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # update actor
        dist = self.actor(state)
        log_prob = dist.log_prob(action)
        entropy = -torch.sum(dist.probs * dist.log_probs)