a2c.py

import os
import gym
import numpy as np
import tensorflow as tf

# Killing optional CPU driver warnings
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"


class ReinforceWithBaseline(tf.keras.Model):
    def __init__(self, state_size, num_actions):
        """
        The ReinforceWithBaseline class that inherits from tf.keras.Model.

        The forward pass calculates the policy for the agent given a batch of states. During training,
        ReinforceWithBaseLine estimates the value of each state to be used as a baseline to compare the policy's
        performance with.

        :param state_size: number of parameters that define the state. You don't necessarily have to use this, 
                           but it can be used as the input size for your first dense layer.
        :param num_actions: number of actions in an environment
        """
        super(ReinforceWithBaseline, self).__init__()
        self.num_actions = num_actions

        self.state_size = 512
        self.hidden_size = 512
        self.dense1 = tf.keras.layers.Dense(
            self.hidden_size, input_shape=[-1, self.state_size], activation="relu"
        )
        self.dense2 = tf.keras.layers.Dense(self.num_actions, activation="softmax")
        self.learning_rate = .001
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
        self.critic_hidden_size = 512
        self.critic_dense1 = tf.keras.layers.Dense(
            self.critic_hidden_size, input_shape=[-1, self.state_size]
        )
        self.critic_dense2 = tf.keras.layers.Dense(1)

    @tf.function
    def call(self, states):
        """
        Performs the forward pass on a batch of states to generate the action probabilities.
        This returns a policy tensor of shape [episode_length, num_actions], where each row is a
        probability distribution over actions for each state.

        :param states: An [episode_length, state_size] dimensioned array
        representing the history of states of an episode
        :return: A [episode_length,num_actions] matrix representing the probability distribution over actions
        of each state in the episode
        """
        x = self.dense1(states)
        policy = self.dense2(x)
        return policy

    def value_function(self, states):
        """
        Performs the forward pass on a batch of states to calculate the value function, to be used as the
        critic in the loss function.

        :param states: An [episode_length, state_size] dimensioned array representing the history of states
        of an episode
        :return: A [episode_length] matrix representing the value of each state
        """
        x = self.critic_dense1(states)
        vals = self.critic_dense2(x)
        return vals

    def loss(self, states, actions, discounted_rewards):
        """
        Computes the loss for the agent. Refer to the handout to see how this is done.

        Remember that the loss is similar to the loss as in reinforce.py, with one specific change.

        1) Instead of element-wise multiplying with discounted_rewards, you want to element-wise multiply with your advantage. Here, advantage is defined as discounted_rewards - state_values, where state_values is calculated by the critic network.
        
        2) In your actor loss, you must set advantage to be tf.stop_gradient(discounted_rewards - state_values). You may need to cast your (discounted_rewards - state_values) to tf.float32. tf.stop_gradient is used here to stop the loss calculated on the actor network from propagating back to the critic network.
        
        3) To calculate the loss for your critic network. Do this by calling the value_function on the states and then taking the sum of the squared advantage.

        :param states: A batch of states of shape [episode_length, state_size]
        :param actions: History of actions taken at each timestep of the episode (represented as an [episode_length] array)
        :param discounted_rewards: Discounted rewards throughout a complete episode (represented as an [episode_length] array)
        :return: loss, a TensorFlow scalar
        """
        prbs = self(states)
        prb_actions = [prbs[i][actions[i]] for i in range(len(actions))]
        negative_log_prbs = tf.math.negative(tf.math.log(prb_actions))
        state_values = self.value_function(states)
        off_by = tf.stop_gradient(discounted_rewards - state_values)
        discounts = negative_log_prbs * off_by
        actor_loss = tf.reduce_sum(discounts)
        critic_loss = tf.reduce_sum(tf.math.square(discounted_rewards - state_values))
        weight = .5
        return actor_loss + critic_loss * weight