stock_prices1_rl_trader.py

## you need plot_rl_rewards.py file
## run this with python3 rl_trader.py --m train
## and this python3 rl_trader.py --m test

import numpy as np
import pandas as pd

import torch 
import torch.nn as nn
import torch.nn.functional as F

from datetime import datetime
import itertools
import argparse
import re
import os
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

def get_data():

    #returns a Tx3 list of stock prices
    #each row is a different stock
    # 0=AAPL
    # 1 = MSI
    # 2=SBUX

    df=pd.read_csv("aapl_msi_sbux.csv")

    return df.values

## the experience replay memory

class ReplayBuffer:
    def __init__(self,obs_dim,act_dim,size):
        self.obs1_buf=np.zeros([size,obs_dim],dtype=np.float32)
        self.obs2_buf=np.zeros([size,obs_dim],dtype=np.float32)
        self.acts_buf=np.zeros(size,dtype=np.uint8)
        self.rews_buf=np.zeros(size,dtype=np.float32)
        self.done_buf=np.zeros(size,dtype=np.uint8)
        self.ptr, self.size, self.max_size=0,0,size

    def store(self, obs, act, rew, next_obs, done):
        self.obs1_buf[self.ptr]=obs
        self.obs2_buf[self.ptr]=next_obs
        self.acts_buf[self.ptr]=act
        self.rews_buf[self.ptr]=rew
        self.done_buf[self.ptr]=done
        self.ptr=(self.ptr+1) % self.max_size
        self.size=min(self.size+1, self.max_size)

    def sample_batch(self, batch_size=32):
        idxs=np.random.randint(0,self.size,size=batch_size)
        return dict(s=self.obs1_buf[idxs],
                    s2=self.obs2_buf[idxs],
                    a=self.acts_buf[idxs],
                    r=self.rews_buf[idxs],
                    d=self.done_buf[idxs],
                    )
    

def get_scaler(env):

    # return scikit learn scaler object to scale the sates
    #note you could populate the replay buff here

    states=[]
    for _ in range(env.n_step):
        action= np.random.choice(env.action_space)
        state, reward, done, info = env.step(action)
        states.append(state)
        if done:
            break
    
    scaler=StandardScaler()
    scaler.fit(states)
    return scaler

def maybe_make_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

class MLP(nn.Module):
    def __init__(self, n_inputs, n_action, n_hidden_layers=1, hidden_dim=32):
        super(MLP,self).__init__()

        M=n_inputs
        self.layers=[]
        for _ in range(n_hidden_layers):
            layer=nn.Linear(M,hidden_dim)
            M=hidden_dim
            self.layers.append(layer)
        
        #final layer
        self.layers.append(nn.Linear(M, n_action))
        self.layers = nn.Sequential(*self.layers)

    def forward(self,X):
        return self.layers(X)
    
    def save_weights(self, path):
        torch.save(self.state_dict(), path)

    def load_weights(self,path):
        self.load_state_dict(torch.load(path))

    def predict(model, np_states):
        with torch.no_grad():
            inputs=torch.from_numpy(np_states.astype(np.float32))
            output=model(inputs)
            #print(output)
            return output.numpy()
    
def train_one_step(model, criterion, optimizer, inputs, targets):
    #converts to tensors

    inputs=torch.from_numpy(inputs.astype(np.float32))
    targets=torch.from_numpy(targets.astype(np.float32))

    #zero the parameter grads
    optimizer.zero_grad()

    #forward pass
    outputs=model(inputs)
    loss=criterion(outputs,targets)

    #backward and optimize
    loss.backward()
    optimizer.step()


class MultiStockEnv:
    # defining environmen class
    # 3 stock environment
    # sell/ hol / buy 3 actions we have

    # constructor

    def __init__(self, data, initial_investment=20000):
        #data
        self.stock_price_history=data
        self.n_step, self.n_stock= self.stock_price_history.shape

        #instance attributes
        self.initial_investment=initial_investment
        self.cur_step=None
        self.stock_owned=None
        self.stock_price=None
        self.cash_in_hand=None

        self.action_space=np.arange(3**self.n_stock)

        #action permutations
        #returns like [0,0,0], [001] [002] ...
        # 0 sell, 1 hold, 2 buy

        self.action_list=list(map(list,itertools.product([0,1,2],repeat=self.n_stock)))

        # calculate size of the state
        self.state_dim=self.n_stock*2 +1

        self.reset()

    def reset(self):
        self.cur_step=0
        self.stock_owned=np.zeros(self.n_stock)
        self.stock_price=self.stock_price_history[self.cur_step]
        self.cash_in_hand=self.initial_investment
        return self._get_obs()
    
    def step(self,action):
        assert action in self.action_space

        # get current values before performing the action
        prev_val=self._get_val()

        #update the price
        self.cur_step +=1
        self.stock_price=self.stock_price_history[self.cur_step]

        #perform the trade
        self._trade(action)

        #get new value after taking the action
        cur_val=self._get_val()

        #reward is the increase in the portfilio value
        reward=cur_val-prev_val

        #done if we have run out of the data
        done=self.cur_step==self.n_step-1

        #store the current value of the portfolio
        info={"cur_val":cur_val}

        #conform the gym API
        return self._get_obs(), reward, done, info
    
    def _get_obs(self):
        obs=np.empty(self.state_dim)
        obs[:self.n_stock]=self.stock_owned
        obs[self.n_stock:2*self.n_stock] = self.stock_price
        obs[-1]=self.cash_in_hand
        return obs
    
    def _get_val(self):
        return self.stock_owned.dot(self.stock_price)+self.cash_in_hand
    
    def _trade(self,action):

        #index the action we want to perform
        # 0 sell, 1 hold, 2 buy

        action_vec=self.action_list[action]

        #determine which stocks to buy or sell
        sell_index=[] #index we want to sell
        buy_index=[]   #index we want to buy
        for i,a in enumerate(action_vec):
            if a==0:
                sell_index.append(i)
            elif a==2:
                buy_index.append(i)

            #sell any stocks we want to sell
            # then buy any stocks we want to buy

            if sell_index:
                #we sell ALL to simplify
                for i in sell_index:
                    self.cash_in_hand+=self.stock_price[i]*self.stock_owned[i]
                    self.stock_owned[i]=0
            if buy_index:
                #we loop through each stock we want to buy
                #we buy until run out of cash
                can_buy=True
                while can_buy:
                    for i in buy_index:
                        if self.cash_in_hand>self.stock_price[i]:
                            self.stock_owned[i]+=1 #buy one share
                            self.cash_in_hand -= self.stock_price[i]
                        else:
                            can_buy=False

class DQNAgent(object):
    def __init__(self, state_size, action_size):
        self.state_size=state_size
        self.action_size=action_size
        self.memory=ReplayBuffer(state_size, action_size, size=500)
        self.gamma=0.95 #discount rate
        self.epsilon=1.0  #exloration rate
        self.epsilon_min=0.01
        self.epsilon_decay=.995
        self.model=MLP(state_size,action_size)

        #loss and character
        self.criterion=nn.MSELoss()
        self.optimizer=torch.optim.Adam(self.model.parameters())

    def update_replay_memory(self, state, action, reward, next_state, done):
        self.memory.store(state, action, reward, next_state, done)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_size)
        act_values=self.model.predict(state)
        return np.argmax(act_values[0]) # returns action
    
    def replay(self, batch_size=32):
        #first check if replay buffer contains enough data
        if self.memory.size < batch_size:
            return
        
        # sample a batch of data from the replay memory
        minibatch=self.memory.sample_batch(batch_size)
        states=minibatch['s']
        actions=minibatch['a']
        rewards=minibatch['r']
        next_states=minibatch['s2']
        done=minibatch['d']


        #calculate the target
        target = rewards + (1 - done) * self.gamma * np.amax(self.model.predict(next_states), axis=1)

        # target same as the prediction makes it easy
        # we only update network for the actions
        #target equal to the prediction for all vals
        # targets for the actions taken
        # Q(s,a)

        target_full=self.model.predict(states)
        target_full[np.arange(batch_size),actions]=target

        #run one training step
        train_one_step(self.model, self.criterion, self.optimizer, states, target_full)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


def play_one_episode(agent, env, is_train):
    #after transforming sates are already 1xD

    state=env.reset()
    state=scaler.transform([state])
    done=False

    while not done:
        action=agent.act(state)
        next_state, reward, done, info=env.step(action)
        next_state=scaler.transform([next_state])
        if is_train== 'train':
            agent.update_replay_memory(state,action,reward,next_state,done)
            agent.replay(batch_size)
        state=next_state

    return info['cur_val']

if __name__=='__main__':
    #config
    models_folder='rl_trader_models'
    rewards_folder='rl_trader_rewards'
    num_episodes=2000
    batch_size=32
    initial_investment=20000

    parser=argparse.ArgumentParser()
    parser.add_argument('-m','--mode',type=str, required=True,
                        help='either "train" or "test"') 
    args=parser.parse_args()

    maybe_make_dir(models_folder)
    maybe_make_dir(rewards_folder)

    data=get_data()
    n_timesteps, n_stocks =data.shape
    n_train=n_timesteps // 2

    train_data=data[:n_train]
    test_data=data[n_train:]

    env= MultiStockEnv(train_data, initial_investment)
    state_size=env.state_dim
    action_size=len(env.action_space)
    agent=DQNAgent(state_size,action_size)
    scaler=get_scaler(env)

    #store the final calue of the portfolio
    portfolio_value=[]

    if args.mode=='test':
        #tehn load the previous scaler
        with open(f'{models_folder}/scaler.pkl','rb') as f:
            scaler=pickle.load(f)

        #remake the env with test data
        env=MultiStockEnv(test_data,initial_investment)

        #make sure epsilon is not 1 
        # if epsilon=0 it is deterministic
        agent.epsilon=0.01

        #load trained weights

        agent.load(f'{models_folder}/dqn.h5')

    #play the game num_episodes times
    for e in range(num_episodes):
        t0=datetime.now()
        val=play_one_episode(agent,env,args.mode)
        dt=datetime.now()-t0
        print(f'episode: {e+1}/{num_episodes}, esisode end value: {val:.2f},duration: {dt}')
        portfolio_value.append(val) # append episode end portfolio value

    #save the weights when we are done
    if args.mode=='train':
        #save the DQN
        agent.save(f'{models_folder}/dqn.h5')

        #save the scaler
        with open(f'{models_folder}/scaler.pkl','wb') as f:
            pickle.dump(scaler,f)

    #save the portfolio value for each episode
    np.save(f'{rewards_folder}/{args.mode}.npy',portfolio_value)