-
Notifications
You must be signed in to change notification settings - Fork 0
/
problem1_forecast.py
215 lines (156 loc) · 7.3 KB
/
problem1_forecast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/env python
'''
Author: Janak Agrawal
Date Created: March 06, 2020
Date Last Modified: March 06,2020
License: MIT
Python Version: 3.7
'''
import pandas as pd
import datetime as dt
from matplotlib import pyplot as plt
import tensorflow as tf
import os
import numpy as np
from tensorflow.keras.regularizers import l2
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
class ForecastModel:
'''
Class for creating a LSTM based time series prediction model.
Attributes:
TRAIN_SPLIT (int): The splitting index for train and test dataset
HISTORY_SIZE (int): the number of data points needed to make prediction
TARGET_SIZE (int): the number of data points to be predicted
'''
def __init__(self,TRAIN_SPLIT,HISTORY_SIZE,TARGET_SIZE):
'''initialization function for the class'''
self.TRAIN_SPLIT = TRAIN_SPLIT
self.HISTORY_SIZE = HISTORY_SIZE
self.TARGET_SIZE = TARGET_SIZE
def normalize_data(self,features):
''' normalizes data around its mean and standard deviation'''
dataset = features.values
data_mean = dataset[:self.TRAIN_SPLIT].mean(axis=0)
data_std = dataset[:self.TRAIN_SPLIT].std(axis=0)
dataset = (dataset-data_mean)/data_std
return dataset
def prepare_labelled_data(self,dataset, target, start_index, end_index):
'''separate the dataset to create two datasets for training and label data points'''
data = []
labels = []
#calculate start and end index for iteration
start_index = start_index + self.HISTORY_SIZE
if end_index is None:
end_index = len(dataset) - self.TARGET_SIZE
#aggregate data and append it to data and labels
for i in range(start_index, end_index):
indices = range(i-self.HISTORY_SIZE, i)
data.append(dataset[indices])
labels.append(target[i:i+self.TARGET_SIZE])
return np.array(data), np.array(labels)
def prepare_model(self,x_train):
'''implements and compiles the model architecture'''
#create a sequential model and add layers
self.model = tf.keras.models.Sequential()
self.model.add(tf.keras.layers.LSTM(32,
return_sequences=True,
input_shape=x_train.shape[-2:],activity_regularizer=l2(0.1)))
self.model.add(tf.keras.layers.LSTM(16, activation='relu',activity_regularizer=l2(0.1)))
self.model.add(tf.keras.layers.Dense(self.TARGET_SIZE,activity_regularizer=l2(0.1)))
#compile model with optimizer
self.model.compile(optimizer=tf.keras.optimizers.Adam(), loss='mse',metrics=['mae', 'msle','mse'])
def fit(self, train_data,val_data,EPOCHS,EVALUATION_INTERVAL):
'''trains the model to predict the train data'''
self.model_history = self.model.fit(train_data, epochs=EPOCHS,
steps_per_epoch=EVALUATION_INTERVAL,
validation_data=val_data,
validation_steps=50)
return self.model_history
def predict(self,x):
'''makes a model prediction on data x'''
return self.model.predict(x)
def evaluate(self,x,y,BATCH_SIZE):
'''evaluates the loss of model based on data x and true prediction y'''
return self.model.evaluate(x,y,batch_size=BATCH_SIZE)
class ForecastModelEvaluation:
'''
Class to evaluate the performance of ForecastModel class for training and test data.
Attributes:
model (ForecastModel) : the model to test performance
'''
def __init__(self,model):
'''initialization function for class'''
self.model = model
def evaluate(self,x_train,y_train,x_val,y_val,BATCH_SIZE=256):
'''evaluate the performance of model on train and test data'''
#call model evaluate function to get training losses
losses = self.model.evaluate(x_train,y_train,BATCH_SIZE)
#get names of losses
names = self.model.model.metrics_names
#print losses on screen in order
print("The losses were as follows for training(in-sample):")
for i in range(1,len(losses)):
print(names[i]+" : "+str(losses[i]))
#call model evaluate function to get test losses
losses = self.model.evaluate(x_val,y_val,BATCH_SIZE)
#print losses on screen in order
print("The losses were as follows for validation(out-sample):")
for i in range(1,len(losses)):
print(names[i]+" : "+str(losses[i]))
###----------------Data Cleansing-----------------###
#Read data using pandas
df = pd.read_csv('equity_RN628A_hbase_data.csv')
#print(df.head())
#Remove data that is not in between 11/02/2012 and 12/01/2013
df['dated'] = pd.to_datetime(df['date'])
start_date = dt.datetime(2012,11,2)
end_date = dt.datetime(2013,12,1)
df = df[df['dated']>=start_date]
df = df[df['dated']<=end_date]
#df.head()
#Fill in missing values using spline interpolation
print("Number of NaN in actual_kwh",df['actual_kwh'].isna().sum())
print("Number of NaN in temperature",df['actual_temperature'].isna().sum())
df['actual_kwh'].interpolate(method='spline',order=3,inplace=True)
df['actual_kwh'].fillna(method='bfill',inplace=True)
df['actual_temperature'].interpolate(method='spline',order=3,inplace=True)
print("Number of NaN in actual_kwh after interpolation",df['actual_kwh'].isna().sum())
print("Number of NaN in temperature after interpolation",df['actual_temperature'].isna().sum())
#Display how the data looks
plt.plot(df['actual_kwh'].head(200))
plt.show()
plt.plot(df['actual_temperature'].head(200))
plt.show()
#converting data into numpy arrays
features_name = ['actual_kwh','actual_temperature']
features = df[features_name]
features.index = df['Unnamed: 0']
###------------------------Model Data Preparation--------------------###
#Defining model parameters
TRAIN_SPLIT = int(len(df)*0.8)
TARGET_SIZE = 4*24 #number of points to predict i.e. 24hrs
HISTORY_SIZE = 4*24*5 #number of points to use for prediction i.e. 5 days
TARGET_FEATURE = 0 #position of target feature in dataset
BATCH_SIZE = 512
EPOCHS = 10
EVALUATION_INTERVAL = 50
BUFFER_SIZE = 100000
#normalize and split dataset
kwh_model = ForecastModel(TRAIN_SPLIT,HISTORY_SIZE,TARGET_SIZE)
dataset = kwh_model.normalize_data(features)
x_train, y_train = kwh_model.prepare_labelled_data(dataset, dataset[:, TARGET_FEATURE], 0, TRAIN_SPLIT)
x_val, y_val = kwh_model.prepare_labelled_data(dataset, dataset[:, TARGET_FEATURE], TRAIN_SPLIT, None)
#Batch the data together
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()
val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_data = val_data.batch(BATCH_SIZE).repeat()
###--------------------Training model-------------------------------###
kwh_model.prepare_model(x_train)
history = kwh_model.fit(train_data,val_data,EPOCHS,EVALUATION_INTERVAL)
#make prediction using the model
for x, y in val_data.take(3):
multi_step_plot(x[0], y[0], kwh_model.predict(x)[0])
###----------------Evaluating model-----------------------------###
kwh_evaluator = ForecastModelEvaluation(kwh_model)
kwh_evaluator.evaluate(x_train,y_train,x_val,y_val,BATCH_SIZE)