-
Notifications
You must be signed in to change notification settings - Fork 2
/
timeseries.py
461 lines (413 loc) · 21.4 KB
/
timeseries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt
from old_stuff.helper_codes.data_set import DataSet
from old_stuff.helper_codes.data_subset import DataSubset
from pandas.plotting import autocorrelation_plot
from numpy import sqrt
from numpy import log
from scipy.stats import boxcox
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.stats.diagnostic as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
import glob
import os
import sys
sys.path.append("../..")
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
class TimeSeries:
'''
Time Series class specific to the data subsets of this project (MoPH)
'''
def __init__(self, data_frame=None, downsample=None, rolling_average_window=None, filter=None):
"""
Initialize the time series class
additional data manipulation will not be done if params are left as None (or False)
:param downsample: string, how much to downsample by (e.g. 'W-TUE': weekly, '2W-TUE': biweekly, 'M': month)
:param rolling_average_window: int, number of days to average over if applying rolling average to count
:param filter: dict, which columns/values to filter before aggregating by date format is {column: [values]}
"""
data_set_holder = DataSet()
# added this 'if statement' here
if data_frame is None:
# data_set_holder = data_set.DataSet()
self.df = data_set_holder.copy_df()
else:
self.df = data_frame
self.df['date'] = pd.to_datetime(self.df['date'])
if filter:
for column, value in filter.items():
self.df = DataSubset.filter(self.df, column, value)
subsetter = DataSubset(self.df)
self.series_df = subsetter.agg_count(['date'])
#self.find_missing()
data_set_holder.add_time_columns(self.series_df)
self.series = self.series_df.set_index('date')['count']
if downsample:
if downsample == '2W-TUE':
self.series = self.series.resample(downsample).sum()
else:
self.series = self.series.resample(downsample).sum()
# smooth data with a rolling average
if rolling_average_window:
self.series = self.rolling_avg(rolling_average_window)
# replace nan values at start with copies of the first value
# this is required to make some of the functions work
for i in range(rolling_average_window - 1):
self.series[i] = self.series[rolling_average_window]
self.downsample = downsample
def rolling_avg(self, window_size):
"""
create a rolling average of the time series
:param window_size: int the size of the rolling average window
:return: Series, the time series with average smoothing
"""
rolling = self.series.rolling(window=window_size)
return rolling.mean()
class TimeSeriesPlotter:
'''
This class can be used for generating time series - related plots. We can use this class with any
data frame, not just the ones we are dealing with in this project
'''
def __init__(self, df, target_variable, service_name=None, mohafaza=None):
'''
:param df: dataframe being used
:param target_variable: name of the target variable column
:param service_name: name of the service of this dataframe. By default it is None.
:param mohafaza: name of the Governorate (mohafaza) of this dataframe. By default is is None.
'''
self.df = df
self.target_variable = target_variable
self.service_name = service_name
self.mohafaza = mohafaza
def create_folder(self, folder_path):
'''
creates a folder in the specified path
:param folder_path: path to the folder
:return:
'''
if not os.path.exists(folder_path):
os.makedirs(folder_path)
def create_title_image_name(self, image_title, image_name):
'''
this function is responsible for creating a title of the image being created, and the name
of the image created when it is saved
:param image_title: title that will appear in the plot
:param image_name: name of the image that will be saved. Example: if `line_plot` is
passed, it will be saved ad `line_plot.png`
:return: the title and the name of the image
'''
if self.service_name is not None and self.mohafaza is not None:
image_title = '%s of %s for %s in %s' % (image_title, self.target_variable, self.service_name, self.mohafaza)
image_name = '%s_%s_%s.png' % (self.service_name, self.mohafaza, image_name)
else:
image_title = '%s of %s' % (image_title, self.target_variable)
image_name = '%s.png' % image_name
return image_title, image_name
# function for generating a line plot of a specified column in a specified dataframe
def generate_lineplot(self, output_folder):
'''
this function creates a line plot of the target variable
:param output_folder: path to the folder where the line plot will be saved
:return saves the line plot image in the specified folder
'''
self.df[self.target_variable].plot()
image_title, image_name = self.create_title_image_name('Line plot', 'line_plot')
plt.suptitle(image_title)
self.create_folder(output_folder)
plt.savefig(output_folder + image_name)
plt.close()
# function for generating a histogram of a specified column in a specified dataframe
def generate_hist(self, output_folder):
'''
this function creates a histogram of the target variable
:param output_folder: path to the folder where the histogram will be saved
:return saves the line plot image in the specified folder
'''
self.df[self.target_variable].hist()
image_title, image_name = self.create_title_image_name('Histogram', 'histogram')
plt.suptitle(image_title)
self.create_folder(output_folder)
plt.savefig(output_folder + image_name)
plt.close()
# function for generating a line plot of the square root of a specified column in a specified dataframe
def generate_sqrt_transform(self, output_folder):
'''
this function generates a line plot of the square root transform applied to the target variable passed
as a parameter of this class.
:param output_folder: path to the folder where the line plot of the square root transform will be saved
:return: the square root series of the target variable
'''
series_sqrt = sqrt(self.df[self.target_variable])
plt.plot(series_sqrt)
image_title, image_name = self.create_title_image_name('Square Root Transform', 'sqrt_transform')
plt.suptitle(image_title)
self.create_folder(output_folder)
plt.savefig(output_folder + image_name)
plt.close()
return series_sqrt
# function for generating a line plot of the log transform of a specified column in a specified dataframe
def generate_log_transform(self, output_folder):
'''
this function generates a line plot of the log transform applied to the target variable passed
as a parameter of this class.
:param output_folder: path to the folder where the line plot of the log transform will be saved
:return: the log transform series of the target variable
'''
series_log = log(self.df[self.target_variable])
series_log = series_log[series_log != -np.inf]
image_title, image_name = self.create_title_image_name('Log Transform', 'log_transform')
plt.suptitle(image_title)
self.create_folder(output_folder)
plt.savefig(output_folder + image_name)
plt.close()
return series_log
# function for generating a line plot of the box cox transform of a specified column in a specified dataframe
def generate_boxcox(self, output_folder):
'''
this function generates a line plot of the box cox transform applied to the target variable passed
as a parameter of this class.
:param output_folder: path to the folder where the line plot of the box cox transform will be saved
:return: the box cox transform series of the target variable
'''
series_bc = self.df[self.target_variable][self.df[self.target_variable] != 0]
series_bc, lam = boxcox(series_bc)
print('Lambda: %f' % lam)
image_title, image_name = self.create_title_image_name('Box Cox Transform', 'box_cox_transform')
plt.suptitle(image_title)
self.create_folder(output_folder)
plt.savefig(output_folder + image_name)
plt.close()
return series_bc
# function for generating a density plot of the target variable of the dataframe being passed
def density_plot(self, output_folder):
'''
function that generates a density plot of the target variable of the dataframe being passed
:param output_folder: path to the folder where the density plot of the target variable will be saved
:return: saves the density plot as an image in the specified output folder
'''
self.df[self.target_variable].plot(kind='kde')
image_title, image_name = self.create_title_image_name('Density Plot', 'density_plot')
plt.suptitle(image_title)
self.create_folder(output_folder)
plt.savefig(output_folder + image_name)
plt.close()
# function that performs Ljung Box Test for detecting white noise on the target variable of the dataframe
# being passed to this class
def ljung_box_test(self, output_folder, df_name):
'''
function that applies L-jung box test for detecting white noise in the target variable of the
dataframe being passed as a parameter of this class
:param output_folder: path to the output folder where the dataframe that contains
the columns returned by the Ljung-Box test will be saved
:param df_name: name that is associated to the dataframe that will be created
:return: the dataframe created
'''
if self.service_name is not None and self.mohafaza is not None:
print("testing for %s in %s" % (self.service_name, self.mohafaza))
arr = sm.acorr_ljungbox(self.df[self.target_variable], boxpierce=True)
df = pd.DataFrame({'lb': arr[0], 'p-values': arr[1], 'bpvalue': arr[2], 'bpp-values': arr[3]})
df.index.name = 'lag_nb'
if not os.path.exists(output_folder):
os.makedirs(output_folder)
df.to_csv(output_folder + df_name + '.csv')
if len(df[df['p-values'] <= 0.05]) == len(df):
print('all p-values for ljung box <= 0.05')
if len(df[df['bpp-values'] <= 0.05]) == len(df):
print('all p-values for box pierce <= 0.05')
print('-----------------------------------------------')
return df
# function that performs augmented dickey-fuller test on the target variable of the dataframe
# being passed to this class
def dickey_fuller(self, reg='ct'):
'''
function that applies Augmented Dickey Fuller Test for stationarity checking of the target variable
of the dataframe being passed
:param reg: the regression value used in the adfuller function of statsmodel. By default, it is
`ct`. The possible values are: `c`, `ct`, `ctt`, `nc`. references: https://www.statsmodels.org/stable/generated/statsmodels.tsa.stattools.adfuller.html
:return: prints out the p-value as well as the critical values
'''
if self.service_name is not None and self.mohafaza is not None:
print("testing for %s in %s" % (self.service_name, self.mohafaza))
X = self.df[self.target_variable].values
result = adfuller(X, regression=reg)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Lags used: %d' % result[2])
print('Critical Values:')
for key, value in result[4].items():
print('%s: %.3f' % (key, value))
print('-----------------------------------------------')
# function that generates ACF plot of the target variable of the dataframe passed to this class
def generate_acf(self, output_folder):
'''
function that generates and ACF plot of the target variable of the dataframe being passed to this class
:param output_folder: path to the folder where the acf plot of the target variable will be saved
:return: saves the acf plot as an image in the specified output folder
'''
plot_acf(self.df[self.target_variable])
image_title, image_name = self.create_title_image_name('ACF plot', 'acf_plot')
plt.suptitle(image_title)
self.create_folder(output_folder)
plt.savefig(output_folder + image_name)
plt.close()
# function that generates PACF plot of the target variable of the dataframe passed to this class
def generate_pacf(self, output_folder):
'''
function that generates and PACF plot of the target variable of the dataframe being passed to this class
:param output_folder: path to the folder where the pacf plot of the target variable will be saved
:return: saves the pacf plot as an image in the specified output folder
'''
plot_pacf(self.df[self.target_variable])
image_title, image_name = self.create_title_image_name('PACF Plot', 'pacf_plot')
plt.suptitle(image_title)
self.create_folder(output_folder)
plt.savefig(output_folder + image_name)
plt.close()
# function that generates Autocorrelation plot of the target variable of the dataframe passed to this class
def generate_autocorrelation(self, output_folder):
'''
function that generates and acuto-correlation plot of the target variable of the dataframe being passed to this class
:param output_folder: path to the folder where the auto-correlation plot of the target variable will be saved
:return: saves the auto-correlation plot as an image in the specified output folder
'''
autocorrelation_plot(self.df[self.target_variable])
image_title, image_name = self.create_title_image_name('Autocorrelation Plot', 'autocorrelation_plot')
plt.suptitle(image_title)
self.create_folder(output_folder)
plt.savefig(output_folder + image_name)
plt.close()
def persistence_model(self, output_folder):
df = pd.DataFrame(self.df[self.target_variable])
df = add_lags(df, self.target_variable, 1, 't-')
df.columns = ['t+1', 't-1']
# add the data frame to persistence model input folder
destination = '../input/persistence_input/'
if not os.path.exists(destination):
os.makedirs(destination)
df.to_csv(destination + '%s_%s.csv' % (self.service_name, self.mohafaza))
X = df.values
train_size = int(len(X) * 0.80)
train, test = X[1:train_size], X[train_size:]
train_X, train_y = train[:, 1], train[:, 0]
test_X, test_y = test[:, 1], test[:, 0]
# evaluate persistence model
predictions = []
for x in test_X:
predictions.append(x)
r2_Score = r2_score(test_y, predictions)
rmse_score = np.sqrt(mean_squared_error(test_y, predictions))
mse_score = mean_squared_error(test_y, predictions)
mae_score = mean_absolute_error(test_y, predictions)
mape_score = mean_absolute_percentage_error(test_y, predictions)
print('Persistence model Scores:\nR^2: %.5f\nRMSE: %.5f\nMSE: %.5f\nMAE: %.5f\nMAPE: %.5f\n' %
(r2_Score, rmse_score, mse_score, mae_score, mape_score))
# plot the persistence model, actual vs. predicted
plt.plot(test_y, label='actual')
plt.plot(predictions, label='predicted')
plt.legend()
image_title, image_name = self.create_title_image_name('Persistence model', 'persistence_plot')
plt.suptitle(image_title)
self.create_folder(output_folder)
plt.savefig(output_folder + image_name)
plt.close()
def mean_absolute_percentage_error(y_true, y_pred):
'''
Function to compute the mean absolute percentage error (MAPE) between an actual and
predicted vectors
:param y_true: the actual values
:param y_pred: the predicted values
:return: MAPE
'''
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# function that extracts the time series components (trend, seasonality, and residual) from the target variable of
# the dataframe passed to this function
def get_timeseries_components(df, target_variable, model='additive', freq=None):
'''
function that extracts: trend, seasonality, and residual from the target variable of the dataframe
passed to this function
:param freq: the length of the seasonal cycle in the data
:param model: whether the seasonal decomposition is 'additive' or 'multiplicative'
:return: trend, seasonality, and the residual components of the target variable
'''
if freq is None:
result = seasonal_decompose(df[target_variable], model=model, extrapolate_trend='freq')
else:
result = seasonal_decompose(df[target_variable], model=model, freq=freq, extrapolate_trend='freq')
trend = result.trend
season = result.seasonal
# resid = result.resid
df['%s_trend' % target_variable] = trend
df['%s_seasonality' % target_variable] = season
# df['%s_resid' % target_variable] = resid
return df
# function that creates certain number of lags from the target variable of the dataframe passed to this function
def add_lags(df, target_variable, nb_lags, col_prefix, col_suffix=None):
'''
function that creates a specified number of lag columns of the target variable of the
dataframe passed.
:param df: dataframe
:param nb_lags: number of lags that will be added as columns. Example: if nb_lags = 3, the first
three lags will be created
:param target_variable: the target variable in the dataframe passed. Lags will be created from thiscolumn
:param col_prefix: prefix of the lag column that will be created. Example, if 't-' is passed as a prefix
and the number of lags is 3, then the column names will be: `t-1`, `t-2`, and `t-3` (one column for each lag from 1 to nb_lags)
:param col_suffix: suffix of the lag column that will be created. Example, if '{t-' is the prefix and
'}' is the suffix, and the number of lags is 3, then the column names will be `{t-1}`, `{t-2}`,
and `{t-3}` (one column for each lag from 1 to nb_lags)
:return: the lag columns created
'''
for i in range(1, nb_lags + 1):
if col_suffix is None:
lag_label = col_prefix + str(i)
df[lag_label] = df[target_variable].shift(i)
else:
lag_label = col_prefix + str(i) + col_suffix
df[lag_label] = df[target_variable].shift(i)
return df
# function that applies difference transform to the target variable of the dataframe passed to this function
def difference_series(df, target_variable, interval=1):
'''
function that applies a difference transform to the target variable of the dataframe passed
:param df: dataframe
:param target_variable: the column that we will apply difference transform to.
:param interval: by how much should we difference. By default, it is 1. If 1, a difference
transform by an interval of 1 will be applied
:return: the differenced series of the target variable.
'''
diff_demand = list()
for i in range(interval, len(df)):
value = df[target_variable][i] - df[target_variable][i - interval]
diff_demand.append(value)
df = df.drop(df.index[-interval])
df[target_variable] = diff_demand
return df
def generate_plots():
# fnames = glob.glob('../old_stuff/output/Faour_datasubsets/*.csv')
fnames = glob.glob('../old_stuff/output/multivariate_datasubsets/*.csv')
output_folder = '../old_stuff/output/temporal_structure_plots/'
services = ['General Medicine', 'Gynaecology', 'Pediatrics', 'Pharmacy']
mohafazas = ['akkar', 'bikaa', 'Tripoli']
for f in fnames:
for service in services:
for mohafaza in mohafazas:
if service in f and mohafaza in f:
df = pd.read_csv(f)
print('DATAFRAME: %s in %s' % (service, mohafaza))
tp = TimeSeriesPlotter(df, 'demand', service_name=service, mohafaza=mohafaza)
print('%s in %s' % (service, mohafaza))
tp.generate_lineplot(output_folder + 'line_plot/')
tp.generate_hist(output_folder + 'histogram/')
tp.density_plot(output_folder + '/density_plot/')
tp.ljung_box_test(output_folder + 'ljung_box_results/', '%s_%s' % (service, mohafaza))
tp.dickey_fuller()
tp.generate_acf(output_folder + 'acf/')
tp.generate_pacf(output_folder + 'pacf/')
tp.generate_autocorrelation(output_folder + 'autocorrelation/')
tp.persistence_model('../output/persistence_output/plots/')
if __name__ == '__main__':
generate_plots()