-
Notifications
You must be signed in to change notification settings - Fork 0
/
Time Series Models in Scikit-learn
282 lines (226 loc) · 10.6 KB
/
Time Series Models in Scikit-learn
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#Creating features from the past
#smootheness aka autocorrelation
#shifting 3 behind
print(df.shift(3))
#try creating a dictionary that has various time lags
data=pd.Series(..)
# number shifts
Shifts=[1,2,3,4,5,6,7,8,9,10]
#create a dictionary of time shifted data
TryingShifts={‘lag_{}”.format(ii): data.shift(ii) for ii in Shifts}
#convert them into a dataframe
TryingShifts=pd.DataFrame(TryingShifts)
model=Ridge()
model.fit(TryingShifts,data)
#visualize the fit model coefficients
fig, ax = plt.subplots()
ax.bar(TryingShifts.columns,model.coef_)
ax.set(xlabel=’Coefficient Name’,ylabel=’Coefficient value’)
#set up formatting
plt.setp(ax.get_xticklabels(),rotation=45,horizontalalignment=’right’)
#-----------------------------------------------------------------------
# These are the "time lags"
shifts = np.arange(1, 11).astype(int)
# Use a dictionary comprehension to create name: value pairs, one pair per shift
shifted_data = {"lag_{}_day".format(day_shift): prices_perc.shift(day_shift) for day_shift in shifts}
# Convert into a DataFrame for subsequent use
prices_perc_shifted = pd.DataFrame(shifted_data)
# Plot the first 100 samples of each
ax = prices_perc_shifted.iloc[:100].plot(cmap=plt.cm.viridis)
prices_perc.iloc[:100].plot(color='r', lw=2)
ax.legend(loc='best')
plt.show()
# Replace missing values with the median for each column
X = prices_perc_shifted.fillna(np.nanmedian(prices_perc_shifted))
y = prices_perc.fillna(np.nanmedian(prices_perc))
# Fit the model
model = Ridge()
model.fit(X, y)
#-----------------------------------------------------------------------
#create a function that, given a set of coefficients and feature names, visualizes the coefficient values
def visualize_coefficients(coefs, names, ax):
# Make a bar plot for the coefficients, including their names on the x-axis
ax.bar(prices_perc_shifted.columns,coefs)
ax.set(xlabel='Coefficient name', ylabel='Coefficient value')
# Set formatting so it looks nice
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
return ax
# Visualize the output data up to "2011-01"
fig, axs = plt.subplots(2, 1, figsize=(10, 5))
y.loc[:'2011-01'].plot(ax=axs[0])
# Run the function to visualize model's coefficients
visualize_coefficients(model.coef_,prices_perc_shifted.columns, ax=axs[1])
plt.show()
#-----------------------------------------------------------------------
#Cross validating time series data
#iterating over the split method yields test train indices
#cross validation with kfold cv
from sklearn.model_selection import KFold
cv=KFold(n_splits=5)
for tr, tt in cv.split(X,y):
model.fit(X[tr],y[tr])
model.score(X[tt],y[tt])
#Best
#Using the time series CV iterator
from sklearn.model_selection import TimeSeriesSplit
cv=TimeSeriesSplit(n_splits=10)
#OR you could use custom scoring functions
def MyFunction(estimator,X,y):
Yhat=estimator.predict(X)
CustomScore=MyCustomFunction(Yhat,y)
return CustomScore
#A custom correlation function
def CustomFunction(est,X,y):
#Generate predictions and convert to a vector
Yhat=est.predict(X).squeeze()
#Use the numpy ‘corrcoeff’ function to generate the correlation matrix
CoefficientMatrix=np.corrcoeff(yhat,y.squeeze())
#return a single correlation value from the corel matrix
CorrelCoef=CorrelCoef[1,0]
#shouldn’t it be CoefficientMatrix[1,0] ???
return CorrelCoef
#-----------------------------------------------------------------------
# Import ShuffleSplit and create the cross-validation object
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=10, random_state=1)
# Iterate through CV splits
results = []
for tr, tt in cv.split(X, y):
# Fit the model on training data
model.fit(X[tr], y[tr])
# Generate predictions on the test data, score the predictions, and collect
prediction = model.predict(X[tt])
score = r2_score(y[tt], prediction)
results.append((prediction, score, tt))
# Custom function to quickly visualize predictions
visualize_predictions(results)
#-----------------------------------------------------------------------
# Create KFold cross-validation object
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, shuffle=False, random_state=1)
# Iterate through CV splits
results = []
for tr, tt in cv.split(X, y):
# Fit the model on training data
model.fit(X[tr],y[tr])
# Generate predictions on the test data and collect
prediction = model.predict(X[tt])
results.append((prediction, tt))
# Custom function to quickly visualize predictions
visualize_predictions(results)
#-----------------------------------------------------------------------
# Import TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit
# Create time-series cross-validation object
cv =TimeSeriesSplit(n_splits=10)
# Iterate through CV splits
fig, ax = plt.subplots()
for ii, (tr, tt) in enumerate(cv.split(X, y)):
# Plot the training data on each iteration, to see the behavior of the CV
ax.plot(tr, ii + y[tr])
ax.set(title='Training data on each CV iteration', ylabel='CV iteration')
plt.show()
#-----------------------------------------------------------------------
#Stationarity
#stationary series does not chage its properties over time like mean, stdev, trend
#bootstrapping is a way to estimate the confidence in the mean of a collection of numbers
#lower and upper percentiles represent the variability of the mean
from sklearn.utils import resample
#cv_coefficients has shape (n_cv_folds, n_coefficients)
NumberBoots=100
BootMeans=np.zeros(NumberBoots,n_coefficients)
for ii in range(NumberBoots):
#Generate random indeces for our data with replacement
#then take the sample mean
RandomSample=resample(cv_coefficients)
BootMeans[ii]=RandomSample.mean(axis=0)
#compute the percetiles of choice for the bootstrapped means
Percentiles=np.percentile(BootMeans,(2.5,97.5),axis=0)
#this is called a 95% confidence interval
def CorrelCoeff(est,X,y):
#return the correlation coefficient between model prediction and a validation set
return np.corrcoef(y,est.predict(X))[1,0]
#Grab the date of the first index of each validation set
FirstIndices=[data.index[tt[0]] for tr,tt in cv.split(X,y)]
#calculate the cv scores and coverts to a pandas series
CVScores=cross_val_score(model,X,y,cv=cv,scoring=CorrelCoeff)
CVScores=pd.Series(CVScores,index=FirstIndices)
#Vizualizing model scores as a timeseries
fig,axs=plt.subplots(2,1,figsize=(10,5),sharex=True)
#Calculate a rolling mean of scores over time
CVScoresMean=CVScores.rolling(10,min_periods=1).mean()
CVScores.plot(ax=axs[0])
axs[0].set(title=’Validation scores (correlation)’,ylim=[0,1])
#plot the raw data
data.plot(ax=axs[1])
axs[1].set(title=’Validation data’)
#-----------------------------------------------------------------------------------------
from sklearn.utils import resample
def bootstrap_interval(data, percentiles=(2.5, 97.5), n_boots=100):
"""Bootstrap a confidence interval for the mean of columns of a 2-D dataset."""
# Create empty array to fill the results
bootstrap_means = np.zeros([n_boots, data.shape[-1]])
for ii in range(n_boots):
# Generate random indices for data *with* replacement, then take the sample mean
random_sample = resample(data)
bootstrap_means[ii] = random_sample.mean(axis=0)
# Compute the percentiles of choice for the bootstrapped means
percentiles = np.percentile(bootstrap_means, percentiles, axis=0)
return percentiles
#-----------------------------------------------------------------------------------------
#assessed the variability of each coefficient
# Iterate through CV splits
n_splits = 100
cv = TimeSeriesSplit(n_splits=n_splits)
# Create empty array to collect coefficients
coefficients = np.zeros([n_splits, X.shape[1]])
for ii, (tr, tt) in enumerate(cv.split(X, y)):
# Fit the model on training data and collect the coefficients
model.fit(X[tr], y[tr])
coefficients[ii] = model.coef_
#-----------------------------------------------------------------------------------------
# Calculate a confidence interval around each coefficient
bootstrapped_interval = bootstrap_interval(coefficients, percentiles=(2.5, 97.5), n_boots=100)
# Plot it
fig, ax = plt.subplots()
ax.scatter(feature_names, bootstrapped_interval[0], marker='_', lw=3)
ax.scatter(feature_names, bootstrapped_interval[1], marker='_', lw=3)
ax.set(title='95% confidence interval for model coefficients')
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()
#-----------------------------------------------------------------------------------------
#Now that you've assessed the variability of each coefficient, let's do the same for the performance (scores) of the model. Recall that the TimeSeriesSplit object will use successively-later indices for each test set. This means that you can treat the scores of your validation as a time series. You can visualize this over time in order to see how the model's performance changes over time.
from sklearn.model_selection import cross_val_score
# Generate scores for each split to see how the model performs over time
scores = cross_val_score(model, X, y, cv=cv, scoring=my_pearsonr)
# Convert to a Pandas Series object
scores_series = pd.Series(scores, index=times_scores, name='score')
# Bootstrap a rolling confidence interval for the mean score
scores_lo = scores_series.rolling(20).aggregate(partial(bootstrap_interval, percentiles=2.5))
scores_hi = scores_series.rolling(20).aggregate(partial(bootstrap_interval, percentiles=97.5))
# Plot the results
fig, ax = plt.subplots()
scores_lo.plot(ax=ax, label="Lower confidence interval")
scores_hi.plot(ax=ax, label="Upper confidence interval")
ax.legend()
plt.show()
#-----------------------------------------------------------------------------------------
#Accounting for non-stationarity
#In this exercise, you will again visualize the variations in model scores, but now for data that changes its statistics over time.
# Pre-initialize window sizes
window_sizes = [25, 50, 75, 100]
# Create an empty DataFrame to collect the stores
all_scores = pd.DataFrame(index=times_scores)
# Generate scores for each split to see how the model performs over time
for window in window_sizes:
# Create cross-validation object using a limited lookback window
cv = TimeSeriesSplit(n_splits=100, max_train_size=window)
# Calculate scores across all CV splits and collect them in a DataFrame
this_scores = cross_val_score(model, X, y, cv=cv, scoring=my_pearsonr)
all_scores['Length {}'.format(window)] = this_scores
# Visualize the scores
ax = all_scores.rolling(10).mean().plot(cmap=plt.cm.coolwarm)
ax.set(title='Scores for multiple windows', ylabel='Correlation (r)')
plt.show()
#tsfresh
#quantopian