Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

324 state space predictions #325

Merged
merged 43 commits into from
Sep 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
de318b2
review comments
thanasions Sep 16, 2019
47e361d
review comments
thanasions Sep 16, 2019
abe5916
cache tidy-up
thanasions Sep 16, 2019
4d98c59
tests pending #319
thanasions Sep 17, 2019
d5d05d3
Merge branch 'develop' of https://github.com/datasciencecampus/pyGram…
thanasions Sep 17, 2019
77a27ce
switch to smooth line predictions
thanasions Sep 17, 2019
6065e25
Merge branch 'develop' of https://github.com/datasciencecampus/pyGram…
thanasions Sep 17, 2019
5ce0f0c
first commit
thanasions Sep 18, 2019
01d45a3
#324 create table from state space results - work in progress
Sep 18, 2019
937d7c1
first commit
thanasions Sep 19, 2019
a2e0bc9
#324 create table from state space results - with tests
Sep 19, 2019
a1aef5a
#324 Trimmed SD implemented
Sep 19, 2019
9eb5694
Merge branch '324_state-space-predictions' of https://github.com/data…
thanasions Sep 19, 2019
931a892
#324 ...actually install test dependencies now they've been defined...
Sep 19, 2019
da3929c
dictionary with state space test run
thanasions Sep 19, 2019
cf36f85
#324 ...actually install test dependencies in AppVeyor...
Sep 19, 2019
f68f834
Merge remote-tracking branch 'origin/324_state-space-predictions' int…
thanasions Sep 19, 2019
d2c39da
#324 `run()` now calls state space model prediction rather than ARIMA…
Sep 19, 2019
d90b3ab
#324 changed API so extraction of prediction lengths is hidden and te…
Sep 19, 2019
8209ee9
dictionary with state space test run
thanasions Sep 19, 2019
b022eec
Merge branch '324_state-space-predictions' of https://github.com/data…
thanasions Sep 19, 2019
1a85cd8
#324 changed API so window size is passed in, reworked for new data s…
Sep 20, 2019
e1ce77e
Merge branch 'develop' into 324_state-space-predictions
IanGrimstead Sep 20, 2019
d00aab5
#324 SSM now scans look-ahead (k) in range 2..5
Sep 20, 2019
673c7fe
Merge branch '324_state-space-predictions' of https://github.com/data…
Sep 20, 2019
944da18
dictionary with state space test run
thanasions Sep 20, 2019
1b853e7
#324 report window size to HTML
Sep 20, 2019
a4d9430
Merge branch '324_state-space-predictions' of https://github.com/data…
Sep 20, 2019
54976b1
#324 WIP - needs refinement, but works for non-test. Test may blow gr…
Sep 20, 2019
aba5603
#324 corrected SD table
Sep 20, 2019
ed06c20
dictionary with state space test run
thanasions Sep 21, 2019
70b4e5b
ssm in comparison with others
thanasions Sep 22, 2019
04a4063
#324 corrected tests, removed unused code
Sep 23, 2019
632e9d7
simple labelling experients
thanasions Sep 24, 2019
93ca894
Merge remote-tracking branch 'origin/324_state-space-predictions' int…
thanasions Sep 24, 2019
ecdefb7
simple labelling experients
thanasions Sep 24, 2019
4c12296
simple labelling experients
thanasions Sep 24, 2019
20d6241
Merge branch 'develop' into 324_state-space-predictions
IanGrimstead Sep 25, 2019
55939ce
#328 SSM html table tests fixed
Sep 25, 2019
3320a51
Merge branch 'develop' into 324_state-space-predictions
IanGrimstead Sep 25, 2019
10b53ad
simple labelling experients
thanasions Sep 25, 2019
e8cc1aa
#328 multiplot added as option
Sep 25, 2019
2c311da
Merge remote-tracking branch 'origin/324_state-space-predictions' int…
Sep 25, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[run]
omit = tests/*
omit = algorithms/* test* utils/* vanv/* vvcode/* support.py __init__.py
3 changes: 1 addition & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ install:
- python --version
- python -m pip install -U pip
- python -m easy_install -U setuptools
# command to install dependencies
# - python setup.py install
# command to install dependencies; includes extra 'test' specific dependencies
- pip install -e .[test]

script:
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,11 @@ python pygrams.py -o wordcloud
python pygrams.py -o graph
```

Time series analysis also supports a multiplot to present up to 30 terms time series (emergent and declining), output in the `outputs/emergence` folder:
```
python pygrams.py -ts -dh 'publication_date' -o multiplot
```

The output options generate:

- Report is a text file containing top n terms (default is 250 terms, see `-np` for more details)
Expand Down
2 changes: 2 additions & 0 deletions cached/all-mdf-0.05-200501-201841/USPTO-all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/usr/bin/env bash
python pygrams.py -ts -ei gradients -nts 5 -mpq 50 -sma kalman -dt 2018/05/31 -tsdf 2012/06/01 -tsdt 2016/06/01 --test -pns 1 2 3 4 5 6 -dh publication_date -ds USPTO-granted-lite-all.pkl.bz2
38 changes: 19 additions & 19 deletions pygrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from scripts.utils.argschecker import ArgsChecker
from scripts.utils.pygrams_exception import PygramsException

predictor_names = ['All standard predictors', 'Naive', 'Linear', 'Quadratic', 'Cubic', 'ARIMA', 'Holt-Winters',
predictor_names = ['All standard predictors', 'Naive', 'Linear', 'Quadratic', 'Cubic', 'ARIMA', 'Holt-Winters','ssm',
'LSTM-multiLA-stateful', 'LSTM-multiLA-stateless',
'LSTM-1LA-stateful', 'LSTM-1LA-stateless',
'LSTM-multiM-1LA-stateful', 'LSTM-multiM-1LA-stateless']
Expand Down Expand Up @@ -98,7 +98,7 @@ def get_args(command_line_arguments):
# select outputs

parser.add_argument("-o", "--output", nargs='*', default=[],
choices=['graph', 'wordcloud'], # suppress table output option
choices=['graph', 'wordcloud', 'multiplot'], # suppress table output option
help="Note that this can be defined multiple times to get more than one output. ")

# file names etc.
Expand Down Expand Up @@ -136,7 +136,7 @@ def get_args(command_line_arguments):

parser.add_argument("-ei", "--emergence-index", default='porter', choices=('porter', 'quadratic', 'gradients'),
help="Emergence calculation to use")
parser.add_argument("-sma", "--smoothing-alg", default=None, choices=('kalman', 'savgol'),
parser.add_argument("-sma", "--smoothing-alg", default='savgol', choices=('kalman', 'savgol'),
help="Time series smoothing to use")

parser.add_argument("-exp", "--exponential_fitting", default=False, action="store_true",
Expand Down Expand Up @@ -196,7 +196,7 @@ def main(supplied_args):
# emtech integration
if args.timeseries:
if 0 in args.predictor_names:
algs_codes = list(range(1, 6))
algs_codes = list(range(1, 7))
else:
algs_codes = args.predictor_names

Expand All @@ -216,21 +216,21 @@ def main(supplied_args):
title += f' ({emergence})'

html_results, training_values = pipeline.run(predictors_to_run, normalized=args.normalised,
train_test=args.test, emergence=emergence)
if training_values is None:
continue
# save training_values to csv file
#
# training_values: csv file:
# {'term1': [0,2,4,6], 'term2': [2,4,1,3]} 'term1', 0, 2, 4, 6
# 'term2', 2, 4, 1, 3
#
filename = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence + '_time_series.csv')
with open(filename, 'w') as f:
w = csv.writer(f)
for key, values in training_values:
my_list = ["'" + str(key) + "'"] + values
w.writerow(my_list)
train_test=args.test, emergence=emergence)
if training_values is not None:
# save training_values to csv file
#
# training_values: csv file:
# {'term1': [0,2,4,6], 'term2': [2,4,1,3]} 'term1', 0, 2, 4, 6
# 'term2', 2, 4, 1, 3
#
filename = os.path.join('outputs', 'emergence',
args.outputs_name + '_' + emergence + '_time_series.csv')
with open(filename, 'w') as f:
w = csv.writer(f)
for key, values in training_values:
my_list = ["'" + str(key) + "'"] + values
w.writerow(my_list)

html_doc = f'''<!DOCTYPE html>
<html lang="en">
Expand Down
38 changes: 37 additions & 1 deletion scripts/algorithms/code/ssm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import numpy as np
import pandas as pd
from math import sqrt
from scipy.optimize import minimize
from itertools import product

Expand All @@ -15,6 +14,38 @@ def expand_grid(dictionary):
return pd.DataFrame([row for row in product(*dictionary.values())],
columns=dictionary.keys())

def forecast(self, dkf_in, k):

A = dkf_in['Alast'].values[0]
P = dkf_in['Plast'].values[0]
TT = dkf_in['TT'].values[0]
Z = dkf_in['Z'].values[0]
ncA0 = dkf_in['ncA0'].values[0]
gamma = dkf_in['gamma_est'].values[0]
mse_gamma = dkf_in['mse_gamma'].values[0]
sigma2 = dkf_in['sigma2'].values[0]
ncolZ = Z.shape[1]

Agamma = A[:, :ncA0 - 1]
alpha_hat = np.matrix(np.vstack([-999.0]*ncolZ * k).reshape(ncolZ, k))
mse_alpha_hat = np.matrix(np.vstack([-999.0]*ncolZ*ncolZ * k).reshape(ncolZ, ncolZ * k))

aux = np.vstack([-gamma, 1]).reshape(ncA0, 1)
alpha = np.matmul(A, aux)
alpha_hat[: , 0] = alpha

m1 = np.matmul(Agamma, mse_gamma)
m2 = np.matmul(m1, np.transpose(Agamma))
msealpha = (sigma2 * P) + m2

mse_alpha_hat[:, 0:ncolZ] = msealpha
if k > 1:
for i in range(1,k):
alpha_hat[:, i] = TT * alpha
mse_alpha_hat[: , ((i - 1) * ncolZ ): (ncolZ * i)] = TT * msealpha * TT.transpose()
TT = np.matmul(TT, TT)
return alpha_hat, mse_alpha_hat
return None, None

# Calculating and smoothing quarterly timeseries: 1%| | 1.08k/100k [23:31<36:43:32, 1.34s/term]
def param_estimator(self, sigma_gnu, sigma_eta, delta):
Expand Down Expand Up @@ -220,6 +251,11 @@ def smfilt(self, dkf_out):

return alphahat, mse_alphahat

def run_smooth_forecast(self, sigma_gnu=[0.001, 0.01, 0.1], sigma_eta=[0.001, 0.01, 0.1], delta=[0.5, 0.9], k=5):
opt_param = self.param_estimator(sigma_gnu, sigma_eta, delta)
dfk_out = self.dfk_llm_vard(opt_param)
return self.forecast(dfk_out, k)

def run_smoothing(self, sigma_gnu=[0.001, 0.01, 0.1], sigma_eta=[0.001, 0.01, 0.1], delta=[0.5, 0.9]):
opt_param = self.param_estimator(sigma_gnu, sigma_eta, delta)
dfk_out = self.dfk_llm_vard(opt_param)
Expand Down
2 changes: 1 addition & 1 deletion scripts/algorithms/holtwinters_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def __init__(self, data_in, num_prediction_periods):
self.__num_prediction_periods = num_prediction_periods

y = np.array(data_in).reshape(-1, 1)
y = y + 0.000001 # HoltWinters doesn't like zeros
y = np.clip(y, 0.00001, None) # HoltWinters doesn't like zeros

self.__model = Holt(y, exponential=True, damped=True)
self.__results = self.__model.fit(optimized=True)
Expand Down
5 changes: 3 additions & 2 deletions scripts/algorithms/predictor_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
LstmForecasterMultipleModelSingleLookAhead
from scripts.algorithms.naive_predictor import NaivePredictor
from scripts.algorithms.polynomial_predictor import PolynomialPredictor
from scripts.algorithms.state_space import StateSpaceModelObject


class PredictorFactory(object):
Expand Down Expand Up @@ -50,7 +51,7 @@ def predictor_factory(predictor_name, data_name, training_values, num_prediction
return PolynomialPredictor(training_values, num_prediction_periods, degree=3)
elif predictor_name == 'Holt-Winters':
return HoltWintersPredictor(training_values, num_prediction_periods)
# elif predictor_name == 'state-space':
# return StateSpaceModel(training_values, num_prediction_periods)
elif predictor_name == 'ssm':
return StateSpaceModelObject(training_values, num_prediction_periods)
else:
raise ValueError('Unknown predictor: ' + predictor_name)
65 changes: 24 additions & 41 deletions scripts/algorithms/state_space.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,24 @@
# import csv
# import os
# import rpy2.robjects as robjects
#
#
# class StateSpaceModel(object):
#
# def __init__(self, data_in, num_prediction_periods):
# if not all(isinstance(x, float) for x in data_in):
# raise ValueError('Time series must be all float values')
#
# self.__history = data_in
# self.__num_prediction_periods = num_prediction_periods
#
# @property
# def configuration(self):
# return None
#
# def predict_counts(self):
# cwd = os.getcwd()
# rwd = robjects.r('getwd()')
#
# if '/scripts/algorithms/code' not in rwd[0]:
# wd = '''setwd(''' + '\'' + cwd + '''/scripts/algorithms/code')'''
# robjects.r(wd)
# rwd = robjects.r('getwd()')
# output_path = rwd[0] + '/buffer.csv'
# print("path: "+output_path)
#
# with open(output_path, "w") as file:
# writer = csv.writer(file, delimiter='\n')
# writer.writerow(self.__history)
#
# robjects.r('''
# source('predict')
# ''')
#
# r_func = robjects.globalenv['predict']
# out = r_func("buffer.csv",self.__num_prediction_periods)
# os.chdir(cwd)
# return out[0]
import numpy as np

from scripts.algorithms.code.ssm import StateSpaceModel


class StateSpaceModelObject(object):

def __init__(self, data_in, num_prediction_periods):
if not all(isinstance(x, float) for x in data_in):
raise ValueError('Time series must be all float values')

self.__history = data_in
self.__num_prediction_periods = num_prediction_periods

self.__alpha, self.__mse = StateSpaceModel(self.__history).run_smooth_forecast(k=self.__num_prediction_periods)
@property
def configuration(self):
return None

def predict_counts(self):
return np.array(self.__alpha[0])[0]

def predict_derivatives(self):
return np.array(self.__alpha[1])[0]
Loading