Skip to content

Commit

Permalink
324 state space predictions (#325)
Browse files Browse the repository at this point in the history
* #324 create table from state space results - work in progress
* tests TBA

* first commit

* #324 create table from state space results - with tests
* Trimmed SD not implemented

* #324 Trimmed SD implemented

* #324 report window size to HTML

* #324 WIP - needs refinement, but works for non-test. Test may blow graph generation.

* #328 multiplot added as option
  • Loading branch information
IanGrimstead authored and thanasions committed Sep 25, 2019
1 parent 02f464c commit 87412ef
Show file tree
Hide file tree
Showing 21 changed files with 796 additions and 355 deletions.
2 changes: 1 addition & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[run]
omit = tests/*
omit = algorithms/* test* utils/* vanv/* vvcode/* support.py __init__.py
3 changes: 1 addition & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ install:
- python --version
- python -m pip install -U pip
- python -m easy_install -U setuptools
# command to install dependencies
# - python setup.py install
# command to install dependencies; includes extra 'test' specific dependencies
- pip install -e .[test]

script:
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,11 @@ python pygrams.py -o wordcloud
python pygrams.py -o graph
```

Time series analysis also supports a multiplot to present up to 30 terms time series (emergent and declining), output in the `outputs/emergence` folder:
```
python pygrams.py -ts -dh 'publication_date' -o multiplot
```

The output options generate:

- Report is a text file containing top n terms (default is 250 terms, see `-np` for more details)
Expand Down
2 changes: 2 additions & 0 deletions cached/all-mdf-0.05-200501-201841/USPTO-all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/usr/bin/env bash
python pygrams.py -ts -ei gradients -nts 5 -mpq 50 -sma kalman -dt 2018/05/31 -tsdf 2012/06/01 -tsdt 2016/06/01 --test -pns 1 2 3 4 5 6 -dh publication_date -ds USPTO-granted-lite-all.pkl.bz2
38 changes: 19 additions & 19 deletions pygrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from scripts.utils.argschecker import ArgsChecker
from scripts.utils.pygrams_exception import PygramsException

predictor_names = ['All standard predictors', 'Naive', 'Linear', 'Quadratic', 'Cubic', 'ARIMA', 'Holt-Winters',
predictor_names = ['All standard predictors', 'Naive', 'Linear', 'Quadratic', 'Cubic', 'ARIMA', 'Holt-Winters','ssm',
'LSTM-multiLA-stateful', 'LSTM-multiLA-stateless',
'LSTM-1LA-stateful', 'LSTM-1LA-stateless',
'LSTM-multiM-1LA-stateful', 'LSTM-multiM-1LA-stateless']
Expand Down Expand Up @@ -98,7 +98,7 @@ def get_args(command_line_arguments):
# select outputs

parser.add_argument("-o", "--output", nargs='*', default=[],
choices=['graph', 'wordcloud'], # suppress table output option
choices=['graph', 'wordcloud', 'multiplot'], # suppress table output option
help="Note that this can be defined multiple times to get more than one output. ")

# file names etc.
Expand Down Expand Up @@ -136,7 +136,7 @@ def get_args(command_line_arguments):

parser.add_argument("-ei", "--emergence-index", default='porter', choices=('porter', 'quadratic', 'gradients'),
help="Emergence calculation to use")
parser.add_argument("-sma", "--smoothing-alg", default=None, choices=('kalman', 'savgol'),
parser.add_argument("-sma", "--smoothing-alg", default='savgol', choices=('kalman', 'savgol'),
help="Time series smoothing to use")

parser.add_argument("-exp", "--exponential_fitting", default=False, action="store_true",
Expand Down Expand Up @@ -196,7 +196,7 @@ def main(supplied_args):
# emtech integration
if args.timeseries:
if 0 in args.predictor_names:
algs_codes = list(range(1, 6))
algs_codes = list(range(1, 7))
else:
algs_codes = args.predictor_names

Expand All @@ -216,21 +216,21 @@ def main(supplied_args):
title += f' ({emergence})'

html_results, training_values = pipeline.run(predictors_to_run, normalized=args.normalised,
train_test=args.test, emergence=emergence)
if training_values is None:
continue
# save training_values to csv file
#
# training_values: csv file:
# {'term1': [0,2,4,6], 'term2': [2,4,1,3]} 'term1', 0, 2, 4, 6
# 'term2', 2, 4, 1, 3
#
filename = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence + '_time_series.csv')
with open(filename, 'w') as f:
w = csv.writer(f)
for key, values in training_values:
my_list = ["'" + str(key) + "'"] + values
w.writerow(my_list)
train_test=args.test, emergence=emergence)
if training_values is not None:
# save training_values to csv file
#
# training_values: csv file:
# {'term1': [0,2,4,6], 'term2': [2,4,1,3]} 'term1', 0, 2, 4, 6
# 'term2', 2, 4, 1, 3
#
filename = os.path.join('outputs', 'emergence',
args.outputs_name + '_' + emergence + '_time_series.csv')
with open(filename, 'w') as f:
w = csv.writer(f)
for key, values in training_values:
my_list = ["'" + str(key) + "'"] + values
w.writerow(my_list)

html_doc = f'''<!DOCTYPE html>
<html lang="en">
Expand Down
38 changes: 37 additions & 1 deletion scripts/algorithms/code/ssm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import numpy as np
import pandas as pd
from math import sqrt
from scipy.optimize import minimize
from itertools import product

Expand All @@ -15,6 +14,38 @@ def expand_grid(dictionary):
return pd.DataFrame([row for row in product(*dictionary.values())],
columns=dictionary.keys())

def forecast(self, dkf_in, k):

A = dkf_in['Alast'].values[0]
P = dkf_in['Plast'].values[0]
TT = dkf_in['TT'].values[0]
Z = dkf_in['Z'].values[0]
ncA0 = dkf_in['ncA0'].values[0]
gamma = dkf_in['gamma_est'].values[0]
mse_gamma = dkf_in['mse_gamma'].values[0]
sigma2 = dkf_in['sigma2'].values[0]
ncolZ = Z.shape[1]

Agamma = A[:, :ncA0 - 1]
alpha_hat = np.matrix(np.vstack([-999.0]*ncolZ * k).reshape(ncolZ, k))
mse_alpha_hat = np.matrix(np.vstack([-999.0]*ncolZ*ncolZ * k).reshape(ncolZ, ncolZ * k))

aux = np.vstack([-gamma, 1]).reshape(ncA0, 1)
alpha = np.matmul(A, aux)
alpha_hat[: , 0] = alpha

m1 = np.matmul(Agamma, mse_gamma)
m2 = np.matmul(m1, np.transpose(Agamma))
msealpha = (sigma2 * P) + m2

mse_alpha_hat[:, 0:ncolZ] = msealpha
if k > 1:
for i in range(1,k):
alpha_hat[:, i] = TT * alpha
mse_alpha_hat[: , ((i - 1) * ncolZ ): (ncolZ * i)] = TT * msealpha * TT.transpose()
TT = np.matmul(TT, TT)
return alpha_hat, mse_alpha_hat
return None, None

# Calculating and smoothing quarterly timeseries: 1%| | 1.08k/100k [23:31<36:43:32, 1.34s/term]
def param_estimator(self, sigma_gnu, sigma_eta, delta):
Expand Down Expand Up @@ -220,6 +251,11 @@ def smfilt(self, dkf_out):

return alphahat, mse_alphahat

def run_smooth_forecast(self, sigma_gnu=[0.001, 0.01, 0.1], sigma_eta=[0.001, 0.01, 0.1], delta=[0.5, 0.9], k=5):
opt_param = self.param_estimator(sigma_gnu, sigma_eta, delta)
dfk_out = self.dfk_llm_vard(opt_param)
return self.forecast(dfk_out, k)

def run_smoothing(self, sigma_gnu=[0.001, 0.01, 0.1], sigma_eta=[0.001, 0.01, 0.1], delta=[0.5, 0.9]):
opt_param = self.param_estimator(sigma_gnu, sigma_eta, delta)
dfk_out = self.dfk_llm_vard(opt_param)
Expand Down
2 changes: 1 addition & 1 deletion scripts/algorithms/holtwinters_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def __init__(self, data_in, num_prediction_periods):
self.__num_prediction_periods = num_prediction_periods

y = np.array(data_in).reshape(-1, 1)
y = y + 0.000001 # HoltWinters doesn't like zeros
y = np.clip(y, 0.00001, None) # HoltWinters doesn't like zeros

self.__model = Holt(y, exponential=True, damped=True)
self.__results = self.__model.fit(optimized=True)
Expand Down
5 changes: 3 additions & 2 deletions scripts/algorithms/predictor_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
LstmForecasterMultipleModelSingleLookAhead
from scripts.algorithms.naive_predictor import NaivePredictor
from scripts.algorithms.polynomial_predictor import PolynomialPredictor
from scripts.algorithms.state_space import StateSpaceModelObject


class PredictorFactory(object):
Expand Down Expand Up @@ -50,7 +51,7 @@ def predictor_factory(predictor_name, data_name, training_values, num_prediction
return PolynomialPredictor(training_values, num_prediction_periods, degree=3)
elif predictor_name == 'Holt-Winters':
return HoltWintersPredictor(training_values, num_prediction_periods)
# elif predictor_name == 'state-space':
# return StateSpaceModel(training_values, num_prediction_periods)
elif predictor_name == 'ssm':
return StateSpaceModelObject(training_values, num_prediction_periods)
else:
raise ValueError('Unknown predictor: ' + predictor_name)
65 changes: 24 additions & 41 deletions scripts/algorithms/state_space.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,24 @@
# import csv
# import os
# import rpy2.robjects as robjects
#
#
# class StateSpaceModel(object):
#
# def __init__(self, data_in, num_prediction_periods):
# if not all(isinstance(x, float) for x in data_in):
# raise ValueError('Time series must be all float values')
#
# self.__history = data_in
# self.__num_prediction_periods = num_prediction_periods
#
# @property
# def configuration(self):
# return None
#
# def predict_counts(self):
# cwd = os.getcwd()
# rwd = robjects.r('getwd()')
#
# if '/scripts/algorithms/code' not in rwd[0]:
# wd = '''setwd(''' + '\'' + cwd + '''/scripts/algorithms/code')'''
# robjects.r(wd)
# rwd = robjects.r('getwd()')
# output_path = rwd[0] + '/buffer.csv'
# print("path: "+output_path)
#
# with open(output_path, "w") as file:
# writer = csv.writer(file, delimiter='\n')
# writer.writerow(self.__history)
#
# robjects.r('''
# source('predict')
# ''')
#
# r_func = robjects.globalenv['predict']
# out = r_func("buffer.csv",self.__num_prediction_periods)
# os.chdir(cwd)
# return out[0]
import numpy as np

from scripts.algorithms.code.ssm import StateSpaceModel


class StateSpaceModelObject(object):

def __init__(self, data_in, num_prediction_periods):
if not all(isinstance(x, float) for x in data_in):
raise ValueError('Time series must be all float values')

self.__history = data_in
self.__num_prediction_periods = num_prediction_periods

self.__alpha, self.__mse = StateSpaceModel(self.__history).run_smooth_forecast(k=self.__num_prediction_periods)
@property
def configuration(self):
return None

def predict_counts(self):
return np.array(self.__alpha[0])[0]

def predict_derivatives(self):
return np.array(self.__alpha[1])[0]
Loading

0 comments on commit 87412ef

Please sign in to comment.