324 state space predictions (#325)

* #324 create table from state space results - work in progress * tests TBA * first commit * #324 create table from state space results - with tests * Trimmed SD not implemented * #324 Trimmed SD implemented * #324 report window size to HTML * #324 WIP - needs refinement, but works for non-test. Test may blow graph generation. * #328 multiplot added as option
datasciencecampus · Sep 25, 2019 · 87412ef · 87412ef
1 parent 02f464c
commit 87412ef
Show file tree

Hide file tree

Showing 21 changed files with 796 additions and 355 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -1,2 +1,2 @@
 [run]
-omit = tests/*
+omit = algorithms/* test* utils/* vanv/* vvcode/* support.py __init__.py
diff --git a/.travis.yml b/.travis.yml
@@ -45,8 +45,7 @@ install:
   - python --version
   - python -m pip install -U pip
   - python -m easy_install -U setuptools
-  # command to install dependencies
-  #  - python setup.py install
+  # command to install dependencies; includes extra 'test' specific dependencies
   - pip install -e .[test]
 
 script:

diff --git a/README.md b/README.md
@@ -318,6 +318,11 @@ python pygrams.py -o wordcloud
 python pygrams.py -o graph
 ```
 
+Time series analysis also supports a multiplot to present up to 30 terms time series (emergent and declining), output in the `outputs/emergence` folder:
+```
+python pygrams.py -ts -dh 'publication_date' -o multiplot
+```
+
 The output options generate:
 
 - Report is a text file containing top n terms (default is 250 terms, see `-np` for more details)

diff --git a/cached/all-mdf-0.05-200501-201841/USPTO-all.sh b/cached/all-mdf-0.05-200501-201841/USPTO-all.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+python pygrams.py -ts -ei gradients -nts 5 -mpq 50 -sma kalman -dt 2018/05/31 -tsdf 2012/06/01 -tsdt 2016/06/01 --test -pns 1 2 3 4 5 6 -dh publication_date -ds USPTO-granted-lite-all.pkl.bz2
diff --git a/pygrams.py b/pygrams.py
@@ -8,7 +8,7 @@
 from scripts.utils.argschecker import ArgsChecker
 from scripts.utils.pygrams_exception import PygramsException
 
-predictor_names = ['All standard predictors', 'Naive', 'Linear', 'Quadratic', 'Cubic', 'ARIMA', 'Holt-Winters',
+predictor_names = ['All standard predictors', 'Naive', 'Linear', 'Quadratic', 'Cubic', 'ARIMA', 'Holt-Winters','ssm',
                    'LSTM-multiLA-stateful', 'LSTM-multiLA-stateless',
                    'LSTM-1LA-stateful', 'LSTM-1LA-stateless',
                    'LSTM-multiM-1LA-stateful', 'LSTM-multiM-1LA-stateless']
@@ -98,7 +98,7 @@ def get_args(command_line_arguments):
     # select outputs
 
     parser.add_argument("-o", "--output", nargs='*', default=[],
-                        choices=['graph', 'wordcloud'],  # suppress table output option
+                        choices=['graph', 'wordcloud', 'multiplot'],  # suppress table output option
                         help="Note that this can be defined multiple times to get more than one output. ")
 
     # file names etc.
@@ -136,7 +136,7 @@ def get_args(command_line_arguments):
 
     parser.add_argument("-ei", "--emergence-index", default='porter', choices=('porter', 'quadratic', 'gradients'),
                         help="Emergence calculation to use")
-    parser.add_argument("-sma", "--smoothing-alg", default=None, choices=('kalman', 'savgol'),
+    parser.add_argument("-sma", "--smoothing-alg", default='savgol', choices=('kalman', 'savgol'),
                         help="Time series smoothing to use")
 
     parser.add_argument("-exp", "--exponential_fitting", default=False, action="store_true",
@@ -196,7 +196,7 @@ def main(supplied_args):
     # emtech integration
     if args.timeseries:
         if 0 in args.predictor_names:
-            algs_codes = list(range(1, 6))
+            algs_codes = list(range(1, 7))
         else:
             algs_codes = args.predictor_names
 
@@ -216,21 +216,21 @@ def main(supplied_args):
             title += f' ({emergence})'
 
             html_results, training_values = pipeline.run(predictors_to_run, normalized=args.normalised,
-                                                                train_test=args.test, emergence=emergence)
-            if training_values is None:
-                continue
-            # save training_values to csv file
-            #
-            # training_values:                                  csv file:
-            # {'term1': [0,2,4,6], 'term2': [2,4,1,3]}          'term1', 0, 2, 4, 6
-            #                                                   'term2', 2, 4, 1, 3
-            #
-            filename = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence + '_time_series.csv')
-            with open(filename, 'w') as f:
-                w = csv.writer(f)
-                for key, values in training_values:
-                    my_list = ["'" + str(key) + "'"] + values
-                    w.writerow(my_list)
+                                                         train_test=args.test, emergence=emergence)
+            if training_values is not None:
+                # save training_values to csv file
+                #
+                # training_values:                                  csv file:
+                # {'term1': [0,2,4,6], 'term2': [2,4,1,3]}          'term1', 0, 2, 4, 6
+                #                                                   'term2', 2, 4, 1, 3
+                #
+                filename = os.path.join('outputs', 'emergence',
+                                        args.outputs_name + '_' + emergence + '_time_series.csv')
+                with open(filename, 'w') as f:
+                    w = csv.writer(f)
+                    for key, values in training_values:
+                        my_list = ["'" + str(key) + "'"] + values
+                        w.writerow(my_list)
 
             html_doc = f'''<!DOCTYPE html>
                 <html lang="en">

diff --git a/scripts/algorithms/code/ssm.py b/scripts/algorithms/code/ssm.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pandas as pd
-from math import sqrt
 from scipy.optimize import minimize
 from itertools import product
 
@@ -15,6 +14,38 @@ def expand_grid(dictionary):
         return pd.DataFrame([row for row in product(*dictionary.values())],
                             columns=dictionary.keys())
 
+    def forecast(self, dkf_in, k):
+
+        A = dkf_in['Alast'].values[0]
+        P = dkf_in['Plast'].values[0]
+        TT = dkf_in['TT'].values[0]
+        Z = dkf_in['Z'].values[0]
+        ncA0 = dkf_in['ncA0'].values[0]
+        gamma = dkf_in['gamma_est'].values[0]
+        mse_gamma = dkf_in['mse_gamma'].values[0]
+        sigma2 = dkf_in['sigma2'].values[0]
+        ncolZ = Z.shape[1]
+
+        Agamma = A[:, :ncA0 - 1]
+        alpha_hat     = np.matrix(np.vstack([-999.0]*ncolZ * k).reshape(ncolZ, k))
+        mse_alpha_hat = np.matrix(np.vstack([-999.0]*ncolZ*ncolZ * k).reshape(ncolZ, ncolZ * k))
+
+        aux = np.vstack([-gamma, 1]).reshape(ncA0, 1)
+        alpha = np.matmul(A,  aux)
+        alpha_hat[: , 0] = alpha
+
+        m1 = np.matmul(Agamma, mse_gamma)
+        m2 = np.matmul(m1, np.transpose(Agamma))
+        msealpha = (sigma2 * P) + m2
+
+        mse_alpha_hat[:, 0:ncolZ] = msealpha
+        if k > 1:
+            for i in range(1,k):
+                alpha_hat[:, i] = TT * alpha
+                mse_alpha_hat[: , ((i - 1) * ncolZ ): (ncolZ * i)] = TT * msealpha * TT.transpose()
+                TT = np.matmul(TT,  TT)
+            return alpha_hat, mse_alpha_hat
+        return None, None
 
     # Calculating and smoothing quarterly timeseries:   1%|          | 1.08k/100k [23:31<36:43:32, 1.34s/term]
     def param_estimator(self, sigma_gnu, sigma_eta, delta):
@@ -220,6 +251,11 @@ def smfilt(self, dkf_out):
 
         return alphahat, mse_alphahat
 
+    def run_smooth_forecast(self, sigma_gnu=[0.001, 0.01, 0.1], sigma_eta=[0.001, 0.01, 0.1], delta=[0.5, 0.9], k=5):
+        opt_param = self.param_estimator(sigma_gnu, sigma_eta, delta)
+        dfk_out = self.dfk_llm_vard(opt_param)
+        return self.forecast(dfk_out, k)
+
     def run_smoothing(self, sigma_gnu=[0.001, 0.01, 0.1], sigma_eta=[0.001, 0.01, 0.1], delta=[0.5, 0.9]):
         opt_param = self.param_estimator(sigma_gnu, sigma_eta, delta)
         dfk_out = self.dfk_llm_vard(opt_param)

diff --git a/scripts/algorithms/holtwinters_predictor.py b/scripts/algorithms/holtwinters_predictor.py
@@ -10,7 +10,7 @@ def __init__(self, data_in, num_prediction_periods):
         self.__num_prediction_periods = num_prediction_periods
 
         y = np.array(data_in).reshape(-1, 1)
-        y = y + 0.000001  # HoltWinters doesn't like zeros
+        y = np.clip(y, 0.00001, None)  # HoltWinters doesn't like zeros
 
         self.__model = Holt(y, exponential=True, damped=True)
         self.__results = self.__model.fit(optimized=True)

diff --git a/scripts/algorithms/predictor_factory.py b/scripts/algorithms/predictor_factory.py
@@ -8,6 +8,7 @@
     LstmForecasterMultipleModelSingleLookAhead
 from scripts.algorithms.naive_predictor import NaivePredictor
 from scripts.algorithms.polynomial_predictor import PolynomialPredictor
+from scripts.algorithms.state_space import StateSpaceModelObject
 
 
 class PredictorFactory(object):
@@ -50,7 +51,7 @@ def predictor_factory(predictor_name, data_name, training_values, num_prediction
             return PolynomialPredictor(training_values, num_prediction_periods, degree=3)
         elif predictor_name == 'Holt-Winters':
             return HoltWintersPredictor(training_values, num_prediction_periods)
-        # elif predictor_name == 'state-space':
-        #     return StateSpaceModel(training_values, num_prediction_periods)
+        elif predictor_name == 'ssm':
+            return StateSpaceModelObject(training_values, num_prediction_periods)
         else:
             raise ValueError('Unknown predictor: ' + predictor_name)
diff --git a/scripts/algorithms/state_space.py b/scripts/algorithms/state_space.py
@@ -1,41 +1,24 @@
-# import csv
-# import os
-# import rpy2.robjects as robjects
-#
-#
-# class StateSpaceModel(object):
-#
-#     def __init__(self, data_in, num_prediction_periods):
-#         if not all(isinstance(x, float) for x in data_in):
-#             raise ValueError('Time series must be all float values')
-#
-#         self.__history = data_in
-#         self.__num_prediction_periods = num_prediction_periods
-#
-#     @property
-#     def configuration(self):
-#         return None
-#
-#     def predict_counts(self):
-#         cwd = os.getcwd()
-#         rwd = robjects.r('getwd()')
-#
-#         if '/scripts/algorithms/code' not in rwd[0]:
-#             wd = '''setwd(''' + '\'' + cwd + '''/scripts/algorithms/code')'''
-#             robjects.r(wd)
-#         rwd = robjects.r('getwd()')
-#         output_path = rwd[0] + '/buffer.csv'
-#         print("path: "+output_path)
-#
-#         with open(output_path, "w") as file:
-#             writer = csv.writer(file, delimiter='\n')
-#             writer.writerow(self.__history)
-#
-#         robjects.r('''
-#                source('predict')
-#         ''')
-#
-#         r_func = robjects.globalenv['predict']
-#         out = r_func("buffer.csv",self.__num_prediction_periods)
-#         os.chdir(cwd)
-#         return out[0]
+import numpy as np
+
+from scripts.algorithms.code.ssm import StateSpaceModel
+
+
+class StateSpaceModelObject(object):
+
+    def __init__(self, data_in, num_prediction_periods):
+        if not all(isinstance(x, float) for x in data_in):
+            raise ValueError('Time series must be all float values')
+
+        self.__history = data_in
+        self.__num_prediction_periods = num_prediction_periods
+
+        self.__alpha, self.__mse = StateSpaceModel(self.__history).run_smooth_forecast(k=self.__num_prediction_periods)
+    @property
+    def configuration(self):
+        return None
+
+    def predict_counts(self):
+        return np.array(self.__alpha[0])[0]
+
+    def predict_derivatives(self):
+        return np.array(self.__alpha[1])[0]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#!/usr/bin/env bash
		python pygrams.py -ts -ei gradients -nts 5 -mpq 50 -sma kalman -dt 2018/05/31 -tsdf 2012/06/01 -tsdt 2016/06/01 --test -pns 1 2 3 4 5 6 -dh publication_date -ds USPTO-granted-lite-all.pkl.bz2