#315 clamp smoothed values at 0

* cast smoothed data back to lists (from numpy arrays) for consistency * command line args now restricted to available smoothing and emergence * added simple test for holt-winters to confirm -ve values not handled
datasciencecampus · Sep 17, 2019 · d764cfb · d764cfb
1 parent 44e067c
commit d764cfb
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 22 deletions.
diff --git a/pygrams.py b/pygrams.py
@@ -134,8 +134,10 @@ def get_args(command_line_arguments):
     parser.add_argument("-stp", "--steps_ahead", type=int, default=5,
                         help="number of steps ahead to analyse for")
 
-    parser.add_argument("-ei", "--emergence-index", default='porter', help="options are: porter, quadratic, gradients")
-    parser.add_argument("-sma", "--smoothing-alg", default=None, help="options are: kalman, savgol")
+    parser.add_argument("-ei", "--emergence-index", default='porter', choices=('porter', 'quadratic', 'gradients'),
+                        help="Emergence calculation to use (default: %(default))")
+    parser.add_argument("-sma", "--smoothing-alg", default=None, choices=('kalman', 'savgol'),
+                        help="Time series smoothing to use (default: %(default))")
 
     parser.add_argument("-exp", "--exponential_fitting", default=False, action="store_true",
                         help="analyse using exponential type fit or not")

diff --git a/scripts/pipeline.py b/scripts/pipeline.py
@@ -1,5 +1,6 @@
 from os import path
 
+import numpy as np
 from scipy.signal import savgol_filter
 from tqdm import tqdm
 
@@ -155,9 +156,10 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
 
         # TODO: offer timeseries cache as an option. Then filter dates and terms after reading the cached matrix
         print(f'Creating timeseries matrix...')
-        if cached_folder_name is None or not (path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name))
-                                              and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name))
-                                              and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))):
+        if cached_folder_name is None or not (
+                path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name))
+                and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name))
+                and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))):
             self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(self.__dates)
             [self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week,
              self.__weekly_iso_dates] = self.__timeseries_data
@@ -191,8 +193,8 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
             min_date = self.__timeseries_date_dict['from']
             max_date = self.__timeseries_date_dict['to']
 
-        min_i=0
-        max_i= len(all_quarters)
+        min_i = 0
+        max_i = len(all_quarters)
 
         for i, quarter in enumerate(all_quarters):
             if min_date is not None and min_date < quarter:
@@ -203,7 +205,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
             if max_date is not None and max_date < quarter:
                 break
             max_i = i
-        self.__lims=[min_i, max_i]
+        self.__lims = [min_i, max_i]
         self.__timeseries_quarterly_smoothed = None if sma is None else []
 
         for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term',
@@ -217,30 +219,37 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
             self.__timeseries_quarterly.append(quarterly_values)
 
         if emergence_index == 'gradients' or sma == 'kalman':
-            if cached_folder_name is None or not (path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name))
-                                                  and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))):
+            if cached_folder_name is None or not (
+                    path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name))
+                    and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))):
                 for term_index, quarterly_values in tqdm(enumerate(self.__timeseries_quarterly), unit='term',
-                                       desc='smoothing quarterly timeseries with kalman filter',
-                                       leave=False, unit_scale=True, total=len(self.__timeseries_quarterly)):
+                                                         desc='smoothing quarterly timeseries with kalman filter',
+                                                         leave=False, unit_scale=True,
+                                                         total=len(self.__timeseries_quarterly)):
                     _, _1, smooth_series_s, _intercept = StateSpaceModel(quarterly_values).run_smoothing()
+
                     smooth_series = smooth_series_s[0].tolist()[0]
+                    smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None)
+                    self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist())
+
                     derivatives = smooth_series_s[1].tolist()[0]
                     self.__timeseries_derivatives.append(derivatives)
-                    self.__timeseries_quarterly_smoothed.append(smooth_series)
 
                 utils.pickle_object('smooth_series_s', self.__timeseries_quarterly_smoothed, self.__cached_folder_name)
                 utils.pickle_object('derivatives', self.__timeseries_derivatives, self.__cached_folder_name)
 
             else:
-                self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s', self.__cached_folder_name)
+                self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s',
+                                                                             self.__cached_folder_name)
                 self.__timeseries_derivatives = utils.unpickle_object('derivatives', self.__cached_folder_name)
 
         if sma == 'savgol':
             for quarterly_values in tqdm(self.__timeseries_quarterly, unit='term',
-                                                     desc='savgol smoothing quarterly timeseries',
-                                                     leave=False, unit_scale=True):
+                                         desc='savgol smoothing quarterly timeseries',
+                                         leave=False, unit_scale=True):
                 smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest')
-                self.__timeseries_quarterly_smoothed.append(smooth_series)
+                smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None)
+                self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist())
 
         em = Emergence(all_quarterly_values[min_i:max_i])
 
@@ -283,7 +292,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
 
     def output(self, output_types, wordcloud_title=None, outname=None, nterms=50, n_nmf_topics=0):
         for output_type in output_types:
-            output_factory.create(output_type, self.__term_score_tuples,emergence_list=self.__emergence_list,
+            output_factory.create(output_type, self.__term_score_tuples, emergence_list=self.__emergence_list,
                                   wordcloud_title=wordcloud_title, tfidf_reduce_obj=self.__tfidf_reduce_obj,
                                   name=outname, nterms=nterms, timeseries_data=self.__timeseries_data,
                                   date_dict=self.__date_dict, pick=self.__pick_method,
@@ -294,7 +303,7 @@ def term_score_tuples(self):
         return self.__term_score_tuples
 
     # run with 30 terms only.
-    def get_multiplot(self, timeseries_terms_smooth,timeseries, test_terms, term_ngrams, lims, method = 'Net Growth',
+    def get_multiplot(self, timeseries_terms_smooth, timeseries, test_terms, term_ngrams, lims, method='Net Growth',
                       category='emergent'):
         # libraries and data
         import matplotlib.pyplot as plt
@@ -333,7 +342,7 @@ def get_multiplot(self, timeseries_terms_smooth,timeseries, test_terms, term_ngr
 
             # plot the lineplot
             plt.plot(df['x'], df[column], color='b', marker='', linewidth=1.4, alpha=0.9, label=column)
-            plt.plot(df['x'],df_smooth[column], color='g', linestyle='-', marker='',label='smoothed ground truth')
+            plt.plot(df['x'], df_smooth[column], color='g', linestyle='-', marker='', label='smoothed ground truth')
 
             plt.axvline(x=lims[0], color='k', linestyle='--')
             plt.axvline(x=lims[1], color='k', linestyle='--')
@@ -351,8 +360,8 @@ def get_multiplot(self, timeseries_terms_smooth,timeseries, test_terms, term_ngr
             plt.title(column, loc='left', fontsize=12, fontweight=0)
 
         # general title
-        plt.suptitle(category +" keywords selection using the " + method + " index", fontsize=13, fontweight=0, color='black',
-                     style='italic')
+        plt.suptitle(category + " keywords selection using the " + method + " index", fontsize=13, fontweight=0,
+                     color='black', style='italic')
 
         # axis title
         plt.show()

diff --git a/tests/algorithms/test_holt_winters.py b/tests/algorithms/test_holt_winters.py
@@ -0,0 +1,43 @@
+import unittest
+
+import numpy.testing as np_test
+
+from scripts.algorithms.holtwinters_predictor import HoltWintersPredictor
+
+
+class HoltWintersTests(unittest.TestCase):
+
+    def test_negatives_in_sequence(self):
+        time_series = [1, 1, -1, 1, 1]
+        num_predicted_periods = 3
+
+        try:
+            HoltWintersPredictor(time_series, num_predicted_periods)
+            self.fail('Expected to throw due to negative values')
+
+        except NotImplementedError as nie:
+            self.assertEqual(nie.args[0], 'Unable to correct for negative or zero values')
+
+        except ValueError as ve:
+            self.assertEqual(ve.args[0],
+                             'endog must be strictly positive when using multiplicative trend or seasonal components.')
+
+    def test_zeros_in_sequence(self):
+        time_series = [1, 1, 0, 1, 1]
+        num_predicted_periods = 3
+        expected_prediction = [0.8] * num_predicted_periods
+        hw = HoltWintersPredictor(time_series, num_predicted_periods)
+
+        actual_prediction = hw.predict_counts()
+
+        np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=4)
+
+    def test_static_sequence(self):
+        time_series = [1.0, 1.0, 1.0, 1.0, 1.0]
+        num_predicted_periods = 3
+        expected_prediction = [1] * num_predicted_periods
+        hw = HoltWintersPredictor(time_series, num_predicted_periods)
+
+        actual_prediction = hw.predict_counts()
+
+        np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=4)