From d764cfba7eb3bc8a70130edd12481870d5c9879c Mon Sep 17 00:00:00 2001 From: IanGrimstead Date: Tue, 17 Sep 2019 11:24:49 +0100 Subject: [PATCH] #315 clamp smoothed values at 0 * cast smoothed data back to lists (from numpy arrays) for consistency * command line args now restricted to available smoothing and emergence * added simple test for holt-winters to confirm -ve values not handled --- pygrams.py | 6 ++-- scripts/pipeline.py | 49 ++++++++++++++++----------- tests/algorithms/test_holt_winters.py | 43 +++++++++++++++++++++++ 3 files changed, 76 insertions(+), 22 deletions(-) create mode 100644 tests/algorithms/test_holt_winters.py diff --git a/pygrams.py b/pygrams.py index cc6dd7d..db7c875 100644 --- a/pygrams.py +++ b/pygrams.py @@ -134,8 +134,10 @@ def get_args(command_line_arguments): parser.add_argument("-stp", "--steps_ahead", type=int, default=5, help="number of steps ahead to analyse for") - parser.add_argument("-ei", "--emergence-index", default='porter', help="options are: porter, quadratic, gradients") - parser.add_argument("-sma", "--smoothing-alg", default=None, help="options are: kalman, savgol") + parser.add_argument("-ei", "--emergence-index", default='porter', choices=('porter', 'quadratic', 'gradients'), + help="Emergence calculation to use (default: %(default))") + parser.add_argument("-sma", "--smoothing-alg", default=None, choices=('kalman', 'savgol'), + help="Time series smoothing to use (default: %(default))") parser.add_argument("-exp", "--exponential_fitting", default=False, action="store_true", help="analyse using exponential type fit or not") diff --git a/scripts/pipeline.py b/scripts/pipeline.py index 4b68b7e..95ec1f1 100644 --- a/scripts/pipeline.py +++ b/scripts/pipeline.py @@ -1,5 +1,6 @@ from os import path +import numpy as np from scipy.signal import savgol_filter from tqdm import tqdm @@ -155,9 +156,10 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range # TODO: offer timeseries cache as an option. Then filter dates and terms after reading the cached matrix print(f'Creating timeseries matrix...') - if cached_folder_name is None or not (path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name)) - and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name)) - and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))): + if cached_folder_name is None or not ( + path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name)) + and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name)) + and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))): self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(self.__dates) [self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week, self.__weekly_iso_dates] = self.__timeseries_data @@ -191,8 +193,8 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range min_date = self.__timeseries_date_dict['from'] max_date = self.__timeseries_date_dict['to'] - min_i=0 - max_i= len(all_quarters) + min_i = 0 + max_i = len(all_quarters) for i, quarter in enumerate(all_quarters): if min_date is not None and min_date < quarter: @@ -203,7 +205,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range if max_date is not None and max_date < quarter: break max_i = i - self.__lims=[min_i, max_i] + self.__lims = [min_i, max_i] self.__timeseries_quarterly_smoothed = None if sma is None else [] for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', @@ -217,30 +219,37 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range self.__timeseries_quarterly.append(quarterly_values) if emergence_index == 'gradients' or sma == 'kalman': - if cached_folder_name is None or not (path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name)) - and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))): + if cached_folder_name is None or not ( + path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name)) + and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))): for term_index, quarterly_values in tqdm(enumerate(self.__timeseries_quarterly), unit='term', - desc='smoothing quarterly timeseries with kalman filter', - leave=False, unit_scale=True, total=len(self.__timeseries_quarterly)): + desc='smoothing quarterly timeseries with kalman filter', + leave=False, unit_scale=True, + total=len(self.__timeseries_quarterly)): _, _1, smooth_series_s, _intercept = StateSpaceModel(quarterly_values).run_smoothing() + smooth_series = smooth_series_s[0].tolist()[0] + smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None) + self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist()) + derivatives = smooth_series_s[1].tolist()[0] self.__timeseries_derivatives.append(derivatives) - self.__timeseries_quarterly_smoothed.append(smooth_series) utils.pickle_object('smooth_series_s', self.__timeseries_quarterly_smoothed, self.__cached_folder_name) utils.pickle_object('derivatives', self.__timeseries_derivatives, self.__cached_folder_name) else: - self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s', self.__cached_folder_name) + self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s', + self.__cached_folder_name) self.__timeseries_derivatives = utils.unpickle_object('derivatives', self.__cached_folder_name) if sma == 'savgol': for quarterly_values in tqdm(self.__timeseries_quarterly, unit='term', - desc='savgol smoothing quarterly timeseries', - leave=False, unit_scale=True): + desc='savgol smoothing quarterly timeseries', + leave=False, unit_scale=True): smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest') - self.__timeseries_quarterly_smoothed.append(smooth_series) + smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None) + self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist()) em = Emergence(all_quarterly_values[min_i:max_i]) @@ -283,7 +292,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range def output(self, output_types, wordcloud_title=None, outname=None, nterms=50, n_nmf_topics=0): for output_type in output_types: - output_factory.create(output_type, self.__term_score_tuples,emergence_list=self.__emergence_list, + output_factory.create(output_type, self.__term_score_tuples, emergence_list=self.__emergence_list, wordcloud_title=wordcloud_title, tfidf_reduce_obj=self.__tfidf_reduce_obj, name=outname, nterms=nterms, timeseries_data=self.__timeseries_data, date_dict=self.__date_dict, pick=self.__pick_method, @@ -294,7 +303,7 @@ def term_score_tuples(self): return self.__term_score_tuples # run with 30 terms only. - def get_multiplot(self, timeseries_terms_smooth,timeseries, test_terms, term_ngrams, lims, method = 'Net Growth', + def get_multiplot(self, timeseries_terms_smooth, timeseries, test_terms, term_ngrams, lims, method='Net Growth', category='emergent'): # libraries and data import matplotlib.pyplot as plt @@ -333,7 +342,7 @@ def get_multiplot(self, timeseries_terms_smooth,timeseries, test_terms, term_ngr # plot the lineplot plt.plot(df['x'], df[column], color='b', marker='', linewidth=1.4, alpha=0.9, label=column) - plt.plot(df['x'],df_smooth[column], color='g', linestyle='-', marker='',label='smoothed ground truth') + plt.plot(df['x'], df_smooth[column], color='g', linestyle='-', marker='', label='smoothed ground truth') plt.axvline(x=lims[0], color='k', linestyle='--') plt.axvline(x=lims[1], color='k', linestyle='--') @@ -351,8 +360,8 @@ def get_multiplot(self, timeseries_terms_smooth,timeseries, test_terms, term_ngr plt.title(column, loc='left', fontsize=12, fontweight=0) # general title - plt.suptitle(category +" keywords selection using the " + method + " index", fontsize=13, fontweight=0, color='black', - style='italic') + plt.suptitle(category + " keywords selection using the " + method + " index", fontsize=13, fontweight=0, + color='black', style='italic') # axis title plt.show() diff --git a/tests/algorithms/test_holt_winters.py b/tests/algorithms/test_holt_winters.py new file mode 100644 index 0000000..1f60ff7 --- /dev/null +++ b/tests/algorithms/test_holt_winters.py @@ -0,0 +1,43 @@ +import unittest + +import numpy.testing as np_test + +from scripts.algorithms.holtwinters_predictor import HoltWintersPredictor + + +class HoltWintersTests(unittest.TestCase): + + def test_negatives_in_sequence(self): + time_series = [1, 1, -1, 1, 1] + num_predicted_periods = 3 + + try: + HoltWintersPredictor(time_series, num_predicted_periods) + self.fail('Expected to throw due to negative values') + + except NotImplementedError as nie: + self.assertEqual(nie.args[0], 'Unable to correct for negative or zero values') + + except ValueError as ve: + self.assertEqual(ve.args[0], + 'endog must be strictly positive when using multiplicative trend or seasonal components.') + + def test_zeros_in_sequence(self): + time_series = [1, 1, 0, 1, 1] + num_predicted_periods = 3 + expected_prediction = [0.8] * num_predicted_periods + hw = HoltWintersPredictor(time_series, num_predicted_periods) + + actual_prediction = hw.predict_counts() + + np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=4) + + def test_static_sequence(self): + time_series = [1.0, 1.0, 1.0, 1.0, 1.0] + num_predicted_periods = 3 + expected_prediction = [1] * num_predicted_periods + hw = HoltWintersPredictor(time_series, num_predicted_periods) + + actual_prediction = hw.predict_counts() + + np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=4)