diff --git a/pygrams.py b/pygrams.py index e0b4c3f..b4a0543 100644 --- a/pygrams.py +++ b/pygrams.py @@ -136,6 +136,8 @@ def get_args(command_line_arguments): parser.add_argument("-nrm", "--normalised", default=False, action="store_true", help="analyse using normalised patents counts or not") + parser.add_argument("-sts", "--smooth-timeseries", default=False, action="store_true", + help="smooth timeseries generated from patent counts") args = parser.parse_args(command_line_arguments) @@ -180,6 +182,7 @@ def main(supplied_args): output_name=args.outputs_name, calculate_timeseries=args.timeseries, m_steps_ahead=args.steps_ahead, emergence_index=args.emergence_index, exponential=args.exponential_fitting, nterms=args.nterms, patents_per_quarter_threshold=args.minimum_per_quarter, + smooth_timeseries=args.smooth_timeseries, ) pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, diff --git a/scripts/pipeline.py b/scripts/pipeline.py index 21a393c..540971d 100644 --- a/scripts/pipeline.py +++ b/scripts/pipeline.py @@ -28,7 +28,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range pickled_tfidf_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0, terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5, emergence_index='porter', exponential=False, nterms=50, patents_per_quarter_threshold=20, - ): + smooth_timeseries=False): # load data self.__data_filename = data_filename @@ -191,7 +191,11 @@ def pickle_object(short_name, obj): term_counts_per_week_csc = self.__term_counts_per_week.tocsc() self.__timeseries_quarterly = [] - self.__timeseries_quarterly_smoothed = [] + if smooth_timeseries: + self.__timeseries_quarterly_smoothed = [] + else: + self.__timeseries_quarterly_smoothed = None + self.__term_nonzero_dates = [] all_quarters, all_quarterly_values = self.__x = scripts.utils.date_utils.timeseries_weekly_to_quarterly( self.__weekly_iso_dates, self.__number_of_patents_per_week) @@ -208,12 +212,14 @@ def pickle_object(short_name, obj): non_zero_dates, quarterly_values = utils.fill_missing_zeros(quarterly_values, non_zero_dates, all_quarters) self.__timeseries_quarterly.append(quarterly_values) - smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest') - smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None) - # _, _1, smooth_series_s, _2 = SteadyStateModel(quarterly_values).run_smoothing() - # smooth_series = smooth_series_s[0].tolist()[0] - self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives) + if smooth_timeseries: + smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest') + smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None) + + # _, _1, smooth_series_s, _2 = SteadyStateModel(quarterly_values).run_smoothing() + # smooth_series = smooth_series_s[0].tolist()[0] + self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives) em = Emergence(all_quarterly_values) for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore', diff --git a/scripts/vandv/predictor.py b/scripts/vandv/predictor.py index eb1c917..f4b0448 100644 --- a/scripts/vandv/predictor.py +++ b/scripts/vandv/predictor.py @@ -1,7 +1,6 @@ from tqdm import tqdm from scripts.algorithms.predictor_factory import PredictorFactory as factory -from scripts.utils.date_utils import timeseries_weekly_to_quarterly # TODO quarterly values, should become timeseries @@ -37,12 +36,16 @@ def evaluate_prediction(timeseries_terms, term_ngrams, predictor_names, test_ter if test_forecasts: test_values[term] = [float(x) for x in timeseries_all[-num_prediction_periods - 1:-1]] + if smoothed_series is not None: + training_values_to_use = smoothed_training_values + else: + training_values_to_use = training_values + for predictor_name in predictor_names: results[predictor_name] = {} for test_term in tqdm(test_terms, unit='term', desc='Validating prediction with ' + predictor_name): - - model = factory.predictor_factory(predictor_name, test_term, training_values[test_term], + model = factory.predictor_factory(predictor_name, test_term, training_values_to_use[test_term], num_prediction_periods) predicted_values = model.predict_counts() diff --git a/tests/algorithms/test_holt_winters.py b/tests/algorithms/test_holt_winters.py index f03a627..2d241b7 100644 --- a/tests/algorithms/test_holt_winters.py +++ b/tests/algorithms/test_holt_winters.py @@ -4,7 +4,6 @@ if sys_pf == 'darwin': import matplotlib - matplotlib.use("TkAgg") import unittest @@ -36,10 +35,16 @@ def test_negatives_in_sequence(self): time_series = [1, 1, -1, 1, 1] num_predicted_periods = 3 - with self.assertRaises(NotImplementedError) as nie: + try: HoltWintersPredictor(time_series, num_predicted_periods) + self.fail('Expected to throw due to negative values') + + except NotImplementedError as nie: + self.assertEqual(nie.args[0], 'Unable to correct for negative or zero values') - self.assertEqual(nie.exception.args[0], 'Unable to correct for negative or zero values') + except ValueError as ve: + self.assertEqual(ve.args[0], + 'endog must be strictly positive when using multiplicative trend or seasonal components.') def test_zeros_in_sequence(self): time_series = [1, 1, 0, 1, 1]