Skip to content

Commit

Permalink
#315 added smoothing to command line parameters
Browse files Browse the repository at this point in the history
* corrected holt winters test to work on Linux as well as MacOS
  • Loading branch information
IanGrimstead authored and IanGrimstead committed Sep 3, 2019
1 parent f24870f commit 42cc8c5
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 13 deletions.
3 changes: 3 additions & 0 deletions pygrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ def get_args(command_line_arguments):

parser.add_argument("-nrm", "--normalised", default=False, action="store_true",
help="analyse using normalised patents counts or not")
parser.add_argument("-sts", "--smooth-timeseries", default=False, action="store_true",
help="smooth timeseries generated from patent counts")

args = parser.parse_args(command_line_arguments)

Expand Down Expand Up @@ -180,6 +182,7 @@ def main(supplied_args):
output_name=args.outputs_name, calculate_timeseries=args.timeseries, m_steps_ahead=args.steps_ahead,
emergence_index=args.emergence_index, exponential=args.exponential_fitting, nterms=args.nterms,
patents_per_quarter_threshold=args.minimum_per_quarter,
smooth_timeseries=args.smooth_timeseries,
)

pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name,
Expand Down
20 changes: 13 additions & 7 deletions scripts/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
pickled_tfidf_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0,
terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5,
emergence_index='porter', exponential=False, nterms=50, patents_per_quarter_threshold=20,
):
smooth_timeseries=False):

# load data
self.__data_filename = data_filename
Expand Down Expand Up @@ -191,7 +191,11 @@ def pickle_object(short_name, obj):

term_counts_per_week_csc = self.__term_counts_per_week.tocsc()
self.__timeseries_quarterly = []
self.__timeseries_quarterly_smoothed = []
if smooth_timeseries:
self.__timeseries_quarterly_smoothed = []
else:
self.__timeseries_quarterly_smoothed = None

self.__term_nonzero_dates = []
all_quarters, all_quarterly_values = self.__x = scripts.utils.date_utils.timeseries_weekly_to_quarterly(
self.__weekly_iso_dates, self.__number_of_patents_per_week)
Expand All @@ -208,12 +212,14 @@ def pickle_object(short_name, obj):
non_zero_dates, quarterly_values = utils.fill_missing_zeros(quarterly_values, non_zero_dates, all_quarters)

self.__timeseries_quarterly.append(quarterly_values)
smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest')
smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None)

# _, _1, smooth_series_s, _2 = SteadyStateModel(quarterly_values).run_smoothing()
# smooth_series = smooth_series_s[0].tolist()[0]
self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives)
if smooth_timeseries:
smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest')
smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None)

# _, _1, smooth_series_s, _2 = SteadyStateModel(quarterly_values).run_smoothing()
# smooth_series = smooth_series_s[0].tolist()[0]
self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives)

em = Emergence(all_quarterly_values)
for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore',
Expand Down
9 changes: 6 additions & 3 deletions scripts/vandv/predictor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from tqdm import tqdm

from scripts.algorithms.predictor_factory import PredictorFactory as factory
from scripts.utils.date_utils import timeseries_weekly_to_quarterly


# TODO quarterly values, should become timeseries
Expand Down Expand Up @@ -37,12 +36,16 @@ def evaluate_prediction(timeseries_terms, term_ngrams, predictor_names, test_ter
if test_forecasts:
test_values[term] = [float(x) for x in timeseries_all[-num_prediction_periods - 1:-1]]

if smoothed_series is not None:
training_values_to_use = smoothed_training_values
else:
training_values_to_use = training_values

for predictor_name in predictor_names:
results[predictor_name] = {}

for test_term in tqdm(test_terms, unit='term', desc='Validating prediction with ' + predictor_name):

model = factory.predictor_factory(predictor_name, test_term, training_values[test_term],
model = factory.predictor_factory(predictor_name, test_term, training_values_to_use[test_term],
num_prediction_periods)
predicted_values = model.predict_counts()

Expand Down
11 changes: 8 additions & 3 deletions tests/algorithms/test_holt_winters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

if sys_pf == 'darwin':
import matplotlib

matplotlib.use("TkAgg")

import unittest
Expand Down Expand Up @@ -36,10 +35,16 @@ def test_negatives_in_sequence(self):
time_series = [1, 1, -1, 1, 1]
num_predicted_periods = 3

with self.assertRaises(NotImplementedError) as nie:
try:
HoltWintersPredictor(time_series, num_predicted_periods)
self.fail('Expected to throw due to negative values')

except NotImplementedError as nie:
self.assertEqual(nie.args[0], 'Unable to correct for negative or zero values')

self.assertEqual(nie.exception.args[0], 'Unable to correct for negative or zero values')
except ValueError as ve:
self.assertEqual(ve.args[0],
'endog must be strictly positive when using multiplicative trend or seasonal components.')

def test_zeros_in_sequence(self):
time_series = [1, 1, 0, 1, 1]
Expand Down

0 comments on commit 42cc8c5

Please sign in to comment.