Skip to content

Commit

Permalink
315 clamp redo (#323)
Browse files Browse the repository at this point in the history
* #315 clamp smoothed values at 0
* cast smoothed data back to lists (from numpy arrays) for consistency
* command line args now restricted to available smoothing and emergence
* added simple test for holt-winters to confirm -ve values not handled
  • Loading branch information
IanGrimstead authored and thanasions committed Sep 17, 2019
1 parent 5e45e88 commit f7edcbc
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 23 deletions.
8 changes: 5 additions & 3 deletions pygrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def get_args(command_line_arguments):
parser.add_argument("-ds", "--doc_source", default='USPTO-random-1000.pkl.bz2',
help="the document source to process")
parser.add_argument("-uc", "--use_cache", default=None,
help="Use cached file to speed up queries")
help="Cache file to use, to speed up queries")

# Document column header names
parser.add_argument("-th", "--text_header", default='abstract', help="the column name for the free text")
Expand Down Expand Up @@ -134,8 +134,10 @@ def get_args(command_line_arguments):
parser.add_argument("-stp", "--steps_ahead", type=int, default=5,
help="number of steps ahead to analyse for")

parser.add_argument("-ei", "--emergence-index", default='porter', help="options are: porter, quadratic, gradients")
parser.add_argument("-sma", "--smoothing-alg", default=None, help="options are: kalman, savgol")
parser.add_argument("-ei", "--emergence-index", default='porter', choices=('porter', 'quadratic', 'gradients'),
help="Emergence calculation to use (default: %(default))")
parser.add_argument("-sma", "--smoothing-alg", default=None, choices=('kalman', 'savgol'),
help="Time series smoothing to use (default: %(default))")

parser.add_argument("-exp", "--exponential_fitting", default=False, action="store_true",
help="analyse using exponential type fit or not")
Expand Down
49 changes: 29 additions & 20 deletions scripts/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from os import path

import numpy as np
from scipy.signal import savgol_filter
from tqdm import tqdm

Expand Down Expand Up @@ -155,9 +156,10 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range

# TODO: offer timeseries cache as an option. Then filter dates and terms after reading the cached matrix
print(f'Creating timeseries matrix...')
if cached_folder_name is None or not (path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name))
and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name))
and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))):
if cached_folder_name is None or not (
path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name))
and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name))
and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))):
self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(self.__dates)
[self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week,
self.__weekly_iso_dates] = self.__timeseries_data
Expand Down Expand Up @@ -191,8 +193,8 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
min_date = self.__timeseries_date_dict['from']
max_date = self.__timeseries_date_dict['to']

min_i=0
max_i= len(all_quarters)
min_i = 0
max_i = len(all_quarters)

for i, quarter in enumerate(all_quarters):
if min_date is not None and min_date < quarter:
Expand All @@ -203,7 +205,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
if max_date is not None and max_date < quarter:
break
max_i = i
self.__lims=[min_i, max_i]
self.__lims = [min_i, max_i]
self.__timeseries_quarterly_smoothed = None if sma is None else []

for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term',
Expand All @@ -217,30 +219,37 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
self.__timeseries_quarterly.append(quarterly_values)

if emergence_index == 'gradients' or sma == 'kalman':
if cached_folder_name is None or not (path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name))
and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))):
if cached_folder_name is None or not (
path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name))
and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))):
for term_index, quarterly_values in tqdm(enumerate(self.__timeseries_quarterly), unit='term',
desc='smoothing quarterly timeseries with kalman filter',
leave=False, unit_scale=True, total=len(self.__timeseries_quarterly)):
desc='smoothing quarterly timeseries with kalman filter',
leave=False, unit_scale=True,
total=len(self.__timeseries_quarterly)):
_, _1, smooth_series_s, _intercept = StateSpaceModel(quarterly_values).run_smoothing()

smooth_series = smooth_series_s[0].tolist()[0]
smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None)
self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist())

derivatives = smooth_series_s[1].tolist()[0]
self.__timeseries_derivatives.append(derivatives)
self.__timeseries_quarterly_smoothed.append(smooth_series)

utils.pickle_object('smooth_series_s', self.__timeseries_quarterly_smoothed, self.__cached_folder_name)
utils.pickle_object('derivatives', self.__timeseries_derivatives, self.__cached_folder_name)

else:
self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s', self.__cached_folder_name)
self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s',
self.__cached_folder_name)
self.__timeseries_derivatives = utils.unpickle_object('derivatives', self.__cached_folder_name)

if sma == 'savgol':
for quarterly_values in tqdm(self.__timeseries_quarterly, unit='term',
desc='savgol smoothing quarterly timeseries',
leave=False, unit_scale=True):
desc='savgol smoothing quarterly timeseries',
leave=False, unit_scale=True):
smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest')
self.__timeseries_quarterly_smoothed.append(smooth_series)
smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None)
self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist())

em = Emergence(all_quarterly_values[min_i:max_i])

Expand Down Expand Up @@ -283,7 +292,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range

def output(self, output_types, wordcloud_title=None, outname=None, nterms=50, n_nmf_topics=0):
for output_type in output_types:
output_factory.create(output_type, self.__term_score_tuples,emergence_list=self.__emergence_list,
output_factory.create(output_type, self.__term_score_tuples, emergence_list=self.__emergence_list,
wordcloud_title=wordcloud_title, tfidf_reduce_obj=self.__tfidf_reduce_obj,
name=outname, nterms=nterms, timeseries_data=self.__timeseries_data,
date_dict=self.__date_dict, pick=self.__pick_method,
Expand All @@ -294,7 +303,7 @@ def term_score_tuples(self):
return self.__term_score_tuples

# run with 30 terms only.
def get_multiplot(self, timeseries_terms_smooth,timeseries, test_terms, term_ngrams, lims, method = 'Net Growth',
def get_multiplot(self, timeseries_terms_smooth, timeseries, test_terms, term_ngrams, lims, method='Net Growth',
category='emergent'):
# libraries and data
import matplotlib.pyplot as plt
Expand Down Expand Up @@ -333,7 +342,7 @@ def get_multiplot(self, timeseries_terms_smooth,timeseries, test_terms, term_ngr

# plot the lineplot
plt.plot(df['x'], df[column], color='b', marker='', linewidth=1.4, alpha=0.9, label=column)
plt.plot(df['x'],df_smooth[column], color='g', linestyle='-', marker='',label='smoothed ground truth')
plt.plot(df['x'], df_smooth[column], color='g', linestyle='-', marker='', label='smoothed ground truth')

plt.axvline(x=lims[0], color='k', linestyle='--')
plt.axvline(x=lims[1], color='k', linestyle='--')
Expand All @@ -351,8 +360,8 @@ def get_multiplot(self, timeseries_terms_smooth,timeseries, test_terms, term_ngr
plt.title(column, loc='left', fontsize=12, fontweight=0)

# general title
plt.suptitle(category +" keywords selection using the " + method + " index", fontsize=13, fontweight=0, color='black',
style='italic')
plt.suptitle(category + " keywords selection using the " + method + " index", fontsize=13, fontweight=0,
color='black', style='italic')

# axis title
plt.show()
Expand Down
43 changes: 43 additions & 0 deletions tests/algorithms/test_holt_winters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import unittest

import numpy.testing as np_test

from scripts.algorithms.holtwinters_predictor import HoltWintersPredictor


class HoltWintersTests(unittest.TestCase):

def test_negatives_in_sequence(self):
time_series = [1, 1, -1, 1, 1]
num_predicted_periods = 3

try:
HoltWintersPredictor(time_series, num_predicted_periods)
self.fail('Expected to throw due to negative values')

except NotImplementedError as nie:
self.assertEqual(nie.args[0], 'Unable to correct for negative or zero values')

except ValueError as ve:
self.assertEqual(ve.args[0],
'endog must be strictly positive when using multiplicative trend or seasonal components.')

def test_zeros_in_sequence(self):
time_series = [1, 1, 0, 1, 1]
num_predicted_periods = 3
expected_prediction = [0.8] * num_predicted_periods
hw = HoltWintersPredictor(time_series, num_predicted_periods)

actual_prediction = hw.predict_counts()

np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=4)

def test_static_sequence(self):
time_series = [1.0, 1.0, 1.0, 1.0, 1.0]
num_predicted_periods = 3
expected_prediction = [1] * num_predicted_periods
hw = HoltWintersPredictor(time_series, num_predicted_periods)

actual_prediction = hw.predict_counts()

np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=4)

0 comments on commit f7edcbc

Please sign in to comment.