Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
 into develop
  • Loading branch information
IanGrimstead authored and IanGrimstead committed Sep 16, 2019
2 parents 02086da + b61379d commit 9e25251
Show file tree
Hide file tree
Showing 10 changed files with 69 additions and 69 deletions.
Binary file modified outputs/cached/derivatives.pkl.bz2
Binary file not shown.
Binary file modified outputs/cached/smooth_series_s.pkl.bz2
Binary file not shown.
Binary file modified outputs/cached/weekly_isodates.pkl.bz2
Binary file not shown.
Binary file modified outputs/cached/weekly_series_global.pkl.bz2
Binary file not shown.
Binary file modified outputs/cached/weekly_series_terms.pkl.bz2
Binary file not shown.
10 changes: 5 additions & 5 deletions pygrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def get_args(command_line_arguments):
# Input files
parser.add_argument("-ds", "--doc_source", default='USPTO-random-1000.pkl.bz2',
help="the document source to process")
parser.add_argument("-it", "--input_tfidf", default=None,
help="Load a pickled TFIDF output instead of creating TFIDF by processing a document source")
parser.add_argument("-uc", "--use-cache", default=None,
help="Use cached file to speed up queries")

# Document column header names
parser.add_argument("-th", "--text_header", default='abstract', help="the column name for the free text")
Expand Down Expand Up @@ -173,14 +173,14 @@ def main(supplied_args):

doc_source_file_name = os.path.join(args.path, args.doc_source)

if args.input_tfidf is None:
if args.use_cache is None:
pickled_tfidf_folder_name = None
else:
pickled_tfidf_folder_name = os.path.join('outputs', 'tfidf', args.input_tfidf)
pickled_tfidf_folder_name = args.use_cache

pipeline = Pipeline(doc_source_file_name, docs_mask_dict, pick_method=args.pick,
ngram_range=(args.min_ngrams, args.max_ngrams), text_header=args.text_header,
pickled_tfidf_folder_name=pickled_tfidf_folder_name,
cached_folder_name=pickled_tfidf_folder_name,
max_df=args.max_document_frequency, user_ngrams=args.search_terms,
prefilter_terms=args.prefilter_terms, terms_threshold=args.search_terms_threshold,
output_name=args.outputs_name, calculate_timeseries=args.timeseries, m_steps_ahead=args.steps_ahead,
Expand Down
104 changes: 47 additions & 57 deletions scripts/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import bz2
import pickle
from os import makedirs, path
from os import path

from pandas import read_pickle
from scipy.signal import savgol_filter
Expand All @@ -25,10 +23,9 @@

class Pipeline(object):
def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), text_header='abstract',
pickled_tfidf_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0,
cached_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0,
terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5,
emergence_index='porter', exponential=False, nterms=50, patents_per_quarter_threshold=20, sma=None,
read_timeseries_from_cache=False, read_state_space_from_cache=False):
emergence_index='porter', exponential=False, nterms=50, patents_per_quarter_threshold=20, sma=None):

# load data
self.__data_filename = data_filename
Expand All @@ -39,8 +36,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
self.__emergence_list = []
self.__pick_method = pick_method
# calculate or fetch tf-idf mat
if pickled_tfidf_folder_name is None:

if cached_folder_name is None:
dataframe = data_factory.get(data_filename)
utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict, text_header)
utils.remove_empty_documents(dataframe, text_header)
Expand Down Expand Up @@ -68,29 +64,21 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
self.__cpc_dict = utils.cpc_dict(dataframe)
self.__dates = scripts.utils.date_utils.generate_year_week_dates(dataframe, docs_mask_dict['date_header'])

base_pickle_path = path.join('outputs', 'tfidf')
makedirs(base_pickle_path, exist_ok=True)

def pickle_object(short_name, obj):
folder_name = path.join(base_pickle_path, output_name + f'-mdf-{max_df}')
makedirs(folder_name, exist_ok=True)
file_name = path.join(folder_name, output_name + f'-mdf-{max_df}-{short_name}.pkl.bz2')
with bz2.BZ2File(file_name, 'wb') as pickle_file:
pickle.dump(obj, pickle_file, protocol=4, fix_imports=False)
min_date = min(self.__dates)
max_date = max(self.__dates)

pickle_object('tfidf', self.__tfidf_obj)
pickle_object('dates', self.__dates)
pickle_object('cpc_dict', self.__cpc_dict)
self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}-{min_date}-{max_date}')
utils.pickle_object('tfidf', self.__tfidf_obj, self.__cached_folder_name)
utils.pickle_object('dates', self.__dates, self.__cached_folder_name)
utils.pickle_object('cpc_dict', self.__cpc_dict, self.__cached_folder_name)

else:
print(f'Reading document and TFIDF from pickle {pickled_tfidf_folder_name}')
print(f'Reading document and TFIDF from pickle {cached_folder_name}')

base_folder = path.basename(pickled_tfidf_folder_name)
pickled_base_file_name = path.join(pickled_tfidf_folder_name, base_folder)

self.__tfidf_obj = read_pickle(pickled_base_file_name + '-tfidf.pkl.bz2')
self.__dates = read_pickle(pickled_base_file_name + '-dates.pkl.bz2')
self.__cpc_dict = read_pickle(pickled_base_file_name + '-cpc_dict.pkl.bz2')
self.__cached_folder_name = path.join('cached', cached_folder_name)
self.__tfidf_obj = utils.unpickle_object('tfidf', self.__cached_folder_name)
self.__dates = utils.unpickle_object('dates', self.__cached_folder_name)
self.__cpc_dict = utils.unpickle_object('cpc_dict', self.__cached_folder_name)

if self.__dates is not None:
min_date = min(self.__dates)
Expand Down Expand Up @@ -166,25 +154,22 @@ def pickle_object(short_name, obj):
if not calculate_timeseries:
return

###################################### em-tech pipeline ###########################

# TODO: offer timeseries cache as an option. Then filter dates and terms after reading the cached matrix
# TODO: timeseries matrix no longer needs to be sparse. Massive programing and maintenance burden for small gain
print(f'Creating timeseries matrix...')
pickled_base_file_name2 = path.join('outputs', 'cached')
if not read_timeseries_from_cache:
if cached_folder_name is None or not (path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name))
and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name))
and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))):
self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(self.__dates)
[self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week,
self.__weekly_iso_dates] = self.__timeseries_data

utils.pickle_object('weekly_series_terms', self.__term_counts_per_week, pickled_base_file_name2)
utils.pickle_object('weekly_series_global', self.__number_of_patents_per_week, pickled_base_file_name2)
utils.pickle_object('weekly_isodates', self.__weekly_iso_dates, pickled_base_file_name2)
utils.pickle_object('weekly_series_terms', self.__term_counts_per_week, self.__cached_folder_name)
utils.pickle_object('weekly_series_global', self.__number_of_patents_per_week, self.__cached_folder_name)
utils.pickle_object('weekly_isodates', self.__weekly_iso_dates, self.__cached_folder_name)
else:
self.__term_counts_per_week = read_pickle(path.join(pickled_base_file_name2, 'weekly_series_terms.pkl.bz2'))
self.__number_of_patents_per_week = read_pickle(
path.join(pickled_base_file_name2, 'weekly_series_global.pkl.bz2'))
self.__weekly_iso_dates = read_pickle(path.join(pickled_base_file_name2, 'weekly_isodates.pkl.bz2'))
self.__term_counts_per_week = utils.unpickle_object('weekly_series_terms', self.__cached_folder_name)
self.__number_of_patents_per_week = utils.unpickle_object('weekly_series_global', self.__cached_folder_name)
self.__weekly_iso_dates = utils.unpickle_object('weekly_isodates', self.__cached_folder_name)
self.__term_ngrams = self.__tfidf_obj.feature_names

self.__M = m_steps_ahead
Expand Down Expand Up @@ -223,7 +208,7 @@ def pickle_object(short_name, obj):
self.__timeseries_quarterly_smoothed = None if sma is None else []

for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term',
desc='Calculating and smoothing quarterly timeseries',
desc='Calculating quarterly timeseries',
leave=False, unit_scale=True):
row_indices, row_values = utils.get_row_indices_and_values(term_counts_per_week_csc, term_index)
weekly_iso_dates = [self.__weekly_iso_dates[x] for x in row_indices]
Expand All @@ -232,27 +217,32 @@ def pickle_object(short_name, obj):
non_zero_dates, quarterly_values = utils.fill_missing_zeros(quarterly_values, non_zero_dates, all_quarters)
self.__timeseries_quarterly.append(quarterly_values)

# temporary code
##############
if (emergence_index == 'gradients' or sma == 'kalman') and not read_state_space_from_cache:
_, _1, smooth_series_s, intercept = StateSpaceModel(quarterly_values).run_smoothing()
smooth_series = smooth_series_s[0].tolist()[0]
derivatives = smooth_series_s[1].tolist()[0]
self.__timeseries_derivatives.append(derivatives)
self.__timeseries_intercept.append(intercept)
self.__timeseries_quarterly_smoothed.append(smooth_series)
if emergence_index == 'gradients' or sma == 'kalman':
if cached_folder_name is None or not (path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name))
and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))):
for term_index, quarterly_values in tqdm(enumerate(self.__timeseries_quarterly), unit='term',
desc='smoothing quarterly timeseries with kalman filter',
leave=False, unit_scale=True, total=len(self.__timeseries_quarterly)):
_, _1, smooth_series_s, _intercept = StateSpaceModel(quarterly_values).run_smoothing()
smooth_series = smooth_series_s[0].tolist()[0]
derivatives = smooth_series_s[1].tolist()[0]
self.__timeseries_derivatives.append(derivatives)
self.__timeseries_quarterly_smoothed.append(smooth_series)

utils.pickle_object('smooth_series_s', self.__timeseries_quarterly_smoothed, self.__cached_folder_name)
utils.pickle_object('derivatives', self.__timeseries_derivatives, self.__cached_folder_name)

else:
self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s', self.__cached_folder_name)
self.__timeseries_derivatives = utils.unpickle_object('derivatives', self.__cached_folder_name)

utils.pickle_object('smooth_series_s', self.__timeseries_quarterly_smoothed, pickled_base_file_name2)
utils.pickle_object('intercept', self.__timeseries_intercept, pickled_base_file_name2)
utils.pickle_object('derivatives', self.__timeseries_derivatives, pickled_base_file_name2)
elif sma == 'savgol' :
if sma == 'savgol':
for quarterly_values in tqdm(self.__timeseries_quarterly, unit='term',
desc='savgol smoothing quarterly timeseries',
leave=False, unit_scale=True):
smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest')
self.__timeseries_quarterly_smoothed.append(smooth_series)

if (emergence_index == 'gradients' or sma == 'kalman') and read_timeseries_from_cache:
self.__timeseries_quarterly_smoothed = read_pickle(path.join(pickled_base_file_name2, 'smooth_series_s.pkl.bz2'))
self.__timeseries_derivatives = read_pickle(path.join(pickled_base_file_name2, 'derivatives.pkl.bz2'))

em = Emergence(all_quarterly_values[min_i:max_i])

for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore',
Expand Down
4 changes: 2 additions & 2 deletions scripts/utils/argschecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def checkargs(self):
for idx, word in enumerate(self.args.search_terms):
print(f'{idx}. {word}')

if self.args.input_tfidf is None and self.args.date_header is None:
if self.args.use_cache is None and self.args.date_header is None:
print()
print('WARNING: No dates defined - time series analysis will not be possible with the cached object!')
print()
Expand Down Expand Up @@ -134,7 +134,7 @@ def checkargs(self):
print(f"invalid predictor name number(s) {' '.join(str(e) for e in invalid_predictor_names)} provided (must be between 0 and 12)")
app_exit = True

if self.args.timeseries and self.args.input_tfidf is None and self.args.date_header is None:
if self.args.timeseries and self.args.use_cache is None and self.args.date_header is None:

print(f"date_header is None")
print(f"Cannot calculate emergence without a specifying a date column")
Expand Down
3 changes: 2 additions & 1 deletion scripts/utils/date_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ def tfidf_with_dates_to_weekly_term_counts(term_value_array, uspto_week_dates):
week_dates = []
week_total = 0

for current_row_index in tqdm(range(number_of_rows), 'Calculating weekly term document-counts', unit='document'):
for current_row_index in tqdm(range(number_of_rows), 'Calculating weekly term document-counts', unit='document',
total=number_of_rows):
new_week = int(uspto_week_dates[current_row_index])

while new_week > current_week:
Expand Down
17 changes: 13 additions & 4 deletions scripts/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from pandas import to_datetime
from pandas import to_datetime, read_pickle
from pandas.api.types import is_string_dtype


Expand All @@ -24,13 +24,22 @@ def fill_missing_zeros(quarterly_values, non_zero_dates, all_quarters):
return non_zero_dates, quarterly_values


def pickle_object(short_name, obj, pickle_path):
makedirs(pickle_path, exist_ok=True)
file_name = path.join(pickle_path, f'{short_name}.pkl.bz2')
def pickle_object(short_name, obj, folder_name):
makedirs(folder_name, exist_ok=True)
file_name = pickle_name(short_name, folder_name)
with bz2.BZ2File(file_name, 'wb') as pickle_file:
pickle.dump(obj, pickle_file, protocol=4, fix_imports=False)


def unpickle_object( short_name, folder_name):
file_name = pickle_name(short_name, folder_name)
return read_pickle(file_name)


def pickle_name( short_name, folder_name):
return path.join(folder_name, short_name + '.pkl.bz2')


def stationary_terms(emergence_list, nterms):
if len(emergence_list) == 0:
return []
Expand Down

0 comments on commit 9e25251

Please sign in to comment.