From 9972bcdcbfefbe3ec23b1cf991ae394daae8d867 Mon Sep 17 00:00:00 2001 From: user624086 Date: Thu, 16 May 2019 16:28:44 +0100 Subject: [PATCH 1/4] initial commit --- scripts/pipeline.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/pipeline.py b/scripts/pipeline.py index be8e0b6..f4dfdc8 100644 --- a/scripts/pipeline.py +++ b/scripts/pipeline.py @@ -1,5 +1,6 @@ import bz2 import pickle +import csv from os import makedirs, path from pandas import read_pickle @@ -278,6 +279,19 @@ def run(self, predictors_to_run, emergence, normalized=False, train_test=False): predicted_emergence = map_prediction_to_emergence_label(results, training_values, test_values, predictors_to_run, test_terms=terms) + # save training_values to csv file + # + # training_values: csv file: + # {'term1': [0,2,4,6], 'term2': [2,4,1,3]} 'term1', 0, 2, 4, 6 + # 'term2', 2, 4, 1, 3 + # + filename = 'outputs/emergence/' + emergence + '_time_series.csv' + with open(filename, 'w') as f: + w = csv.writer(f) + for key, values in training_values.items(): + my_list = ["'" + str(key) + "'"] + values + w.writerow(my_list) + html_results += report_predicted_emergence_labels_html(predicted_emergence) html_results += report_prediction_as_graphs_html(results, predictors_to_run, self.__weekly_iso_dates, From e7bb32421566c82be2c739f09763f657fd93bddc Mon Sep 17 00:00:00 2001 From: user624086 Date: Wed, 22 May 2019 09:33:15 +0100 Subject: [PATCH 2/4] filename using os.path.join --- scripts/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pipeline.py b/scripts/pipeline.py index f4dfdc8..fecd5fe 100644 --- a/scripts/pipeline.py +++ b/scripts/pipeline.py @@ -285,7 +285,7 @@ def run(self, predictors_to_run, emergence, normalized=False, train_test=False): # {'term1': [0,2,4,6], 'term2': [2,4,1,3]} 'term1', 0, 2, 4, 6 # 'term2', 2, 4, 1, 3 # - filename = 'outputs/emergence/' + emergence + '_time_series.csv' + filename = path.join('outputs', 'emergence', emergence + '_time_series.csv') with open(filename, 'w') as f: w = csv.writer(f) for key, values in training_values.items(): From f93506a575f59d2393eb96a33aacd3ede844b1d2 Mon Sep 17 00:00:00 2001 From: IanGrimstead Date: Mon, 3 Jun 2019 14:53:12 +0100 Subject: [PATCH 3/4] Merged with develop and tidied to remove knowledge of path (#268) --- pygrams.py | 37 ++++++++++++++++++++++++++----------- scripts/pipeline.py | 36 ++++++++++++------------------------ 2 files changed, 38 insertions(+), 35 deletions(-) diff --git a/pygrams.py b/pygrams.py index a484add..70dc80b 100644 --- a/pygrams.py +++ b/pygrams.py @@ -1,4 +1,5 @@ import argparse +import csv import os import sys import time @@ -20,18 +21,19 @@ def get_args(command_line_arguments): conflict_handler='resolve') # allows overridng of arguments # suppressed:________________________________________ - parser.add_argument("-tc", "--term-counts", default=False, action="store_true", help=argparse.SUPPRESS) + parser.add_argument("-tc", "--term-counts", default=False, action="store_true", help=argparse.SUPPRESS) parser.add_argument("-ih", "--id_header", default=None, help=argparse.SUPPRESS) parser.add_argument("-c", "--cite", default=False, action="store_true", help=argparse.SUPPRESS) parser.add_argument("-pt", "--path", default='data', help=argparse.SUPPRESS) parser.add_argument("-nmf", "--n_nmf_topics", type=int, default=0, help=argparse.SUPPRESS) - # help="NMF topic modelling - number of topics (e.g. 20 or 40)") + # help="NMF topic modelling - number of topics (e.g. 20 or 40)") # Focus source and function parser.add_argument("-f", "--focus", default=None, choices=['set', 'chi2', 'mutual'], help=argparse.SUPPRESS) parser.add_argument("-fs", "--focus_source", default='USPTO-random-1000.pkl.bz2', help=argparse.SUPPRESS) - parser.add_argument("-tn", "--table_name", default=os.path.join('outputs', 'table', 'table.xlsx'), help=argparse.SUPPRESS) + parser.add_argument("-tn", "--table_name", default=os.path.join('outputs', 'table', 'table.xlsx'), + help=argparse.SUPPRESS) parser.add_argument("-j", "--json", default=True, action="store_true", help=argparse.SUPPRESS) @@ -142,7 +144,6 @@ def get_args(command_line_arguments): def main(supplied_args): - paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'wordclouds'), os.path.join('outputs', 'table'), os.path.join('outputs', 'emergence')] for path in paths: @@ -157,7 +158,7 @@ def main(supplied_args): outputs.append('report') if args.term_counts: outputs.append('termcounts') - if args.n_nmf_topics >0: + if args.n_nmf_topics > 0: outputs.append('nmf') docs_mask_dict = argscheck.get_docs_mask_dict() @@ -210,8 +211,22 @@ def main(supplied_args): title += f' ({emergence})' - html_results = pipeline_emtech.run(predictors_to_run, normalized=args.normalised, train_test=args.test, - emergence=emergence) + html_results, training_values = pipeline_emtech.run(predictors_to_run, normalized=args.normalised, + train_test=args.test, + emergence=emergence) + + # save training_values to csv file + # + # training_values: csv file: + # {'term1': [0,2,4,6], 'term2': [2,4,1,3]} 'term1', 0, 2, 4, 6 + # 'term2', 2, 4, 1, 3 + # + filename = path.join('outputs', 'emergence', args.outputs_name + '_' + emergence + '_time_series.csv') + with open(filename, 'w') as f: + w = csv.writer(f) + for key, values in training_values: + my_list = ["'" + str(key) + "'"] + values + w.writerow(my_list) html_doc = f''' @@ -226,7 +241,7 @@ def main(supplied_args): ''' - base_file_name = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence) + base_file_name = path.join('outputs', 'emergence', args.outputs_name + '_' + emergence) if args.normalised: base_file_name += '_normalised' @@ -248,9 +263,9 @@ def main(supplied_args): main(sys.argv[1:]) end = time.time() diff = int(end - start) - hours=diff//3600 - minutes=diff//60 - seconds=diff%60 + hours = diff // 3600 + minutes = diff // 60 + seconds = diff % 60 print('') print(f"pyGrams query took {hours}:{minutes:02d}:{seconds:02d} to complete") diff --git a/scripts/pipeline.py b/scripts/pipeline.py index 007e35e..1a8dea3 100644 --- a/scripts/pipeline.py +++ b/scripts/pipeline.py @@ -1,12 +1,11 @@ import bz2 import pickle -import csv from os import makedirs, path from pandas import read_pickle from tqdm import tqdm -import scripts.data_factory as datafactory +import scripts.data_factory as data_factory import scripts.output_factory as output_factory import scripts.utils.date_utils from scripts.algorithms.emergence import Emergence @@ -20,7 +19,6 @@ from scripts.vandv.emergence_labels import map_prediction_to_emergence_label, report_predicted_emergence_labels_html from scripts.vandv.graphs import report_prediction_as_graphs_html from scripts.vandv.predictor import evaluate_prediction -from tqdm import tqdm class Pipeline(object): @@ -37,7 +35,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range # calculate or fetch tf-idf mat if pickled_tfidf_folder_name is None: - dataframe = datafactory.get(data_filename) + dataframe = data_factory.get(data_filename) utils.checkdf(dataframe, emerging_technology, docs_mask_dict, text_header, term_counts) utils.remove_empty_documents(dataframe, text_header) @@ -91,7 +89,8 @@ def pickle_object(short_name, obj): if self.__dates is not None: min_date = min(self.__dates) max_date = max(self.__dates) - print(f'Document year-week dates range from {min_date//100}-{(min_date%100):02d} to {max_date//100}-{(max_date%100):02d}') + print(f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} ' + f'to {max_date // 100}-{(max_date % 100):02d}') WordAnalyzer.init( tokenizer=LemmaTokenizer(), @@ -141,7 +140,8 @@ def pickle_object(short_name, obj): tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix) tfidf_masked, self.__dates = utils.remove_all_null_rows_global(tfidf_masked, self.__dates) - print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents') + print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}' + f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents') # todo: no advantage in classes - just create term_count and extract_ngrams as functions @@ -152,7 +152,8 @@ def pickle_object(short_name, obj): self.__term_counts_data = self.__tfidf_reduce_obj.create_terms_count(self.__dates) # if other outputs self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method) - self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n) + self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, + WordAnalyzer.stemmed_stop_word_set_n) # todo: no output method; just if statements to call output functions...? # Only supply what they each directly require @@ -241,7 +242,7 @@ def __init__(self, term_counts_data, m_steps_ahead=5, curves=True, nterms=50, mi file.write('Emergent\n') for tup in self.__emergence_list[:nterms]: print(tup[0] + ": " + str(tup[1])) - file.write(tup[0] + ": " + str(tup[1])+ '\n') + file.write(tup[0] + ": " + str(tup[1]) + '\n') print() file.write('\n') print('Stationary') @@ -253,10 +254,10 @@ def __init__(self, term_counts_data, m_steps_ahead=5, curves=True, nterms=50, mi file.write('\n') print('Declining') - file.write('Declining'+ '\n') + file.write('Declining' + '\n') for tup in self.__emergence_list[-nterms:]: print(tup[0] + ": " + str(tup[1])) - file.write(tup[0] + ": " + str(tup[1])+ '\n') + file.write(tup[0] + ": " + str(tup[1]) + '\n') print() file.write('\n') # construct a terms list for n emergent n stationary? n declining @@ -288,19 +289,6 @@ def run(self, predictors_to_run, emergence, normalized=False, train_test=False): predicted_emergence = map_prediction_to_emergence_label(results, training_values, test_values, predictors_to_run, test_terms=terms) - # save training_values to csv file - # - # training_values: csv file: - # {'term1': [0,2,4,6], 'term2': [2,4,1,3]} 'term1', 0, 2, 4, 6 - # 'term2', 2, 4, 1, 3 - # - filename = path.join('outputs', 'emergence', emergence + '_time_series.csv') - with open(filename, 'w') as f: - w = csv.writer(f) - for key, values in training_values.items(): - my_list = ["'" + str(key) + "'"] + values - w.writerow(my_list) - html_results += report_predicted_emergence_labels_html(predicted_emergence) html_results += report_prediction_as_graphs_html(results, predictors_to_run, self.__weekly_iso_dates, @@ -309,4 +297,4 @@ def run(self, predictors_to_run, emergence, normalized=False, train_test=False): normalised=normalized, test_forecasts=train_test) - return html_results + return html_results, training_values.items() From b18fd34e07f0d61bc47372254aac9fb5667b770f Mon Sep 17 00:00:00 2001 From: user624086 Date: Tue, 4 Jun 2019 14:54:36 +0100 Subject: [PATCH 4/4] Update pygrams.py change path.join to os.path.join x2 --- pygrams.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pygrams.py b/pygrams.py index 890f135..298aa3c 100644 --- a/pygrams.py +++ b/pygrams.py @@ -221,7 +221,7 @@ def main(supplied_args): # {'term1': [0,2,4,6], 'term2': [2,4,1,3]} 'term1', 0, 2, 4, 6 # 'term2', 2, 4, 1, 3 # - filename = path.join('outputs', 'emergence', args.outputs_name + '_' + emergence + '_time_series.csv') + filename = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence + '_time_series.csv') with open(filename, 'w') as f: w = csv.writer(f) for key, values in training_values: @@ -241,7 +241,7 @@ def main(supplied_args): ''' - base_file_name = path.join('outputs', 'emergence', args.outputs_name + '_' + emergence) + base_file_name = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence) if args.normalised: base_file_name += '_normalised'