datasciencecampus · IanGrimstead · Jun 4, 2019 · May 16, 2019 · May 17, 2019 · May 22, 2019
diff --git a/pygrams.py b/pygrams.py
@@ -1,4 +1,5 @@
 import argparse
+import csv
 import os
 import sys
 import time
@@ -20,18 +21,19 @@ def get_args(command_line_arguments):
                                      conflict_handler='resolve')  # allows overridng of arguments
 
     # suppressed:________________________________________
-    parser.add_argument("-tc", "--term-counts", default=False,  action="store_true", help=argparse.SUPPRESS)
+    parser.add_argument("-tc", "--term-counts", default=False, action="store_true", help=argparse.SUPPRESS)
     parser.add_argument("-ih", "--id_header", default=None, help=argparse.SUPPRESS)
     parser.add_argument("-c", "--cite", default=False, action="store_true", help=argparse.SUPPRESS)
     parser.add_argument("-pt", "--path", default='data', help=argparse.SUPPRESS)
     parser.add_argument("-nmf", "--n_nmf_topics", type=int, default=0, help=argparse.SUPPRESS)
-                        # help="NMF topic modelling - number of topics (e.g. 20 or 40)")
+    # help="NMF topic modelling - number of topics (e.g. 20 or 40)")
 
     # Focus source and function
     parser.add_argument("-f", "--focus", default=None, choices=['set', 'chi2', 'mutual'],
                         help=argparse.SUPPRESS)
     parser.add_argument("-fs", "--focus_source", default='USPTO-random-1000.pkl.bz2', help=argparse.SUPPRESS)
-    parser.add_argument("-tn", "--table_name", default=os.path.join('outputs', 'table', 'table.xlsx'), help=argparse.SUPPRESS)
+    parser.add_argument("-tn", "--table_name", default=os.path.join('outputs', 'table', 'table.xlsx'),
+                        help=argparse.SUPPRESS)
 
     parser.add_argument("-j", "--json", default=True, action="store_true",
                         help=argparse.SUPPRESS)
@@ -142,7 +144,6 @@ def get_args(command_line_arguments):
 
 
 def main(supplied_args):
-
     paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'wordclouds'),
              os.path.join('outputs', 'table'), os.path.join('outputs', 'emergence')]
     for path in paths:
@@ -157,7 +158,7 @@ def main(supplied_args):
     outputs.append('report')
     if args.term_counts:
         outputs.append('termcounts')
-    if args.n_nmf_topics >0:
+    if args.n_nmf_topics > 0:
         outputs.append('nmf')
 
     docs_mask_dict = argscheck.get_docs_mask_dict()
@@ -210,8 +211,22 @@ def main(supplied_args):
 
             title += f' ({emergence})'
 
-            html_results = pipeline_emtech.run(predictors_to_run, normalized=args.normalised, train_test=args.test,
-                                        emergence=emergence)
+            html_results, training_values = pipeline_emtech.run(predictors_to_run, normalized=args.normalised,
+                                                                train_test=args.test,
+                                                                emergence=emergence)
+
+            # save training_values to csv file
+            #
+            # training_values:                                  csv file:
+            # {'term1': [0,2,4,6], 'term2': [2,4,1,3]}          'term1', 0, 2, 4, 6
+            #                                                   'term2', 2, 4, 1, 3
+            #
+            filename = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence + '_time_series.csv')
+            with open(filename, 'w') as f:
+                w = csv.writer(f)
+                for key, values in training_values:
+                    my_list = ["'" + str(key) + "'"] + values
+                    w.writerow(my_list)
 
             html_doc = f'''<!DOCTYPE html>
                 <html lang="en">
@@ -248,9 +263,9 @@ def main(supplied_args):
         main(sys.argv[1:])
         end = time.time()
         diff = int(end - start)
-        hours=diff//3600
-        minutes=diff//60
-        seconds=diff%60
+        hours = diff // 3600
+        minutes = diff // 60
+        seconds = diff % 60
 
         print('')
         print(f"pyGrams query took {hours}:{minutes:02d}:{seconds:02d} to complete")

diff --git a/scripts/pipeline.py b/scripts/pipeline.py
@@ -1,9 +1,11 @@
 import bz2
 import pickle
-
 from os import makedirs, path
+
 from pandas import read_pickle
-import scripts.data_factory as datafactory
+from tqdm import tqdm
+
+import scripts.data_factory as data_factory
 import scripts.output_factory as output_factory
 import scripts.utils.date_utils
 from scripts.algorithms.emergence import Emergence
@@ -17,7 +19,6 @@
 from scripts.vandv.emergence_labels import map_prediction_to_emergence_label, report_predicted_emergence_labels_html
 from scripts.vandv.graphs import report_prediction_as_graphs_html
 from scripts.vandv.predictor import evaluate_prediction
-from tqdm import tqdm
 
 
 class Pipeline(object):
@@ -34,7 +35,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
         # calculate or fetch tf-idf mat
         if pickled_tfidf_folder_name is None:
 
-            dataframe = datafactory.get(data_filename)
+            dataframe = data_factory.get(data_filename)
             utils.checkdf(dataframe, emerging_technology, docs_mask_dict, text_header, term_counts)
             utils.remove_empty_documents(dataframe, text_header)
 
@@ -88,7 +89,8 @@ def pickle_object(short_name, obj):
             if self.__dates is not None:
                 min_date = min(self.__dates)
                 max_date = max(self.__dates)
-                print(f'Document year-week dates range from {min_date//100}-{(min_date%100):02d} to {max_date//100}-{(max_date%100):02d}')
+                print(f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} '
+                      f'to {max_date // 100}-{(max_date % 100):02d}')
 
             WordAnalyzer.init(
                 tokenizer=LemmaTokenizer(),
@@ -138,7 +140,8 @@ def pickle_object(short_name, obj):
         tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix)
 
         tfidf_masked, self.__dates = utils.remove_all_null_rows_global(tfidf_masked, self.__dates)
-        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')
+        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}'
+              f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')
 
         # todo: no advantage in classes - just create term_count and extract_ngrams as functions
 
@@ -149,7 +152,8 @@ def pickle_object(short_name, obj):
             self.__term_counts_data = self.__tfidf_reduce_obj.create_terms_count(self.__dates)
         # if other outputs
         self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)
-        self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)
+        self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni,
+                                                  WordAnalyzer.stemmed_stop_word_set_n)
 
         # todo: no output method; just if statements to call output functions...?
         #  Only supply what they each directly require
@@ -238,7 +242,7 @@ def __init__(self, term_counts_data, m_steps_ahead=5, curves=True, nterms=50, mi
             file.write('Emergent\n')
             for tup in self.__emergence_list[:nterms]:
                 print(tup[0] + ": " + str(tup[1]))
-                file.write(tup[0] + ": " + str(tup[1])+ '\n')
+                file.write(tup[0] + ": " + str(tup[1]) + '\n')
             print()
             file.write('\n')
             print('Stationary')
@@ -250,10 +254,10 @@ def __init__(self, term_counts_data, m_steps_ahead=5, curves=True, nterms=50, mi
             file.write('\n')
 
             print('Declining')
-            file.write('Declining'+ '\n')
+            file.write('Declining' + '\n')
             for tup in self.__emergence_list[-nterms:]:
                 print(tup[0] + ": " + str(tup[1]))
-                file.write(tup[0] + ": " + str(tup[1])+ '\n')
+                file.write(tup[0] + ": " + str(tup[1]) + '\n')
             print()
             file.write('\n')
         # construct a terms list for n emergent n stationary? n declining
@@ -293,4 +297,4 @@ def run(self, predictors_to_run, emergence, normalized=False, train_test=False):
                                                          normalised=normalized,
                                                          test_forecasts=train_test)
 
-        return html_results
+        return html_results, training_values.items()