From 9972bcdcbfefbe3ec23b1cf991ae394daae8d867 Mon Sep 17 00:00:00 2001
From: user624086 <bernard.peat@ons.gov.uk>
Date: Thu, 16 May 2019 16:28:44 +0100
Subject: [PATCH 1/4] initial commit

---
 scripts/pipeline.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/scripts/pipeline.py b/scripts/pipeline.py
index be8e0b6..f4dfdc8 100644
--- a/scripts/pipeline.py
+++ b/scripts/pipeline.py
@@ -1,5 +1,6 @@
 import bz2
 import pickle
+import csv
 from os import makedirs, path
 
 from pandas import read_pickle
@@ -278,6 +279,19 @@ def run(self, predictors_to_run, emergence, normalized=False, train_test=False):
         predicted_emergence = map_prediction_to_emergence_label(results, training_values, test_values,
                                                                 predictors_to_run, test_terms=terms)
 
+        # save training_values to csv file
+        #
+        # training_values:                                  csv file:
+        # {'term1': [0,2,4,6], 'term2': [2,4,1,3]}          'term1', 0, 2, 4, 6
+        #                                                   'term2', 2, 4, 1, 3
+        #
+        filename = 'outputs/emergence/' + emergence + '_time_series.csv'
+        with open(filename, 'w') as f:
+            w = csv.writer(f)
+            for key, values in training_values.items():
+                my_list = ["'" + str(key) + "'"] + values
+                w.writerow(my_list)
+
         html_results += report_predicted_emergence_labels_html(predicted_emergence)
 
         html_results += report_prediction_as_graphs_html(results, predictors_to_run, self.__weekly_iso_dates,

From e7bb32421566c82be2c739f09763f657fd93bddc Mon Sep 17 00:00:00 2001
From: user624086 <bernard.peat@ons.gov.uk>
Date: Wed, 22 May 2019 09:33:15 +0100
Subject: [PATCH 2/4] filename using os.path.join

---
 scripts/pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/pipeline.py b/scripts/pipeline.py
index f4dfdc8..fecd5fe 100644
--- a/scripts/pipeline.py
+++ b/scripts/pipeline.py
@@ -285,7 +285,7 @@ def run(self, predictors_to_run, emergence, normalized=False, train_test=False):
         # {'term1': [0,2,4,6], 'term2': [2,4,1,3]}          'term1', 0, 2, 4, 6
         #                                                   'term2', 2, 4, 1, 3
         #
-        filename = 'outputs/emergence/' + emergence + '_time_series.csv'
+        filename = path.join('outputs', 'emergence', emergence + '_time_series.csv')
         with open(filename, 'w') as f:
             w = csv.writer(f)
             for key, values in training_values.items():

From f93506a575f59d2393eb96a33aacd3ede844b1d2 Mon Sep 17 00:00:00 2001
From: IanGrimstead <Ian.Grimstead@ons.gov.uk>
Date: Mon, 3 Jun 2019 14:53:12 +0100
Subject: [PATCH 3/4] Merged with develop and tidied to remove knowledge of
 path (#268)

---
 pygrams.py          | 37 ++++++++++++++++++++++++++-----------
 scripts/pipeline.py | 36 ++++++++++++------------------------
 2 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/pygrams.py b/pygrams.py
index a484add..70dc80b 100644
--- a/pygrams.py
+++ b/pygrams.py
@@ -1,4 +1,5 @@
 import argparse
+import csv
 import os
 import sys
 import time
@@ -20,18 +21,19 @@ def get_args(command_line_arguments):
                                      conflict_handler='resolve')  # allows overridng of arguments
 
     # suppressed:________________________________________
-    parser.add_argument("-tc", "--term-counts", default=False,  action="store_true", help=argparse.SUPPRESS)
+    parser.add_argument("-tc", "--term-counts", default=False, action="store_true", help=argparse.SUPPRESS)
     parser.add_argument("-ih", "--id_header", default=None, help=argparse.SUPPRESS)
     parser.add_argument("-c", "--cite", default=False, action="store_true", help=argparse.SUPPRESS)
     parser.add_argument("-pt", "--path", default='data', help=argparse.SUPPRESS)
     parser.add_argument("-nmf", "--n_nmf_topics", type=int, default=0, help=argparse.SUPPRESS)
-                        # help="NMF topic modelling - number of topics (e.g. 20 or 40)")
+    # help="NMF topic modelling - number of topics (e.g. 20 or 40)")
 
     # Focus source and function
     parser.add_argument("-f", "--focus", default=None, choices=['set', 'chi2', 'mutual'],
                         help=argparse.SUPPRESS)
     parser.add_argument("-fs", "--focus_source", default='USPTO-random-1000.pkl.bz2', help=argparse.SUPPRESS)
-    parser.add_argument("-tn", "--table_name", default=os.path.join('outputs', 'table', 'table.xlsx'), help=argparse.SUPPRESS)
+    parser.add_argument("-tn", "--table_name", default=os.path.join('outputs', 'table', 'table.xlsx'),
+                        help=argparse.SUPPRESS)
 
     parser.add_argument("-j", "--json", default=True, action="store_true",
                         help=argparse.SUPPRESS)
@@ -142,7 +144,6 @@ def get_args(command_line_arguments):
 
 
 def main(supplied_args):
-
     paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'wordclouds'),
              os.path.join('outputs', 'table'), os.path.join('outputs', 'emergence')]
     for path in paths:
@@ -157,7 +158,7 @@ def main(supplied_args):
     outputs.append('report')
     if args.term_counts:
         outputs.append('termcounts')
-    if args.n_nmf_topics >0:
+    if args.n_nmf_topics > 0:
         outputs.append('nmf')
 
     docs_mask_dict = argscheck.get_docs_mask_dict()
@@ -210,8 +211,22 @@ def main(supplied_args):
 
             title += f' ({emergence})'
 
-            html_results = pipeline_emtech.run(predictors_to_run, normalized=args.normalised, train_test=args.test,
-                                        emergence=emergence)
+            html_results, training_values = pipeline_emtech.run(predictors_to_run, normalized=args.normalised,
+                                                                train_test=args.test,
+                                                                emergence=emergence)
+
+            # save training_values to csv file
+            #
+            # training_values:                                  csv file:
+            # {'term1': [0,2,4,6], 'term2': [2,4,1,3]}          'term1', 0, 2, 4, 6
+            #                                                   'term2', 2, 4, 1, 3
+            #
+            filename = path.join('outputs', 'emergence', args.outputs_name + '_' + emergence + '_time_series.csv')
+            with open(filename, 'w') as f:
+                w = csv.writer(f)
+                for key, values in training_values:
+                    my_list = ["'" + str(key) + "'"] + values
+                    w.writerow(my_list)
 
             html_doc = f'''<!DOCTYPE html>
                 <html lang="en">
@@ -226,7 +241,7 @@ def main(supplied_args):
                 </html>
                 '''
 
-            base_file_name = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence)
+            base_file_name = path.join('outputs', 'emergence', args.outputs_name + '_' + emergence)
 
             if args.normalised:
                 base_file_name += '_normalised'
@@ -248,9 +263,9 @@ def main(supplied_args):
         main(sys.argv[1:])
         end = time.time()
         diff = int(end - start)
-        hours=diff//3600
-        minutes=diff//60
-        seconds=diff%60
+        hours = diff // 3600
+        minutes = diff // 60
+        seconds = diff % 60
 
         print('')
         print(f"pyGrams query took {hours}:{minutes:02d}:{seconds:02d} to complete")
diff --git a/scripts/pipeline.py b/scripts/pipeline.py
index 007e35e..1a8dea3 100644
--- a/scripts/pipeline.py
+++ b/scripts/pipeline.py
@@ -1,12 +1,11 @@
 import bz2
 import pickle
-import csv
 from os import makedirs, path
 
 from pandas import read_pickle
 from tqdm import tqdm
 
-import scripts.data_factory as datafactory
+import scripts.data_factory as data_factory
 import scripts.output_factory as output_factory
 import scripts.utils.date_utils
 from scripts.algorithms.emergence import Emergence
@@ -20,7 +19,6 @@
 from scripts.vandv.emergence_labels import map_prediction_to_emergence_label, report_predicted_emergence_labels_html
 from scripts.vandv.graphs import report_prediction_as_graphs_html
 from scripts.vandv.predictor import evaluate_prediction
-from tqdm import tqdm
 
 
 class Pipeline(object):
@@ -37,7 +35,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
         # calculate or fetch tf-idf mat
         if pickled_tfidf_folder_name is None:
 
-            dataframe = datafactory.get(data_filename)
+            dataframe = data_factory.get(data_filename)
             utils.checkdf(dataframe, emerging_technology, docs_mask_dict, text_header, term_counts)
             utils.remove_empty_documents(dataframe, text_header)
 
@@ -91,7 +89,8 @@ def pickle_object(short_name, obj):
             if self.__dates is not None:
                 min_date = min(self.__dates)
                 max_date = max(self.__dates)
-                print(f'Document year-week dates range from {min_date//100}-{(min_date%100):02d} to {max_date//100}-{(max_date%100):02d}')
+                print(f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} '
+                      f'to {max_date // 100}-{(max_date % 100):02d}')
 
             WordAnalyzer.init(
                 tokenizer=LemmaTokenizer(),
@@ -141,7 +140,8 @@ def pickle_object(short_name, obj):
         tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix)
 
         tfidf_masked, self.__dates = utils.remove_all_null_rows_global(tfidf_masked, self.__dates)
-        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')
+        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}'
+              f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')
 
         # todo: no advantage in classes - just create term_count and extract_ngrams as functions
 
@@ -152,7 +152,8 @@ def pickle_object(short_name, obj):
             self.__term_counts_data = self.__tfidf_reduce_obj.create_terms_count(self.__dates)
         # if other outputs
         self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)
-        self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)
+        self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni,
+                                                  WordAnalyzer.stemmed_stop_word_set_n)
 
         # todo: no output method; just if statements to call output functions...?
         #  Only supply what they each directly require
@@ -241,7 +242,7 @@ def __init__(self, term_counts_data, m_steps_ahead=5, curves=True, nterms=50, mi
             file.write('Emergent\n')
             for tup in self.__emergence_list[:nterms]:
                 print(tup[0] + ": " + str(tup[1]))
-                file.write(tup[0] + ": " + str(tup[1])+ '\n')
+                file.write(tup[0] + ": " + str(tup[1]) + '\n')
             print()
             file.write('\n')
             print('Stationary')
@@ -253,10 +254,10 @@ def __init__(self, term_counts_data, m_steps_ahead=5, curves=True, nterms=50, mi
             file.write('\n')
 
             print('Declining')
-            file.write('Declining'+ '\n')
+            file.write('Declining' + '\n')
             for tup in self.__emergence_list[-nterms:]:
                 print(tup[0] + ": " + str(tup[1]))
-                file.write(tup[0] + ": " + str(tup[1])+ '\n')
+                file.write(tup[0] + ": " + str(tup[1]) + '\n')
             print()
             file.write('\n')
         # construct a terms list for n emergent n stationary? n declining
@@ -288,19 +289,6 @@ def run(self, predictors_to_run, emergence, normalized=False, train_test=False):
         predicted_emergence = map_prediction_to_emergence_label(results, training_values, test_values,
                                                                 predictors_to_run, test_terms=terms)
 
-        # save training_values to csv file
-        #
-        # training_values:                                  csv file:
-        # {'term1': [0,2,4,6], 'term2': [2,4,1,3]}          'term1', 0, 2, 4, 6
-        #                                                   'term2', 2, 4, 1, 3
-        #
-        filename = path.join('outputs', 'emergence', emergence + '_time_series.csv')
-        with open(filename, 'w') as f:
-            w = csv.writer(f)
-            for key, values in training_values.items():
-                my_list = ["'" + str(key) + "'"] + values
-                w.writerow(my_list)
-
         html_results += report_predicted_emergence_labels_html(predicted_emergence)
 
         html_results += report_prediction_as_graphs_html(results, predictors_to_run, self.__weekly_iso_dates,
@@ -309,4 +297,4 @@ def run(self, predictors_to_run, emergence, normalized=False, train_test=False):
                                                          normalised=normalized,
                                                          test_forecasts=train_test)
 
-        return html_results
+        return html_results, training_values.items()

From b18fd34e07f0d61bc47372254aac9fb5667b770f Mon Sep 17 00:00:00 2001
From: user624086 <bernard.peat@ons.gov.uk>
Date: Tue, 4 Jun 2019 14:54:36 +0100
Subject: [PATCH 4/4] Update pygrams.py

change path.join to os.path.join x2
---
 pygrams.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pygrams.py b/pygrams.py
index 890f135..298aa3c 100644
--- a/pygrams.py
+++ b/pygrams.py
@@ -221,7 +221,7 @@ def main(supplied_args):
             # {'term1': [0,2,4,6], 'term2': [2,4,1,3]}          'term1', 0, 2, 4, 6
             #                                                   'term2', 2, 4, 1, 3
             #
-            filename = path.join('outputs', 'emergence', args.outputs_name + '_' + emergence + '_time_series.csv')
+            filename = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence + '_time_series.csv')
             with open(filename, 'w') as f:
                 w = csv.writer(f)
                 for key, values in training_values:
@@ -241,7 +241,7 @@ def main(supplied_args):
                 </html>
                 '''
 
-            base_file_name = path.join('outputs', 'emergence', args.outputs_name + '_' + emergence)
+            base_file_name = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence)
 
             if args.normalised:
                 base_file_name += '_normalised'