Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

save time series to file #270

Merged
merged 9 commits into from
Jun 4, 2019
35 changes: 25 additions & 10 deletions pygrams.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import csv
import os
import sys
import time
Expand All @@ -20,18 +21,19 @@ def get_args(command_line_arguments):
conflict_handler='resolve') # allows overridng of arguments

# suppressed:________________________________________
parser.add_argument("-tc", "--term-counts", default=False, action="store_true", help=argparse.SUPPRESS)
parser.add_argument("-tc", "--term-counts", default=False, action="store_true", help=argparse.SUPPRESS)
parser.add_argument("-ih", "--id_header", default=None, help=argparse.SUPPRESS)
parser.add_argument("-c", "--cite", default=False, action="store_true", help=argparse.SUPPRESS)
parser.add_argument("-pt", "--path", default='data', help=argparse.SUPPRESS)
parser.add_argument("-nmf", "--n_nmf_topics", type=int, default=0, help=argparse.SUPPRESS)
# help="NMF topic modelling - number of topics (e.g. 20 or 40)")
# help="NMF topic modelling - number of topics (e.g. 20 or 40)")

# Focus source and function
parser.add_argument("-f", "--focus", default=None, choices=['set', 'chi2', 'mutual'],
help=argparse.SUPPRESS)
parser.add_argument("-fs", "--focus_source", default='USPTO-random-1000.pkl.bz2', help=argparse.SUPPRESS)
parser.add_argument("-tn", "--table_name", default=os.path.join('outputs', 'table', 'table.xlsx'), help=argparse.SUPPRESS)
parser.add_argument("-tn", "--table_name", default=os.path.join('outputs', 'table', 'table.xlsx'),
help=argparse.SUPPRESS)

parser.add_argument("-j", "--json", default=True, action="store_true",
help=argparse.SUPPRESS)
Expand Down Expand Up @@ -142,7 +144,6 @@ def get_args(command_line_arguments):


def main(supplied_args):

paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'wordclouds'),
os.path.join('outputs', 'table'), os.path.join('outputs', 'emergence')]
for path in paths:
Expand All @@ -157,7 +158,7 @@ def main(supplied_args):
outputs.append('report')
if args.term_counts:
outputs.append('termcounts')
if args.n_nmf_topics >0:
if args.n_nmf_topics > 0:
outputs.append('nmf')

docs_mask_dict = argscheck.get_docs_mask_dict()
Expand Down Expand Up @@ -210,8 +211,22 @@ def main(supplied_args):

title += f' ({emergence})'

html_results = pipeline_emtech.run(predictors_to_run, normalized=args.normalised, train_test=args.test,
emergence=emergence)
html_results, training_values = pipeline_emtech.run(predictors_to_run, normalized=args.normalised,
train_test=args.test,
emergence=emergence)

# save training_values to csv file
#
# training_values: csv file:
# {'term1': [0,2,4,6], 'term2': [2,4,1,3]} 'term1', 0, 2, 4, 6
# 'term2', 2, 4, 1, 3
#
filename = os.path.join('outputs', 'emergence', args.outputs_name + '_' + emergence + '_time_series.csv')
with open(filename, 'w') as f:
w = csv.writer(f)
for key, values in training_values:
my_list = ["'" + str(key) + "'"] + values
w.writerow(my_list)

html_doc = f'''<!DOCTYPE html>
<html lang="en">
Expand Down Expand Up @@ -248,9 +263,9 @@ def main(supplied_args):
main(sys.argv[1:])
end = time.time()
diff = int(end - start)
hours=diff//3600
minutes=diff//60
seconds=diff%60
hours = diff // 3600
minutes = diff // 60
seconds = diff % 60

print('')
print(f"pyGrams query took {hours}:{minutes:02d}:{seconds:02d} to complete")
Expand Down
26 changes: 15 additions & 11 deletions scripts/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import bz2
import pickle

from os import makedirs, path

from pandas import read_pickle
import scripts.data_factory as datafactory
from tqdm import tqdm

import scripts.data_factory as data_factory
import scripts.output_factory as output_factory
import scripts.utils.date_utils
from scripts.algorithms.emergence import Emergence
Expand All @@ -17,7 +19,6 @@
from scripts.vandv.emergence_labels import map_prediction_to_emergence_label, report_predicted_emergence_labels_html
from scripts.vandv.graphs import report_prediction_as_graphs_html
from scripts.vandv.predictor import evaluate_prediction
from tqdm import tqdm


class Pipeline(object):
Expand All @@ -34,7 +35,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
# calculate or fetch tf-idf mat
if pickled_tfidf_folder_name is None:

dataframe = datafactory.get(data_filename)
dataframe = data_factory.get(data_filename)
utils.checkdf(dataframe, emerging_technology, docs_mask_dict, text_header, term_counts)
utils.remove_empty_documents(dataframe, text_header)

Expand Down Expand Up @@ -88,7 +89,8 @@ def pickle_object(short_name, obj):
if self.__dates is not None:
min_date = min(self.__dates)
max_date = max(self.__dates)
print(f'Document year-week dates range from {min_date//100}-{(min_date%100):02d} to {max_date//100}-{(max_date%100):02d}')
print(f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} '
f'to {max_date // 100}-{(max_date % 100):02d}')

WordAnalyzer.init(
tokenizer=LemmaTokenizer(),
Expand Down Expand Up @@ -138,7 +140,8 @@ def pickle_object(short_name, obj):
tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix)

tfidf_masked, self.__dates = utils.remove_all_null_rows_global(tfidf_masked, self.__dates)
print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')
print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}'
f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')

# todo: no advantage in classes - just create term_count and extract_ngrams as functions

Expand All @@ -149,7 +152,8 @@ def pickle_object(short_name, obj):
self.__term_counts_data = self.__tfidf_reduce_obj.create_terms_count(self.__dates)
# if other outputs
self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)
self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)
self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni,
WordAnalyzer.stemmed_stop_word_set_n)

# todo: no output method; just if statements to call output functions...?
# Only supply what they each directly require
Expand Down Expand Up @@ -238,7 +242,7 @@ def __init__(self, term_counts_data, m_steps_ahead=5, curves=True, nterms=50, mi
file.write('Emergent\n')
for tup in self.__emergence_list[:nterms]:
print(tup[0] + ": " + str(tup[1]))
file.write(tup[0] + ": " + str(tup[1])+ '\n')
file.write(tup[0] + ": " + str(tup[1]) + '\n')
print()
file.write('\n')
print('Stationary')
Expand All @@ -250,10 +254,10 @@ def __init__(self, term_counts_data, m_steps_ahead=5, curves=True, nterms=50, mi
file.write('\n')

print('Declining')
file.write('Declining'+ '\n')
file.write('Declining' + '\n')
for tup in self.__emergence_list[-nterms:]:
print(tup[0] + ": " + str(tup[1]))
file.write(tup[0] + ": " + str(tup[1])+ '\n')
file.write(tup[0] + ": " + str(tup[1]) + '\n')
print()
file.write('\n')
# construct a terms list for n emergent n stationary? n declining
Expand Down Expand Up @@ -293,4 +297,4 @@ def run(self, predictors_to_run, emergence, normalized=False, train_test=False):
normalised=normalized,
test_forecasts=train_test)

return html_results
return html_results, training_values.items()