Skip to content

Commit

Permalink
JSON config file now generated (#52)
Browse files Browse the repository at this point in the history
* Renamed fdg folder to visuals, matched source code to new folder #49
Removed unnecessary folder creation in outputs
* Implemented JSON output and tested via Mocks (#47)
  • Loading branch information
IanGrimstead authored Sep 12, 2018
1 parent e6e9f31 commit d49cc67
Show file tree
Hide file tree
Showing 9 changed files with 186 additions and 51 deletions.
134 changes: 84 additions & 50 deletions detect.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import argparse
import json
import os
import sys

from pandas import Timestamp, read_pickle, ExcelWriter

Expand All @@ -24,7 +26,7 @@ def year2pandas_earliest_date(year_in):
return Timestamp(year_string)


def get_args():
def get_args(command_line_arguments):
parser = argparse.ArgumentParser(description="create report, wordcloud, and fdg graph for patent texts")

parser.add_argument("-f", "--focus", default=False, action="store_true",
Expand All @@ -36,7 +38,8 @@ def get_args():
help="options are <median> <max> <sum> <avg> defaults to sum. Average is over non zero values")
parser.add_argument("-o", "--output", default='report', choices=['fdg', 'wordcloud', 'report', 'table', 'all'],
help="options are: <fdg> <wordcloud> <report> <table> <all>")

parser.add_argument("-j", "--json", default=False, action="store_true",
help="Output configuration as JSON file alongside output report")
parser.add_argument("-yf", "--year_from", type=int, default=2000, help="The first year for the patent cohort")
parser.add_argument("-yt", "--year_to", type=int, default=0, help="The last year for the patent cohort (0 is now)")

Expand Down Expand Up @@ -66,7 +69,7 @@ def get_args():

parser.add_argument("-nltk", "--nltk_path", default=None, help="custom path for NLTK data")

args = parser.parse_args()
args = parser.parse_args(command_line_arguments)
return args


Expand Down Expand Up @@ -104,55 +107,15 @@ def check_cpc_between_years(args, df):
exit(0)


def get_tfidf(args, filename, cpc):
def get_tfidf(args, pickle_file_name, cpc):
date_from = year2pandas_earliest_date(args.year_from)
date_to = year2pandas_latest_date(args.year_to)

df = PatentsPickle2DataFrame(filename, classification=cpc, date_from=date_from, date_to=date_to).data_frame
df = PatentsPickle2DataFrame(pickle_file_name, classification=cpc, date_from=date_from, date_to=date_to).data_frame
check_cpc_between_years(args, df)
return TFIDF(df, tokenizer=LemmaTokenizer(), ngram_range=(args.min_n, args.max_n))


def main():
paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'json'), os.path.join('outputs', 'wordclouds'),
os.path.join('outputs', 'table')]
for path in paths:
os.makedirs(path, exist_ok=True)

args = get_args()
checkargs(args)

if args.nltk_path:
import nltk
nltk.data.path.append(args.nltk_path)

path = os.path.join('data', args.patent_source + ".pkl.bz2")
tfidf = get_tfidf(args, path, args.cpc_classification)

newtfidf = None
if args.focus or args.output == 'table':
path2 = os.path.join('data', args.focus_source + ".pkl.bz2")
newtfidf = get_tfidf(args, path2, None)

citation_count_dict = None
if args.cite:
citation_count_dict = load_citation_count_dict()

out = args.output

ngram_multiplier = 4

if out == 'report':
run_report(args, ngram_multiplier, tfidf, newtfidf, citation_count_dict=citation_count_dict)
elif out == 'wordcloud' or out == 'all':
run_report(args, ngram_multiplier, tfidf, newtfidf, wordclouds=True, citation_count_dict=citation_count_dict)
elif out == 'table' or out == 'all':
run_table(args, ngram_multiplier, tfidf, newtfidf, citation_count_dict)

if out == 'fdg' or out == 'all':
run_fdg(args, tfidf, newtfidf)


def load_citation_count_dict():
citation_count_dict = read_pickle(FilePaths.us_patents_citation_dictionary_1of2_pickle_name)
citation_count_dict_pt2 = read_pickle(FilePaths.us_patents_citation_dictionary_2of2_pickle_name)
Expand All @@ -178,11 +141,11 @@ def run_report(args, ngram_multiplier, tfidf, tfidf_random=None, wordclouds=Fals
number_of_ngrams_to_return=ngram_multiplier * num_ngrams,
pick=args.pick, time=args.time,
citation_count_dict=citation_count_dict)
set_terms = set(terms) if not args.focus \
else tfidf.detect_popular_ngrams_in_corpus_excluding_common(tfidf_random,
number_of_ngrams_to_return=ngram_multiplier * num_ngrams,
pick=args.pick, time=args.time,
citation_count_dict=citation_count_dict)
set_terms = set(terms) if not args.focus else \
tfidf.detect_popular_ngrams_in_corpus_excluding_common(tfidf_random,
number_of_ngrams_to_return=ngram_multiplier * num_ngrams,
pick=args.pick, time=args.time,
citation_count_dict=citation_count_dict)

dict_freqs = dict([((p[1]), p[0]) for p in ngrams_scores_tuple if p[1] in set_terms])

Expand All @@ -209,5 +172,76 @@ def run_fdg(args, tf_idf, tf_idf2=None):
graph.save_graph("key-terms", 'data')


def write_config_to_json(args, patent_pickle_file_name):
patent_pickle_file_name = os.path.abspath(patent_pickle_file_name)
report_file_name = os.path.abspath(args.report_name)
json_file_name = os.path.splitext(report_file_name)[0] + '.json'

json_data = {
'paths': {
'data': patent_pickle_file_name,
'tech_report': report_file_name
},
'year': {
'from': args.year_from,
'to': args.year_to
},
'parameters': {
'cpc': '' if args.cpc_classification is None else args.cpc_classification,
'pick': args.pick,
'time': args.time,
'cite': args.cite,
'focus': args.focus
}
}

with open(json_file_name, 'w') as json_file:
json.dump(json_data, json_file)


def main():
paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'wordclouds'),
os.path.join('outputs', 'table')]
for path in paths:
os.makedirs(path, exist_ok=True)

args = get_args(sys.argv[1:])
checkargs(args)

patent_pickle_file_name = os.path.join('data', args.patent_source + ".pkl.bz2")

if args.json:
write_config_to_json(args, patent_pickle_file_name)

if args.nltk_path:
import nltk
nltk.data.path.append(args.nltk_path)

tfidf = get_tfidf(args, patent_pickle_file_name, args.cpc_classification)

newtfidf = None
if args.focus or args.output == 'table':
path2 = os.path.join('data', args.focus_source + ".pkl.bz2")
newtfidf = get_tfidf(args, path2, None)

citation_count_dict = None
if args.cite:
citation_count_dict = load_citation_count_dict()

out = args.output

ngram_multiplier = 4

if out == 'report':
run_report(args, ngram_multiplier, tfidf, newtfidf, citation_count_dict=citation_count_dict)
elif out == 'wordcloud' or out == 'all':
run_report(args, ngram_multiplier, tfidf, newtfidf, wordclouds=True, citation_count_dict=citation_count_dict)
elif out == 'table' or out == 'all':
run_table(args, ngram_multiplier, tfidf, newtfidf, citation_count_dict)

if out == 'fdg' or out == 'all':
run_fdg(args, tfidf, newtfidf)


if __name__ == '__main__':
main()
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion scripts/visualization/graphs/fdgprep.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def __create_graph_json(self):
def save_graph(self, fname, varname):

graph = self.__create_graph_json()
file_name = os.path.join('outputs', 'fdg', fname + '.js')
file_name = os.path.join('outputs', 'visuals', fname + '.js')
with open(file_name, 'w') as js_temp:
js_temp.write(varname + " = '[")
json.dump(graph, js_temp)
Expand Down
101 changes: 101 additions & 0 deletions tests/test_detect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import os
import unittest
from unittest import mock

import detect


class TestDetect(unittest.TestCase):

def test_args_json_not_requested(self):
args = detect.get_args([])
self.assertFalse(args.json)

def test_args_json_requested_short(self):
args = detect.get_args(['-j'])
self.assertTrue(args.json)

def test_args_json_requested_long(self):
args = detect.get_args(['--json'])
self.assertTrue(args.json)

def test_args_report_name_requested_long(self):
args = detect.get_args(['--report_name=my/test/name.txt'])
self.assertEqual('my/test/name.txt', args.report_name)

def test_args_patent_source_requested_long(self):
args = detect.get_args(['--patent_source=my-test'])
self.assertEqual('my-test', args.patent_source)

@mock.patch("detect.json.dump", create=True)
@mock.patch("detect.open", create=True)
def test_json_configuration_encoding_minimal(self, mock_open, mock_json_dump):
patent_pickle_file_name = 'test.pkl'
patent_pickle_absolute_file_name = os.path.abspath('test.pkl')
report_file_name = os.path.join(os.path.abspath(os.sep), 'dummy', 'test.txt')
json_file_name = os.path.join(os.path.abspath(os.sep), 'dummy', 'test.json')
args = detect.get_args(['-j', f'--report_name={report_file_name}'])

detect.write_config_to_json(args, patent_pickle_file_name)

self.assertTrue(args.json)
mock_open.assert_called_with(json_file_name, 'w')

actual_json = mock_json_dump.call_args[0][0]
expected_json = {
'paths': {
'data': patent_pickle_absolute_file_name,
'tech_report': report_file_name
},
'year': {
'from': 2000,
'to': 0
},
'parameters': {
'cite': False,
'cpc': '',
'focus': False,
'pick': 'sum',
'time': False
}
}
self.assertEqual(expected_json, actual_json)

@mock.patch("detect.json.dump", create=True)
@mock.patch("detect.open", create=True)
def test_json_configuration_encoding_maximal(self, mock_open, mock_json_dump):
patent_pickle_file_name = os.path.join('dummy', 'test.pkl')
patent_pickle_absolute_file_name = os.path.abspath(patent_pickle_file_name)
report_file_name = os.path.join(os.path.abspath(os.sep), 'dummy', 'test.txt')
json_file_name = os.path.join(os.path.abspath(os.sep), 'dummy', 'test.json')
args = detect.get_args(['-j', f'--report_name={report_file_name}', '-c', '-t', '-f', '-p=max', '-cpc=Y12',
'-yf=1998', '-yt=2001'])

detect.write_config_to_json(args, patent_pickle_file_name)

self.assertTrue(args.json)
mock_open.assert_called_with(json_file_name, 'w')

actual_json = mock_json_dump.call_args[0][0]
expected_json = {
'paths': {
'data': patent_pickle_absolute_file_name,
'tech_report': report_file_name
},
'year': {
'from': 1998,
'to': 2001
},
'parameters': {
'cite': True,
'cpc': 'Y12',
'focus': True,
'pick': 'max',
'time': True
}
}
self.assertEqual(expected_json, actual_json)


if __name__ == '__main__':
unittest.main()

0 comments on commit d49cc67

Please sign in to comment.