Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/datasciencecampus/pyGrams
Browse files Browse the repository at this point in the history
…into develop

# Conflicts:
#	pygrams.py
#	scripts/utils/utils.py
  • Loading branch information
IanGrimstead authored and IanGrimstead committed Sep 16, 2019
2 parents 9e25251 + 8cced07 commit 44e067c
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 71 deletions.
10 changes: 5 additions & 5 deletions scripts/output_factory.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import bz2
import json
import pickle
from bz2 import BZ2File
from os import makedirs, path
from pickle import dump

from scripts.nmf_wrapper import nmf_topic_modelling
from scripts.terms_graph import TermsGraph
Expand Down Expand Up @@ -32,16 +32,16 @@ def create(output_type, output, emergence_list=[], wordcloud_title=None, tfidf_r
graph.save_graph("key-terms", 'data')

elif output_type == 'wordcloud':
dict_freqs = dict([(p[0], (p[1])) for p in output])
dict_freqs = {p[0]: p[1] for p in output}
wordcloud = MultiCloudPlot(freqsin=dict_freqs, max_words=len(output))
filename_and_path = path.join('outputs', 'wordclouds', name)
wordcloud.plot_cloud(wordcloud_title, filename_and_path)

elif output_type == 'termcounts':
term_counts_filename = path.join('outputs', 'termcounts', name + '-term_counts.pkl.bz2')
makedirs(path.dirname(term_counts_filename), exist_ok=True)
with bz2.BZ2File(term_counts_filename, 'wb') as pickle_file:
pickle.dump(timeseries_data, pickle_file, protocol=4)
with BZ2File(term_counts_filename, 'wb') as pickle_file:
dump(timeseries_data, pickle_file, protocol=4)

elif output_type == 'json_config':
doc_pickle_file_name = path.abspath(doc_pickle_file_name)
Expand Down
1 change: 0 additions & 1 deletion scripts/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from os import path

from pandas import read_pickle
from scipy.signal import savgol_filter
from tqdm import tqdm

Expand Down
17 changes: 13 additions & 4 deletions scripts/utils/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import array as arr
import bz2
import pickle
from bz2 import BZ2File
from os import path, makedirs
from pickle import dump

import numpy as np
from gensim.models import KeyedVectors
Expand All @@ -27,8 +27,17 @@ def fill_missing_zeros(quarterly_values, non_zero_dates, all_quarters):
def pickle_object(short_name, obj, folder_name):
makedirs(folder_name, exist_ok=True)
file_name = pickle_name(short_name, folder_name)
with bz2.BZ2File(file_name, 'wb') as pickle_file:
pickle.dump(obj, pickle_file, protocol=4, fix_imports=False)
with BZ2File(file_name, 'wb') as pickle_file:
dump(obj, pickle_file, protocol=4, fix_imports=False)


def unpickle_object(short_name, folder_name):
file_name = pickle_name(short_name, folder_name)
return read_pickle(file_name)


def pickle_name(short_name, folder_name):
return path.join(folder_name, short_name + '.pkl.bz2')


def unpickle_object( short_name, folder_name):
Expand Down
136 changes: 75 additions & 61 deletions tests/test_pygrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@
from scripts.utils.pygrams_exception import PygramsException


def bz2file_fake(file_name, state):
assert state == 'wb', "Only supports file.open in write mode"
m = MagicMock()
m.__enter__.return_value = Mock()
m.__exit__.return_value = Mock()
m.__enter__.return_value = file_name
return m


class TestPyGrams(unittest.TestCase):
data_source_name = 'dummy.pkl.bz2'
out_name = 'out'
Expand Down Expand Up @@ -110,14 +119,6 @@ def open_fake_file(file_name, state):

mock_open.side_effect = open_fake_file

def bz2file_fake(file_name, state):
self.assertEqual(state, 'wb', "Only supports file.open in write mode")
m = MagicMock()
m.__enter__.return_value = Mock()
m.__exit__.return_value = Mock()
m.__enter__.return_value = file_name
return m

mock_bz2file.side_effect = bz2file_fake

def isfile_fake(file_name):
Expand All @@ -128,15 +129,18 @@ def isfile_fake(file_name):

mock_path_isfile.side_effect = isfile_fake

def assertTfidfOutputs(self, assert_func, mock_pickle_dump, mock_makedirs, max_df):
def assertTfidfOutputs(self, assert_func, mock_pickle_dump, mock_makedirs, max_df, min_date=200052,
max_date=200052):
self.assertTrue(self.publication_date_auto_tested)
self.assertTrue(self.patent_id_auto_tested)

mock_makedirs.assert_called_with(self.tfidfOutputFolder(self.out_name, max_df), exist_ok=True)
mock_makedirs.assert_called_with(self.tfidfOutputFolder(self.out_name, max_df, min_date, max_date),
exist_ok=True)

results_checked = False
expected_tfidf_file_name = self.tfidfFileName(self.out_name, max_df, min_date, max_date)
for dump_args in mock_pickle_dump.call_args_list:
if dump_args[0][1] == self.tfidfFileName(self.out_name, max_df):
if dump_args[0][1] == expected_tfidf_file_name:
tfidf_obj = dump_args[0][0]
assert_func(tfidf_matrix=tfidf_obj.tfidf_matrix, feature_names=tfidf_obj.feature_names)

Expand All @@ -150,10 +154,10 @@ def assertTimeSeriesOutputs(self, assert_func, mock_pickle_dump, mock_makedirs):
self.assertTrue(self.publication_date_auto_tested)
self.assertTrue(self.patent_id_auto_tested)

# mock_makedirs.assert_called_with(self.termCountsOutputFolder(), exist_ok=True)
expected_term_counts_filename = self.termCountsFileName(self.out_name)
results_checked = False
for dump_args in mock_pickle_dump.call_args_list:
if dump_args[0][1] == self.termCountsFileName(self.out_name):
if dump_args[0][1] == expected_term_counts_filename:
[term_counts_per_week, feature_names, number_of_documents_per_week, week_iso_dates] = dump_args[0][0]

assert_func(term_counts_per_week, feature_names, number_of_documents_per_week, week_iso_dates)
Expand All @@ -165,13 +169,13 @@ def assertTimeSeriesOutputs(self, assert_func, mock_pickle_dump, mock_makedirs):
self.fail('Term counts results were not matched - were filenames correct?')

@staticmethod
def tfidfOutputFolder(data_source_name, max_df):
return os.path.join('outputs', 'tfidf', data_source_name + f'-mdf-{max_df}')
def tfidfOutputFolder(data_source_name, max_df, min_date, max_date):
return os.path.join('cached', data_source_name + f'-mdf-{max_df}-{min_date}-{max_date}')

@staticmethod
def tfidfFileName(data_source_name, max_df):
return os.path.join(TestPyGrams.tfidfOutputFolder(data_source_name, max_df),
data_source_name + f'-mdf-{max_df}-tfidf.pkl.bz2')
def tfidfFileName(data_source_name, max_df, min_date, max_date):
return os.path.join(TestPyGrams.tfidfOutputFolder(data_source_name, max_df, min_date, max_date),
'tfidf.pkl.bz2')

@staticmethod
def termCountsOutputFolder():
Expand All @@ -182,10 +186,10 @@ def termCountsFileName(data_source_name):
return os.path.join(TestPyGrams.termCountsOutputFolder(), data_source_name + '-term_counts.pkl.bz2')

@mock.patch("scripts.data_factory.read_pickle", create=True)
@mock.patch("pickle.dump", create=True)
@mock.patch("scripts.utils.utils.dump", create=True)
@mock.patch("scripts.text_processing.open", create=True)
@mock.patch("bz2.BZ2File", create=True)
@mock.patch("scripts.pipeline.makedirs", create=True)
@mock.patch("scripts.utils.utils.BZ2File", create=True)
@mock.patch("scripts.utils.utils.makedirs", create=True)
@mock.patch("os.path.isfile", create=True)
def test_simple_output_tfidf(self, mock_path_isfile, mock_makedirs, mock_bz2file, mock_open, mock_pickle_dump,
mock_read_pickle):
Expand All @@ -207,29 +211,35 @@ def assert_tfidf_outputs(tfidf_matrix, feature_names):

self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs, max_df)

@mock.patch("scripts.pipeline.read_pickle", create=True)
@mock.patch("scripts.data_factory.read_pickle", create=True)
@mock.patch("pickle.dump", create=True)
@mock.patch("scripts.text_processing.open", create=True)
@mock.patch("bz2.BZ2File", create=True)
@mock.patch("scripts.pipeline.makedirs", create=True)
@mock.patch("scripts.utils.utils.read_pickle", create=True)
@mock.patch("scripts.utils.utils.dump", create=True)
@mock.patch("scripts.utils.utils.BZ2File", create=True)
@mock.patch("scripts.utils.utils.makedirs", create=True)
@mock.patch("scripts.output_factory.dump", create=True)
@mock.patch("scripts.output_factory.BZ2File", create=True)
@mock.patch("scripts.output_factory.makedirs", create=True)
@mock.patch("os.path.isfile", create=True)
def test_simple_output_tfidf_pickle_and_unpickle_and_write_to_timeseries(self, mock_path_isfile,
mock_output_makedirs,
mock_pipeline_makedirs, mock_bz2file,
mock_output_bz2file,
mock_output_pickle_dump,
mock_utils_makedirs,
mock_utils_bz2file,
mock_utils_pickle_dump,
mock_utils_read_pickle,
mock_open,
mock_pickle_dump,
mock_factory_read_pickle,
mock_pipeline_read_pickle):
mock_factory_read_pickle
):
fake_df_data = {
'abstract': [
'abstract'
]
}

# Make a note of the dumped TFIDF object for later
self.preparePyGrams(fake_df_data, mock_factory_read_pickle, mock_open, mock_bz2file, mock_path_isfile)
self.preparePyGrams(fake_df_data, mock_factory_read_pickle, mock_open, mock_utils_bz2file, mock_path_isfile)
args = ['-ds', self.data_source_name, '--date_header', 'publication_date', '--max_document_frequency', '1.0']
pygrams.main(args)

Expand All @@ -250,20 +260,17 @@ def find_matching_pickle(mock_pickle_dump, pickle_file_name):
return args[0][0]
return None

dumped_tfidf_file_name = os.path.join('outputs', 'tfidf', self.out_name + '-mdf-1.0',
self.out_name + '-mdf-1.0-tfidf.pkl.bz2')
self.dumped_tfidf = find_matching_pickle(mock_pickle_dump, dumped_tfidf_file_name)
dumped_tfidf_file_name = os.path.join('cached', self.out_name + '-mdf-1.0-200052-200052', 'tfidf.pkl.bz2')
self.dumped_tfidf = find_matching_pickle(mock_utils_pickle_dump, dumped_tfidf_file_name)

dumped_dates_file_name = os.path.join('outputs', 'tfidf', self.out_name + '-mdf-1.0',
self.out_name + '-mdf-1.0-dates.pkl.bz2')
self.dumped_dates = find_matching_pickle(mock_pickle_dump, dumped_dates_file_name)
dumped_dates_file_name = os.path.join('cached', self.out_name + '-mdf-1.0-200052-200052', 'dates.pkl.bz2')
self.dumped_dates = find_matching_pickle(mock_utils_pickle_dump, dumped_dates_file_name)

dumped_cpc_dict_file_name = os.path.join('outputs', 'tfidf', self.out_name + '-mdf-1.0',
self.out_name + '-mdf-1.0-cpc_dict.pkl.bz2')
self.dumped_cpc_dict = find_matching_pickle(mock_pickle_dump, dumped_cpc_dict_file_name)
dumped_cpc_dict_file_name = os.path.join('cached', self.out_name + '-mdf-1.0-200052-200052', 'cpc_dict.pkl.bz2')
self.dumped_cpc_dict = find_matching_pickle(mock_utils_pickle_dump, dumped_cpc_dict_file_name)

mock_factory_read_pickle.side_effect = factory_read_pickle_fake
mock_pickle_dump.reset_mock(return_value=True, side_effect=True)
mock_utils_pickle_dump.reset_mock(return_value=True, side_effect=True)

# Instead support TFIDF pickle read - and return the TFIDF object previously saved to disc
def pipeline_read_pickle_fake(pickle_file_name):
Expand All @@ -276,12 +283,13 @@ def pipeline_read_pickle_fake(pickle_file_name):
else:
self.fail(f'Should not be reading {pickle_file_name} via a factory if TFIDF was requested from pickle')

mock_pipeline_read_pickle.side_effect = pipeline_read_pickle_fake
mock_pipeline_read_pickle.return_value = self.dumped_tfidf
mock_output_bz2file.side_effect = bz2file_fake
mock_utils_read_pickle.side_effect = pipeline_read_pickle_fake
mock_utils_read_pickle.return_value = self.dumped_tfidf
args = ['-ds', self.data_source_name, '-ts', '-tc',
'--date_header',
'publication_date', '--max_document_frequency', '1.0',
'--input_tfidf', self.out_name + '-mdf-1.0']
'--use_cache', self.out_name + '-mdf-1.0-200052-200052']
pygrams.main(args)

def assert_timeseries_outputs(term_counts_per_week, feature_names, number_of_documents_per_week,
Expand All @@ -292,13 +300,13 @@ def assert_timeseries_outputs(term_counts_per_week, feature_names, number_of_doc
self.assertListEqual(number_of_documents_per_week, [1])
self.assertListEqual(week_iso_dates, [200052])

self.assertTimeSeriesOutputs(assert_timeseries_outputs, mock_pickle_dump, mock_output_makedirs)
self.assertTimeSeriesOutputs(assert_timeseries_outputs, mock_output_pickle_dump, mock_output_makedirs)

@mock.patch("scripts.data_factory.read_pickle", create=True)
@mock.patch("pickle.dump", create=True)
@mock.patch("scripts.utils.utils.dump", create=True)
@mock.patch("scripts.text_processing.open", create=True)
@mock.patch("bz2.BZ2File", create=True)
@mock.patch("scripts.pipeline.makedirs", create=True)
@mock.patch("scripts.utils.utils.BZ2File", create=True)
@mock.patch("scripts.utils.utils.makedirs", create=True)
@mock.patch("os.path.isfile", create=True)
def test_simple_two_patents_unigrams_only_output_tfidf(self, mock_path_isfile, mock_makedirs, mock_bz2file,
mock_open, mock_pickle_dump, mock_read_pickle):
Expand Down Expand Up @@ -339,17 +347,17 @@ def assert_tfidf_outputs(tfidf_matrix, feature_names):
self.assertListAlmostEqual(tfidf_as_lists[0], [l2norm_tfidf_abstract, l2norm_tfidf_one, 0], places=4)
self.assertListAlmostEqual(tfidf_as_lists[1], [l2norm_tfidf_abstract, 0, l2norm_tfidf_one], places=4)

self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs, max_df)
self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs, max_df, 200051, 200052)

"""
Extended from test_simple_two_patents_unigrams_only_output_tfidf - sets prefilter-terms to remove 'noise' terms
"""

@mock.patch("scripts.data_factory.read_pickle", create=True)
@mock.patch("pickle.dump", create=True)
@mock.patch("scripts.utils.utils.dump", create=True)
@mock.patch("scripts.text_processing.open", create=True)
@mock.patch("bz2.BZ2File", create=True)
@mock.patch("scripts.pipeline.makedirs", create=True)
@mock.patch("scripts.utils.utils.BZ2File", create=True)
@mock.patch("scripts.utils.utils.makedirs", create=True)
@mock.patch("os.path.isfile", create=True)
def test_simple_two_patents_unigrams_and_prefilter_only_output_tfidf(self, mock_path_isfile, mock_makedirs,
mock_bz2file, mock_open, mock_pickle_dump,
Expand All @@ -362,8 +370,8 @@ def test_simple_two_patents_unigrams_and_prefilter_only_output_tfidf(self, mock_
}
max_df = 1.0
self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open, mock_bz2file, mock_path_isfile)
args = ['-ds', self.data_source_name, '--date_header',
'publication_date', '--max_document_frequency', str(max_df), '--max_ngrams', '1',
args = ['-ds', self.data_source_name, '--date_header', 'publication_date',
'--max_document_frequency', str(max_df), '--max_ngrams', '1',
'--prefilter_terms', '1']

pygrams.main(args)
Expand All @@ -388,23 +396,28 @@ def assert_tfidf_outputs(tfidf_matrix, feature_names):
self.assertListAlmostEqual(tfidf_as_lists[0], [l2norm_tfidf_abstract], places=4)
self.assertListAlmostEqual(tfidf_as_lists[1], [l2norm_tfidf_abstract], places=4)

self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs, max_df)
self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs, max_df, 200051, 200052)

@mock.patch("scripts.data_factory.read_pickle", create=True)
@mock.patch("pickle.dump", create=True)
@mock.patch("scripts.utils.utils.dump", create=True)
@mock.patch("scripts.utils.utils.BZ2File", create=True)
@mock.patch("scripts.text_processing.open", create=True)
@mock.patch("bz2.BZ2File", create=True)
@mock.patch("scripts.output_factory.dump", create=True)
@mock.patch("scripts.output_factory.BZ2File", create=True)
@mock.patch("scripts.output_factory.makedirs", create=True)
@mock.patch("os.path.isfile", create=True)
def test_unibitri_reduction_output_termcounts(self, mock_path_isfile, mock_makedirs, mock_bz2file, mock_open,
mock_pickle_dump, mock_read_pickle):
def test_unibitri_reduction_output_termcounts(self, mock_path_isfile, mock_of_makedirs,
mock_of_bz2file, mock_of_dump, mock_open,
mock_utils_bz2file, mock_utils_dump, mock_read_pickle):
fake_df_data = {
'abstract': [
'abstract 1, of the patent with extra stuff'
]
}

self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open, mock_bz2file, mock_path_isfile)
mock_of_bz2file.side_effect = bz2file_fake

self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open, mock_utils_bz2file, mock_path_isfile)
args = ['-ts', '-tc', '-ds', self.data_source_name, '--id_header', 'patent_id', '--date_header',
'publication_date', '--max_document_frequency', '1.0']

Expand All @@ -417,7 +430,7 @@ def assert_outputs(term_counts_per_week, feature_names, number_of_documents_per_
self.assertListEqual(number_of_documents_per_week, [1])
self.assertListEqual(week_iso_dates, [200052])

self.assertTimeSeriesOutputs(assert_outputs, mock_pickle_dump, mock_makedirs)
self.assertTimeSeriesOutputs(assert_outputs, mock_of_dump, mock_of_makedirs)

@unittest.skip("json compulsory now, so not an option")
def test_args_json_not_requested(self):
Expand Down Expand Up @@ -510,7 +523,8 @@ def test_graph_creation(self, mock_open, mock_json_dump):
json_file_name = os.path.join('outputs', 'reports', 'key-terms.json')
graph_report_name = os.path.join('outputs', 'reports', fname + '_graph.txt')

test_args = ['--doc_source', 'USPTO-random-100.pkl.bz2', '-o', 'graph', '--outputs_name', fname]
test_args = ['--doc_source', 'USPTO-random-100.pkl.bz2', '--date_header', 'publication_date', '-o', 'graph',
'--outputs_name', fname]
pygrams.main(test_args)

mock_open.assert_any_call(json_file_name, 'w')
Expand Down

0 comments on commit 44e067c

Please sign in to comment.