Skip to content

Commit

Permalink
Merge branch 'develop' into 212-pmdarima
Browse files Browse the repository at this point in the history
  • Loading branch information
thanasions authored Mar 22, 2019
2 parents 3272a7f + 39d5ab2 commit cf3614f
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 12 deletions.
2 changes: 1 addition & 1 deletion scripts/data_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def get(doc_source_file_name):
if not os.path.isfile(doc_source_file_name):
raise PygramsException('file: ' + doc_source_file_name + ' does not exist in data folder')

if doc_source_file_name.endswith('.pkl.bz2'):
if doc_source_file_name.endswith('.pkl.bz2') or doc_source_file_name.endswith('.pkl'):
return read_pickle(doc_source_file_name)
elif doc_source_file_name.endswith('.xls'):
return read_excel(doc_source_file_name)
Expand Down
2 changes: 1 addition & 1 deletion scripts/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
self.__text_lengths = self.__dataframe[text_header].map(len).tolist()
self.__dataframe.drop(columns=[text_header], inplace=True)

tfidf_filename = path.join('outputs', 'tfidf', output_name + '-tfidf.pkl.bz2')
tfidf_filename = path.join('outputs', 'tfidf', output_name + f'-tfidf-mdf-{max_df}.pkl.bz2')
makedirs(path.dirname(tfidf_filename), exist_ok=True)
with bz2.BZ2File(tfidf_filename, 'wb') as pickle_file:
pickle.dump(
Expand Down
20 changes: 10 additions & 10 deletions tests/test_pygrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,14 +126,14 @@ def isfile_fake(file_name):

mock_path_isfile.side_effect = isfile_fake

def assertTfidfOutputs(self, assert_func, mock_pickle_dump, mock_makedirs):
def assertTfidfOutputs(self, assert_func, mock_pickle_dump, mock_makedirs, max_df):
self.assertTrue(self.publication_date_auto_tested)
self.assertTrue(self.patent_id_auto_tested)

mock_makedirs.assert_called_with(self.tfidfOutputFolder(), exist_ok=True)
results_checked = False
for dump_args in mock_pickle_dump.call_args_list:
if dump_args[0][1] == self.tfidfFileName(self.out_name):
if dump_args[0][1] == self.tfidfFileName(self.out_name, max_df):
tfidf_pickle = dump_args[0][0]
tfidf_obj = tfidf_pickle[0]

Expand Down Expand Up @@ -168,8 +168,8 @@ def tfidfOutputFolder():
return os.path.join('outputs', 'tfidf')

@staticmethod
def tfidfFileName(data_source_name):
return os.path.join(TestPyGrams.tfidfOutputFolder(), data_source_name + '-tfidf.pkl.bz2')
def tfidfFileName(data_source_name, max_df):
return os.path.join(TestPyGrams.tfidfOutputFolder(), data_source_name + f'-tfidf-mdf-{max_df}.pkl.bz2')

@staticmethod
def termCountsOutputFolder():
Expand All @@ -192,17 +192,17 @@ def test_simple_output_tfidf(self, mock_path_isfile, mock_makedirs, mock_bz2file
'abstract'
]
}

max_df = 1.0
self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open, mock_bz2file, mock_path_isfile)
args = ['-ds', self.data_source_name, '--date_header', 'publication_date', '--max_document_frequency', '1.0']
args = ['-ds', self.data_source_name, '--date_header', 'publication_date', '--max_document_frequency', str(max_df)]

pygrams.main(args)

def assert_tfidf_outputs(tfidf_matrix, feature_names):
self.assertEqual(tfidf_matrix.todense(), np.ones(shape=(1, 1)), 'TFIDF should be 1x1 matrix of 1')
self.assertListEqual(feature_names, ['abstract'])

self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs)
self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs, max_df)

@mock.patch("scripts.pipeline.read_pickle", create=True)
@mock.patch("scripts.data_factory.read_pickle", create=True)
Expand Down Expand Up @@ -271,10 +271,10 @@ def test_simple_two_patents_unigrams_only_output_tfidf(self, mock_path_isfile, m
'abstract two'
]
}

max_df=1.0
self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open, mock_bz2file, mock_path_isfile)
args = ['-ds', self.data_source_name, '--date_header',
'publication_date', '--max_document_frequency', '1.0', '--max_ngrams', '1']
'publication_date', '--max_document_frequency', str(max_df), '--max_ngrams', '1']

pygrams.main(args)

Expand All @@ -301,7 +301,7 @@ def assert_tfidf_outputs(tfidf_matrix, feature_names):
self.assertListAlmostEqual(tfidf_as_lists[0], [l2norm_tfidf_abstract, l2norm_tfidf_one, 0], places=4)
self.assertListAlmostEqual(tfidf_as_lists[1], [l2norm_tfidf_abstract, 0, l2norm_tfidf_one], places=4)

self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs)
self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs, max_df)

@mock.patch("scripts.data_factory.read_pickle", create=True)
@mock.patch("pickle.dump", create=True)
Expand Down

0 comments on commit cf3614f

Please sign in to comment.