Skip to content

Commit

Permalink
csv dates come as strings. Type-check to see what's going on and conv… (
Browse files Browse the repository at this point in the history
#232)

* moved things around a bit. type check after df creaation inside not read from pickle clause. If read from pickle, that should have been taken care of..
  • Loading branch information
thanasions authored Apr 9, 2019
1 parent 200e6d8 commit 1c9c49b
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 93 deletions.
43 changes: 4 additions & 39 deletions scripts/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
import pickle
from os import makedirs, path

from pandas import read_pickle, to_datetime
from pandas.api.types import is_string_dtype
from pandas import read_pickle
from tqdm import tqdm

import scripts.data_factory as datafactory
Expand All @@ -22,40 +21,6 @@
from scripts.vandv.predictor import evaluate_prediction


def checkdf(df, emtec, docs_mask_dict, text_header, term_counts):
app_exit = False

if emtec or docs_mask_dict['time'] or docs_mask_dict['date'] is not None or term_counts:
if docs_mask_dict['date_header'] not in df.columns:
print(f"date_header '{docs_mask_dict['date_header']}' not in dataframe")
app_exit = True

if docs_mask_dict['date_header'] is not None:
if is_string_dtype(df[docs_mask_dict['date_header']]):
df[docs_mask_dict['date_header']] = to_datetime(df[docs_mask_dict['date_header']])

min_date = min(df[docs_mask_dict['date_header']])
max_date = max(df[docs_mask_dict['date_header']])
print(f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}')
else:
print('Document dates not specified')

if text_header not in df.columns:
print(f"text_header '{text_header}' not in dataframe")
app_exit = True

if app_exit:
exit(0)


def remove_empty_documents(data_frame, text_header):
num_docs_before_sift = data_frame.shape[0]
data_frame.dropna(subset=[text_header], inplace=True)
num_docs_after_sift = data_frame.shape[0]
num_docs_sifted = num_docs_before_sift - num_docs_after_sift
print(f'Dropped {num_docs_sifted:,} from {num_docs_before_sift:,} docs due to empty text field')


class Pipeline(object):
def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3),
normalize_rows=False, text_header='abstract', term_counts=False,
Expand All @@ -70,11 +35,11 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
self.__pick_method = pick_method
# calculate or fetch tf-idf mat
if pickled_tf_idf_file_name is None:

self.__dataframe = datafactory.get(data_filename)
checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header, term_counts)
utils.checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header, term_counts)
utils.remove_empty_documents(self.__dataframe, text_header)

remove_empty_documents(self.__dataframe, text_header)
self.__tfidf_obj = TFIDF(text_series=self.__dataframe[text_header], ngram_range=ngram_range,
max_document_frequency=max_df, tokenizer=LemmaTokenizer())

Expand Down
3 changes: 2 additions & 1 deletion scripts/tfidf_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from tqdm import tqdm

from scripts.utils.date_utils import tfidf_with_dates_to_weekly_term_counts
import pandas as pd


class TfidfReduce(object):
Expand Down Expand Up @@ -72,7 +73,7 @@ def extract_ngrams_from_docset(self, pick_method, verbose=True):
return ngrams_scores_tuple

def create_terms_count(self, df, dates_header):
dates = df[dates_header]
dates = df[dates_header].tolist()
document_week_dates = [iso_date[0] * 100 + iso_date[1] for iso_date in
[d.isocalendar() for d in dates]]

Expand Down
38 changes: 37 additions & 1 deletion scripts/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile

from pandas import to_datetime
from pandas.api.types import is_string_dtype


def bisearch_csr(array, target, start, end):
while start <= end:
Expand Down Expand Up @@ -164,4 +167,37 @@ def stop_tup(tuples, unigrams, ngrams, digits=True):
break
if not word_in_ngrams:
new_tuples.append(tuple)
return new_tuples
return new_tuples

def checkdf(df, emtec, docs_mask_dict, text_header, term_counts):
app_exit = False

if emtec or docs_mask_dict['time'] or docs_mask_dict['date'] is not None or term_counts:
if docs_mask_dict['date_header'] not in df.columns:
print(f"date_header '{docs_mask_dict['date_header']}' not in dataframe")
app_exit = True

if docs_mask_dict['date_header'] is not None:
if is_string_dtype(df[docs_mask_dict['date_header']]):
df[docs_mask_dict['date_header']] = to_datetime(df[docs_mask_dict['date_header']])

min_date = min(df[docs_mask_dict['date_header']])
max_date = max(df[docs_mask_dict['date_header']])
print(f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}')
else:
print('Document dates not specified')

if text_header not in df.columns:
print(f"text_header '{text_header}' not in dataframe")
app_exit = True

if app_exit:
exit(0)


def remove_empty_documents(data_frame, text_header):
num_docs_before_sift = data_frame.shape[0]
data_frame.dropna(subset=[text_header], inplace=True)
num_docs_after_sift = data_frame.shape[0]
num_docs_sifted = num_docs_before_sift - num_docs_after_sift
print(f'Dropped {num_docs_sifted:,} from {num_docs_before_sift:,} docs due to empty text field')
4 changes: 1 addition & 3 deletions tests/test_documents_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,14 @@
class TestDocumentsFilter(unittest.TestCase):

def setUp(self):

self.__docs_mask_dict = {}
# doc_weights
self.__docs_mask_dict['filter_by'] = 'union'
self.__docs_mask_dict['cpc'] = None
self.__docs_mask_dict['time'] = None
self.__docs_mask_dict['cite'] = []
self.__docs_mask_dict['columns'] = None
self.__docs_mask_dict['date'] = None
self.__docs_mask_dict['date_header'] = None
self.__docs_mask_dict['date_header'] = 'publication_date'
# [self.args.year_from, year_to, self.args.month_from, month_to, self.args.date_header]

df = pd.read_pickle(FilePaths.us_patents_random_100_pickle_name)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_filter_terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from tests.support import assert_list_almost_equal


class TestDocumentsFilter(unittest.TestCase):
class TestTermsFilter(unittest.TestCase):

def setUp(self):
df = pd.read_pickle(FilePaths.us_patents_random_100_pickle_name)
Expand Down
92 changes: 44 additions & 48 deletions tests/test_tfidf_reduce.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,7 @@
import unittest

import numpy as np
import os
import pandas as pd

from scripts import FilePaths
from scripts.filter_terms import FilterTerms
from scripts.text_processing import StemTokenizer
from scripts.tfidf_mask import TfidfMask
from scripts.tfidf_reduce import TfidfReduce
from scripts.tfidf_wrapper import TFIDF
from scripts.utils import utils
from scripts.pipeline import Pipeline
from tests import support


Expand All @@ -21,79 +13,83 @@ def setUpClass(cls):
max_n = 3
max_df = 0.3
ngram_range = (min_n, max_n)
df = pd.read_pickle(FilePaths.us_patents_random_100_pickle_name)
tfidf_obj = TFIDF(df['abstract'], ngram_range=ngram_range, max_document_frequency=max_df,
tokenizer=StemTokenizer())

doc_weights = list(np.ones(len(df)))
date_to = pd.to_datetime('today').date()
date_from = pd.to_datetime('1900-01-01')

# term weights - embeddings
filter_output_obj = FilterTerms(tfidf_obj.feature_names, None, None)
term_weights = filter_output_obj.ngram_weights_vec
docs_mask_dict={}
docs_mask_dict['filter_by'] = 'union'
docs_mask_dict['cpc'] = None
docs_mask_dict['time'] = None
docs_mask_dict['cite'] = []
docs_mask_dict['columns'] = None
docs_mask_dict['date'] = {
'to': date_to,
'from': date_from
}
docs_mask_dict['date_header'] = 'publication_date'

tfidf_mask_obj = TfidfMask(tfidf_obj, ngram_range=ngram_range)
tfidf_mask_obj.update_mask(doc_weights, term_weights)
tfidf_mask = tfidf_mask_obj.tfidf_mask
filename = os.path.join('tests', 'data', 'USPTO-random-100.csv')

# mask the tfidf matrix
tfidf_matrix = tfidf_obj.tfidf_matrix
tfidf_masked = tfidf_mask.multiply(tfidf_matrix)
tfidf_masked = utils.remove_all_null_rows(tfidf_masked)
cls.__pipeline = Pipeline(filename, docs_mask_dict, ngram_range=ngram_range,
text_header='abstract', term_counts=True,
max_df=max_df, output_name='test')

print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents')

tfidf_reduce_obj = TfidfReduce(tfidf_masked, tfidf_obj.feature_names)
cls.__term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset('sum')
cls.__term_score_tuples = cls.__pipeline.term_score_tuples

def test_terms(self):
term_score_tuples = self.__term_score_tuples
actual_terms = [x for _, x in term_score_tuples]
expected_terms = ['mount surfac',
expected_terms = ['mounting surface',
'transmit path',
'electron element',
'electronic element',
'link document',
'amid deriv',
'amide derivative',
'valproic acid',
'voic messag',
'pharmaceut formul',
'jack mechan',
'voice message',
'jack mechanism',
'pharmaceutical formulation',
'light beam',
'angular velocity',
'contact beam',
'angular veloc',
'shorter tuft',
'conduct materi',
'endodont instrument',
'conductive material',
'endodontic instrument',
'mass offset',
'section bend',
'compon materi',
'connect portion',
'termin channel'
'component material',
'terminal channel',
'stationary household appliance',
'fault point'
]

self.assertListEqual(actual_terms[:20], expected_terms)

def test_scores(self):
term_score_tuples = self.__term_score_tuples
actual_scores = [x for x, _ in term_score_tuples]
expected_scores = [0.9449111825230679,
expected_scores = [0.8728715609439694,
0.8259734063804905,
0.7754588414852185,
0.7276068751089988,
0.7620007620011429,
0.7071067811865476,
0.7071067811865476,
0.7071067811865475,
0.6882472016116852,
0.6666666666666666,
0.6396021490668312,
0.6246950475544241,
0.6198903382379372,
0.6031800939323297,
0.6000595413031171,
0.5834599659915781,
0.5806718350868961,
0.5773502691896257,
0.5773502691896257,
0.5773502691896257,
0.5669467095138407,
0.5611088299627696,
0.5597177778726654]
0.5597177778726654,
0.5570860145311556,
0.5568900989230109]

support.assert_list_almost_equal(self, actual_scores[:20], expected_scores)

def test_timeseries_mat(self):
timeseries_mat = self.__pipeline.term_counts_data
self.assertEqual(sum(timeseries_mat[2]), 100)

0 comments on commit 1c9c49b

Please sign in to comment.