Skip to content

Commit

Permalink
filtering rows now gets rid of corresponding rows in df (#249)
Browse files Browse the repository at this point in the history
* filtering rows now gets rid of corresponding rows in df
* gensim & scipy version limited due to introduced instability in current versions
  • Loading branch information
thanasions authored and IanGrimstead committed May 1, 2019
1 parent ce921da commit f08cfe2
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 8 deletions.
8 changes: 4 additions & 4 deletions scripts/documents_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ def __init__(self, df, docs_mask_dict):
doc_set = self.__filter_dates(df, docs_mask_dict['date'], docs_mask_dict['date_header'])
self.__add_set(doc_set, docs_mask_dict['filter_by'])

self.__doc_weights = [0.0] * len(df) if len(self.__doc_indices) > 0 else [1.0] * len(df)
self.__doc_filters = [0.0] * len(df) if len(self.__doc_indices) > 0 else [1.0] * len(df)
for i in self.__doc_indices:
self.__doc_weights[i] = 1.0
self.__doc_filters[i] = 1.0

def __add_set(self, doc_set, filter_by):
if filter_by == 'intersection':
Expand All @@ -33,8 +33,8 @@ def __add_set(self, doc_set, filter_by):
self.__doc_indices = self.__doc_indices.union(set(doc_set))

@property
def doc_weights(self):
return self.__doc_weights
def doc_filters(self):
return self.__doc_filters

@property
def doc_indices(self):
Expand Down
5 changes: 2 additions & 3 deletions scripts/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
# then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place

# docs weights( column, dates subset + time, citations etc.)
doc_filters = DocumentsFilter(self.__dataframe, docs_mask_dict).doc_weights
doc_filters = DocumentsFilter(self.__dataframe, docs_mask_dict).doc_filters

# todo: build up list of weight functions (left with single remaining arg etc via partialfunc)
# combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect)
Expand Down Expand Up @@ -114,8 +114,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
tfidf_matrix = self.__tfidf_obj.tfidf_matrix
tfidf_masked = tfidf_mask.multiply(tfidf_matrix)

tfidf_masked = utils.remove_all_null_rows(tfidf_masked)

tfidf_masked= utils.remove_all_null_rows(tfidf_masked)
print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents')

# todo: no advantage in classes - just create term_count and extract_ngrams as functions
Expand Down
11 changes: 11 additions & 0 deletions scripts/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@
from pandas.api.types import is_string_dtype


def remove_all_null_rows_global(sparse_mat, df):
nonzero_row_indices, _ = sparse_mat.nonzero()
unique_nonzero_indices = np.unique(nonzero_row_indices)

df = df.reset_index(drop=True)
df = df.ix[unique_nonzero_indices]
df = df.reset_index(drop=True)
return sparse_mat[unique_nonzero_indices], df


def bisearch_csr(array, target, start, end):
while start <= end:
middle = (start + end) // 2
Expand Down Expand Up @@ -169,6 +179,7 @@ def stop_tup(tuples, unigrams, ngrams, digits=True):
new_tuples.append(tuple)
return new_tuples


def checkdf(df, emtec, docs_mask_dict, text_header, term_counts):
app_exit = False

Expand Down
14 changes: 13 additions & 1 deletion tests/test_tfidf_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def init_mask(self, cpc, min_n, uni_factor=0.8):
self.__tfidf_obj = TFIDF(self.__df['abstract'], ngram_range=(min_n, self.__max_n),
max_document_frequency=self.__max_df, tokenizer=StemTokenizer())

doc_filters = DocumentsFilter(self.__df, docs_mask_dict).doc_weights
doc_filters = DocumentsFilter(self.__df, docs_mask_dict).doc_filters
doc_weights = DocumentsWeights(self.__df, docs_mask_dict['time'], docs_mask_dict['cite'],
docs_mask_dict['date_header']).weights
doc_weights = [a * b for a, b in zip(doc_filters, doc_weights)]
Expand Down Expand Up @@ -105,11 +105,23 @@ def test_num_non_zeros_clean_rows_clean_unigrams(self):
tfidf_mask_nozero_rows = utils.remove_all_null_rows(self.__tfidf_mask)
self.assertEqual(26, len(tfidf_mask_nozero_rows.data))

def test_num_non_zeros_clean_rows_clean_unigrams_and_df(self):
self.init_mask('Y02', 1, uni_factor=0.4)
tfidf_mask_nozero_rows, self.__df = utils.remove_all_null_rows_global(self.__tfidf_mask, self.__df)
self.assertEqual(26, len(tfidf_mask_nozero_rows.data))
self.assertEqual(1, len(self.__df.index))

def test_num_non_zeros_clean_rows(self):
self.init_mask('Y02', 2)
tfidf_mask_nozero_rows = utils.remove_all_null_rows(self.__tfidf_mask)
self.assertEqual(20, len(tfidf_mask_nozero_rows.data))

def test_num_non_zeros_clean_rows_and_df(self):
self.init_mask('Y02', 2)
tfidf_mask_nozero_rows, self.__df = utils.remove_all_null_rows_global(self.__tfidf_mask, self.__df)
self.assertEqual(20, len(tfidf_mask_nozero_rows.data))
self.assertEqual(1, len(self.__df.index))

def test_no_negative_weights(self):
self.init_mask(None, 2)
data = self.__tfidf_mask.data
Expand Down

0 comments on commit f08cfe2

Please sign in to comment.