filtering rows now gets rid of corresponding rows in df (#249)

* filtering rows now gets rid of corresponding rows in df * gensim & scipy version limited due to introduced instability in current versions
datasciencecampus · May 1, 2019 · f08cfe2 · f08cfe2
1 parent ce921da
commit f08cfe2
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 8 deletions.
diff --git a/scripts/documents_filter.py b/scripts/documents_filter.py
@@ -19,9 +19,9 @@ def __init__(self, df, docs_mask_dict):
             doc_set = self.__filter_dates(df, docs_mask_dict['date'], docs_mask_dict['date_header'])
             self.__add_set(doc_set, docs_mask_dict['filter_by'])
 
-        self.__doc_weights = [0.0] * len(df) if len(self.__doc_indices) > 0 else [1.0] * len(df)
+        self.__doc_filters = [0.0] * len(df) if len(self.__doc_indices) > 0 else [1.0] * len(df)
         for i in self.__doc_indices:
-            self.__doc_weights[i] = 1.0
+            self.__doc_filters[i] = 1.0
 
     def __add_set(self, doc_set, filter_by):
         if filter_by == 'intersection':
@@ -33,8 +33,8 @@ def __add_set(self, doc_set, filter_by):
             self.__doc_indices = self.__doc_indices.union(set(doc_set))
 
     @property
-    def doc_weights(self):
-        return self.__doc_weights
+    def doc_filters(self):
+        return self.__doc_filters
 
     @property
     def doc_indices(self):

diff --git a/scripts/pipeline.py b/scripts/pipeline.py
@@ -84,7 +84,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
         #  then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place
 
         # docs weights( column, dates subset + time, citations etc.)
-        doc_filters = DocumentsFilter(self.__dataframe, docs_mask_dict).doc_weights
+        doc_filters = DocumentsFilter(self.__dataframe, docs_mask_dict).doc_filters
 
         # todo: build up list of weight functions (left with single remaining arg etc via partialfunc)
         #  combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect)
@@ -114,8 +114,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
         tfidf_matrix = self.__tfidf_obj.tfidf_matrix
         tfidf_masked = tfidf_mask.multiply(tfidf_matrix)
 
-        tfidf_masked = utils.remove_all_null_rows(tfidf_masked)
-
+        tfidf_masked= utils.remove_all_null_rows(tfidf_masked)
         print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents')
 
         # todo: no advantage in classes - just create term_count and extract_ngrams as functions

diff --git a/scripts/utils/utils.py b/scripts/utils/utils.py
@@ -10,6 +10,16 @@
 from pandas.api.types import is_string_dtype
 
 
+def remove_all_null_rows_global(sparse_mat, df):
+    nonzero_row_indices, _ = sparse_mat.nonzero()
+    unique_nonzero_indices = np.unique(nonzero_row_indices)
+
+    df = df.reset_index(drop=True)
+    df = df.ix[unique_nonzero_indices]
+    df = df.reset_index(drop=True)
+    return sparse_mat[unique_nonzero_indices], df
+
+
 def bisearch_csr(array, target, start, end):
     while start <= end:
         middle = (start + end) // 2
@@ -169,6 +179,7 @@ def stop_tup(tuples, unigrams, ngrams, digits=True):
                 new_tuples.append(tuple)
     return new_tuples
 
+
 def checkdf(df, emtec, docs_mask_dict, text_header, term_counts):
     app_exit = False
 

diff --git a/tests/test_tfidf_mask.py b/tests/test_tfidf_mask.py
@@ -49,7 +49,7 @@ def init_mask(self, cpc, min_n, uni_factor=0.8):
         self.__tfidf_obj = TFIDF(self.__df['abstract'], ngram_range=(min_n, self.__max_n),
                                  max_document_frequency=self.__max_df, tokenizer=StemTokenizer())
 
-        doc_filters = DocumentsFilter(self.__df, docs_mask_dict).doc_weights
+        doc_filters = DocumentsFilter(self.__df, docs_mask_dict).doc_filters
         doc_weights = DocumentsWeights(self.__df, docs_mask_dict['time'], docs_mask_dict['cite'],
                                        docs_mask_dict['date_header']).weights
         doc_weights = [a * b for a, b in zip(doc_filters, doc_weights)]
@@ -105,11 +105,23 @@ def test_num_non_zeros_clean_rows_clean_unigrams(self):
         tfidf_mask_nozero_rows = utils.remove_all_null_rows(self.__tfidf_mask)
         self.assertEqual(26, len(tfidf_mask_nozero_rows.data))
 
+    def test_num_non_zeros_clean_rows_clean_unigrams_and_df(self):
+        self.init_mask('Y02', 1, uni_factor=0.4)
+        tfidf_mask_nozero_rows, self.__df = utils.remove_all_null_rows_global(self.__tfidf_mask, self.__df)
+        self.assertEqual(26, len(tfidf_mask_nozero_rows.data))
+        self.assertEqual(1, len(self.__df.index))
+
     def test_num_non_zeros_clean_rows(self):
         self.init_mask('Y02', 2)
         tfidf_mask_nozero_rows = utils.remove_all_null_rows(self.__tfidf_mask)
         self.assertEqual(20, len(tfidf_mask_nozero_rows.data))
 
+    def test_num_non_zeros_clean_rows_and_df(self):
+        self.init_mask('Y02', 2)
+        tfidf_mask_nozero_rows, self.__df = utils.remove_all_null_rows_global(self.__tfidf_mask, self.__df)
+        self.assertEqual(20, len(tfidf_mask_nozero_rows.data))
+        self.assertEqual(1, len(self.__df.index))
+
     def test_no_negative_weights(self):
         self.init_mask(None, 2)
         data = self.__tfidf_mask.data