Skip to content

Commit

Permalink
278 move mask (#283)
Browse files Browse the repository at this point in the history
* resolves #278
  • Loading branch information
thanasions authored May 22, 2019
1 parent 9bb15a2 commit be5ab28
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 14 deletions.
7 changes: 4 additions & 3 deletions scripts/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import bz2
import pickle
from os import makedirs, path

from os import makedirs, path
from pandas import read_pickle
from tqdm import tqdm

import scripts.data_factory as datafactory
import scripts.output_factory as output_factory
import scripts.utils.date_utils
Expand All @@ -19,6 +17,7 @@
from scripts.vandv.emergence_labels import map_prediction_to_emergence_label, report_predicted_emergence_labels_html
from scripts.vandv.graphs import report_prediction_as_graphs_html
from scripts.vandv.predictor import evaluate_prediction
from tqdm import tqdm


class Pipeline(object):
Expand All @@ -43,6 +42,8 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
ngram_range=ngram_range,
max_document_frequency=max_df,
tokenizer=LemmaTokenizer())
tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8, unbias=True)
self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask)

if prefilter_terms != 0:
tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix, self.__tfidf_obj.feature_names)
Expand Down
13 changes: 7 additions & 6 deletions scripts/tfidf_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


class TfidfMask(object):
def __init__(self, tfidf_obj, ngram_range=(2, 3), uni_factor=0.8):
def __init__(self, tfidf_obj, ngram_range=(2, 3), uni_factor=0.8, unbias=False):
self.__tfidf_matrix = tfidf_obj.tfidf_matrix
self.__feature_names = tfidf_obj.feature_names
self.__tfidf_mask = self.__tfidf_matrix.copy()
Expand All @@ -11,12 +11,13 @@ def __init__(self, tfidf_obj, ngram_range=(2, 3), uni_factor=0.8):
self.__uni_factor = uni_factor
self.__idf = tfidf_obj.idf

# do unigrams
if ngram_range[0] == 1:
self.__clean_unigrams(self.__max_bigram())
if unbias:
# do unigrams
if ngram_range[0] == 1:
self.__clean_unigrams(self.__max_bigram())

for i in range(ngram_range[0], ngram_range[1]):
self.__unbias_ngrams(i + 1)
for i in range(ngram_range[0], ngram_range[1]):
self.__unbias_ngrams(i + 1)

@property
def tfidf_mask(self):
Expand Down
8 changes: 7 additions & 1 deletion scripts/tfidf_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ def __trigger_transformer(self):
self.__l2_norm = utils.l2normvec(self.__tfidf_matrix)
self.__tfidf_matrix = utils.apply_l2normvec(self.__tfidf_matrix, self.__l2_norm)

def apply_weights(self, weights_matrix):
self.__count_matrix = self.__count_matrix.multiply(weights_matrix)
self.__tfidf_matrix = self.__tfidf_matrix.multiply(weights_matrix)
self.__count_matrix.data = np.array([np.uint8(round(x)) for x in self.__count_matrix.data])

@property
def l2_norm(self):
return self.__l2_norm
Expand All @@ -82,4 +87,5 @@ def vocabulary(self):

@property
def feature_names(self):
return self.__feature_names
return self.__feature_names

4 changes: 2 additions & 2 deletions tests/test_pygrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,9 +399,9 @@ def test_unibitri_reduction_output_termcounts(self, mock_path_isfile, mock_maked
pygrams.main(args)

def assert_outputs(term_counts_per_week, feature_names, number_of_documents_per_week, week_iso_dates):
self.assertListEqual(feature_names, ['abstract', 'extra', 'extra stuff', 'patent', 'stuff', 'with'])
self.assertListEqual(feature_names, ['abstract', 'extra stuff', 'patent', 'with'])
term_counts_as_lists = term_counts_per_week.todense().tolist()
self.assertListEqual(term_counts_as_lists[0], [1, 0, 1, 1, 0, 1])
self.assertListEqual(term_counts_as_lists[0], [1, 1, 1, 1])
self.assertListEqual(number_of_documents_per_week, [1])
self.assertListEqual(week_iso_dates, [200052])

Expand Down
2 changes: 1 addition & 1 deletion tests/test_terms_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def setUpClass(cls):
filter_output_obj = FilterTerms(tfidf_obj.feature_names, None, None)
term_weights = filter_output_obj.ngram_weights_vec

tfidf_mask_obj = TfidfMask(tfidf_obj, ngram_range=ngram_range)
tfidf_mask_obj = TfidfMask(tfidf_obj, ngram_range=ngram_range, unbias=True)
tfidf_mask_obj.update_mask(doc_weights, term_weights)
tfidf_mask = tfidf_mask_obj.tfidf_mask

Expand Down
2 changes: 1 addition & 1 deletion tests/test_tfidf_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def init_mask(self, cpc, min_n, uni_factor=0.8):
filter_output_obj = FilterTerms(self.__tfidf_obj.feature_names, None)
term_weights = filter_output_obj.ngram_weights_vec

tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=(min_n, self.__max_n), uni_factor=uni_factor)
tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=(min_n, self.__max_n), uni_factor=uni_factor, unbias=True)
tfidf_mask_obj.update_mask(doc_filters, term_weights)
self.__tfidf_mask = tfidf_mask_obj.tfidf_mask

Expand Down

0 comments on commit be5ab28

Please sign in to comment.