Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

278 move mask #283

Merged
merged 7 commits into from
May 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions scripts/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import bz2
import pickle
from os import makedirs, path

from os import makedirs, path
from pandas import read_pickle
from tqdm import tqdm

import scripts.data_factory as datafactory
import scripts.output_factory as output_factory
import scripts.utils.date_utils
Expand All @@ -19,6 +17,7 @@
from scripts.vandv.emergence_labels import map_prediction_to_emergence_label, report_predicted_emergence_labels_html
from scripts.vandv.graphs import report_prediction_as_graphs_html
from scripts.vandv.predictor import evaluate_prediction
from tqdm import tqdm


class Pipeline(object):
Expand All @@ -43,6 +42,8 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
ngram_range=ngram_range,
max_document_frequency=max_df,
tokenizer=LemmaTokenizer())
tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8, unbias=True)
self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask)

if prefilter_terms != 0:
tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix, self.__tfidf_obj.feature_names)
Expand Down
13 changes: 7 additions & 6 deletions scripts/tfidf_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


class TfidfMask(object):
def __init__(self, tfidf_obj, ngram_range=(2, 3), uni_factor=0.8):
def __init__(self, tfidf_obj, ngram_range=(2, 3), uni_factor=0.8, unbias=False):
self.__tfidf_matrix = tfidf_obj.tfidf_matrix
self.__feature_names = tfidf_obj.feature_names
self.__tfidf_mask = self.__tfidf_matrix.copy()
Expand All @@ -11,12 +11,13 @@ def __init__(self, tfidf_obj, ngram_range=(2, 3), uni_factor=0.8):
self.__uni_factor = uni_factor
self.__idf = tfidf_obj.idf

# do unigrams
if ngram_range[0] == 1:
self.__clean_unigrams(self.__max_bigram())
if unbias:
# do unigrams
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove as its doesn't help?

Suggested change
# do unigrams

if ngram_range[0] == 1:
self.__clean_unigrams(self.__max_bigram())

for i in range(ngram_range[0], ngram_range[1]):
self.__unbias_ngrams(i + 1)
for i in range(ngram_range[0], ngram_range[1]):
self.__unbias_ngrams(i + 1)

@property
def tfidf_mask(self):
Expand Down
8 changes: 7 additions & 1 deletion scripts/tfidf_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ def __trigger_transformer(self):
self.__l2_norm = utils.l2normvec(self.__tfidf_matrix)
self.__tfidf_matrix = utils.apply_l2normvec(self.__tfidf_matrix, self.__l2_norm)

def apply_weights(self, weights_matrix):
self.__count_matrix = self.__count_matrix.multiply(weights_matrix)
self.__tfidf_matrix = self.__tfidf_matrix.multiply(weights_matrix)
self.__count_matrix.data = np.array([np.uint8(round(x)) for x in self.__count_matrix.data])

@property
def l2_norm(self):
return self.__l2_norm
Expand All @@ -82,4 +87,5 @@ def vocabulary(self):

@property
def feature_names(self):
return self.__feature_names
return self.__feature_names

4 changes: 2 additions & 2 deletions tests/test_pygrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,9 +399,9 @@ def test_unibitri_reduction_output_termcounts(self, mock_path_isfile, mock_maked
pygrams.main(args)

def assert_outputs(term_counts_per_week, feature_names, number_of_documents_per_week, week_iso_dates):
self.assertListEqual(feature_names, ['abstract', 'extra', 'extra stuff', 'patent', 'stuff', 'with'])
self.assertListEqual(feature_names, ['abstract', 'extra stuff', 'patent', 'with'])
term_counts_as_lists = term_counts_per_week.todense().tolist()
self.assertListEqual(term_counts_as_lists[0], [1, 0, 1, 1, 0, 1])
self.assertListEqual(term_counts_as_lists[0], [1, 1, 1, 1])
self.assertListEqual(number_of_documents_per_week, [1])
self.assertListEqual(week_iso_dates, [200052])

Expand Down
2 changes: 1 addition & 1 deletion tests/test_terms_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def setUpClass(cls):
filter_output_obj = FilterTerms(tfidf_obj.feature_names, None, None)
term_weights = filter_output_obj.ngram_weights_vec

tfidf_mask_obj = TfidfMask(tfidf_obj, ngram_range=ngram_range)
tfidf_mask_obj = TfidfMask(tfidf_obj, ngram_range=ngram_range, unbias=True)
tfidf_mask_obj.update_mask(doc_weights, term_weights)
tfidf_mask = tfidf_mask_obj.tfidf_mask

Expand Down
2 changes: 1 addition & 1 deletion tests/test_tfidf_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def init_mask(self, cpc, min_n, uni_factor=0.8):
filter_output_obj = FilterTerms(self.__tfidf_obj.feature_names, None)
term_weights = filter_output_obj.ngram_weights_vec

tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=(min_n, self.__max_n), uni_factor=uni_factor)
tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=(min_n, self.__max_n), uni_factor=uni_factor, unbias=True)
tfidf_mask_obj.update_mask(doc_filters, term_weights)
self.__tfidf_mask = tfidf_mask_obj.tfidf_mask

Expand Down