From f1646a6a163a40e1924f34f15b4d115e1f516caa Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Fri, 15 Dec 2017 16:10:47 +0530 Subject: [PATCH] change old tests --- gensim/models/tfidfmodel.py | 25 +++++++++++-------------- gensim/sklearn_api/tfidf.py | 7 ++++--- gensim/test/test_sklearn_api.py | 7 +++---- 3 files changed, 18 insertions(+), 21 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index e396618c1f..a47b4c291a 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -53,7 +53,7 @@ class TfidfModel(interfaces.TransformationABC): """ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc", - wlocal=None, wglobal=None, wnormalize=None): + wlocal=None, wglobal=None, normalize=None): """ Compute tf-idf by multiplying a local component (term frequency) with a global component (inverse document frequency), and normalizing @@ -82,7 +82,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc", mapping (then `corpus`, if specified, is ignored). """ self.id2word = id2word - self.wlocal, self.wglobal, self.wnormalize = wlocal, wglobal, wnormalize + self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize self.num_docs, self.num_nnz, self.idfs = None, None, None n_tf, n_df, n_n = smartirs @@ -106,13 +106,14 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc", elif n_tf == "p": self.wglobal = lambda docfreq, totaldocs: math.log((float(totaldocs) - docfreq) / docfreq) - if self.wnormalize is None: - if n_n == "n": - self.wnormalize = lambda x: x - elif n_n == "c": - self.wnormalize = matutils.unitvec - elif n_n == "t": - self.wnormalize = matutils.unitvec + if self.normalize is None or isinstance(self.normalize, bool): + if n_n == "n" or self.normalize is False: + self.normalize = lambda x: x + elif n_n == "c" or self.normalize is True: + self.normalize = matutils.unitvec + # TODO write byte-size normalisation + # elif n_n == "b": + # self.normalize = matutils.unitvec if dictionary is not None: # user supplied a Dictionary object, which already contains all the @@ -160,10 +161,6 @@ def initialize(self, corpus): # and finally compute the idf weights n_features = max(dfs) if dfs else 0 - logger.info( - "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)", - self.num_docs, n_features, self.num_nnz - ) def __getitem__(self, bow, eps=1e-12): """ @@ -185,7 +182,7 @@ def __getitem__(self, bow, eps=1e-12): # and finally, normalize the vector either to unit length, or use a # user-defined normalization function - vector = self.wnormalize(vector) + vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index c0a45f1823..a4822d90ae 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -21,14 +21,15 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator): Base Tf-Idf module """ - def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, - wglobal=gensim.models.tfidfmodel.df2idf, normalize=True): + def __init__(self, id2word=None, dictionary=None, smartirs="ntc", wlocal=None, + wglobal=None, normalize=True): """ Sklearn wrapper for Tf-Idf model. """ self.gensim_model = None self.id2word = id2word self.dictionary = dictionary + self.smartirs = smartirs self.wlocal = wlocal self.wglobal = wglobal self.normalize = normalize @@ -38,7 +39,7 @@ def fit(self, X, y=None): Fit the model according to the given training data. """ self.gensim_model = TfidfModel( - corpus=X, id2word=self.id2word, dictionary=self.dictionary, + corpus=X, id2word=self.id2word, dictionary=self.dictionary, smartirs="ntc", wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize ) return self diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 3793c79948..947804c59d 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -498,7 +498,6 @@ def testPersistence(self): original_matrix = self.model.transform(original_bow) passed = numpy.allclose(loaded_matrix, original_matrix, atol=1e-1) self.assertTrue(passed) - def testModelNotFitted(self): lsi_wrapper = LsiTransformer(id2word=dictionary, num_topics=2) texts_new = ['graph', 'eulerian'] @@ -973,13 +972,13 @@ def testTransform(self): def testSetGetParams(self): # updating only one param - self.model.set_params(normalize=False) + self.model.set_params(smartirs='nnn') model_params = self.model.get_params() - self.assertEqual(model_params["normalize"], False) + self.assertEqual(model_params["smartirs"], 'nnn') # verify that the attributes values are also changed for `gensim_model` after fitting self.model.fit(self.corpus) - self.assertEqual(getattr(self.model.gensim_model, 'normalize'), False) + self.assertEqual(getattr(self.model.gensim_model, 'smartirs'), 'nnn') def testPipeline(self): with open(datapath('mini_newsgroup'), 'rb') as f: