From 460dc1cb9921817f71b40b412e11a6d413926472 Mon Sep 17 00:00:00 2001 From: Amr Mohamed Date: Sun, 28 Apr 2019 04:37:50 +0200 Subject: [PATCH] Fix the off-by-one bug in the TFIDF model. (#2392) * Fix the off-by-one bug in the TFIDF model. Fixes #2375. Use len to compute the number of features. Since the ids are zero-indexed, Using max causes an off-by-one bug. * Use the maximum token identifier to compute the number of TFIDF features * Tweak the number of features computation --- gensim/models/tfidfmodel.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index a4cbedcd22..b45d1652a1 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -390,10 +390,9 @@ def initialize(self, corpus): self.num_nnz = numnnz self.dfs = dfs # and finally compute the idf weights - n_features = max(dfs) if dfs else 0 logger.info( "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)", - self.num_docs, n_features, self.num_nnz + self.num_docs, max(dfs.keys()) + 1 if dfs else 0, self.num_nnz ) self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)