Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code style fixes to the TFIDF module #1313

Merged
merged 1 commit into from
May 22, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 16 additions & 11 deletions gensim/models/tfidfmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ def precompute_idfs(wglobal, dfs, total_docs):
"""Precompute the inverse document frequency mapping for all terms."""
# not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
# this method is here just to speed things up a little.
return dict((termid, wglobal(df, total_docs))
for termid, df in iteritems(dfs))
return dict((termid, wglobal(df, total_docs)) for termid, df in iteritems(dfs))


class TfidfModel(interfaces.TransformationABC):
Expand All @@ -49,8 +48,9 @@ class TfidfModel(interfaces.TransformationABC):

Model persistency is achieved via its load/save methods.
"""
def __init__(self, corpus=None, id2word=None, dictionary=None,
wlocal=utils.identity, wglobal=df2idf, normalize=True):
def __init__(
self, corpus=None, id2word=None, dictionary=None,
wlocal=utils.identity, wglobal=df2idf, normalize=True):
"""
Compute tf-idf by multiplying a local component (term frequency) with a
global component (inverse document frequency), and normalizing
Expand Down Expand Up @@ -87,11 +87,13 @@ def __init__(self, corpus=None, id2word=None, dictionary=None,
# statistics we need to construct the IDF mapping. we can skip the
# step that goes through the corpus (= an optimization).
if corpus is not None:
logger.warning("constructor received both corpus and explicit "
"inverse document frequencies; ignoring the corpus")
logger.warning(
"constructor received both corpus and explicit inverse document frequencies; ignoring the corpus")
self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
self.dfs = dictionary.dfs.copy()
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
if id2word is None:
self.id2word = dictionary
elif corpus is not None:
self.initialize(corpus)
else:
Expand All @@ -114,7 +116,7 @@ def initialize(self, corpus):
numnnz, docno = 0, -1
for docno, bow in enumerate(corpus):
if docno % 10000 == 0:
logger.info("PROGRESS: processing document #%i" % docno)
logger.info("PROGRESS: processing document #%i", docno)
numnnz += len(bow)
for termid, _ in bow:
dfs[termid] = dfs.get(termid, 0) + 1
Expand All @@ -126,8 +128,9 @@ def initialize(self, corpus):

# and finally compute the idf weights
n_features = max(dfs) if dfs else 0
logger.info("calculating IDF weights for %i documents and %i features (%i matrix non-zeros)" %
(self.num_docs, n_features, self.num_nnz))
logger.info(
"calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",
self.num_docs, n_features, self.num_nnz)
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)


Expand All @@ -142,8 +145,10 @@ def __getitem__(self, bow, eps=1e-12):

# unknown (new) terms will be given zero weight (NOT infinity/huge weight,
# as strict application of the IDF formula would dictate)
vector = [(termid, self.wlocal(tf) * self.idfs.get(termid))
for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]
vector = [
(termid, self.wlocal(tf) * self.idfs.get(termid))
for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0
]

# and finally, normalize the vector either to unit length, or use a
# user-defined normalization function
Expand Down