Skip to content

Commit

Permalink
remove document frequency from the API (instead, Dictionary will have…
Browse files Browse the repository at this point in the history
… 2 counters)
  • Loading branch information
piskvorky committed Jun 23, 2017
1 parent 392c672 commit fd96ee8
Showing 1 changed file with 9 additions and 13 deletions.
22 changes: 9 additions & 13 deletions gensim/models/fast_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
"""
Fast & memory efficient counting of things (and n-grams of things).
This module is designed to count *item* and *document* frequencies over large, streamed corpora (lazy iteration).
This module is designed to count item frequencies over large, streamed corpora (lazy iteration).
Such counts are useful in various other modules, such as Dictionary, TfIdf, Phrases etc.
"""

Expand Down Expand Up @@ -42,12 +44,9 @@ class FastCounter(object):
Fast counting of item frequency and document frequency across large, streamed iterables.
"""

def __init__(self, doc2items=iter_gram1, collect_df=False):
def __init__(self, doc2items=iter_gram1):
self.doc2items = doc2items
self.collect_df = collect_df

self.item_counts = Counter() # TODO replace by some GIL-free low-level struct
self.doc_counts = Counter() # TODO replace by some GIL-free low-level struct
self.hash2cnt = Counter() # TODO replace by some GIL-free low-level struct

def hash(self, key):
return hash(key)
Expand All @@ -62,11 +61,8 @@ def update(self, documents):
# TODO: release GIL, so we can run update() in parallel threads.
# Or maybe not needed, if we create multiple FastCounters from multiple input streams using
# multiprocessing, and only .merge() them at the end.
item_cnts = Counter(self.hash(ngram) for ngram in self.doc2items(document))
self.item_counts.update(item_cnts)
if self.collect_df:
# increment by 1 per unique key ("document frequency")
self.doc_counts.update(iterkeys(item_cnts))
hash2cnt = Counter(self.hash(ngram) for ngram in self.doc2items(document))
self.hash2cnt.update(hash2cnt)

# self.prune_vocab()

Expand All @@ -79,7 +75,7 @@ def prune_vocab(self):

def get(self, key, default=None):
"""Return the item frequency of `key` (or `default` if key not present)."""
return self.item_counts.get(self.hash(key), default)
return self.hash2cnt.get(self.hash(key), default)

def merge(self, other):
"""
Expand All @@ -89,7 +85,7 @@ def merge(self, other):
raise NotImplementedError

def __len__(self):
return len(self.item_counts)
return len(self.hash2cnt)

def __str__(self):
return "%s<%i items>" % (self.__class__.__name__, len(self))
Expand Down

0 comments on commit fd96ee8

Please sign in to comment.