Skip to content

Commit

Permalink
implement missing API methods
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Jun 23, 2017
1 parent c601cbf commit a4fdfdb
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 15 deletions.
30 changes: 17 additions & 13 deletions gensim/models/fast_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

from six import iterkeys, iteritems

from gensim import utils

logger = logging.getLogger(__name__)


Expand All @@ -44,8 +46,10 @@ class FastCounter(object):
Fast counting of item frequency and document frequency across large, streamed iterables.
"""

def __init__(self, doc2items=iter_gram1):
def __init__(self, doc2items=iter_gram1, max_size=None):
self.doc2items = doc2items
self.max_size = max_size
self.min_reduce = 0
self.hash2cnt = Counter() # TODO replace by some GIL-free low-level struct

def hash(self, item):
Expand All @@ -62,26 +66,28 @@ def update(self, documents):
# Or maybe not needed, if we create multiple FastCounters from multiple input streams using
# multiprocessing, and only .merge() them at the end.
self.hash2cnt.update(self.hash(ngram) for ngram in self.doc2items(document))

# self.prune_vocab()
self.prune_items()

return self # for easier chaining

def prune_vocab(self):
# Trim data structures to fit in memory, if too large.
# Or use a fixed-size data structure to start with (hyperloglog?)
raise NotImplementedError
def prune_items(self):
"""Trim data structures to fit in memory, if too large."""
# XXX: Or use a fixed-size data structure to start with (hyperloglog?)
while self.max_size and len(self) > self.max_size:
self.min_reduce += 1
utils.prune_vocab(self.hash2cnt, self.min_reduce)

def get(self, item, default=None):
"""Return the item frequency of `item` (or `default` if item not present)."""
return self.hash2cnt.get(self.hash(item), default)

def merge(self, other):
"""
Merge counts from other into self, in-place.
Merge counts from another FastCounter into self, in-place.
"""
# rare operation, no need to optimize too much
raise NotImplementedError
self.hash2cnt.update(other.hash2cnt)
self.min_reduce = max(self.min_reduce, other.min_reduce)
self.prune_items()

def __len__(self):
return len(self.hash2cnt)
Expand All @@ -95,7 +101,7 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000):
self.threshold = threshold
self.min_count = min_count
self.max_vocab_size = max_vocab_size
self.counter = FastCounter(iter_gram12)
self.counter = FastCounter(iter_gram12, max_size=max_vocab_size)

def add_documents(self, documents):
self.counter.update(documents)
Expand All @@ -107,8 +113,6 @@ def export_phrases(self, document):
Yield all collocations (pairs of adjacent closely related tokens) from the
input `document`, as 2-tuples `(score, bigram)`.
"""
if not self.counter:
return
norm = 1.0 * len(self.counter)
for bigram in iter_gram2(document):
pa, pb, pab = self.counter.get((bigram[0],)), self.counter.get((bigram[1],)), self.counter.get(bigram, 0)
Expand Down
5 changes: 3 additions & 2 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1121,8 +1121,9 @@ def prune_vocab(vocab, min_reduce, trim_rule=None):
if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule): # vocab[w] <= min_reduce:
result += vocab[w]
del vocab[w]
logger.info("pruned out %i tokens with count <=%i (before %i, after %i)",
old_len - len(vocab), min_reduce, old_len, len(vocab))
logger.info(
"pruned out %i tokens with count <=%i (before %i, after %i)",
old_len - len(vocab), min_reduce, old_len, len(vocab))
return result


Expand Down

0 comments on commit a4fdfdb

Please sign in to comment.