implement missing API methods

piskvorky · Jun 23, 2017 · a4fdfdb · a4fdfdb
1 parent c601cbf
commit a4fdfdb
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 15 deletions.
diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py
@@ -18,6 +18,8 @@
 
 from six import iterkeys, iteritems
 
+from gensim import utils
+
 logger = logging.getLogger(__name__)
 
 
@@ -44,8 +46,10 @@ class FastCounter(object):
     Fast counting of item frequency and document frequency across large, streamed iterables.
     """
 
-    def __init__(self, doc2items=iter_gram1):
+    def __init__(self, doc2items=iter_gram1, max_size=None):
         self.doc2items = doc2items
+        self.max_size = max_size
+        self.min_reduce = 0
         self.hash2cnt = Counter()  # TODO replace by some GIL-free low-level struct
 
     def hash(self, item):
@@ -62,26 +66,28 @@ def update(self, documents):
             # Or maybe not needed, if we create multiple FastCounters from multiple input streams using
             # multiprocessing, and only .merge() them at the end.
             self.hash2cnt.update(self.hash(ngram) for ngram in self.doc2items(document))
-
-            # self.prune_vocab()
+            self.prune_items()
 
         return self  # for easier chaining
 
-    def prune_vocab(self):
-        # Trim data structures to fit in memory, if too large.
-        # Or use a fixed-size data structure to start with (hyperloglog?)
-        raise NotImplementedError
+    def prune_items(self):
+        """Trim data structures to fit in memory, if too large."""
+        # XXX: Or use a fixed-size data structure to start with (hyperloglog?)
+        while self.max_size and len(self) > self.max_size:
+            self.min_reduce += 1
+            utils.prune_vocab(self.hash2cnt, self.min_reduce)
 
     def get(self, item, default=None):
         """Return the item frequency of `item` (or `default` if item not present)."""
         return self.hash2cnt.get(self.hash(item), default)
 
     def merge(self, other):
         """
-        Merge counts from other into self, in-place.
+        Merge counts from another FastCounter into self, in-place.
         """
-        # rare operation, no need to optimize too much
-        raise NotImplementedError
+        self.hash2cnt.update(other.hash2cnt)
+        self.min_reduce = max(self.min_reduce, other.min_reduce)
+        self.prune_items()
 
     def __len__(self):
         return len(self.hash2cnt)
@@ -95,7 +101,7 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000):
         self.threshold = threshold
         self.min_count = min_count
         self.max_vocab_size = max_vocab_size
-        self.counter = FastCounter(iter_gram12)
+        self.counter = FastCounter(iter_gram12, max_size=max_vocab_size)
 
     def add_documents(self, documents):
         self.counter.update(documents)
@@ -107,8 +113,6 @@ def export_phrases(self, document):
         Yield all collocations (pairs of adjacent closely related tokens) from the
         input `document`, as 2-tuples `(score, bigram)`.
         """
-        if not self.counter:
-            return
         norm = 1.0 * len(self.counter)
         for bigram in iter_gram2(document):
             pa, pb, pab = self.counter.get((bigram[0],)), self.counter.get((bigram[1],)), self.counter.get(bigram, 0)

diff --git a/gensim/utils.py b/gensim/utils.py
@@ -1121,8 +1121,9 @@ def prune_vocab(vocab, min_reduce, trim_rule=None):
         if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule):  # vocab[w] <= min_reduce:
             result += vocab[w]
             del vocab[w]
-    logger.info("pruned out %i tokens with count <=%i (before %i, after %i)",
-                old_len - len(vocab), min_reduce, old_len, len(vocab))
+    logger.info(
+        "pruned out %i tokens with count <=%i (before %i, after %i)",
+        old_len - len(vocab), min_reduce, old_len, len(vocab))
     return result