piskvorky · piskvorky · Jun 23, 2017 · Jun 23, 2017 · Jun 23, 2017 · Jun 23, 2017
diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Fast & memory efficient counting of things (and n-grams of things).
+
+This module is designed to count item frequencies over large, streamed corpora (lazy iteration).
+
+Such counts are useful in various other modules, such as Dictionary, TfIdf, Phrases etc.
+
+"""
+
+import sys
+import os
+from collections import defaultdict
+import logging
+
+from gensim import utils
+from gensim.models.fast_counter_cython import FastCounterCython, FastCounterPreshed
+
+logger = logging.getLogger(__name__)
+
+
+def iter_ngrams(document, ngrams):
+    assert ngrams[0] <= ngrams[1]
+
+    for n in range(ngrams[0], ngrams[1] + 1):
+        for ngram in zip(*[document[i:] for i in range(n)]):
+            logger.debug("yielding ngram %r", ngram)
+            yield ngram
+
+def iter_gram1(document):
+    return iter_ngrams(document, (1, 1))
+
+def iter_gram2(document):
+    return iter_ngrams(document, (2, 2))
+
+def iter_gram12(document):
+    return iter_ngrams(document, (1, 2))
+
+
+class FastCounter(object):
+    """
+    Fast counting of item frequency frequency across large, streamed iterables.
+    """
+
+    def __init__(self, doc2items=iter_gram1, max_size=None):
+        self.doc2items = doc2items
+        self.max_size = max_size
+        self.min_reduce = 0
+        self.hash2cnt = defaultdict(int)
+
+    def hash(self, item):
+        return hash(item)
+
+    def update(self, documents):
+        """
+        Update the relevant ngram counters from the iterable `documents`.
+
+        If the memory structures get too large, clip them (then the internal counts may be only approximate).
+        """
+        for document in documents:
+            for item in self.doc2items(document):
+                self.hash2cnt[self.hash(item)] += 1
+            self.prune_items()
+
+        return self  # for easier chaining
+
+    def prune_items(self):
+        """Trim data structures to fit in memory, if too large."""
+        # XXX: Or use a fixed-size data structure to start with (hyperloglog?)
+        while self.max_size and len(self) > self.max_size:
+            self.min_reduce += 1
+            utils.prune_vocab(self.hash2cnt, self.min_reduce)
+
+    def get(self, item, default=None):
+        """Return the item frequency of `item` (or `default` if item not present)."""
+        return self.hash2cnt.get(self.hash(item), default)
+
+    def merge(self, other):
+        """
+        Merge counts from another FastCounter into self, in-place.
+        """
+        self.hash2cnt.update(other.hash2cnt)
+        self.min_reduce = max(self.min_reduce, other.min_reduce)
+        self.prune_items()
+
+    def __len__(self):
+        return len(self.hash2cnt)
+
+    def __str__(self):
+        return "%s<%i items>" % (self.__class__.__name__, len(self))
+
+
+class Phrases(object):
+    def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000):
+        self.threshold = threshold
+        self.min_count = min_count
+        self.max_vocab_size = max_vocab_size
+        # self.counter = FastCounter(iter_gram12, max_size=max_vocab_size)
+        self.counter = FastCounterCython()
+        # self.counter = FastCounterPreshed()
+
+    def add_documents(self, documents):
+        self.counter.update(documents)
+
+        return self  # for easier chaining
+
+    def export_phrases(self, document):
+        """
+        Yield all collocations (pairs of adjacent closely related tokens) from the
+        input `document`, as 2-tuples `(score, bigram)`.
+        """
+        norm = 1.0 * len(self.counter)
+        for bigram in iter_gram2(document):
+            pa, pb, pab = self.counter.get((bigram[0],)), self.counter.get((bigram[1],)), self.counter.get(bigram, 0)
+            if pa and pb:
+                score = norm / pa / pb * (pab - self.min_count)
+                if score > self.threshold:
+                    yield score, bigram
+
+
+if __name__ == '__main__':
+    logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
+    logger.info("running %s", " ".join(sys.argv))
+
+    # check and process cmdline input
+    program = os.path.basename(sys.argv[0])
+    if len(sys.argv) < 2:
+        print(globals()['__doc__'] % locals())
+        sys.exit(1)
+    infile = sys.argv[1]
+
+    from gensim.models.word2vec import Text8Corpus
+    documents = Text8Corpus(infile)
+
+    logger.info("training phrases")
+    bigram = Phrases(min_count=5, threshold=100).add_documents(documents)
+    logger.info("finished training phrases")
+    print(bigram.counter)
+    # for doc in documents:
+    #     s = u' '.join(doc)
+    #     for _, bigram in bigram.export_phrases(doc):
+    #         s = s.replace(u' '.join(bigram), u'_'.join(bigram))
+    #     print(utils.to_utf8(s))
+
+    logger.info("finished running %s", " ".join(sys.argv))
diff --git a/gensim/models/fast_counter_cython.pyx b/gensim/models/fast_counter_cython.pyx
@@ -0,0 +1,137 @@
+#!/usr/bin/env cython
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: cdivision=True
+# coding: utf-8
+#
+# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+from collections import defaultdict
+
+from libc.stdint cimport int64_t, uint64_t
+
+cimport preshed.counter
+
+
+cdef uint64_t chash(obj):
+    # TODO use something faster, can assume string
+    return <uint64_t>hash(obj)
+
+
+class FastCounterCython(object):
+    """
+    Fast counting of item frequency frequency across large, streamed iterables.
+    """
+
+    def __init__(self, doc2items=None, max_size=None):
+        self.doc2items = doc2items
+        self.max_size = max_size
+        self.min_reduce = 0
+        self.hash2cnt = defaultdict(int)
+
+    def update(self, documents):
+        """
+        Update the relevant ngram counters from the iterable `documents`.
+
+        If the memory structures get too large, clip them (then the internal counts may be only approximate).
+        """
+        cdef Py_ssize_t idx, l
+        cdef uint64_t h1, h2
+        hash2cnt = self.hash2cnt
+        for document in documents:
+            l = len(document)
+            if l:
+                h1 = chash(document[0])
+                hash2cnt[h1] += 1
+                for idx in range(1, l):
+                    h2 = chash(document[idx])
+                    hash2cnt[h2] += 1
+                    hash2cnt[h1 + h2] += 1
+                    h1 = h2
+
+            # FIXME: add optimized prune
+
+        return self  # for easier chaining
+
+    def prune_items(self):
+        """Trim data structures to fit in memory, if too large."""
+        # XXX: Or use a fixed-size data structure to start with (hyperloglog?)
+        pass
+
+    def get(self, item, default=None):
+        """Return the item frequency of `item` (or `default` if item not present)."""
+        return self.hash2cnt.get(chash(item), default)
+
+    def merge(self, other):
+        """
+        Merge counts from another FastCounter into self, in-place.
+        """
+        self.hash2cnt.update(other.hash2cnt)
+        self.min_reduce = max(self.min_reduce, other.min_reduce)
+        self.prune_items()
+
+    def __len__(self):
+        return len(self.hash2cnt)
+
+    def __str__(self):
+        return "%s<%i items>" % (self.__class__.__name__, len(self))
+
+
+class FastCounterPreshed(object):
+    """
+    Fast counting of item frequency frequency across large, streamed iterables.
+    """
+
+    def __init__(self, doc2items=None, max_size=None):
+        self.doc2items = doc2items
+        self.max_size = max_size
+        self.min_reduce = 0
+        self.hash2cnt = preshed.counter.PreshCounter()  # TODO replace by some GIL-free low-level struct
+
+    def update(self, documents):
+        """
+        Update the relevant ngram counters from the iterable `documents`.
+
+        If the memory structures get too large, clip them (then the internal counts may be only approximate).
+        """
+        cdef Py_ssize_t idx, l
+        cdef uint64_t h1, h2
+        cdef preshed.counter.PreshCounter hash2cnt = self.hash2cnt
+        for document in documents:
+            l = len(document)
+            if l:
+                h1 = chash(document[0])
+                hash2cnt.inc(h1, 1)
+                for idx in range(1, l):
+                    h2 = chash(document[idx])
+                    hash2cnt.inc(h2, 1)
+                    hash2cnt.inc(h1 + h2, 1)
+                    h1 = h2
+
+            # FIXME: add optimized prune
+
+        return self  # for easier chaining
+
+    def prune_items(self):
+        """Trim data structures to fit in memory, if too large."""
+        # XXX: Or use a fixed-size data structure to start with (hyperloglog?)
+        pass
+
+    def get(self, item, default=None):
+        """Return the item frequency of `item` (or `default` if item not present)."""
+        return self.hash2cnt.get(chash(item), default)
+
+    def merge(self, other):
+        """
+        Merge counts from another FastCounter into self, in-place.
+        """
+        self.hash2cnt.update(other.hash2cnt)
+        self.min_reduce = max(self.min_reduce, other.min_reduce)
+        self.prune_items()
+
+    def __len__(self):
+        return len(self.hash2cnt)
+
+    def __str__(self):
+        return "%s<%i items>" % (self.__class__.__name__, len(self))
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
@@ -134,12 +134,6 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         should be a byte string (e.g. b'_').
 
         """
-        if min_count <= 0:
-            raise ValueError("min_count should be at least 1")
-
-        if threshold <= 0:
-            raise ValueError("threshold should be positive")
-
         self.min_count = min_count
         self.threshold = threshold
         self.max_vocab_size = max_vocab_size
@@ -169,7 +163,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
             if sentence_no % progress_per == 0:
                 logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
                             (sentence_no, total_words, len(vocab)))
-            sentence = [utils.any2utf8(w) for w in sentence]
+            # sentence = [utils.any2utf8(w) for w in sentence]
             for bigram in zip(sentence, sentence[1:]):
                 vocab[bigram[0]] += 1
                 vocab[delimiter.join(bigram)] += 1
@@ -394,7 +388,7 @@ def __getitem__(self, sentence):
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
-    logging.info("running %s" % " ".join(sys.argv))
+    logger.info("running %s", " ".join(sys.argv))
 
     # check and process cmdline input
     program = os.path.basename(sys.argv[0])
@@ -408,6 +402,12 @@ def __getitem__(self, sentence):
     sentences = Text8Corpus(infile)
 
     # test_doc = LineSentence('test/test_data/testcorpus.txt')
+    logger.info("training phrases")
     bigram = Phrases(sentences, min_count=5, threshold=100)
-    for s in bigram[sentences]:
-        print(utils.to_utf8(u' '.join(s)))
+    print bigram
+    logger.info("finished training phrases")
+
+    # for s in bigram[sentences]:
+    #     print(utils.to_utf8(u' '.join(s)))
+
+    logger.info("finished running %s", " ".join(sys.argv))
diff --git a/gensim/utils.py b/gensim/utils.py
@@ -1121,8 +1121,9 @@ def prune_vocab(vocab, min_reduce, trim_rule=None):
         if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule):  # vocab[w] <= min_reduce:
             result += vocab[w]
             del vocab[w]
-    logger.info("pruned out %i tokens with count <=%i (before %i, after %i)",
-                old_len - len(vocab), min_reduce, old_len, len(vocab))
+    logger.info(
+        "pruned out %i tokens with count <=%i (before %i, after %i)",
+        old_len - len(vocab), min_reduce, old_len, len(vocab))
     return result
 
 

diff --git a/setup.py b/setup.py
@@ -249,7 +249,10 @@ def finalize_options(self):
             include_dirs=[model_dir]),
         Extension('gensim.models.doc2vec_inner',
             sources=['./gensim/models/doc2vec_inner.c'],
-            include_dirs=[model_dir])
+            include_dirs=[model_dir]),
+        Extension('gensim.models.fast_counter_cython',
+            sources=['./gensim/models/fast_counter_cython.c'],
+            include_dirs=[model_dir]),
     ],
     cmdclass=cmdclass,
     packages=find_packages(),