From 392c672a49ab1762a837eedb584149f583ea2c10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz>
Date: Fri, 23 Jun 2017 23:41:21 +0900
Subject: [PATCH 1/7] wip: API design for fast_counter & phrases

---
 gensim/models/fast_counter.py | 123 ++++++++++++++++++++++++++++++++++
 gensim/models/phrases.py      |   6 --
 2 files changed, 123 insertions(+), 6 deletions(-)
 create mode 100644 gensim/models/fast_counter.py

diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py
new file mode 100644
index 0000000000..431a3d39a2
--- /dev/null
+++ b/gensim/models/fast_counter.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Fast & memory efficient counting of things (and n-grams of things).
+
+This module is designed to count *item* and *document* frequencies over large, streamed corpora (lazy iteration).
+
+"""
+
+from collections import Counter
+import logging
+
+from six import iterkeys, iteritems
+
+logger = logging.getLogger(__name__)
+
+
+def iter_ngrams(document, ngrams):
+    assert ngrams[0] <= ngrams[1]
+
+    for n in range(ngrams[0], ngrams[1] + 1):
+        for ngram in zip(*[document[i:] for i in range(n)]):
+            logger.debug("yielding ngram %r", ngram)
+            yield ngram
+
+def iter_gram1(document):
+    return iter_ngrams(document, (1, 1))
+
+def iter_gram2(document):
+    return iter_ngrams(document, (2, 2))
+
+def iter_gram12(document):
+    return iter_ngrams(document, (1, 2))
+
+
+class FastCounter(object):
+    """
+    Fast counting of item frequency and document frequency across large, streamed iterables.
+    """
+
+    def __init__(self, doc2items=iter_gram1, collect_df=False):
+        self.doc2items = doc2items
+        self.collect_df = collect_df
+
+        self.item_counts = Counter()  # TODO replace by some GIL-free low-level struct
+        self.doc_counts = Counter()  # TODO replace by some GIL-free low-level struct
+
+    def hash(self, key):
+        return hash(key)
+
+    def update(self, documents):
+        """
+        Update the relevant ngram counters from the iterable `documents`.
+
+        If the memory structures get too large, clip them (then the internal counts may be only approximate).
+        """
+        for document in documents:
+            # TODO: release GIL, so we can run update() in parallel threads.
+            # Or maybe not needed, if we create multiple FastCounters from multiple input streams using
+            # multiprocessing, and only .merge() them at the end.
+            item_cnts = Counter(self.hash(ngram) for ngram in self.doc2items(document))
+            self.item_counts.update(item_cnts)
+            if self.collect_df:
+                # increment by 1 per unique key ("document frequency")
+                self.doc_counts.update(iterkeys(item_cnts))
+
+            # self.prune_vocab()
+
+        return self  # for easier chaining
+
+    def prune_vocab(self):
+        # Trim data structures to fit in memory, if too large.
+        # Or use a fixed-size data structure to start with (hyperloglog?)
+        raise NotImplementedError
+
+    def get(self, key, default=None):
+        """Return the item frequency of `key` (or `default` if key not present)."""
+        return self.item_counts.get(self.hash(key), default)
+
+    def merge(self, other):
+        """
+        Merge counts from other into self, in-place.
+        """
+        # rare operation, no need to optimize too much
+        raise NotImplementedError
+
+    def __len__(self):
+        return len(self.item_counts)
+
+    def __str__(self):
+        return "%s<%i items>" % (self.__class__.__name__, len(self))
+
+
+class Phrases(object):
+    def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000):
+        self.threshold = threshold
+        self.min_count = min_count
+        self.max_vocab_size = max_vocab_size
+        self.counter = FastCounter(iter_gram12)
+
+    def add_documents(self, documents):
+        self.counter.update(documents)
+
+        return self  # for easier chaining
+
+    def export_phrases(self, document):
+        """
+        Yield all collocations (pairs of adjacent closely related tokens) from the
+        input `document`, as 2-tuples `(score, bigram)`.
+        """
+        if not self.counter:
+            return
+        norm = 1.0 * len(self.counter)
+        for bigram in iter_gram2(document):
+            pa, pb, pab = self.counter.get((bigram[0],)), self.counter.get((bigram[1],)), self.counter.get(bigram, 0)
+            if pa and pb:
+                score = norm / pa / pb * (pab - self.min_count)
+                if score > self.threshold:
+                    yield score, bigram
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index be735b865a..65d9b93b02 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -134,12 +134,6 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         should be a byte string (e.g. b'_').
 
         """
-        if min_count <= 0:
-            raise ValueError("min_count should be at least 1")
-
-        if threshold <= 0:
-            raise ValueError("threshold should be positive")
-
         self.min_count = min_count
         self.threshold = threshold
         self.max_vocab_size = max_vocab_size

From 6a98f8656dccc810a9d2c99a23e2ae75e0afeea1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz>
Date: Sat, 24 Jun 2017 00:05:32 +0900
Subject: [PATCH 2/7] remove document frequency from the API (instead,
 Dictionary will have 2 counters)

---
 gensim/models/fast_counter.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py
index 431a3d39a2..a5d79764fb 100644
--- a/gensim/models/fast_counter.py
+++ b/gensim/models/fast_counter.py
@@ -7,7 +7,9 @@
 """
 Fast & memory efficient counting of things (and n-grams of things).
 
-This module is designed to count *item* and *document* frequencies over large, streamed corpora (lazy iteration).
+This module is designed to count item frequencies over large, streamed corpora (lazy iteration).
+
+Such counts are useful in various other modules, such as Dictionary, TfIdf, Phrases etc.
 
 """
 
@@ -42,12 +44,9 @@ class FastCounter(object):
     Fast counting of item frequency and document frequency across large, streamed iterables.
     """
 
-    def __init__(self, doc2items=iter_gram1, collect_df=False):
+    def __init__(self, doc2items=iter_gram1):
         self.doc2items = doc2items
-        self.collect_df = collect_df
-
-        self.item_counts = Counter()  # TODO replace by some GIL-free low-level struct
-        self.doc_counts = Counter()  # TODO replace by some GIL-free low-level struct
+        self.hash2cnt = Counter()  # TODO replace by some GIL-free low-level struct
 
     def hash(self, key):
         return hash(key)
@@ -62,11 +61,7 @@ def update(self, documents):
             # TODO: release GIL, so we can run update() in parallel threads.
             # Or maybe not needed, if we create multiple FastCounters from multiple input streams using
             # multiprocessing, and only .merge() them at the end.
-            item_cnts = Counter(self.hash(ngram) for ngram in self.doc2items(document))
-            self.item_counts.update(item_cnts)
-            if self.collect_df:
-                # increment by 1 per unique key ("document frequency")
-                self.doc_counts.update(iterkeys(item_cnts))
+            self.hash2cnt.update(self.hash(ngram) for ngram in self.doc2items(document))
 
             # self.prune_vocab()
 
@@ -79,7 +74,7 @@ def prune_vocab(self):
 
     def get(self, key, default=None):
         """Return the item frequency of `key` (or `default` if key not present)."""
-        return self.item_counts.get(self.hash(key), default)
+        return self.hash2cnt.get(self.hash(key), default)
 
     def merge(self, other):
         """
@@ -89,7 +84,7 @@ def merge(self, other):
         raise NotImplementedError
 
     def __len__(self):
-        return len(self.item_counts)
+        return len(self.hash2cnt)
 
     def __str__(self):
         return "%s<%i items>" % (self.__class__.__name__, len(self))

From c601cbf8d34016230dd847a41eabb2f7f1ff2159 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz>
Date: Sat, 24 Jun 2017 00:09:12 +0900
Subject: [PATCH 3/7] consistency: rename `key` to `item`

---
 gensim/models/fast_counter.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py
index a5d79764fb..cfd106d2ab 100644
--- a/gensim/models/fast_counter.py
+++ b/gensim/models/fast_counter.py
@@ -48,8 +48,8 @@ def __init__(self, doc2items=iter_gram1):
         self.doc2items = doc2items
         self.hash2cnt = Counter()  # TODO replace by some GIL-free low-level struct
 
-    def hash(self, key):
-        return hash(key)
+    def hash(self, item):
+        return hash(item)
 
     def update(self, documents):
         """
@@ -72,9 +72,9 @@ def prune_vocab(self):
         # Or use a fixed-size data structure to start with (hyperloglog?)
         raise NotImplementedError
 
-    def get(self, key, default=None):
-        """Return the item frequency of `key` (or `default` if key not present)."""
-        return self.hash2cnt.get(self.hash(key), default)
+    def get(self, item, default=None):
+        """Return the item frequency of `item` (or `default` if item not present)."""
+        return self.hash2cnt.get(self.hash(item), default)
 
     def merge(self, other):
         """

From a4fdfdb0417d936e9ffbea835fde9a3bccae67fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz>
Date: Sat, 24 Jun 2017 00:27:34 +0900
Subject: [PATCH 4/7] implement missing API methods

---
 gensim/models/fast_counter.py | 30 +++++++++++++++++-------------
 gensim/utils.py               |  5 +++--
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py
index cfd106d2ab..b65f3c37cf 100644
--- a/gensim/models/fast_counter.py
+++ b/gensim/models/fast_counter.py
@@ -18,6 +18,8 @@
 
 from six import iterkeys, iteritems
 
+from gensim import utils
+
 logger = logging.getLogger(__name__)
 
 
@@ -44,8 +46,10 @@ class FastCounter(object):
     Fast counting of item frequency and document frequency across large, streamed iterables.
     """
 
-    def __init__(self, doc2items=iter_gram1):
+    def __init__(self, doc2items=iter_gram1, max_size=None):
         self.doc2items = doc2items
+        self.max_size = max_size
+        self.min_reduce = 0
         self.hash2cnt = Counter()  # TODO replace by some GIL-free low-level struct
 
     def hash(self, item):
@@ -62,15 +66,16 @@ def update(self, documents):
             # Or maybe not needed, if we create multiple FastCounters from multiple input streams using
             # multiprocessing, and only .merge() them at the end.
             self.hash2cnt.update(self.hash(ngram) for ngram in self.doc2items(document))
-
-            # self.prune_vocab()
+            self.prune_items()
 
         return self  # for easier chaining
 
-    def prune_vocab(self):
-        # Trim data structures to fit in memory, if too large.
-        # Or use a fixed-size data structure to start with (hyperloglog?)
-        raise NotImplementedError
+    def prune_items(self):
+        """Trim data structures to fit in memory, if too large."""
+        # XXX: Or use a fixed-size data structure to start with (hyperloglog?)
+        while self.max_size and len(self) > self.max_size:
+            self.min_reduce += 1
+            utils.prune_vocab(self.hash2cnt, self.min_reduce)
 
     def get(self, item, default=None):
         """Return the item frequency of `item` (or `default` if item not present)."""
@@ -78,10 +83,11 @@ def get(self, item, default=None):
 
     def merge(self, other):
         """
-        Merge counts from other into self, in-place.
+        Merge counts from another FastCounter into self, in-place.
         """
-        # rare operation, no need to optimize too much
-        raise NotImplementedError
+        self.hash2cnt.update(other.hash2cnt)
+        self.min_reduce = max(self.min_reduce, other.min_reduce)
+        self.prune_items()
 
     def __len__(self):
         return len(self.hash2cnt)
@@ -95,7 +101,7 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000):
         self.threshold = threshold
         self.min_count = min_count
         self.max_vocab_size = max_vocab_size
-        self.counter = FastCounter(iter_gram12)
+        self.counter = FastCounter(iter_gram12, max_size=max_vocab_size)
 
     def add_documents(self, documents):
         self.counter.update(documents)
@@ -107,8 +113,6 @@ def export_phrases(self, document):
         Yield all collocations (pairs of adjacent closely related tokens) from the
         input `document`, as 2-tuples `(score, bigram)`.
         """
-        if not self.counter:
-            return
         norm = 1.0 * len(self.counter)
         for bigram in iter_gram2(document):
             pa, pb, pab = self.counter.get((bigram[0],)), self.counter.get((bigram[1],)), self.counter.get(bigram, 0)
diff --git a/gensim/utils.py b/gensim/utils.py
index dd391f887b..9118d97e49 100644
--- a/gensim/utils.py
+++ b/gensim/utils.py
@@ -1121,8 +1121,9 @@ def prune_vocab(vocab, min_reduce, trim_rule=None):
         if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule):  # vocab[w] <= min_reduce:
             result += vocab[w]
             del vocab[w]
-    logger.info("pruned out %i tokens with count <=%i (before %i, after %i)",
-                old_len - len(vocab), min_reduce, old_len, len(vocab))
+    logger.info(
+        "pruned out %i tokens with count <=%i (before %i, after %i)",
+        old_len - len(vocab), min_reduce, old_len, len(vocab))
     return result
 
 

From 24f5b63e923a50b4d669178ab0462680b85d1984 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz>
Date: Sat, 24 Jun 2017 02:19:52 +0900
Subject: [PATCH 5/7] WIP: simple cython impl

---
 gensim/models/fast_counter_cython.pyx | 68 +++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 gensim/models/fast_counter_cython.pyx

diff --git a/gensim/models/fast_counter_cython.pyx b/gensim/models/fast_counter_cython.pyx
new file mode 100644
index 0000000000..94f83b0030
--- /dev/null
+++ b/gensim/models/fast_counter_cython.pyx
@@ -0,0 +1,68 @@
+#!/usr/bin/env cython
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: cdivision=True
+# coding: utf-8
+#
+# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+from collections import Counter
+
+cimport preshed.counter
+
+
+class FastCounterCython(object):
+    """
+    Fast counting of item frequency frequency across large, streamed iterables.
+    """
+
+    def __init__(self, doc2items=None, max_size=None):
+        self.doc2items = doc2items
+        self.max_size = max_size
+        self.min_reduce = 0
+        self.hash2cnt = Counter()  # TODO replace by some GIL-free low-level struct
+
+    def update(self, documents):
+        """
+        Update the relevant ngram counters from the iterable `documents`.
+
+        If the memory structures get too large, clip them (then the internal counts may be only approximate).
+        """
+        hash2cnt = self.hash2cnt
+        for document in documents:
+            # TODO: release GIL, so we can run update() in parallel threads.
+            # Or maybe not needed, if we create multiple FastCounters from multiple input streams using
+            # multiprocessing, and only .merge() them at the end.
+            if document:
+                hash2cnt[hash(document[0])] += 1
+                for idx in range(len(document) - 1):
+                    hash2cnt[hash(document[idx + 1])] += 1
+                    hash2cnt[hash((document[idx], document[idx + 1]))] += 1
+
+            # FIXME: add optimized prune
+
+        return self  # for easier chaining
+
+    def prune_items(self):
+        """Trim data structures to fit in memory, if too large."""
+        # XXX: Or use a fixed-size data structure to start with (hyperloglog?)
+        pass
+
+    def get(self, item, default=None):
+        """Return the item frequency of `item` (or `default` if item not present)."""
+        return self.hash2cnt.get(hash(item), default)
+
+    def merge(self, other):
+        """
+        Merge counts from another FastCounter into self, in-place.
+        """
+        self.hash2cnt.update(other.hash2cnt)
+        self.min_reduce = max(self.min_reduce, other.min_reduce)
+        self.prune_items()
+
+    def __len__(self):
+        return len(self.hash2cnt)
+
+    def __str__(self):
+        return "%s<%i items>" % (self.__class__.__name__, len(self))

From f87722a020be6ea4cc23f14ed7d812709c208e76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz>
Date: Sat, 24 Jun 2017 03:06:52 +0900
Subject: [PATCH 6/7] cythonized Phrases (untested, mildly optimized)

---
 gensim/models/fast_counter.py         | 38 ++++++++++--
 gensim/models/fast_counter_cython.pyx | 89 ++++++++++++++++++++++++---
 gensim/models/phrases.py              | 14 +++--
 setup.py                              |  5 +-
 4 files changed, 127 insertions(+), 19 deletions(-)

diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py
index b65f3c37cf..fb5374652e 100644
--- a/gensim/models/fast_counter.py
+++ b/gensim/models/fast_counter.py
@@ -13,12 +13,13 @@
 
 """
 
+import sys
+import os
 from collections import Counter
 import logging
 
-from six import iterkeys, iteritems
-
 from gensim import utils
+from gensim.models.fast_counter_cython import FastCounterCython, FastCounterPreshed
 
 logger = logging.getLogger(__name__)
 
@@ -43,7 +44,7 @@ def iter_gram12(document):
 
 class FastCounter(object):
     """
-    Fast counting of item frequency and document frequency across large, streamed iterables.
+    Fast counting of item frequency frequency across large, streamed iterables.
     """
 
     def __init__(self, doc2items=iter_gram1, max_size=None):
@@ -101,7 +102,9 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000):
         self.threshold = threshold
         self.min_count = min_count
         self.max_vocab_size = max_vocab_size
-        self.counter = FastCounter(iter_gram12, max_size=max_vocab_size)
+        # self.counter = FastCounter(iter_gram12, max_size=max_vocab_size)
+        # self.counter = FastCounterCython()
+        self.counter = FastCounterPreshed()
 
     def add_documents(self, documents):
         self.counter.update(documents)
@@ -120,3 +123,30 @@ def export_phrases(self, document):
                 score = norm / pa / pb * (pab - self.min_count)
                 if score > self.threshold:
                     yield score, bigram
+
+
+if __name__ == '__main__':
+    logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
+    logger.info("running %s", " ".join(sys.argv))
+
+    # check and process cmdline input
+    program = os.path.basename(sys.argv[0])
+    if len(sys.argv) < 2:
+        print(globals()['__doc__'] % locals())
+        sys.exit(1)
+    infile = sys.argv[1]
+
+    from gensim.models.word2vec import Text8Corpus
+    documents = Text8Corpus(infile)
+
+    logger.info("training phrases")
+    bigram = Phrases(min_count=5, threshold=100).add_documents(documents)
+    logger.info("finished training phrases")
+    print(bigram.counter)
+    # for doc in documents:
+    #     s = u' '.join(doc)
+    #     for _, bigram in bigram.export_phrases(doc):
+    #         s = s.replace(u' '.join(bigram), u'_'.join(bigram))
+    #     print(utils.to_utf8(s))
+
+    logger.info("finished running %s", " ".join(sys.argv))
diff --git a/gensim/models/fast_counter_cython.pyx b/gensim/models/fast_counter_cython.pyx
index 94f83b0030..24450a0fb0 100644
--- a/gensim/models/fast_counter_cython.pyx
+++ b/gensim/models/fast_counter_cython.pyx
@@ -9,9 +9,16 @@
 
 from collections import Counter
 
+from libc.stdint cimport int64_t, uint64_t
+
 cimport preshed.counter
 
 
+cdef uint64_t chash(obj):
+    # TODO use something faster, can assume string
+    return <uint64_t>hash(obj)
+
+
 class FastCounterCython(object):
     """
     Fast counting of item frequency frequency across large, streamed iterables.
@@ -21,7 +28,7 @@ class FastCounterCython(object):
         self.doc2items = doc2items
         self.max_size = max_size
         self.min_reduce = 0
-        self.hash2cnt = Counter()  # TODO replace by some GIL-free low-level struct
+        self.hash2cnt = Counter()
 
     def update(self, documents):
         """
@@ -29,16 +36,78 @@ class FastCounterCython(object):
 
         If the memory structures get too large, clip them (then the internal counts may be only approximate).
         """
+        cdef int idx, l
+        cdef uint64_t h1, h2
         hash2cnt = self.hash2cnt
         for document in documents:
-            # TODO: release GIL, so we can run update() in parallel threads.
-            # Or maybe not needed, if we create multiple FastCounters from multiple input streams using
-            # multiprocessing, and only .merge() them at the end.
-            if document:
-                hash2cnt[hash(document[0])] += 1
-                for idx in range(len(document) - 1):
-                    hash2cnt[hash(document[idx + 1])] += 1
-                    hash2cnt[hash((document[idx], document[idx + 1]))] += 1
+            l = len(document)
+            if l:
+                h1 = chash(document[0])
+                hash2cnt[h1] += 1
+                for idx in range(1, l):
+                    h2 = chash(document[idx])
+                    hash2cnt[h2] += 1
+                    hash2cnt[h1 ^ h2] += 1
+                    h1 = h2
+
+            # FIXME: add optimized prune
+
+        return self  # for easier chaining
+
+    def prune_items(self):
+        """Trim data structures to fit in memory, if too large."""
+        # XXX: Or use a fixed-size data structure to start with (hyperloglog?)
+        pass
+
+    def get(self, item, default=None):
+        """Return the item frequency of `item` (or `default` if item not present)."""
+        return self.hash2cnt.get(chash(item), default)
+
+    def merge(self, other):
+        """
+        Merge counts from another FastCounter into self, in-place.
+        """
+        self.hash2cnt.update(other.hash2cnt)
+        self.min_reduce = max(self.min_reduce, other.min_reduce)
+        self.prune_items()
+
+    def __len__(self):
+        return len(self.hash2cnt)
+
+    def __str__(self):
+        return "%s<%i items>" % (self.__class__.__name__, len(self))
+
+
+class FastCounterPreshed(object):
+    """
+    Fast counting of item frequency frequency across large, streamed iterables.
+    """
+
+    def __init__(self, doc2items=None, max_size=None):
+        self.doc2items = doc2items
+        self.max_size = max_size
+        self.min_reduce = 0
+        self.hash2cnt = preshed.counter.PreshCounter()  # TODO replace by some GIL-free low-level struct
+
+    def update(self, documents):
+        """
+        Update the relevant ngram counters from the iterable `documents`.
+
+        If the memory structures get too large, clip them (then the internal counts may be only approximate).
+        """
+        cdef int idx, l
+        cdef uint64_t h1, h2
+        cdef preshed.counter.PreshCounter hash2cnt = self.hash2cnt
+        for document in documents:
+            l = len(document)
+            if l:
+                h1 = chash(document[0])
+                hash2cnt.inc(h1, 1)
+                for idx in range(1, l):
+                    h2 = chash(document[idx])
+                    hash2cnt.inc(h2, 1)
+                    hash2cnt.inc(h1 ^ h2, 1)
+                    h1 = h2
 
             # FIXME: add optimized prune
 
@@ -51,7 +120,7 @@ class FastCounterCython(object):
 
     def get(self, item, default=None):
         """Return the item frequency of `item` (or `default` if item not present)."""
-        return self.hash2cnt.get(hash(item), default)
+        return self.hash2cnt.get(chash(item), default)
 
     def merge(self, other):
         """
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 65d9b93b02..f3082f620c 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -163,7 +163,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
             if sentence_no % progress_per == 0:
                 logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
                             (sentence_no, total_words, len(vocab)))
-            sentence = [utils.any2utf8(w) for w in sentence]
+            # sentence = [utils.any2utf8(w) for w in sentence]
             for bigram in zip(sentence, sentence[1:]):
                 vocab[bigram[0]] += 1
                 vocab[delimiter.join(bigram)] += 1
@@ -388,7 +388,7 @@ def __getitem__(self, sentence):
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
-    logging.info("running %s" % " ".join(sys.argv))
+    logger.info("running %s", " ".join(sys.argv))
 
     # check and process cmdline input
     program = os.path.basename(sys.argv[0])
@@ -402,6 +402,12 @@ def __getitem__(self, sentence):
     sentences = Text8Corpus(infile)
 
     # test_doc = LineSentence('test/test_data/testcorpus.txt')
+    logger.info("training phrases")
     bigram = Phrases(sentences, min_count=5, threshold=100)
-    for s in bigram[sentences]:
-        print(utils.to_utf8(u' '.join(s)))
+    print bigram
+    logger.info("finished training phrases")
+
+    # for s in bigram[sentences]:
+    #     print(utils.to_utf8(u' '.join(s)))
+
+    logger.info("finished running %s", " ".join(sys.argv))
diff --git a/setup.py b/setup.py
index 1e88cb129d..dbd52ee017 100644
--- a/setup.py
+++ b/setup.py
@@ -249,7 +249,10 @@ def finalize_options(self):
             include_dirs=[model_dir]),
         Extension('gensim.models.doc2vec_inner',
             sources=['./gensim/models/doc2vec_inner.c'],
-            include_dirs=[model_dir])
+            include_dirs=[model_dir]),
+        Extension('gensim.models.fast_counter_cython',
+            sources=['./gensim/models/fast_counter_cython.c'],
+            include_dirs=[model_dir]),
     ],
     cmdclass=cmdclass,
     packages=find_packages(),

From df84033653389fb0b7a09d93af497ad00863219e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz>
Date: Sat, 24 Jun 2017 03:24:47 +0900
Subject: [PATCH 7/7] replace Counter by defaultdict

---
 gensim/models/fast_counter.py         | 14 ++++++--------
 gensim/models/fast_counter_cython.pyx | 12 ++++++------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py
index fb5374652e..1f59d4ee25 100644
--- a/gensim/models/fast_counter.py
+++ b/gensim/models/fast_counter.py
@@ -15,7 +15,7 @@
 
 import sys
 import os
-from collections import Counter
+from collections import defaultdict
 import logging
 
 from gensim import utils
@@ -51,7 +51,7 @@ def __init__(self, doc2items=iter_gram1, max_size=None):
         self.doc2items = doc2items
         self.max_size = max_size
         self.min_reduce = 0
-        self.hash2cnt = Counter()  # TODO replace by some GIL-free low-level struct
+        self.hash2cnt = defaultdict(int)
 
     def hash(self, item):
         return hash(item)
@@ -63,10 +63,8 @@ def update(self, documents):
         If the memory structures get too large, clip them (then the internal counts may be only approximate).
         """
         for document in documents:
-            # TODO: release GIL, so we can run update() in parallel threads.
-            # Or maybe not needed, if we create multiple FastCounters from multiple input streams using
-            # multiprocessing, and only .merge() them at the end.
-            self.hash2cnt.update(self.hash(ngram) for ngram in self.doc2items(document))
+            for item in self.doc2items(document):
+                self.hash2cnt[self.hash(item)] += 1
             self.prune_items()
 
         return self  # for easier chaining
@@ -103,8 +101,8 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000):
         self.min_count = min_count
         self.max_vocab_size = max_vocab_size
         # self.counter = FastCounter(iter_gram12, max_size=max_vocab_size)
-        # self.counter = FastCounterCython()
-        self.counter = FastCounterPreshed()
+        self.counter = FastCounterCython()
+        # self.counter = FastCounterPreshed()
 
     def add_documents(self, documents):
         self.counter.update(documents)
diff --git a/gensim/models/fast_counter_cython.pyx b/gensim/models/fast_counter_cython.pyx
index 24450a0fb0..dad283032c 100644
--- a/gensim/models/fast_counter_cython.pyx
+++ b/gensim/models/fast_counter_cython.pyx
@@ -7,7 +7,7 @@
 # Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-from collections import Counter
+from collections import defaultdict
 
 from libc.stdint cimport int64_t, uint64_t
 
@@ -28,7 +28,7 @@ class FastCounterCython(object):
         self.doc2items = doc2items
         self.max_size = max_size
         self.min_reduce = 0
-        self.hash2cnt = Counter()
+        self.hash2cnt = defaultdict(int)
 
     def update(self, documents):
         """
@@ -36,7 +36,7 @@ class FastCounterCython(object):
 
         If the memory structures get too large, clip them (then the internal counts may be only approximate).
         """
-        cdef int idx, l
+        cdef Py_ssize_t idx, l
         cdef uint64_t h1, h2
         hash2cnt = self.hash2cnt
         for document in documents:
@@ -47,7 +47,7 @@ class FastCounterCython(object):
                 for idx in range(1, l):
                     h2 = chash(document[idx])
                     hash2cnt[h2] += 1
-                    hash2cnt[h1 ^ h2] += 1
+                    hash2cnt[h1 + h2] += 1
                     h1 = h2
 
             # FIXME: add optimized prune
@@ -95,7 +95,7 @@ class FastCounterPreshed(object):
 
         If the memory structures get too large, clip them (then the internal counts may be only approximate).
         """
-        cdef int idx, l
+        cdef Py_ssize_t idx, l
         cdef uint64_t h1, h2
         cdef preshed.counter.PreshCounter hash2cnt = self.hash2cnt
         for document in documents:
@@ -106,7 +106,7 @@ class FastCounterPreshed(object):
                 for idx in range(1, l):
                     h2 = chash(document[idx])
                     hash2cnt.inc(h2, 1)
-                    hash2cnt.inc(h1 ^ h2, 1)
+                    hash2cnt.inc(h1 + h2, 1)
                     h1 = h2
 
             # FIXME: add optimized prune