From df84033653389fb0b7a09d93af497ad00863219e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <radimrehurek@seznam.cz>
Date: Sat, 24 Jun 2017 03:24:47 +0900
Subject: [PATCH] replace Counter by defaultdict

---
 gensim/models/fast_counter.py         | 14 ++++++--------
 gensim/models/fast_counter_cython.pyx | 12 ++++++------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py
index fb5374652e..1f59d4ee25 100644
--- a/gensim/models/fast_counter.py
+++ b/gensim/models/fast_counter.py
@@ -15,7 +15,7 @@
 
 import sys
 import os
-from collections import Counter
+from collections import defaultdict
 import logging
 
 from gensim import utils
@@ -51,7 +51,7 @@ def __init__(self, doc2items=iter_gram1, max_size=None):
         self.doc2items = doc2items
         self.max_size = max_size
         self.min_reduce = 0
-        self.hash2cnt = Counter()  # TODO replace by some GIL-free low-level struct
+        self.hash2cnt = defaultdict(int)
 
     def hash(self, item):
         return hash(item)
@@ -63,10 +63,8 @@ def update(self, documents):
         If the memory structures get too large, clip them (then the internal counts may be only approximate).
         """
         for document in documents:
-            # TODO: release GIL, so we can run update() in parallel threads.
-            # Or maybe not needed, if we create multiple FastCounters from multiple input streams using
-            # multiprocessing, and only .merge() them at the end.
-            self.hash2cnt.update(self.hash(ngram) for ngram in self.doc2items(document))
+            for item in self.doc2items(document):
+                self.hash2cnt[self.hash(item)] += 1
             self.prune_items()
 
         return self  # for easier chaining
@@ -103,8 +101,8 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000):
         self.min_count = min_count
         self.max_vocab_size = max_vocab_size
         # self.counter = FastCounter(iter_gram12, max_size=max_vocab_size)
-        # self.counter = FastCounterCython()
-        self.counter = FastCounterPreshed()
+        self.counter = FastCounterCython()
+        # self.counter = FastCounterPreshed()
 
     def add_documents(self, documents):
         self.counter.update(documents)
diff --git a/gensim/models/fast_counter_cython.pyx b/gensim/models/fast_counter_cython.pyx
index 24450a0fb0..dad283032c 100644
--- a/gensim/models/fast_counter_cython.pyx
+++ b/gensim/models/fast_counter_cython.pyx
@@ -7,7 +7,7 @@
 # Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-from collections import Counter
+from collections import defaultdict
 
 from libc.stdint cimport int64_t, uint64_t
 
@@ -28,7 +28,7 @@ class FastCounterCython(object):
         self.doc2items = doc2items
         self.max_size = max_size
         self.min_reduce = 0
-        self.hash2cnt = Counter()
+        self.hash2cnt = defaultdict(int)
 
     def update(self, documents):
         """
@@ -36,7 +36,7 @@ class FastCounterCython(object):
 
         If the memory structures get too large, clip them (then the internal counts may be only approximate).
         """
-        cdef int idx, l
+        cdef Py_ssize_t idx, l
         cdef uint64_t h1, h2
         hash2cnt = self.hash2cnt
         for document in documents:
@@ -47,7 +47,7 @@ class FastCounterCython(object):
                 for idx in range(1, l):
                     h2 = chash(document[idx])
                     hash2cnt[h2] += 1
-                    hash2cnt[h1 ^ h2] += 1
+                    hash2cnt[h1 + h2] += 1
                     h1 = h2
 
             # FIXME: add optimized prune
@@ -95,7 +95,7 @@ class FastCounterPreshed(object):
 
         If the memory structures get too large, clip them (then the internal counts may be only approximate).
         """
-        cdef int idx, l
+        cdef Py_ssize_t idx, l
         cdef uint64_t h1, h2
         cdef preshed.counter.PreshCounter hash2cnt = self.hash2cnt
         for document in documents:
@@ -106,7 +106,7 @@ class FastCounterPreshed(object):
                 for idx in range(1, l):
                     h2 = chash(document[idx])
                     hash2cnt.inc(h2, 1)
-                    hash2cnt.inc(h1 ^ h2, 1)
+                    hash2cnt.inc(h1 + h2, 1)
                     h1 = h2
 
             # FIXME: add optimized prune