From df84033653389fb0b7a09d93af497ad00863219e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 24 Jun 2017 03:24:47 +0900 Subject: [PATCH] replace Counter by defaultdict --- gensim/models/fast_counter.py | 14 ++++++-------- gensim/models/fast_counter_cython.pyx | 12 ++++++------ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py index fb5374652e..1f59d4ee25 100644 --- a/gensim/models/fast_counter.py +++ b/gensim/models/fast_counter.py @@ -15,7 +15,7 @@ import sys import os -from collections import Counter +from collections import defaultdict import logging from gensim import utils @@ -51,7 +51,7 @@ def __init__(self, doc2items=iter_gram1, max_size=None): self.doc2items = doc2items self.max_size = max_size self.min_reduce = 0 - self.hash2cnt = Counter() # TODO replace by some GIL-free low-level struct + self.hash2cnt = defaultdict(int) def hash(self, item): return hash(item) @@ -63,10 +63,8 @@ def update(self, documents): If the memory structures get too large, clip them (then the internal counts may be only approximate). """ for document in documents: - # TODO: release GIL, so we can run update() in parallel threads. - # Or maybe not needed, if we create multiple FastCounters from multiple input streams using - # multiprocessing, and only .merge() them at the end. - self.hash2cnt.update(self.hash(ngram) for ngram in self.doc2items(document)) + for item in self.doc2items(document): + self.hash2cnt[self.hash(item)] += 1 self.prune_items() return self # for easier chaining @@ -103,8 +101,8 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000): self.min_count = min_count self.max_vocab_size = max_vocab_size # self.counter = FastCounter(iter_gram12, max_size=max_vocab_size) - # self.counter = FastCounterCython() - self.counter = FastCounterPreshed() + self.counter = FastCounterCython() + # self.counter = FastCounterPreshed() def add_documents(self, documents): self.counter.update(documents) diff --git a/gensim/models/fast_counter_cython.pyx b/gensim/models/fast_counter_cython.pyx index 24450a0fb0..dad283032c 100644 --- a/gensim/models/fast_counter_cython.pyx +++ b/gensim/models/fast_counter_cython.pyx @@ -7,7 +7,7 @@ # Copyright (C) 2017 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -from collections import Counter +from collections import defaultdict from libc.stdint cimport int64_t, uint64_t @@ -28,7 +28,7 @@ class FastCounterCython(object): self.doc2items = doc2items self.max_size = max_size self.min_reduce = 0 - self.hash2cnt = Counter() + self.hash2cnt = defaultdict(int) def update(self, documents): """ @@ -36,7 +36,7 @@ class FastCounterCython(object): If the memory structures get too large, clip them (then the internal counts may be only approximate). """ - cdef int idx, l + cdef Py_ssize_t idx, l cdef uint64_t h1, h2 hash2cnt = self.hash2cnt for document in documents: @@ -47,7 +47,7 @@ class FastCounterCython(object): for idx in range(1, l): h2 = chash(document[idx]) hash2cnt[h2] += 1 - hash2cnt[h1 ^ h2] += 1 + hash2cnt[h1 + h2] += 1 h1 = h2 # FIXME: add optimized prune @@ -95,7 +95,7 @@ class FastCounterPreshed(object): If the memory structures get too large, clip them (then the internal counts may be only approximate). """ - cdef int idx, l + cdef Py_ssize_t idx, l cdef uint64_t h1, h2 cdef preshed.counter.PreshCounter hash2cnt = self.hash2cnt for document in documents: @@ -106,7 +106,7 @@ class FastCounterPreshed(object): for idx in range(1, l): h2 = chash(document[idx]) hash2cnt.inc(h2, 1) - hash2cnt.inc(h1 ^ h2, 1) + hash2cnt.inc(h1 + h2, 1) h1 = h2 # FIXME: add optimized prune