Skip to content

Commit

Permalink
replace Counter by defaultdict
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Jun 23, 2017
1 parent f87722a commit df84033
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 14 deletions.
14 changes: 6 additions & 8 deletions gensim/models/fast_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

import sys
import os
from collections import Counter
from collections import defaultdict
import logging

from gensim import utils
Expand Down Expand Up @@ -51,7 +51,7 @@ def __init__(self, doc2items=iter_gram1, max_size=None):
self.doc2items = doc2items
self.max_size = max_size
self.min_reduce = 0
self.hash2cnt = Counter() # TODO replace by some GIL-free low-level struct
self.hash2cnt = defaultdict(int)

def hash(self, item):
return hash(item)
Expand All @@ -63,10 +63,8 @@ def update(self, documents):
If the memory structures get too large, clip them (then the internal counts may be only approximate).
"""
for document in documents:
# TODO: release GIL, so we can run update() in parallel threads.
# Or maybe not needed, if we create multiple FastCounters from multiple input streams using
# multiprocessing, and only .merge() them at the end.
self.hash2cnt.update(self.hash(ngram) for ngram in self.doc2items(document))
for item in self.doc2items(document):
self.hash2cnt[self.hash(item)] += 1
self.prune_items()

return self # for easier chaining
Expand Down Expand Up @@ -103,8 +101,8 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000):
self.min_count = min_count
self.max_vocab_size = max_vocab_size
# self.counter = FastCounter(iter_gram12, max_size=max_vocab_size)
# self.counter = FastCounterCython()
self.counter = FastCounterPreshed()
self.counter = FastCounterCython()
# self.counter = FastCounterPreshed()

def add_documents(self, documents):
self.counter.update(documents)
Expand Down
12 changes: 6 additions & 6 deletions gensim/models/fast_counter_cython.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

from collections import Counter
from collections import defaultdict

from libc.stdint cimport int64_t, uint64_t

Expand All @@ -28,15 +28,15 @@ class FastCounterCython(object):
self.doc2items = doc2items
self.max_size = max_size
self.min_reduce = 0
self.hash2cnt = Counter()
self.hash2cnt = defaultdict(int)

def update(self, documents):
"""
Update the relevant ngram counters from the iterable `documents`.
If the memory structures get too large, clip them (then the internal counts may be only approximate).
"""
cdef int idx, l
cdef Py_ssize_t idx, l
cdef uint64_t h1, h2
hash2cnt = self.hash2cnt
for document in documents:
Expand All @@ -47,7 +47,7 @@ class FastCounterCython(object):
for idx in range(1, l):
h2 = chash(document[idx])
hash2cnt[h2] += 1
hash2cnt[h1 ^ h2] += 1
hash2cnt[h1 + h2] += 1
h1 = h2

# FIXME: add optimized prune
Expand Down Expand Up @@ -95,7 +95,7 @@ class FastCounterPreshed(object):
If the memory structures get too large, clip them (then the internal counts may be only approximate).
"""
cdef int idx, l
cdef Py_ssize_t idx, l
cdef uint64_t h1, h2
cdef preshed.counter.PreshCounter hash2cnt = self.hash2cnt
for document in documents:
Expand All @@ -106,7 +106,7 @@ class FastCounterPreshed(object):
for idx in range(1, l):
h2 = chash(document[idx])
hash2cnt.inc(h2, 1)
hash2cnt.inc(h1 ^ h2, 1)
hash2cnt.inc(h1 + h2, 1)
h1 = h2

# FIXME: add optimized prune
Expand Down

0 comments on commit df84033

Please sign in to comment.