From 392c672a49ab1762a837eedb584149f583ea2c10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 23 Jun 2017 23:41:21 +0900 Subject: [PATCH 1/7] wip: API design for fast_counter & phrases --- gensim/models/fast_counter.py | 123 ++++++++++++++++++++++++++++++++++ gensim/models/phrases.py | 6 -- 2 files changed, 123 insertions(+), 6 deletions(-) create mode 100644 gensim/models/fast_counter.py diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py new file mode 100644 index 0000000000..431a3d39a2 --- /dev/null +++ b/gensim/models/fast_counter.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2017 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Fast & memory efficient counting of things (and n-grams of things). + +This module is designed to count *item* and *document* frequencies over large, streamed corpora (lazy iteration). + +""" + +from collections import Counter +import logging + +from six import iterkeys, iteritems + +logger = logging.getLogger(__name__) + + +def iter_ngrams(document, ngrams): + assert ngrams[0] <= ngrams[1] + + for n in range(ngrams[0], ngrams[1] + 1): + for ngram in zip(*[document[i:] for i in range(n)]): + logger.debug("yielding ngram %r", ngram) + yield ngram + +def iter_gram1(document): + return iter_ngrams(document, (1, 1)) + +def iter_gram2(document): + return iter_ngrams(document, (2, 2)) + +def iter_gram12(document): + return iter_ngrams(document, (1, 2)) + + +class FastCounter(object): + """ + Fast counting of item frequency and document frequency across large, streamed iterables. + """ + + def __init__(self, doc2items=iter_gram1, collect_df=False): + self.doc2items = doc2items + self.collect_df = collect_df + + self.item_counts = Counter() # TODO replace by some GIL-free low-level struct + self.doc_counts = Counter() # TODO replace by some GIL-free low-level struct + + def hash(self, key): + return hash(key) + + def update(self, documents): + """ + Update the relevant ngram counters from the iterable `documents`. + + If the memory structures get too large, clip them (then the internal counts may be only approximate). + """ + for document in documents: + # TODO: release GIL, so we can run update() in parallel threads. + # Or maybe not needed, if we create multiple FastCounters from multiple input streams using + # multiprocessing, and only .merge() them at the end. + item_cnts = Counter(self.hash(ngram) for ngram in self.doc2items(document)) + self.item_counts.update(item_cnts) + if self.collect_df: + # increment by 1 per unique key ("document frequency") + self.doc_counts.update(iterkeys(item_cnts)) + + # self.prune_vocab() + + return self # for easier chaining + + def prune_vocab(self): + # Trim data structures to fit in memory, if too large. + # Or use a fixed-size data structure to start with (hyperloglog?) + raise NotImplementedError + + def get(self, key, default=None): + """Return the item frequency of `key` (or `default` if key not present).""" + return self.item_counts.get(self.hash(key), default) + + def merge(self, other): + """ + Merge counts from other into self, in-place. + """ + # rare operation, no need to optimize too much + raise NotImplementedError + + def __len__(self): + return len(self.item_counts) + + def __str__(self): + return "%s<%i items>" % (self.__class__.__name__, len(self)) + + +class Phrases(object): + def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000): + self.threshold = threshold + self.min_count = min_count + self.max_vocab_size = max_vocab_size + self.counter = FastCounter(iter_gram12) + + def add_documents(self, documents): + self.counter.update(documents) + + return self # for easier chaining + + def export_phrases(self, document): + """ + Yield all collocations (pairs of adjacent closely related tokens) from the + input `document`, as 2-tuples `(score, bigram)`. + """ + if not self.counter: + return + norm = 1.0 * len(self.counter) + for bigram in iter_gram2(document): + pa, pb, pab = self.counter.get((bigram[0],)), self.counter.get((bigram[1],)), self.counter.get(bigram, 0) + if pa and pb: + score = norm / pa / pb * (pab - self.min_count) + if score > self.threshold: + yield score, bigram diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index be735b865a..65d9b93b02 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -134,12 +134,6 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, should be a byte string (e.g. b'_'). """ - if min_count <= 0: - raise ValueError("min_count should be at least 1") - - if threshold <= 0: - raise ValueError("threshold should be positive") - self.min_count = min_count self.threshold = threshold self.max_vocab_size = max_vocab_size From 6a98f8656dccc810a9d2c99a23e2ae75e0afeea1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 24 Jun 2017 00:05:32 +0900 Subject: [PATCH 2/7] remove document frequency from the API (instead, Dictionary will have 2 counters) --- gensim/models/fast_counter.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py index 431a3d39a2..a5d79764fb 100644 --- a/gensim/models/fast_counter.py +++ b/gensim/models/fast_counter.py @@ -7,7 +7,9 @@ """ Fast & memory efficient counting of things (and n-grams of things). -This module is designed to count *item* and *document* frequencies over large, streamed corpora (lazy iteration). +This module is designed to count item frequencies over large, streamed corpora (lazy iteration). + +Such counts are useful in various other modules, such as Dictionary, TfIdf, Phrases etc. """ @@ -42,12 +44,9 @@ class FastCounter(object): Fast counting of item frequency and document frequency across large, streamed iterables. """ - def __init__(self, doc2items=iter_gram1, collect_df=False): + def __init__(self, doc2items=iter_gram1): self.doc2items = doc2items - self.collect_df = collect_df - - self.item_counts = Counter() # TODO replace by some GIL-free low-level struct - self.doc_counts = Counter() # TODO replace by some GIL-free low-level struct + self.hash2cnt = Counter() # TODO replace by some GIL-free low-level struct def hash(self, key): return hash(key) @@ -62,11 +61,7 @@ def update(self, documents): # TODO: release GIL, so we can run update() in parallel threads. # Or maybe not needed, if we create multiple FastCounters from multiple input streams using # multiprocessing, and only .merge() them at the end. - item_cnts = Counter(self.hash(ngram) for ngram in self.doc2items(document)) - self.item_counts.update(item_cnts) - if self.collect_df: - # increment by 1 per unique key ("document frequency") - self.doc_counts.update(iterkeys(item_cnts)) + self.hash2cnt.update(self.hash(ngram) for ngram in self.doc2items(document)) # self.prune_vocab() @@ -79,7 +74,7 @@ def prune_vocab(self): def get(self, key, default=None): """Return the item frequency of `key` (or `default` if key not present).""" - return self.item_counts.get(self.hash(key), default) + return self.hash2cnt.get(self.hash(key), default) def merge(self, other): """ @@ -89,7 +84,7 @@ def merge(self, other): raise NotImplementedError def __len__(self): - return len(self.item_counts) + return len(self.hash2cnt) def __str__(self): return "%s<%i items>" % (self.__class__.__name__, len(self)) From c601cbf8d34016230dd847a41eabb2f7f1ff2159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 24 Jun 2017 00:09:12 +0900 Subject: [PATCH 3/7] consistency: rename `key` to `item` --- gensim/models/fast_counter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py index a5d79764fb..cfd106d2ab 100644 --- a/gensim/models/fast_counter.py +++ b/gensim/models/fast_counter.py @@ -48,8 +48,8 @@ def __init__(self, doc2items=iter_gram1): self.doc2items = doc2items self.hash2cnt = Counter() # TODO replace by some GIL-free low-level struct - def hash(self, key): - return hash(key) + def hash(self, item): + return hash(item) def update(self, documents): """ @@ -72,9 +72,9 @@ def prune_vocab(self): # Or use a fixed-size data structure to start with (hyperloglog?) raise NotImplementedError - def get(self, key, default=None): - """Return the item frequency of `key` (or `default` if key not present).""" - return self.hash2cnt.get(self.hash(key), default) + def get(self, item, default=None): + """Return the item frequency of `item` (or `default` if item not present).""" + return self.hash2cnt.get(self.hash(item), default) def merge(self, other): """ From a4fdfdb0417d936e9ffbea835fde9a3bccae67fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 24 Jun 2017 00:27:34 +0900 Subject: [PATCH 4/7] implement missing API methods --- gensim/models/fast_counter.py | 30 +++++++++++++++++------------- gensim/utils.py | 5 +++-- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py index cfd106d2ab..b65f3c37cf 100644 --- a/gensim/models/fast_counter.py +++ b/gensim/models/fast_counter.py @@ -18,6 +18,8 @@ from six import iterkeys, iteritems +from gensim import utils + logger = logging.getLogger(__name__) @@ -44,8 +46,10 @@ class FastCounter(object): Fast counting of item frequency and document frequency across large, streamed iterables. """ - def __init__(self, doc2items=iter_gram1): + def __init__(self, doc2items=iter_gram1, max_size=None): self.doc2items = doc2items + self.max_size = max_size + self.min_reduce = 0 self.hash2cnt = Counter() # TODO replace by some GIL-free low-level struct def hash(self, item): @@ -62,15 +66,16 @@ def update(self, documents): # Or maybe not needed, if we create multiple FastCounters from multiple input streams using # multiprocessing, and only .merge() them at the end. self.hash2cnt.update(self.hash(ngram) for ngram in self.doc2items(document)) - - # self.prune_vocab() + self.prune_items() return self # for easier chaining - def prune_vocab(self): - # Trim data structures to fit in memory, if too large. - # Or use a fixed-size data structure to start with (hyperloglog?) - raise NotImplementedError + def prune_items(self): + """Trim data structures to fit in memory, if too large.""" + # XXX: Or use a fixed-size data structure to start with (hyperloglog?) + while self.max_size and len(self) > self.max_size: + self.min_reduce += 1 + utils.prune_vocab(self.hash2cnt, self.min_reduce) def get(self, item, default=None): """Return the item frequency of `item` (or `default` if item not present).""" @@ -78,10 +83,11 @@ def get(self, item, default=None): def merge(self, other): """ - Merge counts from other into self, in-place. + Merge counts from another FastCounter into self, in-place. """ - # rare operation, no need to optimize too much - raise NotImplementedError + self.hash2cnt.update(other.hash2cnt) + self.min_reduce = max(self.min_reduce, other.min_reduce) + self.prune_items() def __len__(self): return len(self.hash2cnt) @@ -95,7 +101,7 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000): self.threshold = threshold self.min_count = min_count self.max_vocab_size = max_vocab_size - self.counter = FastCounter(iter_gram12) + self.counter = FastCounter(iter_gram12, max_size=max_vocab_size) def add_documents(self, documents): self.counter.update(documents) @@ -107,8 +113,6 @@ def export_phrases(self, document): Yield all collocations (pairs of adjacent closely related tokens) from the input `document`, as 2-tuples `(score, bigram)`. """ - if not self.counter: - return norm = 1.0 * len(self.counter) for bigram in iter_gram2(document): pa, pb, pab = self.counter.get((bigram[0],)), self.counter.get((bigram[1],)), self.counter.get(bigram, 0) diff --git a/gensim/utils.py b/gensim/utils.py index dd391f887b..9118d97e49 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1121,8 +1121,9 @@ def prune_vocab(vocab, min_reduce, trim_rule=None): if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule): # vocab[w] <= min_reduce: result += vocab[w] del vocab[w] - logger.info("pruned out %i tokens with count <=%i (before %i, after %i)", - old_len - len(vocab), min_reduce, old_len, len(vocab)) + logger.info( + "pruned out %i tokens with count <=%i (before %i, after %i)", + old_len - len(vocab), min_reduce, old_len, len(vocab)) return result From 24f5b63e923a50b4d669178ab0462680b85d1984 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 24 Jun 2017 02:19:52 +0900 Subject: [PATCH 5/7] WIP: simple cython impl --- gensim/models/fast_counter_cython.pyx | 68 +++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 gensim/models/fast_counter_cython.pyx diff --git a/gensim/models/fast_counter_cython.pyx b/gensim/models/fast_counter_cython.pyx new file mode 100644 index 0000000000..94f83b0030 --- /dev/null +++ b/gensim/models/fast_counter_cython.pyx @@ -0,0 +1,68 @@ +#!/usr/bin/env cython +# cython: boundscheck=False +# cython: wraparound=False +# cython: cdivision=True +# coding: utf-8 +# +# Copyright (C) 2017 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +from collections import Counter + +cimport preshed.counter + + +class FastCounterCython(object): + """ + Fast counting of item frequency frequency across large, streamed iterables. + """ + + def __init__(self, doc2items=None, max_size=None): + self.doc2items = doc2items + self.max_size = max_size + self.min_reduce = 0 + self.hash2cnt = Counter() # TODO replace by some GIL-free low-level struct + + def update(self, documents): + """ + Update the relevant ngram counters from the iterable `documents`. + + If the memory structures get too large, clip them (then the internal counts may be only approximate). + """ + hash2cnt = self.hash2cnt + for document in documents: + # TODO: release GIL, so we can run update() in parallel threads. + # Or maybe not needed, if we create multiple FastCounters from multiple input streams using + # multiprocessing, and only .merge() them at the end. + if document: + hash2cnt[hash(document[0])] += 1 + for idx in range(len(document) - 1): + hash2cnt[hash(document[idx + 1])] += 1 + hash2cnt[hash((document[idx], document[idx + 1]))] += 1 + + # FIXME: add optimized prune + + return self # for easier chaining + + def prune_items(self): + """Trim data structures to fit in memory, if too large.""" + # XXX: Or use a fixed-size data structure to start with (hyperloglog?) + pass + + def get(self, item, default=None): + """Return the item frequency of `item` (or `default` if item not present).""" + return self.hash2cnt.get(hash(item), default) + + def merge(self, other): + """ + Merge counts from another FastCounter into self, in-place. + """ + self.hash2cnt.update(other.hash2cnt) + self.min_reduce = max(self.min_reduce, other.min_reduce) + self.prune_items() + + def __len__(self): + return len(self.hash2cnt) + + def __str__(self): + return "%s<%i items>" % (self.__class__.__name__, len(self)) From f87722a020be6ea4cc23f14ed7d812709c208e76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 24 Jun 2017 03:06:52 +0900 Subject: [PATCH 6/7] cythonized Phrases (untested, mildly optimized) --- gensim/models/fast_counter.py | 38 ++++++++++-- gensim/models/fast_counter_cython.pyx | 89 ++++++++++++++++++++++++--- gensim/models/phrases.py | 14 +++-- setup.py | 5 +- 4 files changed, 127 insertions(+), 19 deletions(-) diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py index b65f3c37cf..fb5374652e 100644 --- a/gensim/models/fast_counter.py +++ b/gensim/models/fast_counter.py @@ -13,12 +13,13 @@ """ +import sys +import os from collections import Counter import logging -from six import iterkeys, iteritems - from gensim import utils +from gensim.models.fast_counter_cython import FastCounterCython, FastCounterPreshed logger = logging.getLogger(__name__) @@ -43,7 +44,7 @@ def iter_gram12(document): class FastCounter(object): """ - Fast counting of item frequency and document frequency across large, streamed iterables. + Fast counting of item frequency frequency across large, streamed iterables. """ def __init__(self, doc2items=iter_gram1, max_size=None): @@ -101,7 +102,9 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000): self.threshold = threshold self.min_count = min_count self.max_vocab_size = max_vocab_size - self.counter = FastCounter(iter_gram12, max_size=max_vocab_size) + # self.counter = FastCounter(iter_gram12, max_size=max_vocab_size) + # self.counter = FastCounterCython() + self.counter = FastCounterPreshed() def add_documents(self, documents): self.counter.update(documents) @@ -120,3 +123,30 @@ def export_phrases(self, document): score = norm / pa / pb * (pab - self.min_count) if score > self.threshold: yield score, bigram + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) + logger.info("running %s", " ".join(sys.argv)) + + # check and process cmdline input + program = os.path.basename(sys.argv[0]) + if len(sys.argv) < 2: + print(globals()['__doc__'] % locals()) + sys.exit(1) + infile = sys.argv[1] + + from gensim.models.word2vec import Text8Corpus + documents = Text8Corpus(infile) + + logger.info("training phrases") + bigram = Phrases(min_count=5, threshold=100).add_documents(documents) + logger.info("finished training phrases") + print(bigram.counter) + # for doc in documents: + # s = u' '.join(doc) + # for _, bigram in bigram.export_phrases(doc): + # s = s.replace(u' '.join(bigram), u'_'.join(bigram)) + # print(utils.to_utf8(s)) + + logger.info("finished running %s", " ".join(sys.argv)) diff --git a/gensim/models/fast_counter_cython.pyx b/gensim/models/fast_counter_cython.pyx index 94f83b0030..24450a0fb0 100644 --- a/gensim/models/fast_counter_cython.pyx +++ b/gensim/models/fast_counter_cython.pyx @@ -9,9 +9,16 @@ from collections import Counter +from libc.stdint cimport int64_t, uint64_t + cimport preshed.counter +cdef uint64_t chash(obj): + # TODO use something faster, can assume string + return hash(obj) + + class FastCounterCython(object): """ Fast counting of item frequency frequency across large, streamed iterables. @@ -21,7 +28,7 @@ class FastCounterCython(object): self.doc2items = doc2items self.max_size = max_size self.min_reduce = 0 - self.hash2cnt = Counter() # TODO replace by some GIL-free low-level struct + self.hash2cnt = Counter() def update(self, documents): """ @@ -29,16 +36,78 @@ class FastCounterCython(object): If the memory structures get too large, clip them (then the internal counts may be only approximate). """ + cdef int idx, l + cdef uint64_t h1, h2 hash2cnt = self.hash2cnt for document in documents: - # TODO: release GIL, so we can run update() in parallel threads. - # Or maybe not needed, if we create multiple FastCounters from multiple input streams using - # multiprocessing, and only .merge() them at the end. - if document: - hash2cnt[hash(document[0])] += 1 - for idx in range(len(document) - 1): - hash2cnt[hash(document[idx + 1])] += 1 - hash2cnt[hash((document[idx], document[idx + 1]))] += 1 + l = len(document) + if l: + h1 = chash(document[0]) + hash2cnt[h1] += 1 + for idx in range(1, l): + h2 = chash(document[idx]) + hash2cnt[h2] += 1 + hash2cnt[h1 ^ h2] += 1 + h1 = h2 + + # FIXME: add optimized prune + + return self # for easier chaining + + def prune_items(self): + """Trim data structures to fit in memory, if too large.""" + # XXX: Or use a fixed-size data structure to start with (hyperloglog?) + pass + + def get(self, item, default=None): + """Return the item frequency of `item` (or `default` if item not present).""" + return self.hash2cnt.get(chash(item), default) + + def merge(self, other): + """ + Merge counts from another FastCounter into self, in-place. + """ + self.hash2cnt.update(other.hash2cnt) + self.min_reduce = max(self.min_reduce, other.min_reduce) + self.prune_items() + + def __len__(self): + return len(self.hash2cnt) + + def __str__(self): + return "%s<%i items>" % (self.__class__.__name__, len(self)) + + +class FastCounterPreshed(object): + """ + Fast counting of item frequency frequency across large, streamed iterables. + """ + + def __init__(self, doc2items=None, max_size=None): + self.doc2items = doc2items + self.max_size = max_size + self.min_reduce = 0 + self.hash2cnt = preshed.counter.PreshCounter() # TODO replace by some GIL-free low-level struct + + def update(self, documents): + """ + Update the relevant ngram counters from the iterable `documents`. + + If the memory structures get too large, clip them (then the internal counts may be only approximate). + """ + cdef int idx, l + cdef uint64_t h1, h2 + cdef preshed.counter.PreshCounter hash2cnt = self.hash2cnt + for document in documents: + l = len(document) + if l: + h1 = chash(document[0]) + hash2cnt.inc(h1, 1) + for idx in range(1, l): + h2 = chash(document[idx]) + hash2cnt.inc(h2, 1) + hash2cnt.inc(h1 ^ h2, 1) + h1 = h2 # FIXME: add optimized prune @@ -51,7 +120,7 @@ class FastCounterCython(object): def get(self, item, default=None): """Return the item frequency of `item` (or `default` if item not present).""" - return self.hash2cnt.get(hash(item), default) + return self.hash2cnt.get(chash(item), default) def merge(self, other): """ diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 65d9b93b02..f3082f620c 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -163,7 +163,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): if sentence_no % progress_per == 0: logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) - sentence = [utils.any2utf8(w) for w in sentence] + # sentence = [utils.any2utf8(w) for w in sentence] for bigram in zip(sentence, sentence[1:]): vocab[bigram[0]] += 1 vocab[delimiter.join(bigram)] += 1 @@ -388,7 +388,7 @@ def __getitem__(self, sentence): if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) - logging.info("running %s" % " ".join(sys.argv)) + logger.info("running %s", " ".join(sys.argv)) # check and process cmdline input program = os.path.basename(sys.argv[0]) @@ -402,6 +402,12 @@ def __getitem__(self, sentence): sentences = Text8Corpus(infile) # test_doc = LineSentence('test/test_data/testcorpus.txt') + logger.info("training phrases") bigram = Phrases(sentences, min_count=5, threshold=100) - for s in bigram[sentences]: - print(utils.to_utf8(u' '.join(s))) + print bigram + logger.info("finished training phrases") + + # for s in bigram[sentences]: + # print(utils.to_utf8(u' '.join(s))) + + logger.info("finished running %s", " ".join(sys.argv)) diff --git a/setup.py b/setup.py index 1e88cb129d..dbd52ee017 100644 --- a/setup.py +++ b/setup.py @@ -249,7 +249,10 @@ def finalize_options(self): include_dirs=[model_dir]), Extension('gensim.models.doc2vec_inner', sources=['./gensim/models/doc2vec_inner.c'], - include_dirs=[model_dir]) + include_dirs=[model_dir]), + Extension('gensim.models.fast_counter_cython', + sources=['./gensim/models/fast_counter_cython.c'], + include_dirs=[model_dir]), ], cmdclass=cmdclass, packages=find_packages(), From df84033653389fb0b7a09d93af497ad00863219e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 24 Jun 2017 03:24:47 +0900 Subject: [PATCH 7/7] replace Counter by defaultdict --- gensim/models/fast_counter.py | 14 ++++++-------- gensim/models/fast_counter_cython.pyx | 12 ++++++------ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py index fb5374652e..1f59d4ee25 100644 --- a/gensim/models/fast_counter.py +++ b/gensim/models/fast_counter.py @@ -15,7 +15,7 @@ import sys import os -from collections import Counter +from collections import defaultdict import logging from gensim import utils @@ -51,7 +51,7 @@ def __init__(self, doc2items=iter_gram1, max_size=None): self.doc2items = doc2items self.max_size = max_size self.min_reduce = 0 - self.hash2cnt = Counter() # TODO replace by some GIL-free low-level struct + self.hash2cnt = defaultdict(int) def hash(self, item): return hash(item) @@ -63,10 +63,8 @@ def update(self, documents): If the memory structures get too large, clip them (then the internal counts may be only approximate). """ for document in documents: - # TODO: release GIL, so we can run update() in parallel threads. - # Or maybe not needed, if we create multiple FastCounters from multiple input streams using - # multiprocessing, and only .merge() them at the end. - self.hash2cnt.update(self.hash(ngram) for ngram in self.doc2items(document)) + for item in self.doc2items(document): + self.hash2cnt[self.hash(item)] += 1 self.prune_items() return self # for easier chaining @@ -103,8 +101,8 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000): self.min_count = min_count self.max_vocab_size = max_vocab_size # self.counter = FastCounter(iter_gram12, max_size=max_vocab_size) - # self.counter = FastCounterCython() - self.counter = FastCounterPreshed() + self.counter = FastCounterCython() + # self.counter = FastCounterPreshed() def add_documents(self, documents): self.counter.update(documents) diff --git a/gensim/models/fast_counter_cython.pyx b/gensim/models/fast_counter_cython.pyx index 24450a0fb0..dad283032c 100644 --- a/gensim/models/fast_counter_cython.pyx +++ b/gensim/models/fast_counter_cython.pyx @@ -7,7 +7,7 @@ # Copyright (C) 2017 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -from collections import Counter +from collections import defaultdict from libc.stdint cimport int64_t, uint64_t @@ -28,7 +28,7 @@ class FastCounterCython(object): self.doc2items = doc2items self.max_size = max_size self.min_reduce = 0 - self.hash2cnt = Counter() + self.hash2cnt = defaultdict(int) def update(self, documents): """ @@ -36,7 +36,7 @@ class FastCounterCython(object): If the memory structures get too large, clip them (then the internal counts may be only approximate). """ - cdef int idx, l + cdef Py_ssize_t idx, l cdef uint64_t h1, h2 hash2cnt = self.hash2cnt for document in documents: @@ -47,7 +47,7 @@ class FastCounterCython(object): for idx in range(1, l): h2 = chash(document[idx]) hash2cnt[h2] += 1 - hash2cnt[h1 ^ h2] += 1 + hash2cnt[h1 + h2] += 1 h1 = h2 # FIXME: add optimized prune @@ -95,7 +95,7 @@ class FastCounterPreshed(object): If the memory structures get too large, clip them (then the internal counts may be only approximate). """ - cdef int idx, l + cdef Py_ssize_t idx, l cdef uint64_t h1, h2 cdef preshed.counter.PreshCounter hash2cnt = self.hash2cnt for document in documents: @@ -106,7 +106,7 @@ class FastCounterPreshed(object): for idx in range(1, l): h2 = chash(document[idx]) hash2cnt.inc(h2, 1) - hash2cnt.inc(h1 ^ h2, 1) + hash2cnt.inc(h1 + h2, 1) h1 = h2 # FIXME: add optimized prune