From 392c672a49ab1762a837eedb584149f583ea2c10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 23 Jun 2017 23:41:21 +0900 Subject: [PATCH] wip: API design for fast_counter & phrases --- gensim/models/fast_counter.py | 123 ++++++++++++++++++++++++++++++++++ gensim/models/phrases.py | 6 -- 2 files changed, 123 insertions(+), 6 deletions(-) create mode 100644 gensim/models/fast_counter.py diff --git a/gensim/models/fast_counter.py b/gensim/models/fast_counter.py new file mode 100644 index 0000000000..431a3d39a2 --- /dev/null +++ b/gensim/models/fast_counter.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2017 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Fast & memory efficient counting of things (and n-grams of things). + +This module is designed to count *item* and *document* frequencies over large, streamed corpora (lazy iteration). + +""" + +from collections import Counter +import logging + +from six import iterkeys, iteritems + +logger = logging.getLogger(__name__) + + +def iter_ngrams(document, ngrams): + assert ngrams[0] <= ngrams[1] + + for n in range(ngrams[0], ngrams[1] + 1): + for ngram in zip(*[document[i:] for i in range(n)]): + logger.debug("yielding ngram %r", ngram) + yield ngram + +def iter_gram1(document): + return iter_ngrams(document, (1, 1)) + +def iter_gram2(document): + return iter_ngrams(document, (2, 2)) + +def iter_gram12(document): + return iter_ngrams(document, (1, 2)) + + +class FastCounter(object): + """ + Fast counting of item frequency and document frequency across large, streamed iterables. + """ + + def __init__(self, doc2items=iter_gram1, collect_df=False): + self.doc2items = doc2items + self.collect_df = collect_df + + self.item_counts = Counter() # TODO replace by some GIL-free low-level struct + self.doc_counts = Counter() # TODO replace by some GIL-free low-level struct + + def hash(self, key): + return hash(key) + + def update(self, documents): + """ + Update the relevant ngram counters from the iterable `documents`. + + If the memory structures get too large, clip them (then the internal counts may be only approximate). + """ + for document in documents: + # TODO: release GIL, so we can run update() in parallel threads. + # Or maybe not needed, if we create multiple FastCounters from multiple input streams using + # multiprocessing, and only .merge() them at the end. + item_cnts = Counter(self.hash(ngram) for ngram in self.doc2items(document)) + self.item_counts.update(item_cnts) + if self.collect_df: + # increment by 1 per unique key ("document frequency") + self.doc_counts.update(iterkeys(item_cnts)) + + # self.prune_vocab() + + return self # for easier chaining + + def prune_vocab(self): + # Trim data structures to fit in memory, if too large. + # Or use a fixed-size data structure to start with (hyperloglog?) + raise NotImplementedError + + def get(self, key, default=None): + """Return the item frequency of `key` (or `default` if key not present).""" + return self.item_counts.get(self.hash(key), default) + + def merge(self, other): + """ + Merge counts from other into self, in-place. + """ + # rare operation, no need to optimize too much + raise NotImplementedError + + def __len__(self): + return len(self.item_counts) + + def __str__(self): + return "%s<%i items>" % (self.__class__.__name__, len(self)) + + +class Phrases(object): + def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000): + self.threshold = threshold + self.min_count = min_count + self.max_vocab_size = max_vocab_size + self.counter = FastCounter(iter_gram12) + + def add_documents(self, documents): + self.counter.update(documents) + + return self # for easier chaining + + def export_phrases(self, document): + """ + Yield all collocations (pairs of adjacent closely related tokens) from the + input `document`, as 2-tuples `(score, bigram)`. + """ + if not self.counter: + return + norm = 1.0 * len(self.counter) + for bigram in iter_gram2(document): + pa, pb, pab = self.counter.get((bigram[0],)), self.counter.get((bigram[1],)), self.counter.get(bigram, 0) + if pa and pb: + score = norm / pa / pb * (pab - self.min_count) + if score > self.threshold: + yield score, bigram diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index be735b865a..65d9b93b02 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -134,12 +134,6 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, should be a byte string (e.g. b'_'). """ - if min_count <= 0: - raise ValueError("min_count should be at least 1") - - if threshold <= 0: - raise ValueError("threshold should be positive") - self.min_count = min_count self.threshold = threshold self.max_vocab_size = max_vocab_size