From 023262107f07ba6ccfab00f7ae6691ded557dcd0 Mon Sep 17 00:00:00 2001 From: Yu Yin Date: Thu, 12 Jul 2018 16:16:34 +0800 Subject: [PATCH 1/2] Make clear `prune_at` documentation According to the code, the `prune_at` parameter in `Dictionary.__init__` and `add_documents` is only for reducing memory usage, and has no guarantee on correctness, but the documentation of this parameter was confusing to users. --- gensim/corpora/dictionary.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 84e2ed9945..476510f3a5 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -56,7 +56,8 @@ def __init__(self, documents=None, prune_at=2000000): documents : iterable of iterable of str, optional Documents to be used to initialize the mapping and collect corpus statistics. prune_at : int, optional - Dictionary will keep no more than `prune_at` words in its mapping, to limit its RAM footprint. + Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM + footprint. The correctness is not guaranteed. Use `filter_extremes` to perform proper filtering. Examples -------- @@ -172,7 +173,8 @@ def add_documents(self, documents, prune_at=2000000): documents : iterable of iterable of str Input corpus. All tokens should be already **tokenized and normalized**. prune_at : int, optional - Dictionary will keep no more than `prune_at` words in its mapping, to limit its RAM footprint. + Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM + footprint. The correctness is not guaranteed. Use `filter_extremes` to perform proper filtering. Examples -------- From ebc51c57001a2338818805b1453c325384bcae12 Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Tue, 31 Jul 2018 11:53:35 +0500 Subject: [PATCH 2/2] add link to method --- gensim/corpora/dictionary.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 476510f3a5..1e13692a2d 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -57,7 +57,8 @@ def __init__(self, documents=None, prune_at=2000000): Documents to be used to initialize the mapping and collect corpus statistics. prune_at : int, optional Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM - footprint. The correctness is not guaranteed. Use `filter_extremes` to perform proper filtering. + footprint, the correctness is not guaranteed. + Use :meth:`~gensim.corpora.dictionary.Dictionary.filter_extremes` to perform proper filtering. Examples -------- @@ -174,7 +175,8 @@ def add_documents(self, documents, prune_at=2000000): Input corpus. All tokens should be already **tokenized and normalized**. prune_at : int, optional Dictionary will try to keep no more than `prune_at` words in its mapping, to limit its RAM - footprint. The correctness is not guaranteed. Use `filter_extremes` to perform proper filtering. + footprint, the correctness is not guaranteed. + Use :meth:`~gensim.corpora.dictionary.Dictionary.filter_extremes` to perform proper filtering. Examples --------