From 42d2df4278a0b37a658885e062996788dfdcbe7f Mon Sep 17 00:00:00 2001 From: jshah02 Date: Mon, 27 Jul 2020 10:51:39 +0530 Subject: [PATCH 1/2] bug fix: wikicorpus getstream from data file-path \n Replace fname with input --- gensim/corpora/wikicorpus.py | 4 ++-- gensim/test/test_corpora.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 7214d6b2b0..6ba40580bb 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -620,7 +620,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. """ - self.fname = fname + self.input = fname self.filter_namespaces = filter_namespaces self.filter_articles = filter_articles self.metadata = False @@ -677,7 +677,7 @@ def get_texts(self): texts = \ ((text, self.lemmatize, title, pageid, tokenization_params) for title, text, pageid - in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles)) + in extract_pages(bz2.BZ2File(self.input), self.filter_namespaces, self.filter_articles)) pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt) try: diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 6660542b48..e13e06ca36 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -769,6 +769,11 @@ def test_removed_table_markup(self): for word in table_markup: self.assertTrue(word not in text) + def test_get_stream(self): + wiki = self.corpus_class(self.enwiki) + sample_text_wiki = next(wiki.getstream()).decode()[1:14] + self.assertEqual(sample_text_wiki, "mediawiki xml") + # #TODO: sporadic failure to be investigated # def test_get_texts_returns_generator_of_lists(self): # corpus = self.corpus_class(self.enwiki) From 3feeb2b8f3a6164964cde44d69c41d88d378c813 Mon Sep 17 00:00:00 2001 From: jshah02 Date: Sun, 16 Aug 2020 22:37:34 +0530 Subject: [PATCH 2/2] refactor: use property decorator for input --- gensim/corpora/wikicorpus.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 6ba40580bb..7bc52bcb75 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -620,7 +620,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. """ - self.input = fname + self.fname = fname self.filter_namespaces = filter_namespaces self.filter_articles = filter_articles self.metadata = False @@ -639,6 +639,10 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction else: self.dictionary = dictionary + @property + def input(self): + return self.fname + def get_texts(self): """Iterate over the dump, yielding a list of tokens for each article that passed the length and namespace filtering. @@ -677,7 +681,7 @@ def get_texts(self): texts = \ ((text, self.lemmatize, title, pageid, tokenization_params) for title, text, pageid - in extract_pages(bz2.BZ2File(self.input), self.filter_namespaces, self.filter_articles)) + in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles)) pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt) try: