piskvorky · mpenkov · Aug 17, 2020 · Jul 27, 2020 · Aug 16, 2020 · piskvorky
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -620,7 +620,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary.
 
         """
-        self.fname = fname
+        self.input = fname
         self.filter_namespaces = filter_namespaces
         self.filter_articles = filter_articles
         self.metadata = False
@@ -677,7 +677,7 @@ def get_texts(self):
         texts = \
             ((text, self.lemmatize, title, pageid, tokenization_params)
              for title, text, pageid
-             in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles))
+             in extract_pages(bz2.BZ2File(self.input), self.filter_namespaces, self.filter_articles))
         pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)
 
         try:

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
@@ -769,6 +769,11 @@ def test_removed_table_markup(self):
             for word in table_markup:
                 self.assertTrue(word not in text)
 
+    def test_get_stream(self):
+        wiki = self.corpus_class(self.enwiki)
+        sample_text_wiki = next(wiki.getstream()).decode()[1:14]
+        self.assertEqual(sample_text_wiki, "mediawiki xml")
+
     # #TODO: sporadic failure to be investigated
     # def test_get_texts_returns_generator_of_lists(self):
     #     corpus = self.corpus_class(self.enwiki)