From 42d2df4278a0b37a658885e062996788dfdcbe7f Mon Sep 17 00:00:00 2001
From: jshah02 <jenisnehal.shah@factset.com>
Date: Mon, 27 Jul 2020 10:51:39 +0530
Subject: [PATCH 1/2] bug fix: wikicorpus getstream from data file-path \n
 Replace fname with input

---
 gensim/corpora/wikicorpus.py | 4 ++--
 gensim/test/test_corpora.py  | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 7214d6b2b0..6ba40580bb 100644
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -620,7 +620,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary.
 
         """
-        self.fname = fname
+        self.input = fname
         self.filter_namespaces = filter_namespaces
         self.filter_articles = filter_articles
         self.metadata = False
@@ -677,7 +677,7 @@ def get_texts(self):
         texts = \
             ((text, self.lemmatize, title, pageid, tokenization_params)
              for title, text, pageid
-             in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles))
+             in extract_pages(bz2.BZ2File(self.input), self.filter_namespaces, self.filter_articles))
         pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)
 
         try:
diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
index 6660542b48..e13e06ca36 100644
--- a/gensim/test/test_corpora.py
+++ b/gensim/test/test_corpora.py
@@ -769,6 +769,11 @@ def test_removed_table_markup(self):
             for word in table_markup:
                 self.assertTrue(word not in text)
 
+    def test_get_stream(self):
+        wiki = self.corpus_class(self.enwiki)
+        sample_text_wiki = next(wiki.getstream()).decode()[1:14]
+        self.assertEqual(sample_text_wiki, "mediawiki xml")
+
     # #TODO: sporadic failure to be investigated
     # def test_get_texts_returns_generator_of_lists(self):
     #     corpus = self.corpus_class(self.enwiki)

From 3feeb2b8f3a6164964cde44d69c41d88d378c813 Mon Sep 17 00:00:00 2001
From: jshah02 <jenisnehal.shah@factset.com>
Date: Sun, 16 Aug 2020 22:37:34 +0530
Subject: [PATCH 2/2] refactor: use property decorator for input

---
 gensim/corpora/wikicorpus.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 6ba40580bb..7bc52bcb75 100644
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -620,7 +620,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary.
 
         """
-        self.input = fname
+        self.fname = fname
         self.filter_namespaces = filter_namespaces
         self.filter_articles = filter_articles
         self.metadata = False
@@ -639,6 +639,10 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         else:
             self.dictionary = dictionary
 
+    @property
+    def input(self):
+        return self.fname
+
     def get_texts(self):
         """Iterate over the dump, yielding a list of tokens for each article that passed
         the length and namespace filtering.
@@ -677,7 +681,7 @@ def get_texts(self):
         texts = \
             ((text, self.lemmatize, title, pageid, tokenization_params)
              for title, text, pageid
-             in extract_pages(bz2.BZ2File(self.input), self.filter_namespaces, self.filter_articles))
+             in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles))
         pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)
 
         try: