piskvorky · fabriciorsf · Jul 12, 2024 · Jul 12, 2024 · Oct 1, 2017 · Nov 7, 2017
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -25,13 +25,11 @@
 # LXML isn't faster, so let's go with the built-in solution
 from xml.etree.ElementTree import iterparse
 
-
 from gensim import utils
 # cannot import whole gensim.corpora, because that imports wikicorpus...
 from gensim.corpora.dictionary import Dictionary
 from gensim.corpora.textcorpus import TextCorpus
 
-
 logger = logging.getLogger(__name__)
 
 ARTICLE_MIN_WORDS = 50
@@ -468,10 +466,10 @@ def process_article(
     ----------
     args : (str, str, int)
         Article text, article title, page identificator.
-    tokenizer_func : function
-        Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
-        Needs to have interface:
-        tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.
+    tokenizer_func : function OR list of function, optional
+            Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
+            Each function needs to have interface:
+            `tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.`
     token_min_len : int
         Minimal token length.
     token_max_len : int
@@ -487,7 +485,11 @@ def process_article(
     """
     text, title, pageid = args
     text = filter_wiki(text)
-    result = tokenizer_func(text, token_min_len, token_max_len, lower)
+    tokenizers = [] if (tokenizer_func is None) \
+                    else (list(tokenizer_func) if isinstance(tokenizer_func, (list, tuple)) else [tokenizer_func])
+    for tokenizer in tokenizers:
+        text = " ".join(tokenizer(text, token_min_len, token_max_len, lower))
+    result = text.split()
     return result, title, pageid
 
 
@@ -569,6 +571,7 @@ class WikiCorpus(TextCorpus):
         >>> MmCorpus.serialize(corpus_path, wiki)  # another 8h, creates a file in MatrixMarket format and mapping
 
     """
+
     def __init__(
             self, fname, processes=None, lemmatize=None, dictionary=None, metadata=False,
             filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
@@ -590,10 +593,10 @@ def __init__(
             **IMPORTANT: this needs a really long time**.
         filter_namespaces : tuple of str, optional
             Namespaces to consider.
-        tokenizer_func : function, optional
-            Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`.
-            If you inject your own tokenizer, it must conform to this interface:
-            `tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str`
+        tokenizer_func : function OR list of function, optional
+            Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
+            Each function needs to have interface:
+            `tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.`
         article_min_tokens : int, optional
             Minimum tokens in article. Article will be ignored if number of tokens is less.
         token_min_len : int, optional