jbesomi · mk2510 · Jul 26, 2020 · Jul 27, 2020 · Jul 27, 2020 · Jul 27, 2020
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -7,7 +7,6 @@
 from texthero import preprocessing, stopwords
 from . import PandasTestCase
 
-
 """
 Test doctest
 """
@@ -113,6 +112,104 @@ def test_pipeline_stopwords(self):
         pipeline = [preprocessing.lowercase, preprocessing.remove_stopwords]
         self.assertEqual(preprocessing.clean(s, pipeline=pipeline), s_true)
 
+    """
+    Test clean
+    """
+
+    def _get_default_clean_pipeline(self):
+        """
+        Return a list contaning all the methods used in the default cleaning pipeline.
+
+        Return a list with the following functions:
+        1. :meth:`texthero.preprocessing.fillna`
+        2. :meth:`texthero.preprocessing.lowercase`
+        3. :meth:`texthero.preprocessing.remove_digits`
+        4. :meth:`texthero.preprocessing.remove_punctuation`
+        5. :meth:`texthero.preprocessing.remove_diacritics`
+        6. :meth:`texthero.preprocessing.remove_stopwords`
+        7. :meth:`texthero.preprocessing.remove_whitespace`
+        """
+
+        return [
+            preprocessing.fillna,
+            preprocessing.lowercase,
+            preprocessing.remove_digits,
+            preprocessing.remove_punctuation,
+            preprocessing.remove_diacritics,
+            preprocessing.remove_stopwords,
+            preprocessing.remove_whitespace,
+        ]
+
+    def test_clean(self):
+        s = pd.Series(
+            ["This serös 42 should bE CLeaned.! I am a stopword    \n", np.NAN]
+        )
+        s_true = pd.Series(
+            ["This serös 42 should bE CLeaned.! I am a stopword    \n", np.NAN]
+        )
+        self.assertEqual(
+            preprocessing.clean(s),
+            preprocessing.clean(s_true, self._get_default_clean_pipeline()),
+        )
+
+    def test_clean_fillna(self):
+        s = pd.Series(np.NaN)
+        s_true = pd.Series(np.NaN)
+        self.assertEqual(
+            preprocessing.clean(s),
+            preprocessing.clean(s_true, self._get_default_clean_pipeline()),
+        )
+
+    def test_clean_lowercase(self):
+        s = pd.Series("this text Is MiXed CasE")
+        s_true = pd.Series("this text Is MiXed CasE")
+        self.assertEqual(
+            preprocessing.clean(s),
+            preprocessing.clean(s_true, self._get_default_clean_pipeline()),
+        )
+
+    def test_clean_digits(self):
+        s = pd.Series("Here are 42 digits blocks 89")
+        s_true = pd.Series("Here are 42 digits blocks 89")
+        self.assertEqual(
+            preprocessing.clean(s),
+            preprocessing.clean(s_true, self._get_default_clean_pipeline()),
+        )
+
+    def test_clean_punctuation(self):
+        s = pd.Series("Some. wired, punctiation;.:!!!!")
+        s_true = pd.Series("Some. wired, punctiation;.:!!!")
+        self.assertEqual(
+            preprocessing.clean(s),
+            preprocessing.clean(s_true, self._get_default_clean_pipeline()),
+        )
+
+    def test_clean_diacritics(self):
+        s = pd.Series("Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس")
+        s_true = pd.Series(
+            "Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس"
+        )
+        self.assertEqual(
+            preprocessing.clean(s),
+            preprocessing.clean(s_true, self._get_default_clean_pipeline()),
+        )
+
+    def test_clean_stopwords(self):
+        s = pd.Series("some stopwords are here\nAnd on")
+        s_true = pd.Series("some stopwords are here\nAnd on")
+        self.assertEqual(
+            preprocessing.clean(s),
+            preprocessing.clean(s_true, self._get_default_clean_pipeline()),
+        )
+
+    def test_clean_whitespaces(self):
+        s = pd.Series("hello   world  hello        world \n there ")
+        s_true = pd.Series("hello   world  hello        world \n there ")
+        self.assertEqual(
+            preprocessing.clean(s),
+            preprocessing.clean(s_true, self._get_default_clean_pipeline()),
+        )
+
     """
     Test stopwords.
     """

diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py
@@ -17,6 +17,34 @@
 
 from typing import List, Callable
 
+# REGEX pattern constants
+PATTERN_REMOVE_DIGITS_BLOCK = r"\b\d+\b"
+PATTERN_REMOVE_PUNCTUATION = rf"([{string.punctuation}])+"
+PATTERN_STOPWORD_TOKENIZER = r"""(?x)                          # Set flag to allow verbose regexps
+                                \w+(?:-\w+)*                              # Words with optional internal hyphens 
+                                | \s*                                     # Any space
+                                | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~]    # Any symbol 
+                                """
+PATTERN_REMOVE_ROUND_BRACKETS = r"\([^()]*\)"
+PATERN_REMOVE_CURLY_BRACKETS = r"\{[^{}]*\}"
+PATTERN_REMOVE_SQUARE_BRACKETS = r"\[[^\[\]]*\]"
+PATTERN_REMOVE_ANGLE_BRACKETS = r"<[^<>]*>"
+PATTERN_REMOVE_HTML_TAG = r"""(?x)                    # Turn on free-spacing
+                            <[^>]+>                             # Remove <html> tags
+                            | &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
+                            """
+
+
+def GET_PATTERN_TOKENIZATION(punct: str) -> str:
+    """
+    Returns the standart tokenisation pattern
+    """
+    return rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))"
+
+
+PATTERN_REPLACE_URLS = r"http\S+"
+PATTERN_REPLACE_TAGS = r"@[a-zA-Z0-9]+"
+PATTERN_REPLACE_HASHTAGS = r"#[a-zA-Z0-9_]+"
 
 # Ignore gensim annoying warnings
 import warnings
@@ -92,8 +120,7 @@ def replace_digits(s: pd.Series, symbols: str = " ", only_blocks=True) -> pd.Ser
     """
 
     if only_blocks:
-        pattern = r"\b\d+\b"
-        return s.str.replace(pattern, symbols)
+        return s.str.replace(PATTERN_REMOVE_DIGITS_BLOCK, symbols)
     else:
         return s.str.replace(r"\d+", symbols)
 
@@ -158,7 +185,7 @@ def replace_punctuation(s: pd.Series, symbol: str = " ") -> pd.Series:
     dtype: object
     """
 
-    return s.str.replace(rf"([{string.punctuation}])+", symbol)
+    return s.str.replace(PATTERN_REMOVE_PUNCTUATION, symbol)
 
 
 def remove_punctuation(s: pd.Series) -> pd.Series:
@@ -267,13 +294,10 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str:
 
     """
 
-    pattern = r"""(?x)                          # Set flag to allow verbose regexps
-      \w+(?:-\w+)*                              # Words with optional internal hyphens 
-      | \s*                                     # Any space
-      | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~]    # Any symbol 
-    """
-
-    return "".join(t if t not in words else symbol for t in re.findall(pattern, text))
+    return "".join(
+        t if t not in words else symbol
+        for t in re.findall(PATTERN_STOPWORD_TOKENIZER, text)
+    )
 
 
 def replace_stopwords(
@@ -401,33 +425,14 @@ def _stem(text):
     return s.str.split().apply(_stem)
 
 
-def get_default_pipeline() -> List[Callable[[pd.Series], pd.Series]]:
-    """
-    Return a list contaning all the methods used in the default cleaning pipeline.
-
-    Return a list with the following functions:
-     1. :meth:`texthero.preprocessing.fillna`
-     2. :meth:`texthero.preprocessing.lowercase`
-     3. :meth:`texthero.preprocessing.remove_digits`
-     4. :meth:`texthero.preprocessing.remove_punctuation`
-     5. :meth:`texthero.preprocessing.remove_diacritics`
-     6. :meth:`texthero.preprocessing.remove_stopwords`
-     7. :meth:`texthero.preprocessing.remove_whitespace`
-    """
-    return [
-        fillna,
-        lowercase,
-        remove_digits,
-        remove_punctuation,
-        remove_diacritics,
-        remove_stopwords,
-        remove_whitespace,
-    ]
-
-
 def clean(s: pd.Series, pipeline=None) -> pd.Series:
     """
-    Pre-process a text-based Pandas Series, by using the following default pipline.
+    Pre-process a text-based Pandas Series.
+
+    There are two options to use this function. You can either use this function, buy not specifiying an pipeline.
+    In this case the clean function will use a default pipeline, which was hardcoded, to gain 30% performance improvements,
+    over the "pipe" method.
+    If you specify your own cleaning pipeline, the clean function will use this one instead.
 
      Default pipeline:
      1. :meth:`texthero.preprocessing.fillna`
@@ -438,6 +443,7 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series:
      6. :meth:`texthero.preprocessing.remove_stopwords`
      7. :meth:`texthero.preprocessing.remove_whitespace`
 
+
     Parameters
     ----------
     s : Pandas Series
@@ -458,13 +464,69 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series:
     """
 
     if not pipeline:
-        pipeline = get_default_pipeline()
+        return _optimised_default_clean(s)
 
     for f in pipeline:
         s = s.pipe(f)
     return s
 
 
+def _optimised_default_clean(s: pd.Series) -> pd.Series:
+    """
+    Applies the default clean pipeline in an optimised way to a series,
+    that is about 30% faster.
+
+    Default pipeline:
+     1. :meth:`texthero.preprocessing.fillna`
+     2. :meth:`texthero.preprocessing.lowercase`
+     3. :meth:`texthero.preprocessing.remove_digits`
+     4. :meth:`texthero.preprocessing.remove_punctuation`
+     5. :meth:`texthero.preprocessing.remove_diacritics`
+     6. :meth:`texthero.preprocessing.remove_stopwords`
+     7. :meth:`texthero.preprocessing.remove_whitespace`
+    """
+    return s.apply(_optimised_default_clean_single_cell)
+
+
+def _optimised_default_clean_single_cell(text: str) -> str:
+    """
+    Applies the default clean pipeline to one cell.
+
+    Default pipeline:
+     1. :meth:`texthero.preprocessing.fillna`
+     2. :meth:`texthero.preprocessing.lowercase`
+     3. :meth:`texthero.preprocessing.remove_digits`
+     4. :meth:`texthero.preprocessing.remove_punctuation`
+     5. :meth:`texthero.preprocessing.remove_diacritics`
+     6. :meth:`texthero.preprocessing.remove_stopwords`
+     7. :meth:`texthero.preprocessing.remove_whitespace`
+    """
+
+    # fillna
+    if pd.isna(text):
+        return ""
+
+    # lowercase
+    text = text.lower()
+
+    # remove digits and punctuation
+    pattern_mixed_remove = (
+        PATTERN_REMOVE_DIGITS_BLOCK + "|" + PATTERN_REMOVE_PUNCTUATION
+    )
+    text = re.sub(pattern_mixed_remove, "", text)
+
+    # remove diacritics
+    text = _remove_diacritics(text)
+
+    # remove stopwords
+    text = _replace_stopwords(text, _stopwords.DEFAULT, "")
+
+    # remove whitespace
+    text = " ".join(re.sub("\xa0", " ", text).split())
+
+    return text
+
+
 def has_content(s: pd.Series) -> pd.Series:
     r"""
     Return a Boolean Pandas Series indicating if the rows have content.
@@ -526,7 +588,7 @@ def remove_round_brackets(s: pd.Series) -> pd.Series:
     :meth:`remove_square_brackets`
 
     """
-    return s.str.replace(r"\([^()]*\)", "")
+    return s.str.replace(PATTERN_REMOVE_ROUND_BRACKETS, "")
 
 
 def remove_curly_brackets(s: pd.Series) -> pd.Series:
@@ -550,7 +612,7 @@ def remove_curly_brackets(s: pd.Series) -> pd.Series:
     :meth:`remove_square_brackets`
 
     """
-    return s.str.replace(r"\{[^{}]*\}", "")
+    return s.str.replace(PATERN_REMOVE_CURLY_BRACKETS, "")
 
 
 def remove_square_brackets(s: pd.Series) -> pd.Series:
@@ -575,7 +637,7 @@ def remove_square_brackets(s: pd.Series) -> pd.Series:
 
 
     """
-    return s.str.replace(r"\[[^\[\]]*\]", "")
+    return s.str.replace(PATTERN_REMOVE_SQUARE_BRACKETS, "")
 
 
 def remove_angle_brackets(s: pd.Series) -> pd.Series:
@@ -599,7 +661,7 @@ def remove_angle_brackets(s: pd.Series) -> pd.Series:
     :meth:`remove_square_brackets`
 
     """
-    return s.str.replace(r"<[^<>]*>", "")
+    return s.str.replace(PATTERN_REMOVE_ANGLE_BRACKETS, "")
 
 
 def remove_brackets(s: pd.Series) -> pd.Series:
@@ -652,12 +714,7 @@ def remove_html_tags(s: pd.Series) -> pd.Series:
 
     """
 
-    pattern = r"""(?x)                    # Turn on free-spacing
-      <[^>]+>                             # Remove <html> tags
-      | &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
-      """
-
-    return s.str.replace(pattern, "")
+    return s.str.replace(PATTERN_REMOVE_HTML_TAG, "")
 
 
 def tokenize(s: pd.Series) -> pd.Series:
@@ -681,12 +738,10 @@ def tokenize(s: pd.Series) -> pd.Series:
 
     """
 
-    punct = string.punctuation.replace("_", "")
     # In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w
+    punct = string.punctuation.replace("_", "")
 
-    pattern = rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))"
-
-    return s.str.replace(pattern, r"\2 \3 \4 \5").str.split()
+    return s.str.replace(GET_PATTERN_TOKENIZATION(punct), r"\2 \3 \4 \5").str.split()
 
 
 def tokenize_with_phrases(
@@ -762,9 +817,7 @@ def replace_urls(s: pd.Series, symbol: str) -> pd.Series:
 
     """
 
-    pattern = r"http\S+"
-
-    return s.str.replace(pattern, symbol)
+    return s.str.replace(PATTERN_REPLACE_URLS, symbol)
 
 
 def remove_urls(s: pd.Series) -> pd.Series:
@@ -813,8 +866,7 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series:
 
     """
 
-    pattern = r"@[a-zA-Z0-9]+"
-    return s.str.replace(pattern, symbol)
+    return s.str.replace(PATTERN_REPLACE_TAGS, symbol)
 
 
 def remove_tags(s: pd.Series) -> pd.Series:
@@ -860,8 +912,7 @@ def replace_hashtags(s: pd.Series, symbol: str) -> pd.Series:
     dtype: object
 
     """
-    pattern = r"#[a-zA-Z0-9_]+"
-    return s.str.replace(pattern, symbol)
+    return s.str.replace(PATTERN_REPLACE_HASHTAGS, symbol)
 
 
 def remove_hashtags(s: pd.Series) -> pd.Series: