Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up preprocessing module #124

Closed
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 98 additions & 1 deletion tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from texthero import preprocessing, stopwords
from . import PandasTestCase


"""
Test doctest
"""
Expand Down Expand Up @@ -113,6 +112,104 @@ def test_pipeline_stopwords(self):
pipeline = [preprocessing.lowercase, preprocessing.remove_stopwords]
self.assertEqual(preprocessing.clean(s, pipeline=pipeline), s_true)

"""
Test clean
"""

def _get_default_clean_pipeline(self):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure we really need so many tests for this part ...?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think those test will help us to cover all different sections of the pipeline individually, so if something gets changed, we know, which part is broken

"""
Return a list contaning all the methods used in the default cleaning pipeline.

Return a list with the following functions:
1. :meth:`texthero.preprocessing.fillna`
2. :meth:`texthero.preprocessing.lowercase`
3. :meth:`texthero.preprocessing.remove_digits`
4. :meth:`texthero.preprocessing.remove_punctuation`
5. :meth:`texthero.preprocessing.remove_diacritics`
6. :meth:`texthero.preprocessing.remove_stopwords`
7. :meth:`texthero.preprocessing.remove_whitespace`
"""

return [
preprocessing.fillna,
preprocessing.lowercase,
preprocessing.remove_digits,
preprocessing.remove_punctuation,
preprocessing.remove_diacritics,
preprocessing.remove_stopwords,
preprocessing.remove_whitespace,
]

def test_clean(self):
s = pd.Series(
["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]
)
s_true = pd.Series(
["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]
)
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_fillna(self):
s = pd.Series(np.NaN)
s_true = pd.Series(np.NaN)
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_lowercase(self):
s = pd.Series("this text Is MiXed CasE")
s_true = pd.Series("this text Is MiXed CasE")
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_digits(self):
s = pd.Series("Here are 42 digits blocks 89")
s_true = pd.Series("Here are 42 digits blocks 89")
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_punctuation(self):
s = pd.Series("Some. wired, punctiation;.:!!!!")
s_true = pd.Series("Some. wired, punctiation;.:!!!")
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_diacritics(self):
s = pd.Series("Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس")
s_true = pd.Series(
"Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس"
)
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_stopwords(self):
s = pd.Series("some stopwords are here\nAnd on")
s_true = pd.Series("some stopwords are here\nAnd on")
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_whitespaces(self):
s = pd.Series("hello world hello world \n there ")
s_true = pd.Series("hello world hello world \n there ")
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

"""
Test stopwords.
"""
Expand Down
165 changes: 108 additions & 57 deletions texthero/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,34 @@

from typing import List, Callable

# REGEX pattern constants
PATTERN_REMOVE_DIGITS_BLOCK = r"\b\d+\b"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably, the words REMOVE, REPLACE and PATTERN_ are not necessary:

PATTERN_REMOVE_DIGITS_BLOCK -> DIGITS_BLOCK
PATERN_REMOVE_CURLY_BRACKETS -> CURLY_BRACKETS
...

PATTERN_REMOVE_PUNCTUATION = rf"([{string.punctuation}])+"
PATTERN_STOPWORD_TOKENIZER = r"""(?x) # Set flag to allow verbose regexps
\w+(?:-\w+)* # Words with optional internal hyphens
| \s* # Any space
| [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol
"""
PATTERN_REMOVE_ROUND_BRACKETS = r"\([^()]*\)"
PATERN_REMOVE_CURLY_BRACKETS = r"\{[^{}]*\}"
PATTERN_REMOVE_SQUARE_BRACKETS = r"\[[^\[\]]*\]"
PATTERN_REMOVE_ANGLE_BRACKETS = r"<[^<>]*>"
PATTERN_REMOVE_HTML_TAG = r"""(?x) # Turn on free-spacing
<[^>]+> # Remove <html> tags
| &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
"""


def GET_PATTERN_TOKENIZATION(punct: str) -> str:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Global functions are always lowercased
"Returns the standart tokenisation pattern": not particularly meaningful
"Returns" -> "Return"

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah thanks 🥇 I was unsure about the python style guide. But now it is a private function

"""
Returns the standart tokenisation pattern
"""
return rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))"


PATTERN_REPLACE_URLS = r"http\S+"
PATTERN_REPLACE_TAGS = r"@[a-zA-Z0-9]+"
PATTERN_REPLACE_HASHTAGS = r"#[a-zA-Z0-9_]+"

# Ignore gensim annoying warnings
import warnings
Expand Down Expand Up @@ -92,8 +120,7 @@ def replace_digits(s: pd.Series, symbols: str = " ", only_blocks=True) -> pd.Ser
"""

if only_blocks:
pattern = r"\b\d+\b"
return s.str.replace(pattern, symbols)
return s.str.replace(PATTERN_REMOVE_DIGITS_BLOCK, symbols)
else:
return s.str.replace(r"\d+", symbols)

Expand Down Expand Up @@ -158,7 +185,7 @@ def replace_punctuation(s: pd.Series, symbol: str = " ") -> pd.Series:
dtype: object
"""

return s.str.replace(rf"([{string.punctuation}])+", symbol)
return s.str.replace(PATTERN_REMOVE_PUNCTUATION, symbol)


def remove_punctuation(s: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -267,13 +294,10 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str:

"""

pattern = r"""(?x) # Set flag to allow verbose regexps
\w+(?:-\w+)* # Words with optional internal hyphens
| \s* # Any space
| [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol
"""

return "".join(t if t not in words else symbol for t in re.findall(pattern, text))
return "".join(
t if t not in words else symbol
for t in re.findall(PATTERN_STOPWORD_TOKENIZER, text)
)


def replace_stopwords(
Expand Down Expand Up @@ -401,33 +425,14 @@ def _stem(text):
return s.str.split().apply(_stem)


def get_default_pipeline() -> List[Callable[[pd.Series], pd.Series]]:
"""
Return a list contaning all the methods used in the default cleaning pipeline.

Return a list with the following functions:
1. :meth:`texthero.preprocessing.fillna`
2. :meth:`texthero.preprocessing.lowercase`
3. :meth:`texthero.preprocessing.remove_digits`
4. :meth:`texthero.preprocessing.remove_punctuation`
5. :meth:`texthero.preprocessing.remove_diacritics`
6. :meth:`texthero.preprocessing.remove_stopwords`
7. :meth:`texthero.preprocessing.remove_whitespace`
"""
return [
fillna,
lowercase,
remove_digits,
remove_punctuation,
remove_diacritics,
remove_stopwords,
remove_whitespace,
]


def clean(s: pd.Series, pipeline=None) -> pd.Series:
"""
Pre-process a text-based Pandas Series, by using the following default pipline.
Pre-process a text-based Pandas Series.

There are two options to use this function. You can either use this function, buy not specifiying an pipeline.
In this case the clean function will use a default pipeline, which was hardcoded, to gain 30% performance improvements,
over the "pipe" method.
If you specify your own cleaning pipeline, the clean function will use this one instead.

Default pipeline:
1. :meth:`texthero.preprocessing.fillna`
Expand All @@ -438,6 +443,7 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series:
6. :meth:`texthero.preprocessing.remove_stopwords`
7. :meth:`texthero.preprocessing.remove_whitespace`


Parameters
----------
s : Pandas Series
Expand All @@ -458,13 +464,69 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series:
"""

if not pipeline:
pipeline = get_default_pipeline()
return _optimised_default_clean(s)

for f in pipeline:
s = s.pipe(f)
return s


def _optimised_default_clean(s: pd.Series) -> pd.Series:
"""
Applies the default clean pipeline in an optimised way to a series,
that is about 30% faster.

Default pipeline:
1. :meth:`texthero.preprocessing.fillna`
2. :meth:`texthero.preprocessing.lowercase`
3. :meth:`texthero.preprocessing.remove_digits`
4. :meth:`texthero.preprocessing.remove_punctuation`
5. :meth:`texthero.preprocessing.remove_diacritics`
6. :meth:`texthero.preprocessing.remove_stopwords`
7. :meth:`texthero.preprocessing.remove_whitespace`
"""
return s.apply(_optimised_default_clean_single_cell)


def _optimised_default_clean_single_cell(text: str) -> str:
"""
Applies the default clean pipeline to one cell.

Default pipeline:
1. :meth:`texthero.preprocessing.fillna`
2. :meth:`texthero.preprocessing.lowercase`
3. :meth:`texthero.preprocessing.remove_digits`
4. :meth:`texthero.preprocessing.remove_punctuation`
5. :meth:`texthero.preprocessing.remove_diacritics`
6. :meth:`texthero.preprocessing.remove_stopwords`
7. :meth:`texthero.preprocessing.remove_whitespace`
"""

# fillna
if pd.isna(text):
return ""

# lowercase
text = text.lower()

# remove digits and punctuation
pattern_mixed_remove = (
PATTERN_REMOVE_DIGITS_BLOCK + "|" + PATTERN_REMOVE_PUNCTUATION
)
text = re.sub(pattern_mixed_remove, "", text)

# remove diacritics
text = _remove_diacritics(text)

# remove stopwords
text = _replace_stopwords(text, _stopwords.DEFAULT, "")

# remove whitespace
text = " ".join(re.sub("\xa0", " ", text).split())

return text


def has_content(s: pd.Series) -> pd.Series:
r"""
Return a Boolean Pandas Series indicating if the rows have content.
Expand Down Expand Up @@ -526,7 +588,7 @@ def remove_round_brackets(s: pd.Series) -> pd.Series:
:meth:`remove_square_brackets`

"""
return s.str.replace(r"\([^()]*\)", "")
return s.str.replace(PATTERN_REMOVE_ROUND_BRACKETS, "")


def remove_curly_brackets(s: pd.Series) -> pd.Series:
Expand All @@ -550,7 +612,7 @@ def remove_curly_brackets(s: pd.Series) -> pd.Series:
:meth:`remove_square_brackets`

"""
return s.str.replace(r"\{[^{}]*\}", "")
return s.str.replace(PATERN_REMOVE_CURLY_BRACKETS, "")


def remove_square_brackets(s: pd.Series) -> pd.Series:
Expand All @@ -575,7 +637,7 @@ def remove_square_brackets(s: pd.Series) -> pd.Series:


"""
return s.str.replace(r"\[[^\[\]]*\]", "")
return s.str.replace(PATTERN_REMOVE_SQUARE_BRACKETS, "")


def remove_angle_brackets(s: pd.Series) -> pd.Series:
Expand All @@ -599,7 +661,7 @@ def remove_angle_brackets(s: pd.Series) -> pd.Series:
:meth:`remove_square_brackets`

"""
return s.str.replace(r"<[^<>]*>", "")
return s.str.replace(PATTERN_REMOVE_ANGLE_BRACKETS, "")


def remove_brackets(s: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -652,12 +714,7 @@ def remove_html_tags(s: pd.Series) -> pd.Series:

"""

pattern = r"""(?x) # Turn on free-spacing
<[^>]+> # Remove <html> tags
| &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
"""

return s.str.replace(pattern, "")
return s.str.replace(PATTERN_REMOVE_HTML_TAG, "")


def tokenize(s: pd.Series) -> pd.Series:
Expand All @@ -681,12 +738,10 @@ def tokenize(s: pd.Series) -> pd.Series:

"""

punct = string.punctuation.replace("_", "")
# In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w
punct = string.punctuation.replace("_", "")

pattern = rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))"

return s.str.replace(pattern, r"\2 \3 \4 \5").str.split()
return s.str.replace(GET_PATTERN_TOKENIZATION(punct), r"\2 \3 \4 \5").str.split()


def tokenize_with_phrases(
Expand Down Expand Up @@ -762,9 +817,7 @@ def replace_urls(s: pd.Series, symbol: str) -> pd.Series:

"""

pattern = r"http\S+"

return s.str.replace(pattern, symbol)
return s.str.replace(PATTERN_REPLACE_URLS, symbol)


def remove_urls(s: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -813,8 +866,7 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series:

"""

pattern = r"@[a-zA-Z0-9]+"
return s.str.replace(pattern, symbol)
return s.str.replace(PATTERN_REPLACE_TAGS, symbol)


def remove_tags(s: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -860,8 +912,7 @@ def replace_hashtags(s: pd.Series, symbol: str) -> pd.Series:
dtype: object

"""
pattern = r"#[a-zA-Z0-9_]+"
return s.str.replace(pattern, symbol)
return s.str.replace(PATTERN_REPLACE_HASHTAGS, symbol)


def remove_hashtags(s: pd.Series) -> pd.Series:
Expand Down