-
Notifications
You must be signed in to change notification settings - Fork 239
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Speed up preprocessing module #124
Changes from 7 commits
3ab14fd
e0f02c5
2ea3caf
697a229
4bb9860
57c37c1
db4934e
5887485
addc23b
84652ee
3e7056b
16b775f
0b41020
efdc3c9
e627126
f4a91fa
d4d394b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,34 @@ | |
|
||
from typing import List, Callable | ||
|
||
# REGEX pattern constants | ||
PATTERN_REMOVE_DIGITS_BLOCK = r"\b\d+\b" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably, the words PATTERN_REMOVE_DIGITS_BLOCK -> DIGITS_BLOCK |
||
PATTERN_REMOVE_PUNCTUATION = rf"([{string.punctuation}])+" | ||
PATTERN_STOPWORD_TOKENIZER = r"""(?x) # Set flag to allow verbose regexps | ||
\w+(?:-\w+)* # Words with optional internal hyphens | ||
| \s* # Any space | ||
| [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol | ||
""" | ||
PATTERN_REMOVE_ROUND_BRACKETS = r"\([^()]*\)" | ||
PATERN_REMOVE_CURLY_BRACKETS = r"\{[^{}]*\}" | ||
PATTERN_REMOVE_SQUARE_BRACKETS = r"\[[^\[\]]*\]" | ||
PATTERN_REMOVE_ANGLE_BRACKETS = r"<[^<>]*>" | ||
PATTERN_REMOVE_HTML_TAG = r"""(?x) # Turn on free-spacing | ||
<[^>]+> # Remove <html> tags | ||
| &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove | ||
""" | ||
|
||
|
||
def GET_PATTERN_TOKENIZATION(punct: str) -> str: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Global functions are always lowercased There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah thanks 🥇 I was unsure about the python style guide. But now it is a private function |
||
""" | ||
Returns the standart tokenisation pattern | ||
""" | ||
return rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))" | ||
|
||
|
||
PATTERN_REPLACE_URLS = r"http\S+" | ||
PATTERN_REPLACE_TAGS = r"@[a-zA-Z0-9]+" | ||
PATTERN_REPLACE_HASHTAGS = r"#[a-zA-Z0-9_]+" | ||
|
||
# Ignore gensim annoying warnings | ||
import warnings | ||
|
@@ -92,8 +120,7 @@ def replace_digits(s: pd.Series, symbols: str = " ", only_blocks=True) -> pd.Ser | |
""" | ||
|
||
if only_blocks: | ||
pattern = r"\b\d+\b" | ||
return s.str.replace(pattern, symbols) | ||
return s.str.replace(PATTERN_REMOVE_DIGITS_BLOCK, symbols) | ||
else: | ||
return s.str.replace(r"\d+", symbols) | ||
|
||
|
@@ -158,7 +185,7 @@ def replace_punctuation(s: pd.Series, symbol: str = " ") -> pd.Series: | |
dtype: object | ||
""" | ||
|
||
return s.str.replace(rf"([{string.punctuation}])+", symbol) | ||
return s.str.replace(PATTERN_REMOVE_PUNCTUATION, symbol) | ||
|
||
|
||
def remove_punctuation(s: pd.Series) -> pd.Series: | ||
|
@@ -267,13 +294,10 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str: | |
|
||
""" | ||
|
||
pattern = r"""(?x) # Set flag to allow verbose regexps | ||
\w+(?:-\w+)* # Words with optional internal hyphens | ||
| \s* # Any space | ||
| [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol | ||
""" | ||
|
||
return "".join(t if t not in words else symbol for t in re.findall(pattern, text)) | ||
return "".join( | ||
t if t not in words else symbol | ||
for t in re.findall(PATTERN_STOPWORD_TOKENIZER, text) | ||
) | ||
|
||
|
||
def replace_stopwords( | ||
|
@@ -401,33 +425,14 @@ def _stem(text): | |
return s.str.split().apply(_stem) | ||
|
||
|
||
def get_default_pipeline() -> List[Callable[[pd.Series], pd.Series]]: | ||
""" | ||
Return a list contaning all the methods used in the default cleaning pipeline. | ||
|
||
Return a list with the following functions: | ||
1. :meth:`texthero.preprocessing.fillna` | ||
2. :meth:`texthero.preprocessing.lowercase` | ||
3. :meth:`texthero.preprocessing.remove_digits` | ||
4. :meth:`texthero.preprocessing.remove_punctuation` | ||
5. :meth:`texthero.preprocessing.remove_diacritics` | ||
6. :meth:`texthero.preprocessing.remove_stopwords` | ||
7. :meth:`texthero.preprocessing.remove_whitespace` | ||
""" | ||
return [ | ||
fillna, | ||
lowercase, | ||
remove_digits, | ||
remove_punctuation, | ||
remove_diacritics, | ||
remove_stopwords, | ||
remove_whitespace, | ||
] | ||
|
||
|
||
def clean(s: pd.Series, pipeline=None) -> pd.Series: | ||
""" | ||
Pre-process a text-based Pandas Series, by using the following default pipline. | ||
Pre-process a text-based Pandas Series. | ||
|
||
There are two options to use this function. You can either use this function, buy not specifiying an pipeline. | ||
In this case the clean function will use a default pipeline, which was hardcoded, to gain 30% performance improvements, | ||
over the "pipe" method. | ||
If you specify your own cleaning pipeline, the clean function will use this one instead. | ||
|
||
Default pipeline: | ||
1. :meth:`texthero.preprocessing.fillna` | ||
|
@@ -438,6 +443,7 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series: | |
6. :meth:`texthero.preprocessing.remove_stopwords` | ||
7. :meth:`texthero.preprocessing.remove_whitespace` | ||
|
||
|
||
Parameters | ||
---------- | ||
s : Pandas Series | ||
|
@@ -458,13 +464,69 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series: | |
""" | ||
|
||
if not pipeline: | ||
pipeline = get_default_pipeline() | ||
return _optimised_default_clean(s) | ||
|
||
for f in pipeline: | ||
s = s.pipe(f) | ||
return s | ||
|
||
|
||
def _optimised_default_clean(s: pd.Series) -> pd.Series: | ||
""" | ||
Applies the default clean pipeline in an optimised way to a series, | ||
that is about 30% faster. | ||
|
||
Default pipeline: | ||
1. :meth:`texthero.preprocessing.fillna` | ||
2. :meth:`texthero.preprocessing.lowercase` | ||
3. :meth:`texthero.preprocessing.remove_digits` | ||
4. :meth:`texthero.preprocessing.remove_punctuation` | ||
5. :meth:`texthero.preprocessing.remove_diacritics` | ||
6. :meth:`texthero.preprocessing.remove_stopwords` | ||
7. :meth:`texthero.preprocessing.remove_whitespace` | ||
""" | ||
return s.apply(_optimised_default_clean_single_cell) | ||
|
||
|
||
def _optimised_default_clean_single_cell(text: str) -> str: | ||
""" | ||
Applies the default clean pipeline to one cell. | ||
|
||
Default pipeline: | ||
1. :meth:`texthero.preprocessing.fillna` | ||
2. :meth:`texthero.preprocessing.lowercase` | ||
3. :meth:`texthero.preprocessing.remove_digits` | ||
4. :meth:`texthero.preprocessing.remove_punctuation` | ||
5. :meth:`texthero.preprocessing.remove_diacritics` | ||
6. :meth:`texthero.preprocessing.remove_stopwords` | ||
7. :meth:`texthero.preprocessing.remove_whitespace` | ||
""" | ||
|
||
# fillna | ||
if pd.isna(text): | ||
return "" | ||
|
||
# lowercase | ||
text = text.lower() | ||
|
||
# remove digits and punctuation | ||
pattern_mixed_remove = ( | ||
PATTERN_REMOVE_DIGITS_BLOCK + "|" + PATTERN_REMOVE_PUNCTUATION | ||
) | ||
text = re.sub(pattern_mixed_remove, "", text) | ||
|
||
# remove diacritics | ||
text = _remove_diacritics(text) | ||
|
||
# remove stopwords | ||
text = _replace_stopwords(text, _stopwords.DEFAULT, "") | ||
|
||
# remove whitespace | ||
text = " ".join(re.sub("\xa0", " ", text).split()) | ||
|
||
return text | ||
|
||
|
||
def has_content(s: pd.Series) -> pd.Series: | ||
r""" | ||
Return a Boolean Pandas Series indicating if the rows have content. | ||
|
@@ -526,7 +588,7 @@ def remove_round_brackets(s: pd.Series) -> pd.Series: | |
:meth:`remove_square_brackets` | ||
|
||
""" | ||
return s.str.replace(r"\([^()]*\)", "") | ||
return s.str.replace(PATTERN_REMOVE_ROUND_BRACKETS, "") | ||
|
||
|
||
def remove_curly_brackets(s: pd.Series) -> pd.Series: | ||
|
@@ -550,7 +612,7 @@ def remove_curly_brackets(s: pd.Series) -> pd.Series: | |
:meth:`remove_square_brackets` | ||
|
||
""" | ||
return s.str.replace(r"\{[^{}]*\}", "") | ||
return s.str.replace(PATERN_REMOVE_CURLY_BRACKETS, "") | ||
|
||
|
||
def remove_square_brackets(s: pd.Series) -> pd.Series: | ||
|
@@ -575,7 +637,7 @@ def remove_square_brackets(s: pd.Series) -> pd.Series: | |
|
||
|
||
""" | ||
return s.str.replace(r"\[[^\[\]]*\]", "") | ||
return s.str.replace(PATTERN_REMOVE_SQUARE_BRACKETS, "") | ||
|
||
|
||
def remove_angle_brackets(s: pd.Series) -> pd.Series: | ||
|
@@ -599,7 +661,7 @@ def remove_angle_brackets(s: pd.Series) -> pd.Series: | |
:meth:`remove_square_brackets` | ||
|
||
""" | ||
return s.str.replace(r"<[^<>]*>", "") | ||
return s.str.replace(PATTERN_REMOVE_ANGLE_BRACKETS, "") | ||
|
||
|
||
def remove_brackets(s: pd.Series) -> pd.Series: | ||
|
@@ -652,12 +714,7 @@ def remove_html_tags(s: pd.Series) -> pd.Series: | |
|
||
""" | ||
|
||
pattern = r"""(?x) # Turn on free-spacing | ||
<[^>]+> # Remove <html> tags | ||
| &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove | ||
""" | ||
|
||
return s.str.replace(pattern, "") | ||
return s.str.replace(PATTERN_REMOVE_HTML_TAG, "") | ||
|
||
|
||
def tokenize(s: pd.Series) -> pd.Series: | ||
|
@@ -681,12 +738,10 @@ def tokenize(s: pd.Series) -> pd.Series: | |
|
||
""" | ||
|
||
punct = string.punctuation.replace("_", "") | ||
# In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w | ||
punct = string.punctuation.replace("_", "") | ||
|
||
pattern = rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))" | ||
|
||
return s.str.replace(pattern, r"\2 \3 \4 \5").str.split() | ||
return s.str.replace(GET_PATTERN_TOKENIZATION(punct), r"\2 \3 \4 \5").str.split() | ||
|
||
|
||
def tokenize_with_phrases( | ||
|
@@ -762,9 +817,7 @@ def replace_urls(s: pd.Series, symbol: str) -> pd.Series: | |
|
||
""" | ||
|
||
pattern = r"http\S+" | ||
|
||
return s.str.replace(pattern, symbol) | ||
return s.str.replace(PATTERN_REPLACE_URLS, symbol) | ||
|
||
|
||
def remove_urls(s: pd.Series) -> pd.Series: | ||
|
@@ -813,8 +866,7 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series: | |
|
||
""" | ||
|
||
pattern = r"@[a-zA-Z0-9]+" | ||
return s.str.replace(pattern, symbol) | ||
return s.str.replace(PATTERN_REPLACE_TAGS, symbol) | ||
|
||
|
||
def remove_tags(s: pd.Series) -> pd.Series: | ||
|
@@ -860,8 +912,7 @@ def replace_hashtags(s: pd.Series, symbol: str) -> pd.Series: | |
dtype: object | ||
|
||
""" | ||
pattern = r"#[a-zA-Z0-9_]+" | ||
return s.str.replace(pattern, symbol) | ||
return s.str.replace(PATTERN_REPLACE_HASHTAGS, symbol) | ||
|
||
|
||
def remove_hashtags(s: pd.Series) -> pd.Series: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure we really need so many tests for this part ...?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think those test will help us to cover all different sections of the pipeline individually, so if something gets changed, we know, which part is broken