Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up preprocessing module #124

Closed
Closed
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,104 @@ def test_pipeline_stopwords(self):
pipeline = [preprocessing.lowercase, preprocessing.remove_stopwords]
self.assertEqual(preprocessing.clean(s, pipeline=pipeline), s_true)

"""
Test clean
"""

def _get_default_clean_pipeline(self):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure we really need so many tests for this part ...?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think those test will help us to cover all different sections of the pipeline individually, so if something gets changed, we know, which part is broken

"""
Return a list contaning all the methods used in the default cleaning pipeline.

Return a list with the following functions:
1. :meth:`texthero.preprocessing.fillna`
2. :meth:`texthero.preprocessing.lowercase`
3. :meth:`texthero.preprocessing.remove_digits`
4. :meth:`texthero.preprocessing.remove_punctuation`
5. :meth:`texthero.preprocessing.remove_diacritics`
6. :meth:`texthero.preprocessing.remove_stopwords`
7. :meth:`texthero.preprocessing.remove_whitespace`
"""

return [
preprocessing.fillna,
preprocessing.lowercase,
preprocessing.remove_digits,
preprocessing.remove_punctuation,
preprocessing.remove_diacritics,
preprocessing.remove_stopwords,
preprocessing.remove_whitespace,
]

def test_clean(self):
s = pd.Series(
["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]
)
s_true = pd.Series(
["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]
)
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_fillna(self):
s = pd.Series(np.NaN)
s_true = pd.Series(np.NaN)
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_lowercase(self):
s = pd.Series("this text Is MiXed CasE")
s_true = pd.Series("this text Is MiXed CasE")
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_digits(self):
s = pd.Series("Here are 42 digits blocks 89")
s_true = pd.Series("Here are 42 digits blocks 89")
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_punctuation(self):
s = pd.Series("Some. wired, punctiation;.:!!!!")
s_true = pd.Series("Some. wired, punctiation;.:!!!")
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_diacritics(self):
s = pd.Series("Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس")
s_true = pd.Series(
"Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس"
)
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_stopwords(self):
s = pd.Series("some stopwords are here\nAnd on")
s_true = pd.Series("some stopwords are here\nAnd on")
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

def test_clean_whitespaces(self):
s = pd.Series("hello world hello world \n there ")
s_true = pd.Series("hello world hello world \n there ")
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

"""
Test stopwords.
"""
Expand Down
Loading