-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_cleanner.py
60 lines (44 loc) · 1.64 KB
/
text_cleanner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import re
import string
import unicodedata
from six import u
from stopwords import get_stopwords_by_language
# Taken from gensim
def to_string(text, encoding='utf8', errors='strict'):
"""Convert a string (bytestring in `encoding` or unicode), to unicode."""
if isinstance(text, str):
return text
return str(text, encoding, errors=errors)
# Taken from gensim
RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
def strip_punctuation(s):
s = to_string(s)
return RE_PUNCT.sub(" ", s)
# Taken from gensim
RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
def strip_numeric(s):
s = to_string(s)
return RE_NUMERIC.sub("", s)
RE_LAUGH = re.compile(r"\b(?:a*(?:ha)+h?|(?:a*(?:ja)+j?))\b")
def strip_laugh(s):
s = to_string(s)
return RE_LAUGH.sub("", s)
def remove_stopwords(sentence, language):
stopwords = get_stopwords_by_language(language)
return " ".join(w for w in sentence.split() if w not in stopwords)
# Taken from gensim
def deaccent(text):
"""
Remove accentuation from the given string. Input text is either a unicode string or utf8
encoded bytestring.
"""
if not isinstance(text, str):
# assume utf8 for byte strings, use default (strict) error handling
text = text.decode('utf8')
norm = unicodedata.normalize("NFD", text)
result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
return unicodedata.normalize("NFC", result)
def clean_text(s, language):
functions = [lambda w: w.lower(), deaccent, strip_punctuation, strip_numeric, strip_laugh]
for f in functions: s = f(s)
return remove_stopwords(s, language)