forked from snguyenthanh/better_profanity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
better_profanity.py
299 lines (237 loc) · 10.7 KB
/
better_profanity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# -*- coding: utf-8 -*-
from collections.abc import Iterable
from .constants import ALLOWED_CHARACTERS, ALLOWED_CONTANING_PROFANITY
from .utils import (
any_next_words_form_swear_word,
get_complete_path_of_file,
get_replacement_for_swear_word,
read_wordlist,
)
from .varying_string import VaryingString
class Profanity:
def __init__(self, words=None, whitelist=None):
"""
Args:
words (Iterable/str): Collection of words or file path for a list of
words to censor. `None` to use the default word list.
Raises:
TypeError: If `words` is not a valid type.
FileNotFoundError: If `words` is a `str` and is not a valid file path.
"""
if (
words is not None
and not isinstance(words, str)
and not isinstance(words, Iterable)
):
raise TypeError("words must be of type str, list, or None")
self.CENSOR_WORDSET = []
self.CHARS_MAPPING = {
"a": ("a", "@", "*", "4"),
"i": ("i", "*", "l", "1"),
"o": ("o", "*", "0", "@"),
"u": ("u", "*", "v"),
"v": ("v", "*", "u"),
"l": ("l", "1"),
"e": ("e", "*", "3"),
"s": ("s", "$", "5"),
"t": ("t", "7"),
}
self.MAX_NUMBER_COMBINATIONS = 1
self.ALLOWED_CHARACTERS = ALLOWED_CHARACTERS
self.whitelist = whitelist or set([])
self.whitelist = set(self.whitelist)
self.whitelist.update(ALLOWED_CONTANING_PROFANITY)
self._default_wordlist_filename = get_complete_path_of_file(
"profanity_wordlist.txt"
)
if type(words) == str:
self.load_censor_words_from_file(words)
else:
self.load_censor_words(custom_words=words)
## PUBLIC ##
def censor(self, text, censor_char="*"):
"""Replace the swear words in the text with `censor_char`."""
if not isinstance(text, str):
text = str(text)
if not isinstance(censor_char, str):
censor_char = str(censor_char)
if not self.CENSOR_WORDSET:
self.load_censor_words()
return self._hide_swear_words(text, censor_char)
def load_censor_words_from_file(self, filename, **kwargs):
words = read_wordlist(filename)
self._populate_words_to_wordset(words, **kwargs)
def load_censor_words(self, custom_words=None, **kwargs):
"""Generate a set of words that need to be censored."""
# Replace the words from `profanity_wordlist.txt` with a custom list
custom_words = custom_words or read_wordlist(self._default_wordlist_filename)
self._populate_words_to_wordset(custom_words, **kwargs)
def add_censor_words(self, custom_words):
if not isinstance(custom_words, (list, tuple, set)):
raise TypeError(
"Function 'add_censor_words' only accepts list, tuple or set."
)
for w in custom_words:
self.CENSOR_WORDSET.append(VaryingString(w, char_map=self.CHARS_MAPPING))
def contains_profanity(self, text):
"""Return True if the input text has any swear words."""
return text != self.censor(text)
## PRIVATE ##
def _populate_words_to_wordset(self, words, *, whitelist_words=None):
if whitelist_words is not None and not isinstance(
whitelist_words, (list, set, tuple)
):
raise TypeError(
"The 'whitelist_words' keyword argument only accepts list, tuple or set."
)
# Validation
whitelist_words = whitelist_words or set([])
self.whitelist.update(whitelist_words)
for index, word in enumerate(whitelist_words):
if not isinstance(word, str):
raise ValueError(
"Each word in 'whitelist_words' must be 'str' type, "
"but '{word}' found.".format(word=type(word))
)
whitelist_words[index] = word.lower()
# Populate the words into an internal wordset
whitelist_words = set(whitelist_words)
all_censor_words = []
for word in set(words):
# All words in CENSOR_WORDSET must be in lowercase
word = word.lower()
if word in whitelist_words:
continue
num_of_non_allowed_chars = self._count_non_allowed_characters(word)
if num_of_non_allowed_chars > self.MAX_NUMBER_COMBINATIONS:
self.MAX_NUMBER_COMBINATIONS = num_of_non_allowed_chars
all_censor_words.append(VaryingString(word, char_map=self.CHARS_MAPPING))
# The default wordlist takes ~5MB+ of memory
self.CENSOR_WORDSET = all_censor_words
def _count_non_allowed_characters(self, word):
count = 0
for char in iter(word):
if char not in self.ALLOWED_CHARACTERS:
count += 1
return count
def _update_next_words_indices(self, text, words_indices, start_idx):
"""Return a list of next words_indices after the input index."""
if not words_indices:
words_indices = self._get_next_words(
text, start_idx, self.MAX_NUMBER_COMBINATIONS
)
else:
del words_indices[:2]
if words_indices and words_indices[-1][0] != "":
words_indices += self._get_next_words(text, words_indices[-1][1], 1)
return words_indices
def _hide_swear_words(self, text, censor_char):
"""Replace the swear words with censor characters."""
censored_text = ""
cur_word = ""
skip_index = -1
next_words_indices = []
start_idx_of_next_word = self._get_start_index_of_next_word(text, 0)
# If there are no words in the text, return the raw text without parsing
if start_idx_of_next_word >= len(text) - 1:
return text
# Left strip the text, to avoid inaccurate parsing
if start_idx_of_next_word > 0:
censored_text = text[:start_idx_of_next_word]
text = text[start_idx_of_next_word:]
# Splitting each word in the text to compare with censored words
for index, char in iter(enumerate(text)):
if index < skip_index:
continue
if char in ALLOWED_CHARACTERS:
cur_word += char
continue
# Skip continuous non-allowed characters
if cur_word.strip() == "":
censored_text += char
cur_word = ""
continue
# Iterate the next words combined with the current one
# to check if it forms a swear word
next_words_indices = self._update_next_words_indices(
text, next_words_indices, index
)
contains_swear_word, end_index = any_next_words_form_swear_word(
cur_word, next_words_indices, self.CENSOR_WORDSET
)
if contains_swear_word:
cur_word = get_replacement_for_swear_word(censor_char)
skip_index = end_index
char = ""
next_words_indices = []
# If the current a swear word
if cur_word.lower() in self.CENSOR_WORDSET:
cur_word = get_replacement_for_swear_word(censor_char)
censored_text += cur_word + char
cur_word = ""
# Final check
if cur_word != "" and skip_index < len(text) - 1:
if cur_word.lower() in self.CENSOR_WORDSET:
cur_word = get_replacement_for_swear_word(censor_char)
# Check if removeing letters from behind makes a swear word
cur_word = self._check_for_profanity_within(cur_word, censor_char, [])
censored_text += cur_word
return censored_text
def _check_for_profanity_within(self, cur_word, censor_char, next_words_indices):
"""Checks if there is profanity within """
if cur_word in self.CENSOR_WORDSET:
return cur_word
if not cur_word.lower() in self.whitelist:
for idx, chr in iter(enumerate(cur_word)):
if cur_word[idx:].lower() in self.CENSOR_WORDSET:
cur_word = cur_word[:idx] + get_replacement_for_swear_word(censor_char)
break
cur_check_word = cur_word + 'a'
for idx, chr in iter(enumerate(cur_word)):
if cur_check_word.lower() in self.CENSOR_WORDSET:
cur_word = get_replacement_for_swear_word(censor_char) + cur_word[len(cur_check_word):]
break
cur_check_word = cur_check_word[:-1]
return cur_word
def _get_start_index_of_next_word(self, text, start_idx):
"""Return the index of the first character of the next word in the given text."""
start_idx_of_next_word = len(text)
for index in iter(range(start_idx, len(text))):
if text[index] not in self.ALLOWED_CHARACTERS:
continue
start_idx_of_next_word = index
break
return start_idx_of_next_word
def _get_next_word_and_end_index(self, text, start_idx):
"""Return the next word in the given text, and the index of its last character."""
next_word = ""
index = start_idx
for index in iter(range(start_idx, len(text))):
char = text[index]
if char in self.ALLOWED_CHARACTERS:
next_word += char
continue
break
return next_word, index
def _get_next_words(self, text, start_idx, num_of_next_words=1):
"""
Return a list of pairs of next words and next words included with separators,
combined with their end indices.
For example: Word `hand_job` has next words pairs: `job`, `_job`.
"""
# Find the starting index of the next word
start_idx_of_next_word = self._get_start_index_of_next_word(text, start_idx)
# Return an empty string if there are no other words
if start_idx_of_next_word >= len(text) - 1:
return [("", start_idx_of_next_word), ("", start_idx_of_next_word)]
# Combine the words into a list
next_word, end_index = self._get_next_word_and_end_index(
text, start_idx_of_next_word
)
words = [
(next_word, end_index),
("%s%s" % (text[start_idx:start_idx_of_next_word], next_word), end_index),
]
if num_of_next_words > 1:
words.extend(self._get_next_words(text, end_index, num_of_next_words - 1))
return words