From f80c55e21ce46b2904ef4c5ea5eaeb98a0cbc6ad Mon Sep 17 00:00:00 2001 From: ShowierData9978 <68120127+showierdata9978@users.noreply.github.com> Date: Tue, 27 Sep 2022 00:52:01 +0000 Subject: [PATCH 1/6] Fixed major issue with a limitation. added 2 tests for it --- better_profanity/better_profanity.py | 21 +++++++++++++++++++++ tests.py | 11 ++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/better_profanity/better_profanity.py b/better_profanity/better_profanity.py index 903dd7b..a32a795 100644 --- a/better_profanity/better_profanity.py +++ b/better_profanity/better_profanity.py @@ -197,10 +197,31 @@ def _hide_swear_words(self, text, censor_char): censored_text += cur_word + char cur_word = "" + + # Check if removeing letters from behind makes a swear word + + for idx, chr in iter(enumerate(cur_word)): + + if cur_word[idx-1:].lower() in self.CENSOR_WORDSET: + + cur_word.replace(cur_word[:idx-1], get_replacement_for_swear_word(censor_char)) + + break + # Final check if cur_word != "" and skip_index < len(text) - 1: if cur_word.lower() in self.CENSOR_WORDSET: cur_word = get_replacement_for_swear_word(censor_char) + + + for idx, chr in iter(enumerate(cur_word)): + + if cur_word[idx:].lower() in self.CENSOR_WORDSET: + + cur_word = cur_word[:idx] + get_replacement_for_swear_word(censor_char) + + break + censored_text += cur_word return censored_text diff --git a/tests.py b/tests.py index ded4afb..17ff150 100644 --- a/tests.py +++ b/tests.py @@ -67,7 +67,16 @@ def test_censorship_with_ending_swear_word(self): bad_text = "That wh0re gave m3 a very good H@nD j0b." censored_text = "That **** gave m3 a very good ****." self.assertEqual(profanity.censor(bad_text), censored_text) - + + def test_obstructing_letter_1(self): + bad_text = "Afoobar" + censored_text = "A****" + self.assertEqual(profanity.censor(bad_text), censored_text) + def test_obstructing_letter_multible(self): + bad_text = "AAAAAAAAAfoobar" + censored_text = "AAAAAAAAA****" + self.assertEqual(profanity.censor(bad_text), censored_text) + def test_censorship_empty_text(self): empty_text = "" self.assertEqual(profanity.censor(empty_text), empty_text) From 0ea0821c271f99374bfea0d7e397176eca922e2b Mon Sep 17 00:00:00 2001 From: ShowierData9978 <68120127+showierdata9978@users.noreply.github.com> Date: Tue, 27 Sep 2022 01:02:43 +0000 Subject: [PATCH 2/6] noticed difference between final check and per char check --- better_profanity/better_profanity.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/better_profanity/better_profanity.py b/better_profanity/better_profanity.py index a32a795..8e44065 100644 --- a/better_profanity/better_profanity.py +++ b/better_profanity/better_profanity.py @@ -199,12 +199,11 @@ def _hide_swear_words(self, text, censor_char): # Check if removeing letters from behind makes a swear word - for idx, chr in iter(enumerate(cur_word)): - if cur_word[idx-1:].lower() in self.CENSOR_WORDSET: - - cur_word.replace(cur_word[:idx-1], get_replacement_for_swear_word(censor_char)) + if cur_word[idx:].lower() in self.CENSOR_WORDSET: + + cur_word = cur_word[:idx] + get_replacement_for_swear_word(censor_char) break From 99acdac639721903728ef06f7b93837b9e9bf313 Mon Sep 17 00:00:00 2001 From: ShowierData9978 <68120127+showierdata9978@users.noreply.github.com> Date: Tue, 27 Sep 2022 19:11:48 +0000 Subject: [PATCH 3/6] Fixed End part Added new constant --- better_profanity/better_profanity.py | 67 ++++++++++++++++++-------- better_profanity/constants.py | 6 +++ better_profanity/contaning_allowed.txt | 4 ++ replit.nix | 19 ++++++++ tests.py | 16 +++++- 5 files changed, 89 insertions(+), 23 deletions(-) create mode 100644 better_profanity/contaning_allowed.txt create mode 100644 replit.nix diff --git a/better_profanity/better_profanity.py b/better_profanity/better_profanity.py index 8e44065..adcde49 100644 --- a/better_profanity/better_profanity.py +++ b/better_profanity/better_profanity.py @@ -2,7 +2,7 @@ from collections.abc import Iterable -from .constants import ALLOWED_CHARACTERS +from .constants import ALLOWED_CHARACTERS, ALLOWED_CONTANING_PROFANITY from .utils import ( any_next_words_form_swear_word, get_complete_path_of_file, @@ -13,7 +13,7 @@ class Profanity: - def __init__(self, words=None): + def __init__(self, words=None, whitelist=None): """ Args: words (Iterable/str): Collection of words or file path for a list of @@ -43,6 +43,11 @@ def __init__(self, words=None): } self.MAX_NUMBER_COMBINATIONS = 1 self.ALLOWED_CHARACTERS = ALLOWED_CHARACTERS + + self.whitelist = whitelist or set([]) + self.whitelist = set(self.whitelist) + self.whitelist.update(ALLOWED_CONTANING_PROFANITY) + self._default_wordlist_filename = get_complete_path_of_file( "profanity_wordlist.txt" ) @@ -89,7 +94,7 @@ def contains_profanity(self, text): ## PRIVATE ## - def _populate_words_to_wordset(self, words, *, whitelist_words=None): + def _populate_words_to_wordset(self, words, *, whitelist_words=None,): if whitelist_words is not None and not isinstance( whitelist_words, (list, set, tuple) ): @@ -98,7 +103,9 @@ def _populate_words_to_wordset(self, words, *, whitelist_words=None): ) # Validation - whitelist_words = whitelist_words or [] + whitelist_words = whitelist_words or set([]) + self.whitelist.update(whitelist_words) + for index, word in enumerate(whitelist_words): if not isinstance(word, str): raise ValueError( @@ -176,11 +183,15 @@ def _hide_swear_words(self, text, censor_char): cur_word = "" continue + # Iterate the next words combined with the current one # to check if it forms a swear word next_words_indices = self._update_next_words_indices( text, next_words_indices, index ) + + cur_word = self._check_for_profanity_within(cur_word, censor_char, next_words_indices) + contains_swear_word, end_index = any_next_words_form_swear_word( cur_word, next_words_indices, self.CENSOR_WORDSET ) @@ -194,36 +205,50 @@ def _hide_swear_words(self, text, censor_char): if cur_word.lower() in self.CENSOR_WORDSET: cur_word = get_replacement_for_swear_word(censor_char) + + censored_text += cur_word + char cur_word = "" + - - # Check if removeing letters from behind makes a swear word - for idx, chr in iter(enumerate(cur_word)): - if cur_word[idx:].lower() in self.CENSOR_WORDSET: - - cur_word = cur_word[:idx] + get_replacement_for_swear_word(censor_char) - - break - + # Final check if cur_word != "" and skip_index < len(text) - 1: if cur_word.lower() in self.CENSOR_WORDSET: cur_word = get_replacement_for_swear_word(censor_char) - for idx, chr in iter(enumerate(cur_word)): - - if cur_word[idx:].lower() in self.CENSOR_WORDSET: - - cur_word = cur_word[:idx] + get_replacement_for_swear_word(censor_char) - - break - + # Check if removeing letters from behind makes a swear word + cur_word = self._check_for_profanity_within(cur_word, censor_char, []) censored_text += cur_word + return censored_text + def _check_for_profanity_within(self, cur_word, censor_char, next_words_indices): + """Checks if there is profanity within """ + + if cur_word in self.CENSOR_WORDSET: + return cur_word + + if not cur_word.lower() in self.whitelist: + for idx, chr in iter(enumerate(cur_word)): + if cur_word[idx:].lower() in self.CENSOR_WORDSET: + cur_word = cur_word[:idx] + get_replacement_for_swear_word(censor_char) + + break + + cur_check_word = cur_word + 'a' + + for idx, chr in iter(enumerate(cur_word)): + if cur_check_word.lower() in self.CENSOR_WORDSET: + cur_word = get_replacement_for_swear_word(censor_char) + cur_word[len(cur_check_word):] + break + + cur_check_word = cur_check_word[:-1] + + return cur_word + def _get_start_index_of_next_word(self, text, start_idx): """Return the index of the first character of the next word in the given text.""" start_idx_of_next_word = len(text) diff --git a/better_profanity/constants.py b/better_profanity/constants.py index 5085d46..e368646 100644 --- a/better_profanity/constants.py +++ b/better_profanity/constants.py @@ -13,3 +13,9 @@ # Pre-load the unicode characters with open(get_complete_path_of_file("alphabetic_unicode.json"), "r") as json_file: ALLOWED_CHARACTERS.update(load(json_file)) + + + +with open(get_complete_path_of_file("contaning_allowed.txt"), "r") as txt_file: + ALLOWED_CONTANING_PROFANITY = [a.strip() for a in txt_file.readlines()] + \ No newline at end of file diff --git a/better_profanity/contaning_allowed.txt b/better_profanity/contaning_allowed.txt new file mode 100644 index 0000000..6625229 --- /dev/null +++ b/better_profanity/contaning_allowed.txt @@ -0,0 +1,4 @@ +night +nightmare +laborum +hello \ No newline at end of file diff --git a/replit.nix b/replit.nix new file mode 100644 index 0000000..b9f7eea --- /dev/null +++ b/replit.nix @@ -0,0 +1,19 @@ +{ pkgs }: { + deps = [ + pkgs.python39Packages.pip + pkgs.python38Full + ]; + env = { + PYTHON_LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [ + # Needed for pandas / numpy + pkgs.stdenv.cc.cc.lib + pkgs.zlib + # Needed for pygame + pkgs.glib + # Needed for matplotlib + pkgs.xorg.libX11 + ]; + PYTHONBIN = "${pkgs.python38Full}/bin/python3.8"; + LANG = "en_US.UTF-8"; + }; +} \ No newline at end of file diff --git a/tests.py b/tests.py index 17ff150..a3246c1 100644 --- a/tests.py +++ b/tests.py @@ -72,11 +72,22 @@ def test_obstructing_letter_1(self): bad_text = "Afoobar" censored_text = "A****" self.assertEqual(profanity.censor(bad_text), censored_text) + def test_obstructing_letter_multible(self): - bad_text = "AAAAAAAAAfoobar" + bad_text = "AAAAAAAAAfoobar" censored_text = "AAAAAAAAA****" self.assertEqual(profanity.censor(bad_text), censored_text) - + + def test_end_letter_obstructing(self): + bad_text = "foobarAAAAAAAAA" + censored_text = "****AAAAAAAAA" + self.assertEqual(profanity.censor(bad_text), censored_text) + + def test_clean_word_that_contains(self): + clean_text = "night" + self.assertEqual(profanity.censor(clean_text), clean_text) + + def test_censorship_empty_text(self): empty_text = "" self.assertEqual(profanity.censor(empty_text), empty_text) @@ -214,6 +225,7 @@ def setUp(self): def test_whitelist_words(self): bad_text = "I have boobs" censored_text = "I have ****" + self.assertEqual(profanity.censor(bad_text), censored_text) # Whitelist the word `boobs` From 1cd62d631bb3d8da0823f2a233333e5be1529ad4 Mon Sep 17 00:00:00 2001 From: ShowierData9978 <68120127+showierdata9978@users.noreply.github.com> Date: Tue, 27 Sep 2022 19:21:30 +0000 Subject: [PATCH 4/6] single comma broke it bruh --- better_profanity/better_profanity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/better_profanity/better_profanity.py b/better_profanity/better_profanity.py index adcde49..1f0e0b1 100644 --- a/better_profanity/better_profanity.py +++ b/better_profanity/better_profanity.py @@ -94,7 +94,7 @@ def contains_profanity(self, text): ## PRIVATE ## - def _populate_words_to_wordset(self, words, *, whitelist_words=None,): + def _populate_words_to_wordset(self, words, *, whitelist_words=None): if whitelist_words is not None and not isinstance( whitelist_words, (list, set, tuple) ): From cd0960c40c5981e207be82908e88c53b4aefa75e Mon Sep 17 00:00:00 2001 From: ShowierData9978 Date: Fri, 30 Sep 2022 10:39:35 -0500 Subject: [PATCH 5/6] bro, I put this in gitignore i thought --- replit.nix | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 replit.nix diff --git a/replit.nix b/replit.nix deleted file mode 100644 index b9f7eea..0000000 --- a/replit.nix +++ /dev/null @@ -1,19 +0,0 @@ -{ pkgs }: { - deps = [ - pkgs.python39Packages.pip - pkgs.python38Full - ]; - env = { - PYTHON_LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [ - # Needed for pandas / numpy - pkgs.stdenv.cc.cc.lib - pkgs.zlib - # Needed for pygame - pkgs.glib - # Needed for matplotlib - pkgs.xorg.libX11 - ]; - PYTHONBIN = "${pkgs.python38Full}/bin/python3.8"; - LANG = "en_US.UTF-8"; - }; -} \ No newline at end of file From 28fbc4f7a86f00b3a724c87caf5dd2e318ad5d81 Mon Sep 17 00:00:00 2001 From: ShowierData9978 Date: Thu, 27 Oct 2022 20:24:26 -0500 Subject: [PATCH 6/6] Made it that mutch faster. --- better_profanity/better_profanity.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/better_profanity/better_profanity.py b/better_profanity/better_profanity.py index 1f0e0b1..3a66554 100644 --- a/better_profanity/better_profanity.py +++ b/better_profanity/better_profanity.py @@ -190,8 +190,6 @@ def _hide_swear_words(self, text, censor_char): text, next_words_indices, index ) - cur_word = self._check_for_profanity_within(cur_word, censor_char, next_words_indices) - contains_swear_word, end_index = any_next_words_form_swear_word( cur_word, next_words_indices, self.CENSOR_WORDSET )