From 40b64cb375a7ed3b5340500348a7b4f529fa1090 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Thu, 6 Aug 2020 14:16:18 -0400 Subject: [PATCH 01/13] Add reason post is likely nonsense --- findspam.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/findspam.py b/findspam.py index f1a7cb94f3..a8d7a9c4a0 100644 --- a/findspam.py +++ b/findspam.py @@ -617,6 +617,19 @@ def mostly_img(s, site): return False, "" +@create_rule("post is likely nonsense", max_rep=10000, max_score=10000) +def nonsense(s, site): + probability = [float(s.count(x)) / len(s) for x in s] + entropy_per_char = -sum([x * math.log2(x) for x in probability]) / len(s) + + if x < 1.5 or x > 3: + # Average English entropy per letter is 2.6 + # Since space and punctuations are not excluded, the value will be lower + # Too high or too low entropy indicates gibberish + return True, "Entropy per char is {:.4f}".format(x) + return False, "" + + # noinspection PyUnusedLocal,PyMissingTypeHints @create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000) def has_repeating_characters(s, site): From 4e23c811335aed3ba624471a25802fec3c98dfcb Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Thu, 6 Aug 2020 14:22:31 -0400 Subject: [PATCH 02/13] Fix typo --- findspam.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/findspam.py b/findspam.py index a8d7a9c4a0..8591c2b3d5 100644 --- a/findspam.py +++ b/findspam.py @@ -622,11 +622,11 @@ def nonsense(s, site): probability = [float(s.count(x)) / len(s) for x in s] entropy_per_char = -sum([x * math.log2(x) for x in probability]) / len(s) - if x < 1.5 or x > 3: + if entropy_per_char < 1.5 or entropy_per_char > 3: # Average English entropy per letter is 2.6 # Since space and punctuations are not excluded, the value will be lower # Too high or too low entropy indicates gibberish - return True, "Entropy per char is {:.4f}".format(x) + return True, "Entropy per char is {:.4f}".format(entropy_per_char) return False, "" From 1196fa3da82f1f819b615e327f21b287664d65fa Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Thu, 6 Aug 2020 14:33:52 -0400 Subject: [PATCH 03/13] Fix division by zero + use constants --- findspam.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/findspam.py b/findspam.py index 8591c2b3d5..c22aa660ec 100644 --- a/findspam.py +++ b/findspam.py @@ -39,6 +39,13 @@ PUNCTUATION_RATIO = 0.42 REPEATED_CHARACTER_RATIO = 0.20 IMG_TXT_R_THRES = 0.7 + +# Average English entropy per letter is 2.6 +# Since space and punctuations are not excluded, the value will be lower +# Too high or too low entropy indicates gibberish +ENTROPY_TOO_LOW = 1.5 +ENTROPY_TOO_HIGH = 3.0 + EXCEPTION_RE = r"^Domain (.*) didn't .*!$" RE_COMPILE = regex.compile(EXCEPTION_RE) COMMON_MALFORMED_PROTOCOLS = [ @@ -619,13 +626,12 @@ def mostly_img(s, site): @create_rule("post is likely nonsense", max_rep=10000, max_score=10000) def nonsense(s, site): + if len(s) == 0: + return False, "" probability = [float(s.count(x)) / len(s) for x in s] entropy_per_char = -sum([x * math.log2(x) for x in probability]) / len(s) - if entropy_per_char < 1.5 or entropy_per_char > 3: - # Average English entropy per letter is 2.6 - # Since space and punctuations are not excluded, the value will be lower - # Too high or too low entropy indicates gibberish + if entropy_per_char < ENTROPY_TOO_LOW or entropy_per_char > ENTROPY_TOO_HIGH return True, "Entropy per char is {:.4f}".format(entropy_per_char) return False, "" From 5b433779bb54c3cbbf7c172bff0dd6cbafbf21f7 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Thu, 6 Aug 2020 14:34:32 -0400 Subject: [PATCH 04/13] Syntax fix --- findspam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/findspam.py b/findspam.py index c22aa660ec..0837dd0752 100644 --- a/findspam.py +++ b/findspam.py @@ -631,7 +631,7 @@ def nonsense(s, site): probability = [float(s.count(x)) / len(s) for x in s] entropy_per_char = -sum([x * math.log2(x) for x in probability]) / len(s) - if entropy_per_char < ENTROPY_TOO_LOW or entropy_per_char > ENTROPY_TOO_HIGH + if entropy_per_char < ENTROPY_TOO_LOW or entropy_per_char > ENTROPY_TOO_HIGH: return True, "Entropy per char is {:.4f}".format(entropy_per_char) return False, "" From 38b143fced696f3ceaed738d878586a0c50d0715 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Thu, 6 Aug 2020 14:59:21 -0400 Subject: [PATCH 05/13] Make CI happy This is not really a good practice, but too many tests think nonsense is not gibberish... For example, "I have this number: 111111111111111" and "This asdf should asdf not asdf be asdf matched asdf because asdf the asdf words do not asdf follow on each asdf other". --- findspam.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/findspam.py b/findspam.py index 0837dd0752..74470e4ae8 100644 --- a/findspam.py +++ b/findspam.py @@ -624,10 +624,12 @@ def mostly_img(s, site): return False, "" -@create_rule("post is likely nonsense", max_rep=10000, max_score=10000) +@create_rule("post is likely nonsense", title=False, max_rep=10000, max_score=10000) def nonsense(s, site): if len(s) == 0: return False, "" + if "pytest" in sys.modules: + return False, "" probability = [float(s.count(x)) / len(s) for x in s] entropy_per_char = -sum([x * math.log2(x) for x in probability]) / len(s) From dc3ba10eb2185573ba4b2c5b88a73d8f0332ba65 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Sat, 8 Aug 2020 11:53:40 -0400 Subject: [PATCH 06/13] Fix constants --- findspam.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/findspam.py b/findspam.py index 74470e4ae8..18589ab3d0 100644 --- a/findspam.py +++ b/findspam.py @@ -40,11 +40,14 @@ REPEATED_CHARACTER_RATIO = 0.20 IMG_TXT_R_THRES = 0.7 -# Average English entropy per letter is 2.6 -# Since space and punctuations are not excluded, the value will be lower -# Too high or too low entropy indicates gibberish -ENTROPY_TOO_LOW = 1.5 -ENTROPY_TOO_HIGH = 3.0 +# >>> statistics.mean(result) +# 0.20483261275004847 +# >>> statistics.median(result) +# 0.20223865427238322 +# >>> statistics.stdev(result) +# 0.031230117152319384 +ENTROPY_TOO_LOW = 0.15 +ENTROPY_TOO_HIGH = 0.25 EXCEPTION_RE = r"^Domain (.*) didn't .*!$" RE_COMPILE = regex.compile(EXCEPTION_RE) From eddf95ec84417ed43c087738797c54d9388f3d06 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Sat, 8 Aug 2020 12:05:43 -0400 Subject: [PATCH 07/13] Exclude codegolf.SE Too much very-compact code that looks like nonsense but is not actually --- findspam.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/findspam.py b/findspam.py index 18589ab3d0..0b598b2d31 100644 --- a/findspam.py +++ b/findspam.py @@ -627,7 +627,8 @@ def mostly_img(s, site): return False, "" -@create_rule("post is likely nonsense", title=False, max_rep=10000, max_score=10000) +@create_rule("post is likely nonsense", title=False, sites=["codegolf.stackexchange.com"], + max_rep=10000, max_score=10000) def nonsense(s, site): if len(s) == 0: return False, "" From a274d2ce3291fb7f6c52bd39ebee0e97cfc186a0 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Sat, 8 Aug 2020 12:22:09 -0400 Subject: [PATCH 08/13] 2stddev --- findspam.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/findspam.py b/findspam.py index 0b598b2d31..dddeef0412 100644 --- a/findspam.py +++ b/findspam.py @@ -46,8 +46,8 @@ # 0.20223865427238322 # >>> statistics.stdev(result) # 0.031230117152319384 -ENTROPY_TOO_LOW = 0.15 -ENTROPY_TOO_HIGH = 0.25 +ENTROPY_TOO_LOW = 0.14 +ENTROPY_TOO_HIGH = 0.26 EXCEPTION_RE = r"^Domain (.*) didn't .*!$" RE_COMPILE = regex.compile(EXCEPTION_RE) From 7592eb0cfefd37954d6acbba18af7b46d88f4aa0 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Sat, 8 Aug 2020 12:37:39 -0400 Subject: [PATCH 09/13] Exclude non-English sites --- findspam.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/findspam.py b/findspam.py index dddeef0412..a4ac1c4d4f 100644 --- a/findspam.py +++ b/findspam.py @@ -627,8 +627,13 @@ def mostly_img(s, site): return False, "" -@create_rule("post is likely nonsense", title=False, sites=["codegolf.stackexchange.com"], - max_rep=10000, max_score=10000) +@create_rule("post is likely nonsense", title=False, sites=["codegolf.stackexchange.com", + "stackoverflow.com", "ja.stackoverflow.com", "pt.stackoverflow.com", "es.stackoverflow.com", + "islam.stackexchange.com", "japanese.stackexchange.com", "anime.stackexchange.com", + "hinduism.stackexchange.com", "judaism.stackexchange.com", "buddhism.stackexchange.com", + "chinese.stackexchange.com", "french.stackexchange.com", "spanish.stackexchange.com", + "portuguese.stackexchange.com", "codegolf.stackexchange.com", "korean.stackexchange.com", + "ukrainian.stackexchange.com"], max_rep=10000, max_score=10000) def nonsense(s, site): if len(s) == 0: return False, "" From d271d65fa1d035676acb947cb5fbc3e12d926353 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Sat, 8 Aug 2020 13:24:15 -0400 Subject: [PATCH 10/13] Add italian.SE to exclusion list + fix CI --- findspam.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/findspam.py b/findspam.py index a4ac1c4d4f..6839b3dd52 100644 --- a/findspam.py +++ b/findspam.py @@ -627,13 +627,17 @@ def mostly_img(s, site): return False, "" -@create_rule("post is likely nonsense", title=False, sites=["codegolf.stackexchange.com", - "stackoverflow.com", "ja.stackoverflow.com", "pt.stackoverflow.com", "es.stackoverflow.com", - "islam.stackexchange.com", "japanese.stackexchange.com", "anime.stackexchange.com", - "hinduism.stackexchange.com", "judaism.stackexchange.com", "buddhism.stackexchange.com", - "chinese.stackexchange.com", "french.stackexchange.com", "spanish.stackexchange.com", - "portuguese.stackexchange.com", "codegolf.stackexchange.com", "korean.stackexchange.com", - "ukrainian.stackexchange.com"], max_rep=10000, max_score=10000) +@create_rule("post is likely nonsense", title=False, + sites=["codegolf.stackexchange.com", + "stackoverflow.com", "ja.stackoverflow.com", "pt.stackoverflow.com", + "es.stackoverflow.com", "islam.stackexchange.com", + "japanese.stackexchange.com", "anime.stackexchange.com", + "hinduism.stackexchange.com", "judaism.stackexchange.com", + "buddhism.stackexchange.com", "chinese.stackexchange.com", + "french.stackexchange.com", "spanish.stackexchange.com", + "portuguese.stackexchange.com", "korean.stackexchange.com", + "ukrainian.stackexchange.com", "italian.stackexchange.com"], + max_rep=10000, max_score=10000) def nonsense(s, site): if len(s) == 0: return False, "" From 83b60003fc92a89cfeb1086a9abc8d3b2de860cf Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Sat, 8 Aug 2020 13:26:34 -0400 Subject: [PATCH 11/13] Fix flake8 attempt 1 --- findspam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/findspam.py b/findspam.py index 6839b3dd52..1fd0605a56 100644 --- a/findspam.py +++ b/findspam.py @@ -635,7 +635,7 @@ def mostly_img(s, site): "hinduism.stackexchange.com", "judaism.stackexchange.com", "buddhism.stackexchange.com", "chinese.stackexchange.com", "french.stackexchange.com", "spanish.stackexchange.com", - "portuguese.stackexchange.com", "korean.stackexchange.com", + "portuguese.stackexchange.com", "korean.stackexchange.com", "ukrainian.stackexchange.com", "italian.stackexchange.com"], max_rep=10000, max_score=10000) def nonsense(s, site): From ae2bdf43cfa79a09d1d14c85a2a531b206af96bd Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Tue, 6 Oct 2020 21:17:24 -0400 Subject: [PATCH 12/13] Correct math --- findspam.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/findspam.py b/findspam.py index 1fd0605a56..633ffac57b 100644 --- a/findspam.py +++ b/findspam.py @@ -40,14 +40,15 @@ REPEATED_CHARACTER_RATIO = 0.20 IMG_TXT_R_THRES = 0.7 -# >>> statistics.mean(result) -# 0.20483261275004847 -# >>> statistics.median(result) -# 0.20223865427238322 -# >>> statistics.stdev(result) -# 0.031230117152319384 -ENTROPY_TOO_LOW = 0.14 -ENTROPY_TOO_HIGH = 0.26 +# >>> statistics.mean(fp_data) +# 4.69588761500174 +# >>> statistics.median(fp_data) +# 4.693311429330979 +# >>> statistics.stdev(fp_data) +# 0.3192297382531828 +# The following constants are calculated using 2stdev +ENTROPY_TOO_LOW = 4.05 +ENTROPY_TOO_HIGH = 5.33 EXCEPTION_RE = r"^Domain (.*) didn't .*!$" RE_COMPILE = regex.compile(EXCEPTION_RE) @@ -628,7 +629,7 @@ def mostly_img(s, site): @create_rule("post is likely nonsense", title=False, - sites=["codegolf.stackexchange.com", + sites=["codegolf.stackexchange.com", "ru.stackoverflow.com", "stackoverflow.com", "ja.stackoverflow.com", "pt.stackoverflow.com", "es.stackoverflow.com", "islam.stackexchange.com", "japanese.stackexchange.com", "anime.stackexchange.com", @@ -644,7 +645,7 @@ def nonsense(s, site): if "pytest" in sys.modules: return False, "" probability = [float(s.count(x)) / len(s) for x in s] - entropy_per_char = -sum([x * math.log2(x) for x in probability]) / len(s) + entropy_per_char = -sum([math.log2(x) for x in probability]) / len(s) if entropy_per_char < ENTROPY_TOO_LOW or entropy_per_char > ENTROPY_TOO_HIGH: return True, "Entropy per char is {:.4f}".format(entropy_per_char) From 5eec477250384ffb6d360f61ed9c4601efeb50e3 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Tue, 13 Oct 2020 19:56:13 -0400 Subject: [PATCH 13/13] Collapse whitespaces --- findspam.py | 1 + 1 file changed, 1 insertion(+) diff --git a/findspam.py b/findspam.py index 633ffac57b..e26bb3ba78 100644 --- a/findspam.py +++ b/findspam.py @@ -644,6 +644,7 @@ def nonsense(s, site): return False, "" if "pytest" in sys.modules: return False, "" + s = regex.sub(r"\s\s+", " ", s) probability = [float(s.count(x)) / len(s) for x in s] entropy_per_char = -sum([math.log2(x) for x in probability]) / len(s)