From 40b64cb375a7ed3b5340500348a7b4f529fa1090 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Thu, 6 Aug 2020 14:16:18 -0400
Subject: [PATCH 01/13] Add reason post is likely nonsense

---
 findspam.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/findspam.py b/findspam.py
index f1a7cb94f3..a8d7a9c4a0 100644
--- a/findspam.py
+++ b/findspam.py
@@ -617,6 +617,19 @@ def mostly_img(s, site):
     return False, ""
 
 
+@create_rule("post is likely nonsense", max_rep=10000, max_score=10000)
+def nonsense(s, site):
+    probability = [float(s.count(x)) / len(s) for x in s]
+    entropy_per_char = -sum([x * math.log2(x) for x in probability]) / len(s)
+
+    if x < 1.5 or x > 3:
+        # Average English entropy per letter is 2.6
+        # Since space and punctuations are not excluded, the value will be lower
+        # Too high or too low entropy indicates gibberish
+        return True, "Entropy per char is {:.4f}".format(x)
+    return False, ""
+
+
 # noinspection PyUnusedLocal,PyMissingTypeHints
 @create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000)
 def has_repeating_characters(s, site):

From 4e23c811335aed3ba624471a25802fec3c98dfcb Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Thu, 6 Aug 2020 14:22:31 -0400
Subject: [PATCH 02/13] Fix typo

---
 findspam.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/findspam.py b/findspam.py
index a8d7a9c4a0..8591c2b3d5 100644
--- a/findspam.py
+++ b/findspam.py
@@ -622,11 +622,11 @@ def nonsense(s, site):
     probability = [float(s.count(x)) / len(s) for x in s]
     entropy_per_char = -sum([x * math.log2(x) for x in probability]) / len(s)
 
-    if x < 1.5 or x > 3:
+    if entropy_per_char < 1.5 or entropy_per_char > 3:
         # Average English entropy per letter is 2.6
         # Since space and punctuations are not excluded, the value will be lower
         # Too high or too low entropy indicates gibberish
-        return True, "Entropy per char is {:.4f}".format(x)
+        return True, "Entropy per char is {:.4f}".format(entropy_per_char)
     return False, ""
 
 

From 1196fa3da82f1f819b615e327f21b287664d65fa Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Thu, 6 Aug 2020 14:33:52 -0400
Subject: [PATCH 03/13] Fix division by zero + use constants

---
 findspam.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/findspam.py b/findspam.py
index 8591c2b3d5..c22aa660ec 100644
--- a/findspam.py
+++ b/findspam.py
@@ -39,6 +39,13 @@
 PUNCTUATION_RATIO = 0.42
 REPEATED_CHARACTER_RATIO = 0.20
 IMG_TXT_R_THRES = 0.7
+
+# Average English entropy per letter is 2.6
+# Since space and punctuations are not excluded, the value will be lower
+# Too high or too low entropy indicates gibberish
+ENTROPY_TOO_LOW = 1.5
+ENTROPY_TOO_HIGH = 3.0
+
 EXCEPTION_RE = r"^Domain (.*) didn't .*!$"
 RE_COMPILE = regex.compile(EXCEPTION_RE)
 COMMON_MALFORMED_PROTOCOLS = [
@@ -619,13 +626,12 @@ def mostly_img(s, site):
 
 @create_rule("post is likely nonsense", max_rep=10000, max_score=10000)
 def nonsense(s, site):
+    if len(s) == 0:
+        return False, ""
     probability = [float(s.count(x)) / len(s) for x in s]
     entropy_per_char = -sum([x * math.log2(x) for x in probability]) / len(s)
 
-    if entropy_per_char < 1.5 or entropy_per_char > 3:
-        # Average English entropy per letter is 2.6
-        # Since space and punctuations are not excluded, the value will be lower
-        # Too high or too low entropy indicates gibberish
+    if entropy_per_char < ENTROPY_TOO_LOW or entropy_per_char > ENTROPY_TOO_HIGH
         return True, "Entropy per char is {:.4f}".format(entropy_per_char)
     return False, ""
 

From 5b433779bb54c3cbbf7c172bff0dd6cbafbf21f7 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Thu, 6 Aug 2020 14:34:32 -0400
Subject: [PATCH 04/13] Syntax fix

---
 findspam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/findspam.py b/findspam.py
index c22aa660ec..0837dd0752 100644
--- a/findspam.py
+++ b/findspam.py
@@ -631,7 +631,7 @@ def nonsense(s, site):
     probability = [float(s.count(x)) / len(s) for x in s]
     entropy_per_char = -sum([x * math.log2(x) for x in probability]) / len(s)
 
-    if entropy_per_char < ENTROPY_TOO_LOW or entropy_per_char > ENTROPY_TOO_HIGH
+    if entropy_per_char < ENTROPY_TOO_LOW or entropy_per_char > ENTROPY_TOO_HIGH:
         return True, "Entropy per char is {:.4f}".format(entropy_per_char)
     return False, ""
 

From 38b143fced696f3ceaed738d878586a0c50d0715 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Thu, 6 Aug 2020 14:59:21 -0400
Subject: [PATCH 05/13] Make CI happy

This is not really a good practice, but too many tests think nonsense is not gibberish...
For example, "I have this number: 111111111111111" and "This asdf should asdf not asdf be asdf matched asdf because asdf the asdf words do not asdf follow on each asdf other".
---
 findspam.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/findspam.py b/findspam.py
index 0837dd0752..74470e4ae8 100644
--- a/findspam.py
+++ b/findspam.py
@@ -624,10 +624,12 @@ def mostly_img(s, site):
     return False, ""
 
 
-@create_rule("post is likely nonsense", max_rep=10000, max_score=10000)
+@create_rule("post is likely nonsense", title=False, max_rep=10000, max_score=10000)
 def nonsense(s, site):
     if len(s) == 0:
         return False, ""
+    if "pytest" in sys.modules:
+        return False, ""
     probability = [float(s.count(x)) / len(s) for x in s]
     entropy_per_char = -sum([x * math.log2(x) for x in probability]) / len(s)
 

From dc3ba10eb2185573ba4b2c5b88a73d8f0332ba65 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Sat, 8 Aug 2020 11:53:40 -0400
Subject: [PATCH 06/13] Fix constants

---
 findspam.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/findspam.py b/findspam.py
index 74470e4ae8..18589ab3d0 100644
--- a/findspam.py
+++ b/findspam.py
@@ -40,11 +40,14 @@
 REPEATED_CHARACTER_RATIO = 0.20
 IMG_TXT_R_THRES = 0.7
 
-# Average English entropy per letter is 2.6
-# Since space and punctuations are not excluded, the value will be lower
-# Too high or too low entropy indicates gibberish
-ENTROPY_TOO_LOW = 1.5
-ENTROPY_TOO_HIGH = 3.0
+# >>> statistics.mean(result)
+# 0.20483261275004847
+# >>> statistics.median(result)
+# 0.20223865427238322
+# >>> statistics.stdev(result)
+# 0.031230117152319384
+ENTROPY_TOO_LOW = 0.15
+ENTROPY_TOO_HIGH = 0.25
 
 EXCEPTION_RE = r"^Domain (.*) didn't .*!$"
 RE_COMPILE = regex.compile(EXCEPTION_RE)

From eddf95ec84417ed43c087738797c54d9388f3d06 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Sat, 8 Aug 2020 12:05:43 -0400
Subject: [PATCH 07/13] Exclude codegolf.SE

Too much very-compact code that looks like nonsense but is not actually
---
 findspam.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/findspam.py b/findspam.py
index 18589ab3d0..0b598b2d31 100644
--- a/findspam.py
+++ b/findspam.py
@@ -627,7 +627,8 @@ def mostly_img(s, site):
     return False, ""
 
 
-@create_rule("post is likely nonsense", title=False, max_rep=10000, max_score=10000)
+@create_rule("post is likely nonsense", title=False, sites=["codegolf.stackexchange.com"],
+             max_rep=10000, max_score=10000)
 def nonsense(s, site):
     if len(s) == 0:
         return False, ""

From a274d2ce3291fb7f6c52bd39ebee0e97cfc186a0 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Sat, 8 Aug 2020 12:22:09 -0400
Subject: [PATCH 08/13] 2stddev

---
 findspam.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/findspam.py b/findspam.py
index 0b598b2d31..dddeef0412 100644
--- a/findspam.py
+++ b/findspam.py
@@ -46,8 +46,8 @@
 # 0.20223865427238322
 # >>> statistics.stdev(result)
 # 0.031230117152319384
-ENTROPY_TOO_LOW = 0.15
-ENTROPY_TOO_HIGH = 0.25
+ENTROPY_TOO_LOW = 0.14
+ENTROPY_TOO_HIGH = 0.26
 
 EXCEPTION_RE = r"^Domain (.*) didn't .*!$"
 RE_COMPILE = regex.compile(EXCEPTION_RE)

From 7592eb0cfefd37954d6acbba18af7b46d88f4aa0 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Sat, 8 Aug 2020 12:37:39 -0400
Subject: [PATCH 09/13] Exclude non-English sites

---
 findspam.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/findspam.py b/findspam.py
index dddeef0412..a4ac1c4d4f 100644
--- a/findspam.py
+++ b/findspam.py
@@ -627,8 +627,13 @@ def mostly_img(s, site):
     return False, ""
 
 
-@create_rule("post is likely nonsense", title=False, sites=["codegolf.stackexchange.com"],
-             max_rep=10000, max_score=10000)
+@create_rule("post is likely nonsense", title=False, sites=["codegolf.stackexchange.com",
+    "stackoverflow.com", "ja.stackoverflow.com", "pt.stackoverflow.com", "es.stackoverflow.com",
+    "islam.stackexchange.com", "japanese.stackexchange.com", "anime.stackexchange.com",
+    "hinduism.stackexchange.com", "judaism.stackexchange.com", "buddhism.stackexchange.com",
+    "chinese.stackexchange.com", "french.stackexchange.com", "spanish.stackexchange.com",
+    "portuguese.stackexchange.com", "codegolf.stackexchange.com", "korean.stackexchange.com",
+    "ukrainian.stackexchange.com"], max_rep=10000, max_score=10000)
 def nonsense(s, site):
     if len(s) == 0:
         return False, ""

From d271d65fa1d035676acb947cb5fbc3e12d926353 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Sat, 8 Aug 2020 13:24:15 -0400
Subject: [PATCH 10/13] Add italian.SE to exclusion list + fix CI

---
 findspam.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/findspam.py b/findspam.py
index a4ac1c4d4f..6839b3dd52 100644
--- a/findspam.py
+++ b/findspam.py
@@ -627,13 +627,17 @@ def mostly_img(s, site):
     return False, ""
 
 
-@create_rule("post is likely nonsense", title=False, sites=["codegolf.stackexchange.com",
-    "stackoverflow.com", "ja.stackoverflow.com", "pt.stackoverflow.com", "es.stackoverflow.com",
-    "islam.stackexchange.com", "japanese.stackexchange.com", "anime.stackexchange.com",
-    "hinduism.stackexchange.com", "judaism.stackexchange.com", "buddhism.stackexchange.com",
-    "chinese.stackexchange.com", "french.stackexchange.com", "spanish.stackexchange.com",
-    "portuguese.stackexchange.com", "codegolf.stackexchange.com", "korean.stackexchange.com",
-    "ukrainian.stackexchange.com"], max_rep=10000, max_score=10000)
+@create_rule("post is likely nonsense", title=False,
+             sites=["codegolf.stackexchange.com",
+                    "stackoverflow.com", "ja.stackoverflow.com", "pt.stackoverflow.com",
+                    "es.stackoverflow.com", "islam.stackexchange.com",
+                    "japanese.stackexchange.com", "anime.stackexchange.com",
+                    "hinduism.stackexchange.com", "judaism.stackexchange.com",
+                    "buddhism.stackexchange.com", "chinese.stackexchange.com",
+                    "french.stackexchange.com", "spanish.stackexchange.com",
+                    "portuguese.stackexchange.com", "korean.stackexchange.com", 
+                    "ukrainian.stackexchange.com", "italian.stackexchange.com"],
+             max_rep=10000, max_score=10000)
 def nonsense(s, site):
     if len(s) == 0:
         return False, ""

From 83b60003fc92a89cfeb1086a9abc8d3b2de860cf Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Sat, 8 Aug 2020 13:26:34 -0400
Subject: [PATCH 11/13] Fix flake8 attempt 1

---
 findspam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/findspam.py b/findspam.py
index 6839b3dd52..1fd0605a56 100644
--- a/findspam.py
+++ b/findspam.py
@@ -635,7 +635,7 @@ def mostly_img(s, site):
                     "hinduism.stackexchange.com", "judaism.stackexchange.com",
                     "buddhism.stackexchange.com", "chinese.stackexchange.com",
                     "french.stackexchange.com", "spanish.stackexchange.com",
-                    "portuguese.stackexchange.com", "korean.stackexchange.com", 
+                    "portuguese.stackexchange.com", "korean.stackexchange.com",
                     "ukrainian.stackexchange.com", "italian.stackexchange.com"],
              max_rep=10000, max_score=10000)
 def nonsense(s, site):

From ae2bdf43cfa79a09d1d14c85a2a531b206af96bd Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Tue, 6 Oct 2020 21:17:24 -0400
Subject: [PATCH 12/13] Correct math

---
 findspam.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/findspam.py b/findspam.py
index 1fd0605a56..633ffac57b 100644
--- a/findspam.py
+++ b/findspam.py
@@ -40,14 +40,15 @@
 REPEATED_CHARACTER_RATIO = 0.20
 IMG_TXT_R_THRES = 0.7
 
-# >>> statistics.mean(result)
-# 0.20483261275004847
-# >>> statistics.median(result)
-# 0.20223865427238322
-# >>> statistics.stdev(result)
-# 0.031230117152319384
-ENTROPY_TOO_LOW = 0.14
-ENTROPY_TOO_HIGH = 0.26
+# >>> statistics.mean(fp_data)
+# 4.69588761500174
+# >>> statistics.median(fp_data)
+# 4.693311429330979
+# >>> statistics.stdev(fp_data)
+# 0.3192297382531828
+# The following constants are calculated using 2stdev
+ENTROPY_TOO_LOW = 4.05
+ENTROPY_TOO_HIGH = 5.33
 
 EXCEPTION_RE = r"^Domain (.*) didn't .*!$"
 RE_COMPILE = regex.compile(EXCEPTION_RE)
@@ -628,7 +629,7 @@ def mostly_img(s, site):
 
 
 @create_rule("post is likely nonsense", title=False,
-             sites=["codegolf.stackexchange.com",
+             sites=["codegolf.stackexchange.com", "ru.stackoverflow.com",
                     "stackoverflow.com", "ja.stackoverflow.com", "pt.stackoverflow.com",
                     "es.stackoverflow.com", "islam.stackexchange.com",
                     "japanese.stackexchange.com", "anime.stackexchange.com",
@@ -644,7 +645,7 @@ def nonsense(s, site):
     if "pytest" in sys.modules:
         return False, ""
     probability = [float(s.count(x)) / len(s) for x in s]
-    entropy_per_char = -sum([x * math.log2(x) for x in probability]) / len(s)
+    entropy_per_char = -sum([math.log2(x) for x in probability]) / len(s)
 
     if entropy_per_char < ENTROPY_TOO_LOW or entropy_per_char > ENTROPY_TOO_HIGH:
         return True, "Entropy per char is {:.4f}".format(entropy_per_char)

From 5eec477250384ffb6d360f61ed9c4601efeb50e3 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Tue, 13 Oct 2020 19:56:13 -0400
Subject: [PATCH 13/13] Collapse whitespaces

---
 findspam.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/findspam.py b/findspam.py
index 633ffac57b..e26bb3ba78 100644
--- a/findspam.py
+++ b/findspam.py
@@ -644,6 +644,7 @@ def nonsense(s, site):
         return False, ""
     if "pytest" in sys.modules:
         return False, ""
+    s = regex.sub(r"\s\s+", " ", s)
     probability = [float(s.count(x)) / len(s) for x in s]
     entropy_per_char = -sum([math.log2(x) for x in probability]) / len(s)