-
Notifications
You must be signed in to change notification settings - Fork 3
/
mixed_unicode.py
87 lines (75 loc) · 2.87 KB
/
mixed_unicode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import collections, enum
from src import ModuleManager, utils
class Script(enum.Enum):
Unknown = 0
Latin = 1
Cyrillic = 2
Greek = 3
Armenian = 4
FullWidth = 5
Coptic = 6
Cherokee = 7
TaiLe = 8
class ScoreReason(enum.Enum):
ScriptChange = 0
ScriptChangeInWord = 1
AdditionalScript = 2
WORD_SEPERATORS = [",", " ", "\t", "."]
SCORE_LENGTH = 100
class Module(ModuleManager.BaseModule):
def _detect_script(self, char):
point = ord(char)
# NULL .. LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL
if 0x0000 <= point <= 0x02AF:
return Script.Latin
# GREEK CAPITAL LETTER HETA .. GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL
elif 0x0370 <= point <= 0x03ff:
return Script.Greek
# CYRILLIC CAPITAL LETTER IE WITH GRAVE .. CYRILLIC SMALL LETTER EL WITH DESCENDER
elif 0x0400 <= point <= 0x052F:
return Script.Cyrillic
# ARMENIAN CAPITAL LETTER AYB .. ARMENIAN HYPHEN
elif 0x0531 <= point <= 0x058A:
return Script.Armenian
# FULLWIDTH EXCLAMATION MARK .. FULLWIDTH RIGHT WHITE PARENTHESIS
elif 0xFF01 <= point <= 0xff60:
return Script.FullWidth
# COPTIC CAPITAL LETTER ALFA .. COPTIC MORPHOLOGICAL DIVIDER
elif 0x2C80 <= point <= 0x2CFF:
return Script.Coptic
# CHEROKEE LETTER A .. CHEROKEE SMALL LETTER MV
elif 0x13A0 <= point <= 0x13FD:
return Script.Cherokee
# TAI LE LETTER KA .. U+197F
elif 0x1950 <= point <= 0x197F:
return Script.TaiLe
return Script.Unknown
@utils.hook("received.message.channel")
def channel_message(self, event):
last_script = None
last_was_separator = False
reasons = []
scripts = set([])
for char in event["message"]:
if char in WORD_SEPERATORS:
last_was_separator = True
else:
script = self._detect_script(char)
if not script == Script.Unknown:
scripts.add(script)
if last_script and not script == last_script:
reasons.append(ScoreReason.ScriptChange)
if not last_was_separator:
reasons.append(ScoreReason.ScriptChangeInWord)
last_script = script
last_was_separator = False
if len(scripts) > 1:
reasons.extend([ScoreReason.AdditionalScript]*(len(scripts)-1))
score = len(reasons)
if score > 0:
reasons_s = []
for reason, count in collections.Counter(reasons).items():
reasons_s.append("%s: %s" % (reason, count))
self.log.trace(
"Message given a mixed-unicode score of %s (reasons: %s)",
[score, ", ".join(reasons_s)])