-
Notifications
You must be signed in to change notification settings - Fork 1
/
KannadaLitmusEngine.py
131 lines (117 loc) · 4.96 KB
/
KannadaLitmusEngine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import xml.etree.ElementTree as ET
import sys
import re
import gzip
import os
from io import StringIO
from collections import Counter
class KannadaLitmusEngine:
def __init__(self):
self.noise_pattern = re.compile(r'===?[^=]+=?==')
self.nonword_pattern = re.compile(r'\W')
self.p_initials = re.compile('^(ಪ|ಪಾ|ಪಿ|ಪೀ|ಪು|ಪೂ|ಪೆ|ಪೇ|ಪೈ|ಪೊ|ಪೋ|ಪೌ)')
self.pa = re.compile('^ಪ')
self.h_initials = re.compile('^(ಹ|ಹಾ|ಹಿ|ಹೀ|ಹು|ಹೂ|ಹೆ|ಹೇ|ಹೈ|ಹೊ|ಹೋ|ಹೌ)')
self.ha = re.compile('^ಹ')
self.pertinent_words = Counter()
self.hertinent_words = Counter()
self.psuffixes = Counter()
self.hsuffixes = Counter()
# self.load_titles()
def load_titles(self):
with open("data/knwiktionary-20210401-all-titles-in-ns0", "r") as title_f:
self.titles = set(title_f.read().split('\n'))
def parse_xml(self, f):
xml = open(f).read()
it = ET.iterparse(StringIO(xml))
for _, el in it:
prefix, has_namespace, postfix = el.tag.partition('}')
if has_namespace:
el.tag = postfix # strip all namespaces
root = it.root
for page in root.findall(".//page/[ns='0']"):
yield page
def get_candidates(self, doc):
title = doc.findall('./title')
text = doc.findall('.//text')
return (title, text)
def get_synonyms(self, title, text):
if text:
text = text[0].text
if text:
if title:
title = title[0].text
text = title + ' ' + text
text = self.preprocess_text(text)
pwords, hwords = self.get_phwords(text)
phsynonyms = list(map(lambda suffix: (
'ಪ' + suffix, 'ಹ' + suffix), pwords & hwords))
return phsynonyms
return []
def get_phwords(self, text):
words = text.split()
psuffixes = Counter()
hsuffixes = Counter()
for word in words:
if self.p_initials.match(word):
self.pertinent_words[word] += 1
suffix = re.sub(self.pa, '', word, 1)
psuffixes[suffix] += 1
elif self.h_initials.match(word):
self.hertinent_words[word] += 1
suffix = re.sub(self.ha, '', word, 1)
hsuffixes[suffix] += 1
self.psuffixes.update(psuffixes)
self.hsuffixes.update(hsuffixes)
return psuffixes, hsuffixes
def preprocess_text(self, text):
text = self.noise_pattern.sub(
' ', text, re.MULTILINE | re.DOTALL)
text = re.sub(
r'([a-z]|[\{\}\<\>\[\]\|\-\:\'\;\)\(\.\,])+', ' ', text.lower())
return text
def process_kn_wiktionary(e, f=r"data/knwiktionary-20210401-pages-articles.xml"):
pairs = set()
pairedwords = set()
for doc in e.parse_xml(f):
title, text = e.get_candidates(doc)
syns = e.get_synonyms(title, text)
if len(syns) > 0:
for pair in syns:
pairs.add(",".join(pair))
pairedwords.add(pair[0])
pairedwords.add(pair[1])
return pairs, pairedwords
if __name__ == "__main__":
e = KannadaLitmusEngine()
infile, corpus = sys.argv[1:3]
pairs, pairedwords = process_kn_wiktionary(e)
outpath = 'out/' + corpus + '/'
if not os.path.isdir(outpath):
os.mkdir(outpath)
with open(outpath + "phsynonyms.csv", "w") as synf:
for pair in pairs:
print(pair, file=synf)
if corpus != "wikt":
with gzip.open(infile, 'rt') as corpus_f:
for doc in corpus_f:
text = e.preprocess_text(doc)
e.get_phwords(text)
with open(outpath + "presumable_synonyms.csv", "w") as psynf:
presumable_synonyms = e.psuffixes & e.hsuffixes
for suffix in presumable_synonyms:
pcount = e.psuffixes[suffix]
hcount = e.hsuffixes[suffix]
print(','.join(('ಪ' + suffix, str(pcount), 'ಹ' + suffix, str(hcount),
str(round((pcount-hcount)/(pcount+hcount), 5)))), file=psynf)
with open(outpath + "puniverse.csv", "w") as punif, open(outpath + "huniverse.csv", "w") as hunif, open(outpath + "unpaired.csv", "w") as unpairedf:
for word, count in e.pertinent_words.most_common():
suffix = re.sub(e.pa, '', word)
print(word + ',' + suffix + ',' + str(count), file=punif)
if word not in pairedwords and suffix not in presumable_synonyms:
print(word + ',' + str(count), file=unpairedf)
for word, count in e.hertinent_words.most_common():
suffix = re.sub(e.ha, '', word)
print(word + ',' + suffix + ',' + str(count), file=hunif)
if word not in pairedwords and suffix not in presumable_synonyms:
print(word + ',' + str(count), file=unpairedf)