-
Notifications
You must be signed in to change notification settings - Fork 0
/
mcr2free.awk
38 lines (37 loc) · 896 Bytes
/
mcr2free.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# https://github.com/ekaf/ski/mcr2free.awk (c) 2017-21 Eric Kafe
# license CC BY 4.0, https://creativecommons.org/licenses/by/4.0/
{
split($1,a,"#")
split(a[2],b,":")
lang=b[1]
sub("po","pt",lang)
sub("^.*:","",$2)
key=lang":"$2"-"b[2]
# Freeling converts all words to lower case:
w=tolower(a[1])
if(w!~"[^\xC2\xB7\xC2\xB07\xC2\xA1/&![:lower:][:digit:]\\'._\\-]")dic[key"="w]=1
else {
if(key in typo)typo[key]=typo[key]" "w
else typo[key]=w
}
}
END{
asorti(dic,dic1)
delete dic
for(k in typo){
split(k,a,":")
print a[2], typo[k] | "sort>" od "/" a[1] "/mcr-typos" v ".txt"
}
for(k in dic1){
split(dic1[k],a,"=")
k2=a[1]
if(k2 in dic2)dic2[k2]=dic2[k2]" "a[2]
else dic2[k2]=a[2]
}
delete dic1
for(k in dic2){
split(k,a,":")
lang=a[1]
print a[2], dic2[k] | "sort>" od "/" lang "/senses" v ".src"
}
}