-
Notifications
You must be signed in to change notification settings - Fork 0
/
mutinfo
executable file
·47 lines (43 loc) · 1.65 KB
/
mutinfo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/bin/bash
# Usage: $ ./mutinfo
# and it produces three files:
# SEAN.mutual.txt (Eng-Ir lexicon induced from mutual info on pre '44 corpus)
# NUA.mutual.txt (Eng-Ir lexicon induced from mutual info on post '44 corpus)
# UPDATE.txt (list of spelling updates by merging previous two)
CORPUSDIR=${HOME}/gaeilge/diolaim/comp
LIBEXECPATH=/usr/local/libexec
DATAPATH=/usr/local/share/corpas
# one argument is the regexp matching filenames
do_it()
{
PATTERN="${1}"
TEMPBEARLA=`mktemp`
TEMPGAEILGE=`mktemp`
cd ${CORPUSDIR}
ls | egrep -e "^${PATTERN}-b$" |
while read enfile
do
cat "${enfile}" | sed 's/^[^ ]* //' | ${LIBEXECPATH}/commontok
done | sort -u |
while read englishword
do
HITZ=`echo "${englishword}" | ${LIBEXECPATH}/ppfaigh ${DATAPATH}/veicteoir.db n | tr " " "\n" | egrep '.' | egrep "^${PATTERN}-b:" | sed 's/-b:/:/' | sort -u | tr "\n" " "`
echo "${englishword}: ${HITZ}"
done > ${TEMPBEARLA}
ls | egrep -e "^${PATTERN}" | egrep -v -e '-b$' |
while read gafile
do
cat "${gafile}" | sed 's/^[^ ]* //' | ${LIBEXECPATH}/commontok
done | mutate -d | sort -u |
while read irishword
do
HITZ=`echo "${irishword}" | mutate | ${LIBEXECPATH}/ppfaigh ${DATAPATH}/veicteoir.db n | tr " " "\n" | egrep '.' | egrep "^${PATTERN}" | egrep -v -e '-b:' | sort -u | tr "\n" " "`
echo "${irishword}: ${HITZ}"
done > ${TEMPGAEILGE}
perl ${HOME}/gaeilge/corpas/corpas/mutinfo.pl ${TEMPBEARLA} ${TEMPGAEILGE}
rm -f ${TEMPBEARLA} ${TEMPGAEILGE}
}
# do_it '19([23][0-9]|4[0-3]).*' > SEAN.mutual.txt
do_it '19([5-9][0-9]|4[5-9]).*' > NUA.mutual.txt
# perl report.pl SEAN.mutual.txt NUA.mutual.txt | sort > ALLUPDATES.txt
# cat ALLUPDATES.txt | egrep -v '^([^ ]+) -> \1 \(' > UPDATES.txt