-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert-xml-format-to-wordpair-format.py
52 lines (47 loc) · 1.29 KB
/
convert-xml-format-to-wordpair-format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import re
import time
import io
import sys
import xml.dom.minidom
from collections import defaultdict
doc = xml.dom.minidom.parse(sys.argv[1])
out = io.open(sys.argv[2], encoding='utf8', mode='w')
upper, lower = False, False
if len(sys.argv) > 3:
if sys.argv[3] == 'upper':
upper, lower = True, False
elif sys.argv[3] == 'lower':
upper, lower = False, True
def textToM2m(text):
m2m = ''
for c in text:
m2m += ' '
if c == ' ':
m2m += '<space>'
else:
m2m += c
m2m += ' '
m2m = m2m.strip()
if upper:
m2m = m2m.upper()
elif lower:
m2m = m2m.lower()
return m2m
if doc.childNodes[0].nodeName != 'TransliterationCorpus':
print 'Surprising xml node!{0}'.format(doc.childNodes[0].nodeName)
exit(1)
corpus = doc.childNodes[0]
#corpusId = corpus.attributes['CorpusID'].value
for namedEntity in corpus.getElementsByTagName('Name'):
# find source name
srcNode = namedEntity.getElementsByTagName('SourceName')[0]
srcText = srcNode.childNodes[0].data
#srcText = textToM2m(srcText)
# find each target name
tgtNodes = namedEntity.getElementsByTagName('TargetName')
for tgtNode in tgtNodes:
tgtText = tgtNode.childNodes[0].data
#tgtText = textToM2m(tgtText)
line = u'{0} ||| {1}\n'.format(srcText, tgtText)
out.write(line)
out.close()