-
Notifications
You must be signed in to change notification settings - Fork 18
/
parse.py
executable file
·102 lines (85 loc) · 3.76 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import xml.etree.ElementTree as ET
import re
import os
from os import listdir
from os.path import isfile, join
INPUT_CHARSET="utf-8"
########## Function ##############################################
def saveFile(OutputFolder,OutputHeader,name,content):
outputFooter='</div>\n</body>\n</text>\n</TEI>'
foutput=open(OutputFolder+"/"+name+".xml","w",encoding=INPUT_CHARSET)
foutput.write(OutputHeader.replace("\\n","\r"))
foutput.write(content.replace("\\n","\r")[2:-1])
foutput.write(outputFooter)
foutput.close()
def saveFileWITHOUTHEADER(OutputFolder,name,content):
foutput=open(OutputFolder+"/"+name+".xml","w",encoding=INPUT_CHARSET)
#foutput.write(OutputHeader.replace("\\n","\r"))
foutput.write(content.replace("\\n","\r")[2:-1])
#foutput.write(outputFooter)
foutput.close()
def listallFilesinFolder(mypath,extension):
extension="."+extension.lower()
onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) and f.lower().endswith(extension) ]
return onlyfiles
def ParseFile(OutputFolder,path):
ET.register_namespace('', "http://www.tei-c.org/ns/1.0")
tree = ET.parse(path)
root = tree.getroot()
header=root[0] # Header
OutputHeader='<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
OutputHeader+=str(ET.tostring(header))[2:-1]
OutputHeader+='<text>\n<body>\n<div type="edition">\n'
outputFooter='</div>\n</body>\n</text>\n</TEI>'
index=0
unknown=0
mms=0
addenda=0
toc=0
fragmenta=0
dedication=0
# [create new counter for the new subtyope] e.g. tbl=0
for child in root[1][0][0]:
print(child.attrib)
if str(child.tag)=="{http://www.tei-c.org/ns/1.0}div" and 'subtype' in child.attrib:
att=child.attrib;
if att['subtype']=="preface": # Introduction
saveFile(OutputFolder,OutputHeader,"Introduction",str(ET.tostring(child)))
elif att['subtype']=="work": # Work
title=""
if 'n' in child.attrib:
title=att['n']
else:
unknown+=1
title="UNKNOWN"+str(unknown)
saveFile(OutputFolder,OutputHeader,title,str(ET.tostring(child)))
elif att['subtype']=="index": # Index
index+=1
saveFile(OutputFolder,OutputHeader,"INDEX"+str(index),str(ET.tostring(child)))
elif att['subtype']=="mss": # Index
mms+=1
saveFileWITHOUTHEADER(OutputFolder,"MMS"+str(mms),str(ET.tostring(child)))
elif att['subtype']=="addenda": # Add
addenda+=1
saveFile(OutputFolder,OutputHeader,"ADDENDA"+str(addenda),str(ET.tostring(child)))
elif att['subtype']=="toc": # toc
toc+=1
saveFile(OutputFolder,OutputHeader,"TOC"+str(toc),str(ET.tostring(child)))
elif att['subtype']=="fragmenta": # fragm
fragmenta+=1
saveFile(OutputFolder,OutputHeader,"FRAGMENTA"+str(fragmenta),str(ET.tostring(child)))
elif att['subtype']=="dedication": # dedication
dedication+=1
saveFileWITHOUTHEADER(OutputFolder,"Dedication"+str(dedication),str(ET.tostring(child)))
##################################################################
####################### Code starts here #########################
folderPath="/u/stoyanova/Desktop/csel-dev" # path of the folder where CSEL XML file exist
# 1- List all files in the folder
files=listallFilesinFolder(folderPath,"xml")
for f in files:
print(f)
OutputFolder=folderPath+"/"+f+"_Output"
filepath=folderPath+"/"+f
if not os.path.exists(OutputFolder):
os.makedirs(OutputFolder)
ParseFile(OutputFolder,filepath)