-
Notifications
You must be signed in to change notification settings - Fork 6
/
converted_text_to_akoma_ntoso.py
62 lines (52 loc) · 1.62 KB
/
converted_text_to_akoma_ntoso.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import re
from xml.etree import ElementTree as xml
from volume import Volume
def get_line_details(line):
''' get line details '''
match = re.match("\((\d+\.\d+,\d+\.\d+)\) (.*)", line)
if match:
line_details = {
"pos" : match.group(1),
"text" : match.group(2)
}
else:
line_details = { "pos" : "" , "text" : line}
return line_details
def split_on_volume():
''' split text into volume objects '''
with open('files/pdfminer/output/ferguson_grand_jury_testimony.txt') as file:
volumes = []
recording = False
text = file.readlines()
for line in text:
line_details = get_line_details(line)
# split on volumes, where VOLUME [IVX]+ is the only text
match = re.match("(VOLUME [IVX]+)", line_details["text"])
if match:
volume = Volume()
volume.debateSection(match.group(1))
volumes.append(volume)
recording = True
# get the rest of the text too
if recording:
volume.full_text.append(line_details)
return volumes
if __name__ == "__main__":
volumes = split_on_volume()
for volume in volumes:
volume.remove_cover_pages()
volume.get_speakers()
volume.get_speeches()
volume.fix_indented_qna_speeches()
volume.remove_pos()
volume.build_speeches()
volume.indent(volume.akoma_ntoso)
# Tests
for volume in volumes:
match = re.match("(VOLUME [IVX]+)", volume.heading.text)
assert match
# printout files
for volume in volumes:
with open("files/akoma_ntoso/"+volume.heading.text+".xml", "w") as out:
xml_string = xml.tostring(volume.akoma_ntoso)
out.writelines(xml_string)