This repository has been archived by the owner on Dec 21, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathgenerate_json.py
152 lines (120 loc) · 4.31 KB
/
generate_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# Encoding: utf-8
from BeautifulSoup import BeautifulSoup
import json
def find_child(children, sub):
candidate = None
elems = filter(lambda x: x['sub_key'] == sub, children)
if len(elems):
candidate = elems[0]
return candidate
def get_parent_key(key):
splitted = key.split('.')
nb_terms = len(splitted) - 1
return ".".join(splitted[:nb_terms])
def find_node(root={}, prop_key=None):
splitted_key = prop_key.split('.')
node = None
parent_node = None
partial_key = []
for sub_key in splitted_key:
partial_key.append(sub_key)
if parent_node == None:
parent_node = root
node = find_child(parent_node['children'], sub_key)
if node == None and int(sub_key) == 0:
prop_key = ".".join(partial_key)
parent_node = find_node(root=root, prop_key=get_parent_key(prop_key))
node = create_node(full_key=prop_key, sub_key=sub_key, empty=True, parent=parent_node)
# if we will have a next loop then we save this node as parent
if node:
parent_node = node
if node == None:
import pdb; pdb.set_trace()
return node
def create_node(full_key=None, empty=False, sub_key=0, parent=None, proposition={}):
node = {
'sub_key': sub_key,
'key': full_key,
'empty': empty,
'children': [],
'content': {}
}
if empty == False:
node['content'].update(proposition)
existing_node = find_child(parent['children'], sub_key)
if existing_node == None:
parent['children'].append(node)
return node
def create_nodes(propositions=[]):
root_node = { 'key':'0', 'children': [] }
for (key, prop) in sorted(propositions.items()):
splitted_key = key.split('.')
if len(splitted_key) > 1:
sub_key = splitted_key[len(splitted_key) - 1]
parent_node = find_node(root=root_node, prop_key=get_parent_key(key))
else:
sub_key = splitted_key[0]
parent_node = root_node
if parent_node == None:
import pdb; pdb.set_trace()
create_node(parent=parent_node, full_key=key, sub_key=sub_key,
proposition=prop)
return root_node
################################################################################
#
# HTML Processing utility function
#
################################################################################
def get_prop_content(prop):
if prop is None:
return ""
elements = prop.contents
elements_string = u''.join(
map(
lambda x: soup.toEncoding(x),
elements
)
)
return u"<p>{prop}</p>".format(prop=elements_string)
def atomize_key(key):
key = key.replace('*', '')
splitted = key.split('.')
if len(splitted) > 1:
m = splitted[1]
numbers = [c for c in m]
proper_key = "{n}.{m}".format(n=splitted[0], m=".".join(numbers))
else:
proper_key = splitted[0]
return proper_key
def extract_propositions(page):
propositions = {}
table = page.table.findNext('table')
rows = table.findChildren('tr')[1:]
last_prop_key = None
for tr in rows:
pnum_td = tr.find('td', { 'class': 'pnum' })
if pnum_td and pnum_td.text == '':
prop_key = last_prop_key
elif pnum_td and pnum_td.text != '':
prop_key = atomize_key(pnum_td.text)
if prop_key:
last_prop_key = prop_key
prop_de = get_prop_content(tr.find('td', { 'class': 'ger' }))
prop_en = get_prop_content(tr.find('td', { 'class': 'pmc' }))
proposition = propositions.get(prop_key, None)
if proposition != None:
proposition['de'] += prop_de
proposition['en'] += prop_en
else:
proposition = {
'de': prop_de,
'en': prop_en
}
propositions[ prop_key ] = proposition
return propositions
soup = BeautifulSoup(open('html_export/tractatus.html'))
propositions = extract_propositions(soup.body)
root = create_nodes(propositions)
json_content = json.dumps(root, indent=4, sort_keys=True)
f = open('assets/data/tractatus.json', 'w+')
f.write(json_content)