forked from Taiwanese-Corpus/Khin-hoan_2010_pojbh
-
Notifications
You must be signed in to change notification settings - Fork 0
/
html2json.py
107 lines (87 loc) · 2.66 KB
/
html2json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import json
from os.path import join
from posix import listdir
import re
from bs4 import BeautifulSoup
from bs4.element import Comment
def tsanpootong():
boklok = join('pojbh.lib.ntnu.edu.tw', 'script')
buntsiunn = re.compile('artical-(\d+).htm')
for mia in sorted(listdir(boklok)):
tuiing = buntsiunn.match(mia)
if tuiing:
yield tuiing.group(1), join(boklok, mia)
def tsuliau():
for pianho, sootsai in tsanpootong():
# if '-12789.' not in sootsai:
# continue
with open(sootsai) as tong:
soup = BeautifulSoup(tong, 'lxml')
tailo = list(chhue(soup.find(id='artical_tailo')))
hanlo = list(chhue(soup.find(id='artical_content')))
if len(tailo) != len(hanlo):
print(sootsai)
print(len(tailo), len(hanlo))
# for a, b in zip(tailo, hanlo):
# print(a, b)
# print()
# break
chu = laiiong(soup)
chu['pianho'] = pianho
chu['tailo'] = tailo
chu['hanlo'] = hanlo
yield chu
def laiiong(soup):
chu = {}
for td in laiiong_td(soup):
kiatko = td.get_text()
if kiatko != '':
na, iong = kiatko.split(':', 1)
chu[na] = iong
return chu
def laiiong_td(soup):
chuiau = soup.find("table", attrs={"style": "text-align:left"})
yield from chuiau.find_all('td')
for thann in chuiau.parent.parent.find_next_siblings('tr'):
for td in thann.find_all('td'):
yield td
def chhue(span):
for tuann in chhue_p(span):
kiatko = tuann.strip()
if kiatko:
yield kiatko
def chhue_p(span):
for p in span.children:
try:
p.name
except AttributeError:
print('p', p.strip())
yield p.strip()
else:
tuann = []
for chiat in chhue_p_ete(p):
if chiat is not True:
tuann.append(chiat)
else:
yield ''.join(tuann)
tuann = []
yield ''.join(tuann)
def chhue_p_ete(p):
try:
if isinstance(p, Comment):
return
for ete in p.children:
yield from chhue_p_ete(ete)
if p.name in['p', 'br']:
yield True
except AttributeError:
if '\n' == p:
yield True
else:
yield p
if __name__ == '__main__':
with open('pojbh.json', 'w') as tong:
json.dump(
list(tsuliau()), tong,
ensure_ascii=False, sort_keys=True, indent=2
)