-
Notifications
You must be signed in to change notification settings - Fork 26
/
CX_DB8_PoS.py
110 lines (103 loc) · 4.56 KB
/
CX_DB8_PoS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import glob
import os
from bs4 import BeautifulSoup
import bs4
import string
num_of_cards = 0
os.chdir("/home/lain/manjaro-windows-shared/debate_cards-2014-2017")
with open('card_data_pos.txt', 'a') as f:
for file in glob.glob("*.html5"):
print("File parsed: " + file)
with open(file) as fp:
soup = BeautifulSoup(fp, "lxml")
all_card_tags = soup.find_all('h4')
data = []
for h4 in all_card_tags:
counter = 0
num_of_cards += 1
for content in h4.next_siblings:
if content.name == "h4":
break
elif content.name == "p" and counter == 0:
# this is likely the tag
counter += 1
elif content.name == "h1":
pass
elif content.name == "h2":
pass
elif content.name == "h3":
pass
elif content.name == "p":
f.write("-DOCSTART- -X- -X- -X- O")
f.write('\n')
'''
if num_of_cards == 202:
print(content.text)
underlined = content.find_all('span', class_="underline")
real_underlined = ''
for a_word in underlined:
split = a_word.text.split() #use NLTK for tokenization instead
for part in split:
real_underlined += part
real_underlined += ' '
print('--------------------------------------------')
print(real_underlined)
'''
for stuff in content:
if isinstance(stuff, bs4.element.Tag):
# This stuff is underlined
split_segment = stuff.text.split()
for word in split_segment:
# easy heuerestic to remove weblinks, a word should be less than 20 characters
if word and len(word) < 20:
word = ''.join(
ch for ch in word if ch.isalnum())
if word.isspace() or not word:
f.write('\n')
else:
f.write(word + ' ' + "u")
f.write('\n')
else:
f.write('\n')
else:
split_segment = stuff.split()
for word in split_segment:
if word and len(word) < 20:
word = ''.join(
ch for ch in word if ch.isalnum())
if word.isspace() or not word:
f.write('\n')
else:
f.write(word + ' ' + "n")
f.write('\n')
else:
f.write('\n')
# print(underlined)
# for the_text in underlined:
# print(the_text)
else:
pass
'''
with open('card_data.txt', 'a') as f:
for item in data:
if item is '\n':
f.write('\n')
else:
f.write(str(item['data']) + '\t' + str(item['tag']) + '\n')
'''
print("Number of cards processed: " + str(num_of_cards))
'''
card = {'tag': h4.text, 'cite': h4.find_next(
"strong"), 'pocket': h4.find_previous("h1").text, 'hat': h4.find_previous("h2").text, 'block': h4.find_previous("h3").text}
'''
# assert that all fields are filled in for each card
# from one h4 to another
# if strong and no underline, get rid of it
# if strong and underline, keep it
# get rid of stuff between <a> tags
# get all <p>
'''
all_blocks = soup.find_all('span', class_="underline")
for h3 in all_blocks:
print(h3)
'''