-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathjisho_scraper.py
219 lines (173 loc) · 7.03 KB
/
jisho_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# -*- coding: utf-8 -*-
'''
Addon: Daijirin Definition Scraper
Copyright: (c) Jesse Barkdoll 2017-2019 <https://github.com/barkdoll>
This addon was created to grab and parse definitions from the
大辞林 (daijirin) dictionary hosted on <https://weblio.jp/>
License: GNU AGPLv3 or later <https://www.gnu.org/licenses/agpl.html>
'''
import os
import sys
import requests
from bs4 import BeautifulSoup
import re
import pyperclip
from jisho_config import jisho_config
# Comment this out to see tracebacks for debugging
# sys.tracebacklimit = None
class Scraper:
def __init__(self, term, jisho='daijirin'):
self.term = term
self.jisho = jisho
self.jisho_name = jisho_config[self.jisho]['name']
self.url_id = jisho_config[self.jisho]['url_id']
self.data = self.scrape()
def scrape(self):
# Fetch initial page source
url = 'https://www.weblio.jp/content/{0}?dictCode={1}'.format(
self.term, self.url_id.upper() )
print('searching at ' + url)
sauce = requests.get(url).content
soup = BeautifulSoup(sauce, 'html.parser')
# Find the header of selected dictionary
header_url = soup.find('a', href=re.compile(
".+/cat/dictionary/{}.*".format(self.url_id))
)
try:
header = header_url.find_parent('div', class_='pbarT')
except:
header = None
pass
if header is None:
return None
# Finds the following element containing
# the chosen dictionary's definitions
entry = header.find_next_sibling('div', class_='kijiWrp')
# Outputs Daijirin header(s) to a list for the user to choose from
def parse_daijirin_def():
data = {}
entry_heads = entry.find_all('div', class_='NetDicHead')
def choose_header(header_list):
if len(header_list) > 1:
# If there is more than one entry head,
# user must choose one from the console.
print(
"Choose which one you would like by typing " +
"the entry's number and press Enter:\n"
)
for q, choices in enumerate(header_list, 1):
text = choices.text.encode('utf-8')
print(u'{0}. '.format(q) + text.decode('utf-8'))
# The extra space looks clean :)
print('')
# Checks if the user's input is a valid number
while True:
try:
chosen = header_list[int(input()) - 1]
break
except IndexError:
print("Error: enter a number that's on the list.")
continue
# If there is only one header, it will be selected
# automatically for extracting defintions
else:
chosen = header_list[0]
return chosen
# Runs the above function to get the proper header
chosen_head = choose_header(entry_heads)
chosen_body = chosen_head.find_next_sibling(
'div', class_='NetDicBody'
)
# Takes multi-definition entries and generates a list for output
defs = chosen_body.find_all('span', style="text-indent:0;")
data['yomigana'] = chosen_head.find('b').text
# Omits repetitive yomigana if term is strictly in hiragana
if data['yomigana'] == self.term:
data['yomigana'] = ''
# Handle multiple definitions and parse html list
if len(defs) > 1:
defs = [d.text for d in defs]
# Removes extra whitespaces in the definition strings
stripped = ["".join(piece.split()) for piece in defs]
data['body'] = "\n".join(
['<ol>'] +
[('<li>' + d + '</li>') for d in stripped] +
['</ol>']
)
# Checks for single definition and parses it in the html
else:
single_def = chosen_body.select_one("div div div").text
data['body'] = '<br>\n' + "".join(single_def.split())
return data
def parse_wiki_def():
data = {}
chosen_head = entry.find('h2', class_='midashigo')
chosen_body = chosen_head\
.find_next_sibling('div', class_="Wkpja")\
.find('p', class_=None)\
.text
data['yomigana'] = ''
data['body'] = '<br>\n' + chosen_body
return data
def parse_action(dictionary):
return {
'daijirin': parse_daijirin_def,
'wikipedia': parse_wiki_def
}[dictionary]()
definition = parse_action(self.jisho)
html = '【{0}】 {1}{2}'.format(
self.term, definition['yomigana'], definition['body']
)
return html
# Pushes complete entry into final output text file
def write_txt_file(txt):
text_file = open('definitions.txt', 'ab')
# checks if definitions.txt is empty or not
if os.stat("definitions.txt").st_size == 0:
text_file.write(txt.encode('utf-8'))
else:
text_file.write(
('\n\n<div>' + txt + '</div>').encode('utf-8')
)
# Clears the text file if the 'clear' argument is passed
def clear():
clear_file = open('definitions.txt', 'w')
clear_file.write('')
# Initialize!
args = sys.argv[1:]
if len(args) == 0:
raise ValueError('no terms given. I need a search term pal.')
else:
if any("list" in a for a in args):
if os.stat("definitions.txt").st_size == 0:
print("\nThere's no definitions to show!\n")
else:
read_file = open('definitions.txt', 'rb')
print('\n', read_file.read().decode('utf-8'), '\n')
elif any("cut" in a for a in args):
# Reads the file as bytes ('r+b')
copy_file = open('definitions.txt', 'r+b').read()
# Encodes the bytes to utf-8 and copies text to clipboard
pyperclip.copy(copy_file.decode('utf-8'))
# Clears the contents of the file
clear()
elif any("clear" in a for a in args):
clear()
else:
call_jisho = 'daijirin'
if any("--wiki" in a for a in args):
call_jisho = 'wikipedia'
args = [a for a in args if a != "--wiki"]
accumulator = []
for term in args:
item = Scraper(term, call_jisho).data
if item:
write_txt_file(item)
accumulator.append(item)
else:
print(
"\nNo " + jisho_config[call_jisho]['name'] +
" definitions found for '" + term +
"'.\nCheck your input or try another dictionary.\n"
)
print('\n' + '\n\n'.join(accumulator) + '\n')