-
Notifications
You must be signed in to change notification settings - Fork 3
/
webscraping.py
99 lines (79 loc) · 3.1 KB
/
webscraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import re, json, time, requests, os
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
token = "YOUR TOKEN HERE"
API_URL = "https://www.abibliadigital.com.br/api"
book_list = []
class PageLoaded(object):
def __init__(self, oldId):
self.oldId = oldId
def __call__(self, browser):
return self.oldId != browser.find_element_by_tag_name("html").id
def waitToLoad(browser, oldId):
'''
Wait the webpage load by tag_name.id
'''
try:
wait = WebDriverWait(browser, 10)
wait.until(PageLoaded(oldId))
return True
except Exception as e:
print(e)
return False
def versesTotal(abbrev, chapter, token):
'''
Return the total of verses of a chapter requesting bibleapi
'''
response = requests.get(
f"{API_URL}/verses/ra/{abbrev}/{str(chapter)}",
headers = {"Authorization": "Bearer " + token},
)
result = response.json()
return result["chapter"]["verses"]
opcoes = ChromeOptions()
opcoes.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
browser = Chrome("./chromedriver.exe", options = opcoes)
for abbrev in book_list:
error = False
chapter = 1
totalChapters = requests.get(
f"{API_URL}/books/{abbrev}/",
headers = {"Authorization": "Bearer " + token},
).json()["chapters"]
book = {}
while chapter <= totalChapters and not error:
print(f"Capturing the chapter {chapter} with ", end = "")
# It enter in the chapter and wait to load
oldId = browser.find_element_by_tag_name("html").id
browser.get(f"https://pesquisa.biblia.com.br/pt-BR/RA/{abbrev}/{str(chapter)}")
error = not waitToLoad(browser, oldId)
if error:
print("Script interrupted")
continue
# It scroll to the end for load full page
browser.execute_script("window.scrollBy(0, document.body.scrollHeight)")
time.sleep(0.6)
# It capture the verses
total = versesTotal(abbrev, chapter, token)
verses = browser.find_elements_by_css_selector("li[class=versiculoTexto]")
cont = 0
while len(verses) != total and cont < 20:
browser.execute_script("window.scrollBy(0, document.body.scrollHeight)")
time.sleep(0.6)
verses = browser.find_elements_by_css_selector("li[class=versiculoTexto]")
cont += 1
# It create a new dictionary of the chapter
book[chapter] = []
for verse in verses:
# It split the verse number of the verse text
verse_number, verse_text = [x for x in re.split(r'(\d+)\s+([\w\W]*)', verse.text) if x != ""]
book[chapter].append(verse_text.strip())
print(len(book[chapter]), "verses")
chapter += 1
if not error:
print("Creating File")
with open(os.path.join("chapters", f"{abbrev}.json"), "w",
encoding="utf-8") as file:
json.dump(book, file, ensure_ascii=False, indent = 2)
print("Files created!")
browser.quit()