-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.py
62 lines (50 loc) · 1.79 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from bs4 import BeautifulSoup
from bs4 import NavigableString
from bs4 import Tag
import requests
def parse_g1_new(new):
try:
new_image = new.findChildren("img")[0].get("src")
content_index = 1
except:
new_image = "not_found.png"
content_index = 0
new_content = new.findChildren("a")[content_index]
url = new_content.get("href")
titulo = new_content.findChildren("div")[0].text
return (titulo.strip(), url, new_image, new)
def parse_fsp_new(new):
try:
new_image = new.findChildren("img")[0].get("src")
content_index = 1
except:
new_image = "not_found.png"
content_index = 0
new_content = new.findChildren("a")[content_index]
url = new_content.get("href")
titulo = new_content.findChildren("h2")[0].text
return (titulo.strip(), url, new_image, new)
def get_search_url(text):
url = "https://g1.globo.com/busca/?q=" + text.replace(" ", "+")
url2 = "https://search.folha.uol.com.br/?q=" + text.replace(" ", "+") + "&site=todos"
return url,url2
def get_search_tree(url):
html = requests.get(url).content
soup = BeautifulSoup(html, 'html.parser')
return soup
def get_next_g1(res: (NavigableString | BeautifulSoup)):
if (isinstance(res, BeautifulSoup)):
noticia = res.find("li", class_="widget--info")
elif (isinstance(res, NavigableString) or isinstance(res, Tag)):
noticia = res.find_next("li", class_="widget--info")
if (noticia != None):
return parse_g1_new(noticia)
return None
def get_next_fsp(res: (NavigableString | BeautifulSoup)):
if (isinstance(res, BeautifulSoup)):
noticia = res.find("li", class_="c-headline")
elif (isinstance(res, NavigableString) or isinstance(res, Tag)):
noticia = res.find_next("li", class_="c-headline")
if (noticia != None):
return parse_fsp_new(noticia)
return None