This repository has been archived by the owner on Nov 30, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
caesb-scraping.py
88 lines (68 loc) · 2.79 KB
/
caesb-scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import re
import json
import requests
from bs4 import BeautifulSoup
from utils.twitter import post_tweet, message_tweet
from utils.mongodb import inputDB, updateDB, db_cei
def get_JSESSIONID():
session = requests.Session()
response = session.get('https://www.caesb.df.gov.br/portal-servicos/app/publico/consultarfaltadagua?execution=e1s1')
dict_cookies = session.cookies.get_dict()
return dict_cookies['JSESSIONID'], list(dict_cookies.values())[0]
cookie_JSESSIONID, cookie_BIGipServerPOOL = get_JSESSIONID()
def exec_request(cookie_JSESSIONID, cookie_BIGipServerPOOL):
cookies = {
'JSESSIONID': cookie_JSESSIONID,
'BIGipServerPOOL_ESTABILIZACAO' : cookie_BIGipServerPOOL
}
headers = {
'Host': 'www.caesb.df.gov.br',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
data = 'javax.faces.partial.ajax=true&javax.faces.source=j_idt44%3Aj_idt45&javax.faces.partial.execute=%40all&javax.faces.partial.render=formFaltaDeAgua&j_idt44%3Aj_idt45=j_idt44%3Aj_idt45&j_idt44=j_idt44&javax.faces.ViewState=e1s1'
response = requests.post('https://www.caesb.df.gov.br/portal-servicos/app/publico/consultarfaltadagua?execution=e1s1', headers=headers, cookies=cookies, data=data)
if response.status_code == 200:
return response.text
response = exec_request(cookie_JSESSIONID, cookie_BIGipServerPOOL)
soup = BeautifulSoup(response, 'lxml')
updateXML = soup.find_all('update')
soup2 = BeautifulSoup(str(updateXML[0]), 'html.parser')
cleanr = re.compile('<.*?>')
def cleanhtml(raw_html):
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def get_listaGERAL():
cont = 0
listaRA = []
listaGERAL = []
for tabela_ra in list(soup2.findAll('table')[0].tbody.findAll('td')):
listaRA.append(cleanhtml(str(tabela_ra)))
cont += 1
if cont % 6 == 0:
listaGERAL.append(listaRA)
listaRA = []
return listaGERAL
listaGERAL = get_listaGERAL()
infos_ceilandia = db_cei()
def get_cei():
cei_list = []
for regiao in listaGERAL:
if regiao[0].lower().count('cei'):
cei_dict = {
'RA' : regiao[0],
'Áreas Afetadas' : regiao[1],
'Início' : regiao[2],
'Normalização' : regiao[3],
'Tipo de Falta de Água' : regiao[4],
'Motivo da Falta de Água' : regiao[5]
}
cei_list.append(cei_dict)
return cei_list
cei_list = get_cei()
if len(list(infos_ceilandia.find())) == 0:
for new_data in cei_list:
inputDB(new_data)
post_tweet(message_tweet(new_data))
else:
updateDB(cei_list)