-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapper.py
97 lines (73 loc) · 2.97 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import json
import requests
from bs4 import BeautifulSoup
import re
from src.common.database import database
def extract_subjects_info(table):
tr_elements = table.find_all("tr")
subject_type = "Obrigatória"
period = 1
subjects = []
for tr in tr_elements:
# encontra o tipo de disciplina
if tr.get("bgcolor") == "#658CCF":
text = tr.td.get_text(strip=True).strip()
clean_text = re.sub(r'\s+', ' ', text)
subject_type = clean_text.split(" ")[1][:-1]
# encontra o periodo
if tr.get("bgcolor") == "#CCCCCC":
td_text = tr.find("td").get_text(strip=True)
period = int(re.search(r"\d+", td_text).group())
# encontra as disciplinas
if tr.find("a", class_="link_gray"):
td_elements = tr.find_all("td")
code = re.sub(r'\s+', ' ', td_elements[0].get_text(strip=True))
name = re.sub(r'\s+', ' ', td_elements[1].get_text(strip=True))
subjects.append({
"subject_type": subject_type,
"period": period,
"code": code,
"name": name,
})
return subjects
def get_programs():
BASE_URL = "https://uspdigital.usp.br/jupiterweb/"
PROGRAMS_URL = f"{BASE_URL}jupCursoLista?codcg=3&tipo=N"
page = requests.get(PROGRAMS_URL)
soup = BeautifulSoup(page.content, "html5lib")
tables = soup.select("table")
tables = [tables[3], tables[5], tables[7]]
# obtem todos os cursos da Poli
tr_data = []
for table in tables:
tr_elements = table.find_all("tr")[1:] # Ignore the first <tr> element
for tr in tr_elements:
td_elements = tr.find_all("td")
if len(td_elements) >= 2:
# Get the href from the first <td>
first_td = td_elements[0]
a_tag = first_td.find("a", class_="link_gray")
tr_href = a_tag["href"] if a_tag else None
# Get the text from the second <td>
second_td = td_elements[1]
tr_text = second_td.get_text(strip=True)
tr_data.append({"text": tr_text, "href": tr_href})
# remove entradas que nao tem um link para a pagina de grade curricular
tr_data = [d for d in tr_data if d.get("href") is not None]
# vai a cada grade curricular para obter as disciplinas
for data in tr_data:
page = requests.get(f"{BASE_URL}{data.get('href')}")
soup = BeautifulSoup(page.content, "html5lib")
try:
target_table = soup.select("table")[-2]
except IndexError:
data["subjects"] = []
continue
subjects = extract_subjects_info(target_table)
data["subjects"] = subjects
# with open('json_data.json', 'w', encoding='utf-8') as outfile:
# json.dump(tr_data, outfile)
programs = database["programs"]
programs.insert_many(tr_data)
if __name__ == "__main__":
get_programs()