-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.py
40 lines (34 loc) · 1.55 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from bs4 import BeautifulSoup
import requests
from googlesearch import search
import csv
from tldextract import extract as extractTLD
query="endüstri mühendisliği lisans ders programı"
urls = []
print("Retrieving urls from google..")
for url in search(query, tld="com.tr", num=10, stop=100, pause=2): # num: how many urls will be get from each iteration, stop: number of urls to get, pause: time between requests, 2 second is ok, if it's lower google may block
urls.append(url)
print("Retrieved " + str(len(urls)) + " urls.")
print("Starting to iterate through urls..")
for url in urls:
rowsToPrint = [] # To CSV
if extractTLD(url).suffix != "edu.tr":
print(str(url) + " does not include 'edu.tr' suffix, skipping url..")
continue
try:
html = requests.get(url).text
except requests.exceptions.RequestException: # In case of commucation failure we want our script to continue
pass
soup = BeautifulSoup(html, "lxml")
fileName = extractTLD(url).domain # just domain name for short file name
tables = soup.find_all('table')
print(str(fileName)+": "+ str(url)) # short name: url
for table in tables:
for row in table.find_all('tr'):
course = []
for cell in row.find_all('td'):
course.append(cell.get_text().replace('\xa0', ''))
rowsToPrint.append(course)
with open('curriculums/'+str(fileName)+'.csv', 'w', newline='') as file: # Create files under /curriculums folder
writer = csv.writer(file)
writer.writerows(rowsToPrint)