-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
101 lines (80 loc) · 5.27 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from bs4 import BeautifulSoup
import requests
import html
import pandas as pd
import csv
from datetime import datetime
from datetime import timedelta
import re
#Version 5.2
print("What is the word you want to search?")
Searchterm = input() #Searchterm
print("How many pages do you want to scrape?")
c_pages = int(input()) #How many pages you want to search through
begin_time = datetime.now()
cols = ['Date', 'Name', 'Party', 'Function', 'Statement', 'Debate_Subject', 'Type_of_debate', 'Particular_committee', 'Speaker', 'URL', 'ID']
data = pd.DataFrame(columns = cols)
count, prev_name, prev_url, statement_start_prev = 0, "", "", 0
#The function of a minister or secretary is written per name in different dataset, because it couldn't be easilly retrieved from the webpages.
Minis = pd.read_excel('Ministers.xlsx').fillna(begin_time)
#The range stands for the number of pages counting from 0 you want to scrape from the search result.
for i in range(0, c_pages):
url = "https://debatgemist.tweedekamer.nl/zoeken?search_api_views_fulltext=" + Searchterm + "&page=" + str(i)
soup = BeautifulSoup(requests.get(url).content, "lxml") #This is initially where the pages are being loaded.
debates = soup.find_all('div', class_="data")
print("Reading page", i + 1, "of", c_pages)
#Here we grab every piece in the html where a debate is listed wherein the search term is found by the site.
for debate in debates:
debate_sub = debate.div.h2.text
found_statements = debate.find_all('li')
count = count + 1
#Here we grab every particilar statement from a politician within the listed debates.
for s in found_statements:
s_url = s.a['href']
statement_start = int(re.split('\W', s_url)[-1])
#Because within the text, the name of the politician its party he/she is part of and the statement is written, we have to divide it up so we can correctly archive.
spl_statement = re.split('[\W](?<!\d)[.,](?!\d)', re.sub('\s+', " ", s.text))
spl_party = re.split('\s|(?<!\d)[.](?!\d)', spl_statement[0])
party = spl_party[-1]
if party == '-':
name = " ".join(spl_party[1:-1])
else:
name = " ".join(spl_party[1:-2])
function = 'Kamerlid'
#Because spoken text is split, we want to re-unite it when it belongs together. This also saves time by not having to open a webpage to search for info already found.
time_difference = statement_start - statement_start_prev
if 10 >= time_difference > 0 and name == prev_name and 'debatten' in s_url:
statement = statement + spl_statement[-3]
data.iloc[-1,4] = statement
statement_start_prev = statement_start
prev_name, prev_url = name, url
elif 'debatten' in s_url:
statement = spl_statement[-3]
prev_name, prev_url, statement_start_prev = name, url, statement_start
#Because there is some data that is only written on the site of the actual debate itself, we can load that site and grab info from there.
#So we start a new instance of loading pages and load the particular debate.
st_soup = BeautifulSoup(requests.get(s_url).content,"lxml")
date = re.split('\W', st_soup.find('div',class_="meta").time['datetime'])
part_find = st_soup.find('div',class_="meta")
date_f = datetime(int(date[0]), int(date[1]), int(date[2]))
date_strf = date_f.strftime("%b %d %Y")
deb_and_com = part_find.find_all('span')
typ_deb = deb_and_com[-1].text
typ_com = deb_and_com[0].text
speaker = st_soup.find_all('option')[1].text
time = re.split('\W', st_soup.find_all("label", class_="option")[1].text)
ident_num = ''.join(date[0:3]) + '-' + ''.join(time[6:8]) + ''.join(time[10:12])
statement_time = str(timedelta(seconds=statement_start) + timedelta(hours=int(time[6]), minutes=int(time[7])))
if typ_com == typ_deb:
typ_com = 'Not a particular committee'
function = Minis.loc[(Minis['Name'] == name) & (Minis['start_date'] < date_f) & (Minis.end_date > date_f)].Function.to_string(index=False).strip()
if function == 'Series([], )':
function = 'Kamerlid'
data = data.append({'Date': date_strf, 'Name': name, 'Party': party, 'Function': function, 'Statement': statement, 'Debate_Subject': debate_sub, 'Speaker': speaker, 'Type_of_debate': typ_deb, 'Particular_committee': typ_com, 'URL': s_url, 'ID': ident_num},ignore_index=True)
end_time = datetime.now()
completion_time = end_time - begin_time
current_time = end_time.strftime("%d%m%Y-%H%M")
#Ultimatly, we want to analyse the data. So to make things easier, we automatically write the data to a .csv file
#Write_xlsx = data.to_excel(Searchterm + '-' + current_time + '.xlsx')
Write_csv = data.to_csv(Searchterm + '-' + current_time + '.csv')
print("It took me", completion_time, "to search for", Searchterm, "on", c_pages, "page(s).")