-
Notifications
You must be signed in to change notification settings - Fork 1
/
Webscrap_IMDB_Library.py
184 lines (140 loc) · 7.02 KB
/
Webscrap_IMDB_Library.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
""" Fortgeschrittene Softwaretechnik """
""" Teodor Chiaburu, 900526 """
""" Library containing classes for creating the webscraper and testing it """
# relevant libraries
import requests
import pandas as pd
import unittest
from numpy import isnan
from bs4 import BeautifulSoup
# lambda function to replace the brackets in a string with an empty string
replace_brackets = lambda s : s.replace("(", "").replace(")", "")
"""
Class for creating webscraping objects
Attributes: - top_url = address of website to scrap
- html = the html code of the relevant page(s)
- soup = object of type BeautifulSoup to scrape the data
- list_csv = list where the interesting rows of data are to be stored
Methods: - get_table() = get the data from the webpages as a table
- iterate_films(action) = apply the operation 'action' iteratively on each row of data
- add_films(dict_row, row_pop) = adds the relevant features of the films to the list attribute;
features from row_pop will fist be stored in the dictionary dict_row
- convert_to_df() = converts the attribute list_csv into a data frame
and saves the info as a csv file in the local directory
"""
class Webscraper:
# factory method
def __init__(self, top_url):
# address of the IMDB page with most popular 100 films
self.top_url = top_url
# get html code of the page
self.html = requests.get(top_url,
headers = {'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'})
# create BeautifulSoup object to parse the html
self.soup = BeautifulSoup(self.html.content, "html.parser")
# store the rows for the csv file in a list
self.list_csv = []
# get table with top films
def get_table(self):
return self.soup.find("tbody", {"class": "lister-list"})
# iterate through all the rows in the table
# function 'action' passed as argument
def iterate_films(self, action):
# initialize data table
table = self.get_table()
# each new film is shown on a table row -> tag <tr>
for row_pop in table.find_all("tr"):
# every row info will be first stored in a dictionary
dict_row = {}
# apply operation passed as parameter
action(dict_row, row_pop)
# add movie data to the csv list
def add_films(self, dict_row, row_pop):
# get section with title, year and increase/drop in popularity
title_col = row_pop.find("td", {"class": "titleColumn"})
# get title
dict_row["Title"] = title_col.find("a").text
# get span containing release year and popularity increase/drop
year_and_pop = title_col.find_all("span")
dict_row["Year"] = replace_brackets(year_and_pop[0].text)
# if there is no popularity span, then the popularity was constant (0 increase/decrease)
try:
pop = replace_brackets(year_and_pop[1].text)
pop = int(pop.replace("\n", " ").replace(",", ""))
# pop is an absolute value -> find out whether it's an increase or decrease
titlemeter = year_and_pop[1].find("span", {"class": "up"})
# if the titlemeter span doesn't have class 'up', then it has class 'down'
# this is a sign of drop in popularity
if titlemeter is None:
pop = -pop
except:
pop = 0
dict_row["Popularity"] = pop
# get section with IMDB score and number of ratings
rating_col = row_pop.find("td", {"class": "imdbRating"})
# extract rating and number of users who graded the film
try:
dict_row["Rating"] = float(rating_col.text)
# number of voting users is mentioned in the title attribute
# start index of number is 13 and it ends where the next white space occurs
title_attribute = rating_col.find("strong")["title"]
end_index = title_attribute.find(" ", 13)
dict_row["Votes"] = int(title_attribute[13 : end_index].replace(",", ""))
except:
dict_row["Rating"] = 0.0
dict_row["Votes"] = 0
# add newly created dictionary to the list
self.list_csv.append(dict_row)
# converts attribute 'list_csv' to a data frame and saves it locally as a csv
def convert_to_df(self):
# turn the list into a data frame to save it as csv
df = pd.DataFrame(self.list_csv)
# give a name to the id column and start at 1
df.index.name = "Rank"
df.index += 1
# save csv file
df.to_csv("imdb_top100.csv")
return df
"""
Testing class for Webscraper
Inherited from unittest.TestCase
Attributes: - df = movie data frame
Methods: - add_dataframe(df) = adds the data frame to be tested to the object
- test_shape() = check whther the data frame has the expected number of examples and features
- test_isnan() = check if the columns 'Popularity', 'Rating' and 'Votes' have NaNs
- test_start_index() = check if the first index in the data frame was correctly set to 1
"""
class TestWebscraper(unittest.TestCase):
# set attribute df to the inputed data frame
def add_dataframe(self, df):
self.df = df
# check the dimensions of the data
def test_shape(self):
try:
self.assertEqual(self.df.shape, (100, 5))
print('Shape test passed!')
except AssertionError as err:
print('Shape test failed!')
print(str(err))
# check whether there are NaNs in the numeric columns
def test_isnan(self):
try:
# remember the current column name (in case of exception)
current_col = 'Popularity'
self.assertFalse(any(isnan(self.df[current_col])))
current_col = 'Rating'
self.assertFalse(any(isnan(self.df[current_col])))
current_col = 'Votes'
self.assertFalse(any(isnan(self.df[current_col])))
print('NaN test passed!')
except AssertionError:
print('NaN test failed!')
print(current_col + ' column has NaN values!')
# check if the starting index was set to 1
def test_start_index(self):
try:
self.assertEqual(self.df.index.start, 1)
print('Start index test passed!')
except AssertionError as err:
print('Start index test failed!')
print(str(err))