import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline
movie_names = []
movie_years = []
imdb_ratings = []
meta_ratings = []
movie_earnings = []
movie_genres = []
runtimes = []
pages = 10 # 10 pages per year.
start_year = 2000
end_year = 2018
for i in tqdm_notebook(range(start_year, end_year+1), total = end_year-start_year, unit = 'year'):
k = 1
for j in range(1, pages + 1):
imdb_url = '' + str(i) + '-01-01,' + str(i) + '-12-31&sort=num_votes,desc&start=' + str(k) + '&ref_=adv_nxt'
page_unparsed = urllib.request.urlopen(imdb_url)
page_parsed = BeautifulSoup(page_unparsed, 'html.parser')
k += 50
movie_divs = page_parsed.find_all('div', class_ = 'lister-item mode-advanced')
for movie in movie_divs:
skip_movie = False
meta_rating_unparsed = movie.find('div', class_ = 'inline-block ratings-metascore')
gross_unparsed = movie.find_all('span', attrs = {'name' : 'nv'})
if meta_rating_unparsed is None or gross_unparsed is None:
temp = movie.find_all('span', attrs = {'name' : 'nv'})
if len(temp) is not 2:
meta_ratings.append(int(meta_rating_unparsed.text.replace(" ", "").split("\n")[1]))
movie_genres.append(movie.find('span', attrs = {'class' : 'genre'}).text.strip().split(",")[0])
imdb_ratings.append(float(movie.find('div', class_ = 'inline-block ratings-imdb-rating').text))
movie_names.append(movie.find('h3', class_ = 'lister-item-header').find('a').text)
year = movie.find('span', class_ = 'lister-item-year text-muted unbold').text.split(" ")
if len(year) == 1:
runtimes.append(int(movie.find('span', class_ = 'runtime').text.strip('min')))
# movies = pd.DataFrame(list(zip(movie_names, movie_years, movie_genres, imdb_ratings, meta_ratings, movie_earnings, runtimes)), columns =['name', 'year', 'genre', 'imdb', 'meta', 'gross', 'runtime'])
movies = pd.read_csv("movies.csv")
movies.drop("Unnamed: 0", axis = 1, inplace = True)
movies.drop([170, 1001], axis = 0, inplace = True)
# movies.to_csv("movies.csv")
imdb | meta | gross | runtime | |
count | 4877.000000 | 4877.000000 | 4877.000000 | 4877.000000 |
mean | 6.517572 | 56.017839 | 37.897115 | 107.052696 |
std | 0.971941 | 17.860327 | 68.181217 | 18.182487 |
min | 1.500000 | 1.000000 | 0.000000 | 61.000000 |
25% | 6.000000 | 43.000000 | 0.720000 | 95.000000 |
50% | 6.600000 | 57.000000 | 11.440000 | 104.000000 |
75% | 7.200000 | 69.000000 | 45.170000 | 116.000000 |
max | 9.000000 | 100.000000 | 936.660000 | 366.000000 |
name | year | genre | imdb | meta | gross | runtime | |
0 | Gladiator | 2000 | Action | 8.5 | 67 | 187.71 | 155 |
1 | Memento | 2000 | Mystery | 8.5 | 80 | 25.54 | 113 |
2 | Snatch | 2000 | Comedy | 8.3 | 55 | 30.33 | 102 |
3 | Requiem for a Dream | 2000 | Drama | 8.3 | 68 | 3.64 | 102 |
4 | X-Men | 2000 | Action | 7.4 | 64 | 157.30 | 104 |
import as style
style.use('seaborn-poster') #sets the size of the charts
Palettes :
sns.lmplot(x = 'imdb', y = 'meta', data = movies, scatter_kws={"s": 10})
sns.kdeplot(movies['imdb'], movies['meta'], cmap = sns.cubehelix_palette(light = 1, as_cmap = True), shade = True)
There is a high positive correlation between both the scoring metrics.
**Inference 1: ** People and critics tend to have the same view on movies. Both have the same opinions one movies with a rating of 6 (or 60).
fig, ax = plt.subplots()
sns.kdeplot(movies['imdb'], ax = ax)
sns.kdeplot(movies['meta']/10, ax = ax)
**Inference 2 : ** Meta ratings follow a normal distribution while IMDb ratings tend to favour the 5 - 8 range.
dicts = {}
for genre in movies['genre'].unique():
dicts[genre] = movies[movies['genre'] == genre]['imdb']
temp = pd.DataFrame(dicts)
fig, ax = plt.subplots(figsize=(100,5))
**Inference 3: ** Biography movies tend to have the high ratings.
fig, ax = plt.subplots(figsize=(21,8))
ax = sns.boxplot(x = 'genre', y = 'imdb', data = movies)
**Inference 4: ** In the world of movies, there are a lot of outliers. I don't think a machine learning model could work well on datasets like this (to predict the likeability of a movie)- unless we have more attributes.
# movie with lowest imdb rating
name Beyond the Lights
year 2014
genre Drama
imdb 6.9
meta 73
gross 14.62
runtime 116
Name: 3993, dtype: object
# movie with highest imdb rating
name Iron Man
year 2008
genre Action
imdb 7.9
meta 79
gross 318.41
runtime 126
Name: 2174, dtype: object
# movie with lowest meta rating
name Welcome to Marwen
year 2018
genre Biography
imdb 6
meta 40
gross 10.76
runtime 116
Name: 4871, dtype: object
# movie with highest imdb rating
name Maleficent
year 2014
genre Action
imdb 7
meta 56
gross 241.41
runtime 97
Name: 3797, dtype: object
**Inference 5 - 8: **
- IMDb:
- Highest rated: The Dark Knight (2008). Score: 9
- Lowest rated: Saving Christmas (2014). Score: 1.5
- Metacritic:
- Highest rated: Boyhood(2014). Score: 100
- Highest rated: Death of a Nation(2018). Score: 1
Each movie had anywhere from 1 to 3 genres. To simplify the process, I figured the first genre for each movie would be most accurate. Looking at the dataset, only the first 9 genres are the most abundant.
# sns.pairplot(movies, hue = 'genre')
# sns.lmplot(x = 'imdb', y = 'meta', data = movies, hue = 'genre')
sns.lmplot(x = 'imdb', y = 'gross', data = movies.sample(1500), scatter_kws={"s": 5})
**Inference 9: ** No correlation between IMDb scores and movie box office. Same can be inferred for meta scores.
# be default, barplot shows mean. beow graph shows that family movies had the highest grossing per movie.
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'gross', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('mean gross per genre')
Hmmm. Notice the black line for family movies. It tells us that there is a high change of it being erroneous.
Comedy 1297
Drama 1041
Action 962
Crime 351
Biography 321
Animation 256
Adventure 242
Horror 180
Documentary 176
Mystery 15
Fantasy 14
Romance 8
Thriller 6
Musical 2
Family 2
Music 2
Sci-Fi 1
War 1
Name: genre, dtype: int64
movies[movies['genre'] == 'Family']
name | year | genre | imdb | meta | gross | runtime | |
1201 | Raise Your Voice | 2004 | Family | 5.9 | 33 | 10.41 | 103 |
4502 | Beauty and the Beast | 2017 | Family | 7.2 | 65 | 504.01 | 129 |
temp = movies.groupby('genre').sum()
temp['ppm'] = temp['gross'] / temp['runtime']
temp.sort_values('ppm', ascending =False).head()
imdb | meta | gross | runtime | ppm | |
genre | |||||
Family | 13.1 | 98 | 514.42 | 232 | 2.217328 |
Animation | 1720.9 | 15449 | 24947.05 | 23922 | 1.042850 |
Action | 6054.4 | 47835 | 67386.31 | 108042 | 0.623705 |
Adventure | 1570.0 | 13489 | 16524.08 | 26933 | 0.613525 |
Mystery | 98.8 | 832 | 949.80 | 1638 | 0.579853 |
**Inference 12: ** Family movies had the best profit per movie value. (514.42 million for 2 movies!) and earned a whopping 2.2 million dollars for 1 minute of screentime. This amount was largely due to Beauty and the Beast.
Inference 13: The most profitable movie genres are Family, Animation, and Action.
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'gross', data = movies, estimator = sum, palette=("Blues_d"))
plt.title('sum of grosses per genre')
**Inference 14: ** Action movies had the most profits (67k million), followed by comedy movies (34k million) and then animation movies (25k million).
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.countplot(x = 'genre', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels() ,rotation = 30)
plt.title('number of movies per genre')
Inference 14:: Comedy movies were the most frequently released, followed by drama and action movies.
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'imdb', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average imdb rating per genre')
**Inference 15:**For IMDb ratings, on average, horror movies got the lowest ratings while war, documentary, and musical movies seem to get the highest.
Inference 16: IMDb movies tend to approximately get the same ratings regardless of genre.
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'meta', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average meta rating per genre')
Inference 17:: For meta ratings, documentary movies get the highest average rating.
Inference 18:: Mean ratings for each genre for metacritic tend to vary for each genre, unlike IMDb ratings.
Inference 19: Horror movies are the most disliked.
Inference 20: Documentary movies are the most liked.
fig, ax = plt.subplots()
sns.kdeplot(movies['imdb'], ax=ax)
sns.kdeplot(movies['meta']/10, ax=ax)
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'runtime', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average runtime per genre')
imdb 7.20
meta 57.50
gross 0.08
runtime 90.50
Name: Musical, dtype: float64
imdb 7.036449
meta 63.781931
gross 25.985452
runtime 117.529595
Name: Biography, dtype: float64
Inference 21: Biography movies had the longest average duration. (117 mins). Musical movies had the lowest. (90 minutes)
name Ghosts of the Abyss
year 2003
genre Documentary
imdb 6.9
meta 67
gross 17.09
runtime 61
Name: 1020, dtype: object
name The Best of Youth
year 2003
genre Drama
imdb 8.5
meta 89
gross 0.25
runtime 366
Name: 946, dtype: object
Inference 22: Longest movie was 'The Best of Youth', which ran for 366 minutes (6 hours). Shortest movie was 'Ghost of the Abyss', which ran for 61 minutes. (1 hour)
sns.jointplot(x = 'runtime', y = 'gross', data = movies, kind = 'reg', scatter_kws={"s": 3})
Inference 23: Runtime and box office are slightly correlated. Meaning, if the duration of a movie is more, it earns more. (with a probability of 0.3)
imdb | meta | gross | runtime | |
year | ||||
2000 | 1644.9 | 13595 | 7413.40 | 27113 |
2001 | 1728.3 | 14157 | 7956.77 | 28360 |
2002 | 1774.5 | 15066 | 9072.36 | 28557 |
2003 | 1705.4 | 14524 | 8771.08 | 28026 |
2004 | 1753.3 | 14558 | 9186.20 | 28528 |
2005 | 1748.4 | 14947 | 8807.64 | 28870 |
2006 | 1897.1 | 16516 | 9157.57 | 31077 |
2007 | 1846.1 | 15657 | 9321.09 | 29943 |
2008 | 1732.3 | 14605 | 9663.57 | 28448 |
2009 | 1734.7 | 14517 | 10388.39 | 28545 |
2010 | 1718.8 | 14939 | 10085.55 | 27970 |
2011 | 1757.8 | 15134 | 9947.34 | 28678 |
2012 | 1633.6 | 14207 | 10321.71 | 26685 |
2013 | 1794.9 | 15628 | 10685.90 | 29671 |
2014 | 1689.4 | 14648 | 10544.93 | 27719 |
2015 | 1480.1 | 13127 | 10497.37 | 24504 |
2016 | 1546.5 | 13733 | 11253.12 | 25662 |
2017 | 1363.3 | 12606 | 10532.35 | 22917 |
2018 | 1228.9 | 10953 | 11213.83 | 20714 |
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'year', y = 'gross', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average gross per year')
Hmm. The black bars indicate that there's a high chance of error. Guess this is what happens with semeingly unpredictable data like movies.
Inference 24: Average earning for movies increased as time went on.
sns.countplot(x = 'year', data = movies, palette=("Blues_d"))
plt.xticks(rotation = 45)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18]), <a list of 19 Text xticklabel objects>)
sns.lmplot(x = 'imdb', y = 'meta', data = movies, scatter_kws={"s": 10}, hue = 'year')
<seaborn.axisgrid.FacetGrid at 0x24a6aab2b70>
imdb | meta | gross | runtime | |
year | ||||
2000 | 6.475984 | 53.523622 | 29.186614 | 106.744094 |
2001 | 6.497368 | 53.221805 | 29.912669 | 106.616541 |
2002 | 6.476277 | 54.985401 | 33.110803 | 104.222628 |
2003 | 6.534100 | 55.647510 | 33.605670 | 107.379310 |
2004 | 6.566667 | 54.524345 | 34.405243 | 106.846442 |
2005 | 6.451661 | 55.154982 | 32.500517 | 106.531365 |
2006 | 6.474744 | 56.368601 | 31.254505 | 106.064846 |
2007 | 6.523322 | 55.325088 | 32.936714 | 105.805654 |
2008 | 6.463806 | 54.496269 | 36.058097 | 106.149254 |
2009 | 6.472761 | 54.167910 | 38.762649 | 106.511194 |
2010 | 6.535361 | 56.802281 | 38.348099 | 106.349810 |
2011 | 6.486347 | 55.845018 | 36.706052 | 105.822878 |
2012 | 6.482540 | 56.376984 | 40.959167 | 105.892857 |
2013 | 6.550730 | 57.036496 | 38.999635 | 108.288321 |
2014 | 6.573541 | 56.996109 | 41.030856 | 107.856031 |
2015 | 6.578222 | 58.342222 | 46.654978 | 108.906667 |
2016 | 6.580851 | 58.438298 | 47.885617 | 109.200000 |
2017 | 6.585990 | 60.898551 | 50.880918 | 110.710145 |
2018 | 6.571658 | 58.572193 | 59.967005 | 110.770053 |