import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook
import seaborn as sns
sns.set_style("whitegrid")
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline
'''
movie_names = []
movie_years = []
imdb_ratings = []
meta_ratings = []
movie_earnings = []
movie_genres = []
runtimes = []
pages = 10 # 10 pages per year.
start_year = 2000
end_year = 2018
for i in tqdm_notebook(range(start_year, end_year+1), total = end_year-start_year, unit = 'year'):
k = 1
for j in range(1, pages + 1):
imdb_url = 'https://www.imdb.com/search/title?release_date=' + str(i) + '-01-01,' + str(i) + '-12-31&sort=num_votes,desc&start=' + str(k) + '&ref_=adv_nxt'
page_unparsed = urllib.request.urlopen(imdb_url)
page_parsed = BeautifulSoup(page_unparsed, 'html.parser')
k += 50
movie_divs = page_parsed.find_all('div', class_ = 'lister-item mode-advanced')
for movie in movie_divs:
skip_movie = False
meta_rating_unparsed = movie.find('div', class_ = 'inline-block ratings-metascore')
gross_unparsed = movie.find_all('span', attrs = {'name' : 'nv'})
if meta_rating_unparsed is None or gross_unparsed is None:
continue
temp = movie.find_all('span', attrs = {'name' : 'nv'})
if len(temp) is not 2:
continue
else:
movie_earnings.append(float(temp[1].string.strip('$').strip('M')))
meta_ratings.append(int(meta_rating_unparsed.text.replace(" ", "").split("\n")[1]))
movie_genres.append(movie.find('span', attrs = {'class' : 'genre'}).text.strip().split(",")[0])
imdb_ratings.append(float(movie.find('div', class_ = 'inline-block ratings-imdb-rating').text))
movie_names.append(movie.find('h3', class_ = 'lister-item-header').find('a').text)
year = movie.find('span', class_ = 'lister-item-year text-muted unbold').text.split(" ")
if len(year) == 1:
movie_years.append(year[0][1:5])
else:
movie_years.append(year[1][1:5])
runtimes.append(int(movie.find('span', class_ = 'runtime').text.strip('min')))
'''
'\nmovie_names = []\nmovie_years = []\nimdb_ratings = []\nmeta_ratings = []\nmovie_earnings = [] \nmovie_genres = []\nruntimes = []\n\npages = 10 # 10 pages per year.\nstart_year = 2000\nend_year = 2018\n\nfor i in tqdm_notebook(range(start_year, end_year+1), total = end_year-start_year, unit = \'year\'):\n k = 1\n for j in range(1, pages + 1):\n imdb_url = \'https://www.imdb.com/search/title?release_date=\' + str(i) + \'-01-01,\' + str(i) + \'-12-31&sort=num_votes,desc&start=\' + str(k) + \'&ref_=adv_nxt\'\n page_unparsed = urllib.request.urlopen(imdb_url)\n page_parsed = BeautifulSoup(page_unparsed, \'html.parser\')\n\n k += 50\n\n movie_divs = page_parsed.find_all(\'div\', class_ = \'lister-item mode-advanced\')\n\n for movie in movie_divs:\n skip_movie = False \n\n meta_rating_unparsed = movie.find(\'div\', class_ = \'inline-block ratings-metascore\')\n gross_unparsed = movie.find_all(\'span\', attrs = {\'name\' : \'nv\'})\n\n if meta_rating_unparsed is None or gross_unparsed is None:\n continue\n \n temp = movie.find_all(\'span\', attrs = {\'name\' : \'nv\'})\n if len(temp) is not 2:\n continue\n else:\n movie_earnings.append(float(temp[1].string.strip(\'$\').strip(\'M\')))\n\n meta_ratings.append(int(meta_rating_unparsed.text.replace(" ", "").split("\n")[1]))\n movie_genres.append(movie.find(\'span\', attrs = {\'class\' : \'genre\'}).text.strip().split(",")[0])\n imdb_ratings.append(float(movie.find(\'div\', class_ = \'inline-block ratings-imdb-rating\').text))\n movie_names.append(movie.find(\'h3\', class_ = \'lister-item-header\').find(\'a\').text)\n\n\n year = movie.find(\'span\', class_ = \'lister-item-year text-muted unbold\').text.split(" ")\n if len(year) == 1:\n movie_years.append(year[0][1:5])\n else:\n movie_years.append(year[1][1:5])\n\n \n runtimes.append(int(movie.find(\'span\', class_ = \'runtime\').text.strip(\'min\')))\n'
# movies = pd.DataFrame(list(zip(movie_names, movie_years, movie_genres, imdb_ratings, meta_ratings, movie_earnings, runtimes)), columns =['name', 'year', 'genre', 'imdb', 'meta', 'gross', 'runtime'])
movies = pd.read_csv("movies.csv")
movies.drop("Unnamed: 0", axis = 1, inplace = True)
movies.drop([170, 1001], axis = 0, inplace = True)
# movies.to_csv("movies.csv")
# files.download('movies.csv')
movies.describe()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
imdb | meta | gross | runtime | |
---|---|---|---|---|
count | 4877.000000 | 4877.000000 | 4877.000000 | 4877.000000 |
mean | 6.517572 | 56.017839 | 37.897115 | 107.052696 |
std | 0.971941 | 17.860327 | 68.181217 | 18.182487 |
min | 1.500000 | 1.000000 | 0.000000 | 61.000000 |
25% | 6.000000 | 43.000000 | 0.720000 | 95.000000 |
50% | 6.600000 | 57.000000 | 11.440000 | 104.000000 |
75% | 7.200000 | 69.000000 | 45.170000 | 116.000000 |
max | 9.000000 | 100.000000 | 936.660000 | 366.000000 |
movies.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
name | year | genre | imdb | meta | gross | runtime | |
---|---|---|---|---|---|---|---|
0 | Gladiator | 2000 | Action | 8.5 | 67 | 187.71 | 155 |
1 | Memento | 2000 | Mystery | 8.5 | 80 | 25.54 | 113 |
2 | Snatch | 2000 | Comedy | 8.3 | 55 | 30.33 | 102 |
3 | Requiem for a Dream | 2000 | Drama | 8.3 | 68 | 3.64 | 102 |
4 | X-Men | 2000 | Action | 7.4 | 64 | 157.30 | 104 |
import matplotlib.style as style
style.available
['bmh',
'classic',
'dark_background',
'fast',
'fivethirtyeight',
'ggplot',
'grayscale',
'seaborn-bright',
'seaborn-colorblind',
'seaborn-dark-palette',
'seaborn-dark',
'seaborn-darkgrid',
'seaborn-deep',
'seaborn-muted',
'seaborn-notebook',
'seaborn-paper',
'seaborn-pastel',
'seaborn-poster',
'seaborn-talk',
'seaborn-ticks',
'seaborn-white',
'seaborn-whitegrid',
'seaborn',
'Solarize_Light2',
'tableau-colorblind10',
'_classic_test']
style.use('seaborn-poster') #sets the size of the charts
style.use('ggplot')
Palettes :
Accent, Accent_r, Blues, Blues_r, BrBG, BrBG_r, BuGn, BuGn_r, BuPu, BuPu_r, CMRmap, CMRmap_r, Dark2, Dark2_r, GnBu, GnBu_r, Greens, Greens_r, Greys, Greys_r, OrRd, OrRd_r, Oranges, Oranges_r, PRGn, PRGn_r, Paired, Paired_r, Pastel1, Pastel1_r, Pastel2, Pastel2_r, PiYG, PiYG_r, PuBu, PuBuGn, PuBuGn_r, PuBu_r, PuOr, PuOr_r, PuRd, PuRd_r, Purples, Purples_r, RdBu, RdBu_r, RdGy, RdGy_r, RdPu, RdPu_r, RdYlBu, RdYlBu_r, RdYlGn, RdYlGn_r, Reds, Reds_r, Set1, Set1_r, Set2, Set2_r, Set3, Set3_r, Spectral, Spectral_r, Vega10, Vega10_r, Vega20, Vega20_r, Vega20b, Vega20b_r, Vega20c, Vega20c_r, Wistia, Wistia_r, YlGn, YlGnBu, YlGnBu_r, YlGn_r, YlOrBr, YlOrBr_r, YlOrRd, YlOrRd_r, afmhot, afmhot_r, autumn, autumn_r, binary, binary_r, bone, bone_r, brg, brg_r, bwr, bwr_r, cool, cool_r, coolwarm, coolwarm_r, copper, copper_r, cubehelix, cubehelix_r, flag, flag_r, gist_earth, gist_earth_r, gist_gray, gist_gray_r, gist_heat, gist_heat_r, gist_ncar, gist_ncar_r, gist_rainbow, gist_rainbow_r, gist_stern, gist_stern_r, gist_yarg, gist_yarg_r, gnuplot, gnuplot2, gnuplot2_r, gnuplot_r, gray, gray_r, hot, hot_r, hsv, hsv_r, icefire, icefire_r, inferno, inferno_r, jet, jet_r, magma, magma_r, mako, mako_r, nipy_spectral, nipy_spectral_r, ocean, ocean_r, pink, pink_r, plasma, plasma_r, prism, prism_r, rainbow, rainbow_r, rocket, rocket_r, seismic, seismic_r, spectral, spectral_r, spring, spring_r, summer, summer_r, tab10, tab10_r, tab20, tab20_r, tab20b, tab20b_r, tab20c, tab20c_r, terrain, terrain_r, viridis, viridis_r, vlag, vlag_r, winter, winter_r
sns.lmplot(x = 'imdb', y = 'meta', data = movies, scatter_kws={"s": 10})
<seaborn.axisgrid.FacetGrid at 0x24a65eb9080>
sns.kdeplot(movies['imdb'], movies['meta'], cmap = sns.cubehelix_palette(light = 1, as_cmap = True), shade = True)
<matplotlib.axes._subplots.AxesSubplot at 0x24a67f75cc0>
There is a high positive correlation between both the scoring metrics.
**Inference 1: ** People and critics tend to have the same view on movies. Both have the same opinions one movies with a rating of 6 (or 60).
fig, ax = plt.subplots()
sns.kdeplot(movies['imdb'], ax = ax)
sns.kdeplot(movies['meta']/10, ax = ax)
<matplotlib.axes._subplots.AxesSubplot at 0x24a6800f080>
**Inference 2 : ** Meta ratings follow a normal distribution while IMDb ratings tend to favour the 5 - 8 range.
dicts = {}
for genre in movies['genre'].unique():
dicts[genre] = movies[movies['genre'] == genre]['imdb']
temp = pd.DataFrame(dicts)
fig, ax = plt.subplots(figsize=(100,5))
sns.violinplot(temp)
plt.show()
**Inference 3: ** Biography movies tend to have the high ratings.
fig, ax = plt.subplots(figsize=(21,8))
ax = sns.boxplot(x = 'genre', y = 'imdb', data = movies)
plt.show()
**Inference 4: ** In the world of movies, there are a lot of outliers. I don't think a machine learning model could work well on datasets like this (to predict the likeability of a movie)- unless we have more attributes.
# movie with lowest imdb rating
movies.iloc[movies['imdb'].idxmin()]
name Beyond the Lights
year 2014
genre Drama
imdb 6.9
meta 73
gross 14.62
runtime 116
Name: 3993, dtype: object
# movie with highest imdb rating
movies.iloc[movies['imdb'].idxmax()]
name Iron Man
year 2008
genre Action
imdb 7.9
meta 79
gross 318.41
runtime 126
Name: 2174, dtype: object
# movie with lowest meta rating
movies.iloc[movies['meta'].idxmin()]
name Welcome to Marwen
year 2018
genre Biography
imdb 6
meta 40
gross 10.76
runtime 116
Name: 4871, dtype: object
# movie with highest imdb rating
movies.iloc[movies['meta'].idxmax()]
name Maleficent
year 2014
genre Action
imdb 7
meta 56
gross 241.41
runtime 97
Name: 3797, dtype: object
**Inference 5 - 8: **
- IMDb:
- Highest rated: The Dark Knight (2008). Score: 9
- Lowest rated: Saving Christmas (2014). Score: 1.5
- Metacritic:
- Highest rated: Boyhood(2014). Score: 100
- Highest rated: Death of a Nation(2018). Score: 1
Each movie had anywhere from 1 to 3 genres. To simplify the process, I figured the first genre for each movie would be most accurate. Looking at the dataset, only the first 9 genres are the most abundant.
# sns.pairplot(movies, hue = 'genre')
# sns.lmplot(x = 'imdb', y = 'meta', data = movies, hue = 'genre')
sns.lmplot(x = 'imdb', y = 'gross', data = movies.sample(1500), scatter_kws={"s": 5})
<seaborn.axisgrid.FacetGrid at 0x24a6828e4a8>
**Inference 9: ** No correlation between IMDb scores and movie box office. Same can be inferred for meta scores.
# be default, barplot shows mean. beow graph shows that family movies had the highest grossing per movie.
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'gross', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('mean gross per genre')
plt.show()
Hmmm. Notice the black line for family movies. It tells us that there is a high change of it being erroneous.
movies['genre'].value_counts()
Comedy 1297
Drama 1041
Action 962
Crime 351
Biography 321
Animation 256
Adventure 242
Horror 180
Documentary 176
Mystery 15
Fantasy 14
Romance 8
Thriller 6
Musical 2
Family 2
Music 2
Sci-Fi 1
War 1
Name: genre, dtype: int64
movies[movies['genre'] == 'Family']
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
name | year | genre | imdb | meta | gross | runtime | |
---|---|---|---|---|---|---|---|
1201 | Raise Your Voice | 2004 | Family | 5.9 | 33 | 10.41 | 103 |
4502 | Beauty and the Beast | 2017 | Family | 7.2 | 65 | 504.01 | 129 |
temp = movies.groupby('genre').sum()
temp['ppm'] = temp['gross'] / temp['runtime']
temp.sort_values('ppm', ascending =False).head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
imdb | meta | gross | runtime | ppm | |
---|---|---|---|---|---|
genre | |||||
Family | 13.1 | 98 | 514.42 | 232 | 2.217328 |
Animation | 1720.9 | 15449 | 24947.05 | 23922 | 1.042850 |
Action | 6054.4 | 47835 | 67386.31 | 108042 | 0.623705 |
Adventure | 1570.0 | 13489 | 16524.08 | 26933 | 0.613525 |
Mystery | 98.8 | 832 | 949.80 | 1638 | 0.579853 |
**Inference 12: ** Family movies had the best profit per movie value. (514.42 million for 2 movies!) and earned a whopping 2.2 million dollars for 1 minute of screentime. This amount was largely due to Beauty and the Beast.
Inference 13: The most profitable movie genres are Family, Animation, and Action.
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'gross', data = movies, estimator = sum, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
plt.title('sum of grosses per genre')
plt.show()
**Inference 14: ** Action movies had the most profits (67k million), followed by comedy movies (34k million) and then animation movies (25k million).
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.countplot(x = 'genre', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels() ,rotation = 30)
plt.title('number of movies per genre')
plt.show()
Inference 14:: Comedy movies were the most frequently released, followed by drama and action movies.
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'imdb', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average imdb rating per genre')
plt.show()
**Inference 15:**For IMDb ratings, on average, horror movies got the lowest ratings while war, documentary, and musical movies seem to get the highest.
Inference 16: IMDb movies tend to approximately get the same ratings regardless of genre.
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'meta', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average meta rating per genre')
plt.show()
Inference 17:: For meta ratings, documentary movies get the highest average rating.
Inference 18:: Mean ratings for each genre for metacritic tend to vary for each genre, unlike IMDb ratings.
Inference 19: Horror movies are the most disliked.
Inference 20: Documentary movies are the most liked.
fig, ax = plt.subplots()
sns.kdeplot(movies['imdb'], ax=ax)
sns.kdeplot(movies['meta']/10, ax=ax)
<matplotlib.axes._subplots.AxesSubplot at 0x24a6a8c40b8>
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'genre', y = 'runtime', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average runtime per genre')
plt.show()
movies.groupby(['genre']).mean().sort_values('runtime').iloc[0]
imdb 7.20
meta 57.50
gross 0.08
runtime 90.50
Name: Musical, dtype: float64
movies.groupby(['genre']).mean().sort_values('runtime').iloc[-1]
imdb 7.036449
meta 63.781931
gross 25.985452
runtime 117.529595
Name: Biography, dtype: float64
Inference 21: Biography movies had the longest average duration. (117 mins). Musical movies had the lowest. (90 minutes)
movies.loc[movies['runtime'].idxmin()]
name Ghosts of the Abyss
year 2003
genre Documentary
imdb 6.9
meta 67
gross 17.09
runtime 61
Name: 1020, dtype: object
movies.loc[movies['runtime'].idxmax()]
name The Best of Youth
year 2003
genre Drama
imdb 8.5
meta 89
gross 0.25
runtime 366
Name: 946, dtype: object
Inference 22: Longest movie was 'The Best of Youth', which ran for 366 minutes (6 hours). Shortest movie was 'Ghost of the Abyss', which ran for 61 minutes. (1 hour)
sns.jointplot(x = 'runtime', y = 'gross', data = movies, kind = 'reg', scatter_kws={"s": 3})
plt.show()
Inference 23: Runtime and box office are slightly correlated. Meaning, if the duration of a movie is more, it earns more. (with a probability of 0.3)
movies.groupby('year').sum()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
imdb | meta | gross | runtime | |
---|---|---|---|---|
year | ||||
2000 | 1644.9 | 13595 | 7413.40 | 27113 |
2001 | 1728.3 | 14157 | 7956.77 | 28360 |
2002 | 1774.5 | 15066 | 9072.36 | 28557 |
2003 | 1705.4 | 14524 | 8771.08 | 28026 |
2004 | 1753.3 | 14558 | 9186.20 | 28528 |
2005 | 1748.4 | 14947 | 8807.64 | 28870 |
2006 | 1897.1 | 16516 | 9157.57 | 31077 |
2007 | 1846.1 | 15657 | 9321.09 | 29943 |
2008 | 1732.3 | 14605 | 9663.57 | 28448 |
2009 | 1734.7 | 14517 | 10388.39 | 28545 |
2010 | 1718.8 | 14939 | 10085.55 | 27970 |
2011 | 1757.8 | 15134 | 9947.34 | 28678 |
2012 | 1633.6 | 14207 | 10321.71 | 26685 |
2013 | 1794.9 | 15628 | 10685.90 | 29671 |
2014 | 1689.4 | 14648 | 10544.93 | 27719 |
2015 | 1480.1 | 13127 | 10497.37 | 24504 |
2016 | 1546.5 | 13733 | 11253.12 | 25662 |
2017 | 1363.3 | 12606 | 10532.35 | 22917 |
2018 | 1228.9 | 10953 | 11213.83 | 20714 |
fig, ax = plt.subplots(figsize=(18,4))
ax = sns.barplot(x = 'year', y = 'gross', data = movies, palette=("Blues_d"))
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30)
plt.title('average gross per year')
plt.show()
Hmm. The black bars indicate that there's a high chance of error. Guess this is what happens with semeingly unpredictable data like movies.
Inference 24: Average earning for movies increased as time went on.
sns.countplot(x = 'year', data = movies, palette=("Blues_d"))
plt.xticks(rotation = 45)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18]), <a list of 19 Text xticklabel objects>)
sns.lmplot(x = 'imdb', y = 'meta', data = movies, scatter_kws={"s": 10}, hue = 'year')
<seaborn.axisgrid.FacetGrid at 0x24a6aab2b70>
movies.groupby('year').mean()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
imdb | meta | gross | runtime | |
---|---|---|---|---|
year | ||||
2000 | 6.475984 | 53.523622 | 29.186614 | 106.744094 |
2001 | 6.497368 | 53.221805 | 29.912669 | 106.616541 |
2002 | 6.476277 | 54.985401 | 33.110803 | 104.222628 |
2003 | 6.534100 | 55.647510 | 33.605670 | 107.379310 |
2004 | 6.566667 | 54.524345 | 34.405243 | 106.846442 |
2005 | 6.451661 | 55.154982 | 32.500517 | 106.531365 |
2006 | 6.474744 | 56.368601 | 31.254505 | 106.064846 |
2007 | 6.523322 | 55.325088 | 32.936714 | 105.805654 |
2008 | 6.463806 | 54.496269 | 36.058097 | 106.149254 |
2009 | 6.472761 | 54.167910 | 38.762649 | 106.511194 |
2010 | 6.535361 | 56.802281 | 38.348099 | 106.349810 |
2011 | 6.486347 | 55.845018 | 36.706052 | 105.822878 |
2012 | 6.482540 | 56.376984 | 40.959167 | 105.892857 |
2013 | 6.550730 | 57.036496 | 38.999635 | 108.288321 |
2014 | 6.573541 | 56.996109 | 41.030856 | 107.856031 |
2015 | 6.578222 | 58.342222 | 46.654978 | 108.906667 |
2016 | 6.580851 | 58.438298 | 47.885617 | 109.200000 |
2017 | 6.585990 | 60.898551 | 50.880918 | 110.710145 |
2018 | 6.571658 | 58.572193 | 59.967005 | 110.770053 |