This project's goal is to analyze data about Uber rides while using various data visualization frameworks that are available for Python. Data on Uber rides in New York from April to September of 2014 are included in the dataset. The information was gathered by FiveThirtyEight and is accessible on Kaggle.
The variables in the dataset are:
name | descripiton |
---|---|
Date/Time | The date and hour of Uber pickup |
Lat | The Latitude of the Uber pickup |
Lon | The longitude of the Uber pickup |
Base | The LTC base company code affiliated with the Uber pickup |
import pandas as pd
apr_data = pd.read_csv('./data/uber-raw-data-apr14.csv') # Pickups in april
may_data = pd.read_csv('./data/uber-raw-data-may14.csv') # Pickups in may
jun_data = pd.read_csv('./data/uber-raw-data-jun14.csv') # pickups in june
jul_data = pd.read_csv('./data/uber-raw-data-jul14.csv') # pickups in july
aug_data = pd.read_csv('./data/uber-raw-data-aug14.csv') # pickups in august
sep_data = pd.read_csv('./data/uber-raw-data-sep14.csv') # pickups in september
# Concat all the subsets in just one variable
data = pd.concat([apr_data, may_data, jun_data, jul_data, aug_data, sep_data])
del apr_data, may_data, jun_data, jul_data, aug_data, sep_data # Deleting the unnecessary variables to clean the memory
data['Date/Time'] = pd.to_datetime(data['Date/Time'], format='%m/%d/%Y %H:%M:%S') # String to datetime
data['day'] = data['Date/Time'].dt.day # Extract the day
data['month'] = data['Date/Time'].dt.month # Extract the month
data['year'] = data['Date/Time'].dt.year # Extract the year
data['day_of_week'] = data['Date/Time'].dt.day_of_week # Extract the day of the week
data['hour'] = data['Date/Time'].dt.hour # Extract the hour
data['minute'] = data['Date/Time'].dt.minute # Extract the minute
data['second'] = data['Date/Time'].dt.second # Extract the second
data.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Date/Time | Lat | Lon | Base | day | month | year | day_of_week | hour | minute | second | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2014-04-01 00:11:00 | 40.7690 | -73.9549 | B02512 | 1 | 4 | 2014 | 1 | 0 | 11 | 0 |
1 | 2014-04-01 00:17:00 | 40.7267 | -74.0345 | B02512 | 1 | 4 | 2014 | 1 | 0 | 17 | 0 |
2 | 2014-04-01 00:21:00 | 40.7316 | -73.9873 | B02512 | 1 | 4 | 2014 | 1 | 0 | 21 | 0 |
3 | 2014-04-01 00:28:00 | 40.7588 | -73.9776 | B02512 | 1 | 4 | 2014 | 1 | 0 | 28 | 0 |
4 | 2014-04-01 00:33:00 | 40.7594 | -73.9722 | B02512 | 1 | 4 | 2014 | 1 | 0 | 33 | 0 |
days_names = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
months_names = {4:'April', 5:'May', 6:'June', 7:'July', 8:'August', 9:'September'}
hour_data = data.groupby('hour')['hour'].count().to_frame().rename(columns={'hour':'Total'})
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(11, 6), dpi=100)
fig.fontsize = 20
ax.plot(hour_data, color='#09091a')
ax.set_xlim(0, 23)
ax.set_xticks(range(24))
ax.set_ylim(0, hour_data.Total.max() + 5000)
ax.spines[['top', 'right']].set_visible(False)
ax.set_xlabel('Hour', fontsize=14, color='#222233')
plt.suptitle('Number of trips during the day', fontsize=18, color='#09091a',
x=0.123, y=1.05, ha='left')
ax.set_title('Most trips in 2014 took place after 15:00',
fontsize=14, loc='left', color='#1fbad6', y=1.1, ha='left')
ax.vlines(15, ymin=0, ymax=hour_data.Total.max(), color='#c0c0c8')
ax.text(15.3, hour_data.loc[15] / 1.2, '2,521,360 trips \nwere recorded between\n 15:00 and 23:00 hours.', color='#222233')
ax.text(14.8, hour_data.loc[15] / 3, '2,288,433 trips \nwere recorded between\n 00:00 and 15:00.',
horizontalalignment='right', color='#222233')
ax.plot(17, hour_data.loc[17], 'o', color='#222233')
ax.text(17, hour_data.loc[17] * 1.02, 'Pickups peak at 17:00 with 336,190 trips.')
plt.show()
data_hour_month = pd.crosstab(data.hour, data.month)
data_hour_month = data_hour_month.rename(columns=months_names)
data_hour_month
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
month | April | May | June | July | August | September |
---|---|---|---|---|---|---|
hour | ||||||
0 | 11910 | 13875 | 14514 | 17953 | 21451 | 24133 |
1 | 7769 | 8186 | 9167 | 11527 | 14471 | 16107 |
2 | 4935 | 5372 | 6189 | 8562 | 10105 | 10702 |
3 | 5040 | 5946 | 6937 | 9199 | 10376 | 10789 |
4 | 6095 | 6945 | 7701 | 10040 | 11774 | 12675 |
5 | 9476 | 10789 | 11955 | 14932 | 16525 | 20262 |
6 | 18498 | 21015 | 22030 | 23456 | 24907 | 33307 |
7 | 24924 | 27413 | 30834 | 32545 | 34064 | 43314 |
8 | 22843 | 25460 | 29771 | 33387 | 34566 | 44477 |
9 | 17939 | 20507 | 24298 | 28486 | 30195 | 38542 |
10 | 17865 | 20801 | 23584 | 28558 | 30706 | 37634 |
11 | 18774 | 22055 | 24155 | 30120 | 31778 | 38821 |
12 | 19425 | 23595 | 25233 | 30900 | 32106 | 39193 |
13 | 22603 | 27699 | 28937 | 35832 | 35764 | 45042 |
14 | 27190 | 34363 | 34428 | 41357 | 40644 | 52643 |
15 | 35324 | 43087 | 41586 | 46053 | 48197 | 61219 |
16 | 42003 | 49127 | 48162 | 52403 | 53481 | 68224 |
17 | 45475 | 51508 | 50452 | 58260 | 57122 | 73373 |
18 | 43003 | 48965 | 45013 | 57268 | 55390 | 75040 |
19 | 38923 | 42387 | 38203 | 52332 | 53008 | 69660 |
20 | 36244 | 40731 | 40108 | 51859 | 51674 | 63988 |
21 | 36964 | 42217 | 40791 | 49528 | 51354 | 60606 |
22 | 30645 | 35556 | 35614 | 42218 | 46008 | 51817 |
23 | 20649 | 24836 | 24182 | 29346 | 33609 | 36568 |
fig, ax = plt.subplots(figsize=(11, 6), dpi=100)
data_hour_month.plot(kind='bar', stacked=True, color=['#d9d9d9', '#999999', '#747474', '#5d5d5d', '#3f3f3f', '#1fbad6'], ax=ax)
ax.spines[['top', 'right']].set_visible(False)
ax.set_xlabel('Hour', fontsize=14, color='#222233')
plt.legend(title='Month')
plt.suptitle('Number of trips by month and hour', fontsize=18, color='#09091a',
x=0.123, y=1.05, ha='left')
ax.set_title('In the month of September, more trips were registered',
fontsize=14, loc='left', color='#1fbad6', y=1.1, ha='left')
ax.text(17, hour_data.loc[17], 'Peak hours are \nthe same every month.',
horizontalalignment='right', color='#222233')
plt.show()
trips_avg = round(data.day.value_counts().mean(), 0)
daily_trips = data.day.value_counts()
days_above_avg = daily_trips[daily_trips > trips_avg].to_frame().sort_index()
palette = []
for i in range(32):
if i == 29:
palette.append('#1fbad6')
elif i == 30:
palette.append('#d9d9d9')
elif i+1 in days_above_avg.index:
palette.append('#3f3f3f')
else:
palette.append('#999999')
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize':(10, 6),
'axes.facecolor':'white',
'figure.facecolor':'white'})
ax = sns.countplot(data=data, x='day', palette=palette)
ax.set_xlabel('Hour', fontsize=12)
ax.set_ylabel('')
plt.suptitle('Number of trips by day and month', fontsize=18, color='#09091a',
x=0.123, y=1.05, ha='left')
ax.set_title('17 out of 31 days are above average trips.',
fontsize=14, loc='left', color='#1fbad6', y=1.1, ha='left')
ax.hlines(trips_avg, xmin=-0.5, xmax=31, ls='--', colors='k')
ax.text(31, trips_avg, f"Average = {int(trips_avg)}", va='center')
ax.text(30, daily_trips.loc[31], f"{daily_trips.loc[31]} trips")
ax.text(29, daily_trips.loc[30], f"{daily_trips.loc[30]} trips", color='#1fbad6', weight='bold')
ax;
data2 = data.copy()
data2 = data2.replace({'month': months_names, 'day_of_week': days_names})
import plotly.express as px
palette = ['#0d47a1', '#1565c0', '#1976d2', '#1e88e5', '#2196f3', '#42a5f5', '#64b5f6', '#90caf9']
px.histogram(data2, x='month', color='day_of_week', barmode='group',
labels = {'month':'Months', 'day_of_week':'Day of week'},
title = 'Trips by week day and month',
color_discrete_sequence = palette,
category_orders = {'day_of_week': ['Monday', 'Tuesday', 'Wednesday', 'Quinta', 'Thursday', 'Saturday', 'Sunday']}
).update_layout(yaxis_title = '',
plot_bgcolor = 'rgb(255, 255, 255)')
from plotnine import ggplot
from plotnine import *
import plotnine as p9
trips = data.groupby('month')['month'].count().to_frame().rename(columns={'month':'Total'}).reset_index()
palette = ('#2d9dff', '#2d9dff', '#2d9dff', '#2d9dff', '#2d9dff', '#2d9dff')
p9.options.figure_size = (10, 6)
ggplot(trips)\
+ aes(x='month', y='Total', fill='factor(month)')\
+ geom_col()\
+ coord_flip()\
+ geom_text(
aes(label = 'Total'),
ha = 'right'
)\
+ labs(
y = 'Trips',
x = 'Months' ,
title = 'Trips by month'
)\
+ theme_minimal()\
+ theme(legend_position='none')\
+ scale_x_continuous(breaks=list(range(4, 10)), labels=['April', 'May', 'June', 'July', 'August', 'September'])\
+ scale_fill_manual(values=palette)
<ggplot: (167300155338)>
base_trips = data.groupby('Base')['Base'].count().to_frame().rename(columns={'Base':'Total'}).reset_index()
base_trips
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Base | Total | |
---|---|---|
0 | B02512 | 205673 |
1 | B02598 | 1393113 |
2 | B02617 | 1458853 |
3 | B02682 | 1212789 |
4 | B02764 | 263899 |
import altair as alt
bars = alt.Chart(base_trips, title='Trips by Base').mark_bar().encode(
x='Total',
y="Base"
)
text = bars.mark_text(
align='right',
baseline='middle',
dx=-3, color='#ffffff'
).encode(
text='Total'
)
(bars + text).properties(height=200)
month_base_trips = pd.crosstab(data.Base, data.month)
month_base_trips = month_base_trips.rename(columns=months_names)
month_base_trips
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
month | April | May | June | July | August | September |
---|---|---|---|---|---|---|
Base | ||||||
B02512 | 35536 | 36765 | 32509 | 35021 | 31472 | 34370 |
B02598 | 183263 | 260549 | 242975 | 245597 | 220129 | 240600 |
B02617 | 108001 | 122734 | 184460 | 310160 | 355803 | 377695 |
B02682 | 227808 | 222883 | 194926 | 196754 | 173280 | 197138 |
B02764 | 9908 | 9504 | 8974 | 8589 | 48591 | 178333 |
from bokeh.io import show
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure
x = [(base, mes) for base in month_base_trips.index.values[:] for mes in month_base_trips.columns]
counts = [month_base_trips.loc[base, mes] for base in month_base_trips.index.values[:] for mes in month_base_trips.columns]
source = ColumnDataSource(data=dict(x=x, counts=counts))
p = figure(x_range=FactorRange(*x), plot_height=350, title="Trips by base and month",
toolbar_location=None, tools="")
p.vbar(x='x', top='counts', width=0.9, source=source)
p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xaxis.major_label_orientation = 1
p.xgrid.grid_line_color = None
show(p)
data2 = data.copy()
data2 = data2.replace({'month': months_names, 'day_of_week': days_names})
base_days_week_trips = pd.crosstab(data2.Base, data2.day_of_week)
base_days_week_trips
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
day_of_week | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
---|---|---|---|---|---|---|---|
Base | |||||||
B02512 | 33319 | 25460 | 26773 | 20490 | 35032 | 31670 | 32929 |
B02598 | 229908 | 163542 | 198832 | 146652 | 235157 | 202378 | 216644 |
B02617 | 234379 | 176416 | 206554 | 164452 | 240216 | 214167 | 222669 |
B02682 | 201594 | 143372 | 170160 | 126511 | 205091 | 176198 | 189863 |
B02764 | 41939 | 32682 | 43795 | 32075 | 39649 | 39376 | 34383 |
import pygal
from pygal.style import LightenStyle
dark_lighten_style = LightenStyle('#336676')
bar_chart = pygal.Bar(style=dark_lighten_style, height=250)
bar_chart.title = 'Trips by Base and day of week'
bar_chart.x_labels = base_days_week_trips.index.values[:]
for column in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
bar_chart.add(column, base_days_week_trips[column])
bar_chart.render_to_file('trips_base_week_day.svg')
trips = pd.crosstab(data.hour, data.day) / 1_000
import matplotlib.pyplot as plt
import numpy as np
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(trips, cmap=plt.get_cmap("Blues", 13), vmin=0, vmax=13)
ax.set_xticks(np.arange(len(trips.columns)), labels=trips.columns, fontsize=10)
ax.set_yticks(np.arange(len(trips.index)), labels=trips.index, fontsize=10)
ax.set_title("Trips by hour and day", fontsize=20)
cbar = ax.figure.colorbar(im, ticks=np.arange(14), fraction=0.035, ax=ax)
cbar.ax.set_ylabel("Trips in thounsands", rotation=-90, va="bottom", fontsize=12)
ax.spines[:].set_visible(False)
ax.set_xticks(np.arange(trips.shape[1]+1)-.5, minor=True)
ax.set_yticks(np.arange(trips.shape[0]+1)-.5, minor=True)
ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
ax.tick_params(which="minor", bottom=False, left=False)
ax.set_xlabel('Day', fontsize=12)
ax.set_ylabel('Hour', fontsize=12)
plt.show()
import seaborn as sns
trips = pd.crosstab(data.month, data.day) / 1_000
corridas_plot = trips.rename(index=months_names)
fig, ax = plt.subplots(figsize=(20, 7))
sns.heatmap(trips,
vmin=0,
vmax=45,
cmap=plt.get_cmap("Blues", 9),
ax=ax,
linewidths=2)
ax.set_title('Trips by month and day', fontsize=20)
ax.set_xlabel('Day', fontsize=12)
ax.set_ylabel('', fontsize=12)
ax.collections[0].colorbar.set_label('Trips in thousands', fontsize=12)
trips = pd.crosstab(data.month, data.day_of_week) / 1_000
trips = trips.rename(index=months_names, columns=days_names)
import plotly.graph_objs as go
plot = go.Heatmap(z = trips.values[:],
x = trips.columns,
y = trips.index,
colorscale = 'Blues',
xgap = 2,
ygap = 2,
zmin = 0,
zmax = 165,
colorbar = dict(title='Trips in thousands')
)
layout = go.Layout(title = 'Trips by month and week day')
fig = go.Figure(data=plot, layout=layout)
fig.show()
trips = data.groupby(['Base', 'month'])['hour'].count().reset_index().rename(columns={'hour':'Total'})
trips = trips.replace({'month':months_names})
trips['Total'] /= 1000
trips['Total'] = trips['Total'].round(2)
from plotnine import *
import plotnine as p9
p9.options.figure_size = (10, 6)
ggplot(trips)\
+ aes(x='month', y='Base', fill='Total')\
+ geom_tile(aes(width=.95, height=.95))\
+ geom_text(aes(label='Total'), size=10)\
+ labs(
y = 'Base',
x = '' ,
title = 'Trips by Base and month'
)\
+ theme_minimal()\
+ scale_fill_gradient(low='#cbe7ff', high='#08306b')\
+ scale_x_discrete(limits=('April', 'May', 'June', 'July', 'August', 'September'))
<ggplot: (167257035978)>
trips = data.groupby(['Base', 'day_of_week'])['hour'].count().reset_index().rename(columns={'hour':'Total'})
trips = trips.replace({'day_of_week':days_names})
trips['Total'] /= 1000
trips['Total'] = trips['Total'].round(2)
trips.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Base | day_of_week | Total | |
---|---|---|---|
0 | B02512 | Monday | 25.46 |
1 | B02512 | Tuesday | 31.67 |
2 | B02512 | Wednesday | 32.93 |
3 | B02512 | Thursday | 35.03 |
4 | B02512 | Friday | 33.32 |
import altair as alt
alt.Chart(trips, title='Trips by Base and week day').mark_rect().encode(
x=alt.X('day_of_week', axis=alt.Axis(title='Week day'), sort=['Monday', 'Tuesday', 'Wednesday', 'Thursday',
'Friday', 'Saturday', 'Sunday']),
y='Base',
color=alt.Color('Total', scale=alt.Scale(scheme='blues')),
).properties(height=300, width=300)
import pandas as pd
import numpy as np
from bokeh.plotting import figure
from bokeh.tile_providers import get_provider, WIKIMEDIA
from bokeh.io import output_notebook, show
from pyproj import Proj, transform
import warnings
warnings.filterwarnings("ignore")
inProj = Proj(init='epsg:3857')
outProj = Proj(init='epsg:4326')
lons, lats = [], []
for lon, lat in list(set(zip(data["Lon"], data["Lat"]))):
x, y = transform(outProj, inProj, lon, lat)
lons.append(x)
lats.append(y)
data_map = pd.DataFrame([])
data_map["MercatorX"] = lons
data_map["MercatorY"] = lats
data_map.head()
wikimedia = get_provider(WIKIMEDIA)
ny_lon1, ny_lat1 = transform(outProj, inProj, -73.7, 40.58)
ny_lon2, ny_lat2 = transform(outProj, inProj, -74.15, 40.92)
p = figure(plot_width=900, plot_height=700,
x_range=(ny_lon1, ny_lon2), y_range=(ny_lat1, ny_lat2),
x_axis_type="mercator", y_axis_type="mercator",
title="Uber rides in NY")
p.add_tile(wikimedia)
p.circle(x="MercatorX", y="MercatorY",
size=2,
fill_color="dodgerblue", line_color="dodgerblue",
fill_alpha=0.3,
source=data_map)
show(p)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from cartopy import crs as ccrs
from cartopy import feature as cfeature
# Set the domain for defining the plot region.
latN = 40.92
latS = 40.58
lonW = -74.15
lonE = -73.7
cLat = (latN + latS)/2
cLon = (lonW + lonE )/2
base_colors = {'B02512':'red', 'B02598':'green', 'B02617':'blue', 'B02682':'yellow', 'B02764':'gray'}
bases = data.Base.unique()
proj = ccrs.LambertConformal(central_longitude=cLon, central_latitude=cLat)
res = '10m' # Coarsest and quickest to display; other options are '10m' (slowest), '50m', 1110m.
fig = plt.figure(figsize=(18, 12))
ax = plt.subplot(1 ,1, 1, projection=proj)
ax.set_extent ([lonW, lonE, latS, latN])
ax.add_feature (cfeature.OCEAN.with_scale(res))
ax.add_feature(cfeature.COASTLINE.with_scale(res))
ax.set_title ('New York Map on Uber rides during 2014 (Apr-Sep) by Base')
for base in bases:
lat = data.query(f'Base == "{base}"').Lat
lon = data.query(f'Base == "{base}"').Lon
ax.scatter(lon, lat, s=9, c=base_colors[base],
edgecolor=None, alpha=0.75,
transform=ccrs.PlateCarree(), label=base)
plt.legend()
plt.show()