eda.py

# -*- coding: utf-8 -*-
"""EDA.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1A2Uw-sUxi6iu-OmeodPQaW5vRTfpMQwS

#**3 EXPLORATORY DATA ANALYSIS**

##**3.1 WHAT IS HAPENINNG IN ST. PETERSBURG?**
"""

df2 = df.copy()
#Add a total alcohol consumption
df2['total_alcohol'] = df2.apply(lambda x: x.wine + x.vodka + x.beer + x.champagne + x.brandy, axis=1)
#Segment Saint Petersburg
spb = df2[df2.region == 'Saint Petersburg'].set_index('year')

fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (14, 5), tight_layout=True)
sns.lineplot(data=spb['total_alcohol'], ax=ax1)
ax1.set_title('Alcohol consumption in St Petersburg', fontsize=20)
ax1.set(
        xlabel = 'year', 
        ylabel = 'litre per capita',
        xlim = (1998, 2016))
ax1.legend('')
for col in spb[drinks]:
  sns.lineplot(data=spb[col], ax=ax2)
  ax2.set_title('Consumption by drinks in St Petersburg', fontsize=20)
  ax2.set(
        xlabel = 'year',
        ylabel = "drinks share of total",
        xlim = (1998, 2016)
    )
ax2.legend(['wine', 'beer', 'vodka', 'champagne', 'brandy'])

spb.head()

"""**Conclusion**:


*   Alcohol consumption has dropped massively in St. Petersburg since 2009. Citizens of St. Petersburg don't drink beer as much as before. Vodka has lost its popularity as well but, not so much. Interestingly, wine consumption has seen an unprecedented increase in the last decade. Brandy, probability because of its higher price and accessibility, has remained a niche drink between people of St. Petersburg.

##**3.2 ST. PETERSBURG vs RUSSIA**
"""

# Segemnt the rest of Russia
rest = df2[df2['region']!='Saint Petersburg']
# Calculate average consumption 
rest_avg = rest.groupby('year').mean()

fig, axes = plt.subplots(2, 3,  figsize=(25, 15))
for drink, ax in zip(drinks, axes.flatten()):
      sns.lineplot(ax=ax, data=spb[drink], label='Saint Petersburg')
      sns.lineplot(ax=ax, data=rest_avg[drink], label='Russia')
      ax.set_title(drink.title() + ' consumption, litres per capita')
      ax.set(
            xlabel = 'year',
            xlim = (1998, 2016)
        )
fig.delaxes(axes[1][2])

"""###**3.2.1 COMPARING BY PERCENTAGE**"""

df3 = df2.copy()
# Percentage of alcohol
for drink in drinks:
    df3[drink + '-percentage'] = df3.apply(lambda x: (x[drink] / x.total_alcohol)*100, axis=1)
percentage = ['wine-percentage', 'beer-percentage', 'vodka-percentage', 'champagne-percentage', 'brandy-percentage']
# Segment Peter and Russia
spb2 = df3[df3.region == 'Saint Petersburg'].set_index('year')
rest2 = df3[df3['region']!='Saint Petersburg']
# Take avg of each drink in rest of Russia
avg_russ= rest2.groupby('year')[percentage].mean()

#T ake avg of total alcohol consumption in Russia
russ_alc = rest.groupby('year')['total_alcohol'].mean()

fig, axes = plt.subplots(3, 2, figsize = (14, 8), tight_layout=True)

for drink, ax in zip(percentage, axes.flatten()):
  sns.lineplot(data=spb2[drink], ax=ax)
  sns.lineplot(data=avg_russ[drink], ax=ax)
  ax.set_title(drink.title() + ' as % of alcohol')
  ax.set(
            xlabel = 'year',
            xlim = (1998, 2016)
        )
  ax.legend(['St Petersburg', 'Russia'])

g= sns.lineplot(data=spb2['total_alcohol'], ax=axes[2,1], label = 'Saint Petersburg')
g= sns.lineplot(data=russ_alc, ax=axes[2,1], label = 'Russia')
g.set(
    xlabel = 'year',
    xlim = (1998, 2016)
)

"""**Conclusion**:


*   Percentage-wise, St. Peteians used to drink less wine than other parts of Russia. This trend has changed since 2012. Based on the latest report, wine makes up around 15% of Saint Petersburg's drink basket. As we saw above, beer is losing its popularity over time in Peter. Russians don't follow Petersburgian's footsteps and, beer is getting more popular almost every day. While you may believe vodka is widely popular in Russia, data shows the opposite. Vodka consumption dropped massively both in Russia and St. Petersburg. Our analysis is in line with the recent report from [VTB capital research](https://intellinews.com/vodka-drinking-on-the-decline-in-russia-222886/). Vodka consumption in Russia has decreased from 35% in 1998 to 10% in 2016. Only 10% of Peteians alcohol consumption includes vodka. Unlike the rest of Russia, the people of Saint Petersburg, who earn more on average, drink more brandy and champagne these days. Generally speaking, St. Peteians drink less than average Russian.

##**3.3 CORRELATION BETWEEN DRINKS**
"""

def mult_correlation_matrix(df1, df2, df3, data_set =['']*3):
    # Set up the matplotlib figure
    fig, axs = plt.subplots(1,3,figsize=(15, 20))                    # Set figure size
    l = [df1 , df2, df3]
    for i,df in enumerate(l):
        #Compute the correlation matrix
        correlation = df.corr(method = 'pearson')

        # Generate a mask for the upper triangle
        mask = np.triu(np.ones_like(correlation, dtype=bool))

    
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(230, 20, as_cmap=True)
        cbar_ax = fig.add_axes([.9, .4, .03, .2])
        # Draw the heatmap with the mask 
        sns.heatmap(correlation, 
                mask = mask, 
                cmap = cmap, 
                vmax = 1,                                      # Set scale min value
                vmin = -1,                                     # Set scale min value
                center = 0,                                    # Set scale min value
                square = True,                                 # Ensure perfect squares
                linewidths = 1.5,                              # Set linewidth between squares
                cbar_kws = {"shrink": .9},                     # Set size of color bar
                annot = True ,                                  # Include values within squares
                ax = axs[i],
                cbar_ax = cbar_ax 
                    
               );
        axs[i].tick_params(labelrotation=45)   # Rotate y labels
        axs[i].set_title(f'{data_set[i]}', y=1.05, fontsize =15);   # Set plot title and positionnsp_df.corr()
    fig.suptitle('Correlation Matrix', x=0.5 , y=0.64, fontsize = 20)

df4 = df.copy()
df4.drop(columns='year', inplace=True)
spb4 = df4[df4.region == 'Saint Petersburg']
rest4 = df4[df4['region']!='Saint Petersburg']
mult_correlation_matrix(df4, spb4, rest4 , ['Russia', 'Saint Petersburg' , 'Rest of Russia'])

"""**Conclusion**:


*   Based on Pearson’s Correlation coefficient, we can see that wine consumption in Saint Petersburg enjoys a strong linear relationship with brandy. In the case of Russia, the same relationship is still moderately positive which, indicates that most wines are consumed in regions that also sell large amounts of brandy. We can relatively conclude the same between wine and champagne consumption. The power 
of the relationship between these expensive drinks can be an economic effect. People of Saint Petersburg earn more money than an average Russian.
"""