-
Notifications
You must be signed in to change notification settings - Fork 1
/
MBA_App.py
867 lines (642 loc) · 33.4 KB
/
MBA_App.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
from pyparsing import col
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('agg')
from wordcloud import WordCloud
import calendar
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
#lets import our data from the AWS RDS MySQL DataBase
#db info
from sqlalchemy import create_engine
host = st.secrets["host"]
user = st.secrets["user"]
password = st.secrets["password"]
port = st.secrets["port"]
database = st.secrets["database"]
st.set_option('deprecation.showPyplotGlobalUse', False)
st.set_page_config( page_title="Market Basket Analysis App",
page_icon= "random",
layout="wide"
)
col1, col2, col3 = st.columns((.1,1,.1))
with col1:
st.write("")
with col2:
st.markdown(" <h1 style='text-align: center;'>Market Basket Analysis on Online Retail Data:</h1>", unsafe_allow_html=True)
st.markdown("<p style='text-align: center;'><i><b>Providing a Retail Business with a strategy which helps improve their "
"product sales, inventory management, and customer retention, which in turn would improve the profitability of the business.</b></i></p>", unsafe_allow_html=True)
st.markdown("<center><img src='https://github.com/kkrusere/Market-Basket-Analysis-on-the-Online-Retail-Data/blob/main/Assets/MBA.jpg?raw=1' width=600/></center>", unsafe_allow_html=True)
with col3:
st.write("")
st.markdown("----")
col1, col2,col3 = st.columns((1,0.1,1))
with col1:
st.markdown("### ***Project Contributors:***")
st.markdown("Kuzi Rusere")
st.markdown("### **Project Introduction**")
st.markdown("***Business Proposition:*** This project aims to provide a Retail "
"Business with a strategy that helps improve their product sales, "
"inventory management, and customer retention, which in turn would "
"improve the profitability of the business. In the retail environment, "
"profitability and the `bottom line` is at the focal point of any "
"organization or business in this space. From product sales, through "
"inventory management to customer acquisition and retention all this one "
"way or the other affects the business' profits and net revenue. Transaction "
"data from the POS (point of sale) systems for a retail business is a treasure "
"trove of insights that can be used to better understand the products, customer "
"purchase behavior, and sales together with the relationship and patterns in "
"between. This project explores the different ways of deriving these insights, "
"patterns, and relationships, and how these can be used in designing, developing, "
"and implementing a strategy to improve the retail business' profits, revenue, "
"and overall operations")
st.markdown("***Methodology:*** Data Mining, Analysis and Visualization of Retail "
"Sales Data.")
"""
1. Market Basket Analysis (MBA), which aims to find relationship and establishing pattens within the retail sales data.
2. Customer Segmentation
> * RFM (recency, frequency, monetary) Analysis
3. Product Recomendation (people who bought this also bought)
"""
st.markdown("In addition, we created this `Streamlit` interactive data visualization "
"tool that allows users interact with the data and analytics.")
with col2:
pass
with col3:
st.markdown("### ***Data Collection:***")
"""
**General Information About the Data**
This is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.
**Information about the Attributes/Columns in the Dataset**
* ***InvoiceNo:*** Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.
* ***StockCode:*** Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.
* ***Description:*** Product (item) name. Nominal.
* ***Quantity:*** The quantities of each product (item) per transaction. Numeric.
* ***InvoiceDate:*** Invice Date and time. Numeric, the day and time when each transaction was generated.
* ***UnitPrice:*** Unit price. Numeric, Product price per unit in sterling.
* ***CustomerID:*** Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.
* ***Country:*** Country name. Nominal, the name of the country where each customer resides.
###### **The data source:**
"""
st.image("Assets/UCI_ML_REPO.png", caption="https://archive.ics.uci.edu/ml/datasets/online+retail")
st.markdown("----")
@st.cache(allow_output_mutation=True, ttl= 1200)
def load_data():
"""
This fuction loads data from the aws rds mysql table
"""
data = None
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}/{database}")
try:
query = f"SELECT * FROM MBA_Online_Retail_Data"
data = pd.read_sql(query,engine)
except Exception as e:
print(str(e))
return data
#loading the data
df = load_data()
st.markdown("#### ***Lets take a look at the data:***")
"""
We are going to use the pandas `.shape` function/method to the total number of columns and rows of the dataframe. We can see that our dataframe contains 481313 rows and 16 columns
We'll use the pandas `.info()` function so see the general infomation (data types, null value count, etc.) about the data.
"""
st.markdown(f"###### ***The shape of the data***: {df.shape}")
col1, col2,col3 = st.columns((1, 0.01,.5))
df_head = pd.read_csv("df_head.csv")
with col1:
st.markdown("***The below is the first 5 rows of the cleaed dataset***")
st.dataframe(df_head)
with col2:
pass
df_info = pd.read_csv("df_info.csv", index_col=0)
with col3:
st.markdown("***The below is the info of the data***")
st.dataframe(df_info)
st.success("If you want to take a look at how the data was cleaned, you "
"can go check out the jupyter notebook of this project at: "
"https://github.com/kkrusere/Market-Basket-Analysis-on-the-Online-Retail-Data/blob/main/MBA_Online-Retail_Data.ipynb")
######################functions############################
@st.cache(allow_output_mutation=True)
def group_Quantity_and_SalesRevenue(df,string):
"""
This function inputs the main data frame and feature name
The feature name is the column name that you want to group the Quantity and Sales Revenue
"""
df = df[[f'{string}','Quantity','Sales Revenue']].groupby([f'{string}']).sum().sort_values(by= 'Sales Revenue', ascending = False).reset_index()
return df
@st.cache(allow_output_mutation=True)
def choose_country(country = "All", data = df):
"""
This fuction takes in a country name and filters the data frame for just country
if the there is no country inputed the fuction return the un filtered dataframe
"""
if country == "All":
return data
else:
temp_df = data[data["Country"] == country]
temp_df.reset_index(drop= True, inplace= True)
return temp_df
def wordcloud_of_Description(df, title):
"""
This fuction creates a word cloud
inputs a data frame converts it to tuples and uses the input 'title' as the title of the word cloud
"""
plt.rcParams["figure.figsize"] = (20,20)
tuples = [tuple(x) for x in df.values]
wordcloud = WordCloud(max_font_size=100, background_color="white").generate_from_frequencies(dict(tuples))
plt.imshow(wordcloud)
plt.axis('off')
plt.title(title, fontsize = 27)
plt.show()
country_list = ["All"] + list(dict(df['Country'].value_counts()).keys())
@st.cache(allow_output_mutation=True)
def choose_country(country, data = df):
"""
This fuction takes in a country name and filters the data frame for just country
if the there is no country inputed the fuction return the un filtered dataframe
"""
if country == "All":
return data
else:
temp_df = data[data["Country"] == country]
temp_df.reset_index(drop= True, inplace= True)
return temp_df
##################################################################################
st.markdown("---")
st.markdown(" <h3 style='text-align: center;'>Exploratory Data Analysis <i>(EDA)</i>:</h3>", unsafe_allow_html=True)
col1, col2, col3= st.columns((.1,1,.1))
with col1:
pass
with col2:
"""
* Exploratory data analysis is an approach/practice of analyzing data sets to summarize their main characteristics, often using statistical graphics and other ***data visualization***. It is a critical process of performing initial ***investigations to discover*** patterns, detect outliers and anomalies, and gain some new, hidden, insights into the data.
* Investigating questions like the total volume of purchases per month, week, day of the week, time of the day right to the hour. We will look at customers more later when we get into the ***Recency, Frequency and Monetary Analysis (RFM)*** in the Customer Segmentation section of the project.
"""
with col3:
pass
############################################
col1, col2, col3= st.columns((1,.1,1))
with col1:
Country_Data = df.groupby("Country")["InvoiceNo"].nunique().sort_values(ascending = False).reset_index().head(10)
fig = px.bar(Country_Data, x= "InvoiceNo", y='Country', title= "Top 10 Number of orders per country with the UK")
#fig.show(renderer='png', height=700, width=1000)
#fig.show( height=700, width=1000)
st.plotly_chart(fig)
st.markdown("UK has more number of orders witk 16k Invoice numbers")
with col2:
pass
with col3:
Country_Data = df[df['Country'] != "United Kingdom"].groupby("Country")["InvoiceNo"].nunique().sort_values(ascending = False).reset_index().head(10)
fig = px.bar(Country_Data, x= "InvoiceNo", y='Country', title= "Top 10 Number of orders per country without the UK")
#fig.show(renderer='png', height=700, width=1000)
#fig.show( height=700, width=1000)
st.plotly_chart(fig)
########################################################
col1, col2, col3= st.columns((.1,1,.1))
with col1:
pass
with col2:
"""
The above charts show that the UK by far has more invoices, just as suspected, with invoices surpassing 15K. Germany in in second place, with approximately 30 time less invoices. The retail store management can start possing question of why this is the case, especially when this is a Online retail store. Question like, what is the traffic like to the store web page, or should they start thinking of ***Search Engine Optimization (SEO)***, which is the process of improving the quality and quantity of website traffic to a website or a web page from search engines. Many other questions can be raised from the 2 charts above.
Below, we can take a look at how the countries fare up with regards to the ***Quantity sold*** and ***Sales Revenue***.The first plot is going to be for Quantity sold and the second will be for Sales Revenue both for the whole year of 2011.
"""
with col3:
pass
####################
col1, col2, col3= st.columns((1,.1,1))
with col1:
#choice = st.radio("", ("Top 10", "Bottom 10"))
temp_df = group_Quantity_and_SalesRevenue(df,'Country')
fig = px.bar(temp_df, x= "Quantity", y='Country', title= "Quantity of orders per country with the UK")
#fig.show(renderer='png', height=700, width=1000)
#fig.show( height=700, width=1000)
st.plotly_chart(fig)
with col2:
pass
with col3:
temp_df = group_Quantity_and_SalesRevenue(df,'Country')
fig = px.bar(temp_df[temp_df['Country'] != "United Kingdom"], x= "Quantity", y='Country', title= "Quantity of orders per country without the UK")
#fig.show(renderer='png', height=700, width=1000)
#fig.show( height=700, width=1000)
st.plotly_chart(fig)
####################
col1, col2, col3= st.columns((.1,1,.1))
with col1:
pass
with col2:
"""
Just as expected, the UK has high volumes of Quantitly sold and the below charts should show that the UK has high sales as well. However, unlike the number of invoices, the Netherlands has the second highest volume of Quantity sold at approximately 200K.
"""
with col3:
pass
####################
col1, col2, col3= st.columns((1,.1,1))
with col1:
#choice = st.radio("", ("Top 10", "Bottom 10"))
temp_df = group_Quantity_and_SalesRevenue(df,'Country')
fig = px.bar(temp_df, x= "Sales Revenue", y='Country', title= "Sales Revenue of orders per country with the UK")
#fig.show(renderer='png', height=700, width=1000)
#fig.show( height=700, width=1000)
st.plotly_chart(fig)
with col2:
pass
with col3:
temp_df = group_Quantity_and_SalesRevenue(df,'Country')
fig = px.bar(temp_df[temp_df['Country'] != "United Kingdom"], x= "Sales Revenue", y='Country', title= "Sales Revenue of orders per country without the UK")
#fig.show(renderer='png', height=700, width=1000)
#fig.show( height=700, width=1000)
st.plotly_chart(fig)
####################
col1, col2, col3= st.columns((.1,1,.1))
with col1:
pass
with col2:
"""
The sales revenue of Netherlands and Germany is quite close. It would be interesting to see this broken down by time periods: 'Month', 'Week', 'Day of the Week', 'Time of Day' ,or 'Hour'.
We now going to look at the products, which ones have high Quantity sold, or which product has high Sales Revenue. But first the below chart is a wordcloud of the product descriptions. A wordcloud is a visual representations of words that give greater prominence to words that appear more frequently, in this case the frequency is the 'Quantity'
"""
with col3:
pass
####################
col1, col2, col3= st.columns((3))
with col1:
option = st.selectbox(
'Please Choose a country to Analyze',
country_list)
if option == "All":
st.markdown("We will at data from All the countries")
else:
st.markdown(f"We will be looking at data from {option}")
dataframe = choose_country(country=option)
st.markdown("###### **We can create a word cloud of the Product Descriptions per Quantity & Product Descriptions per Sales Revenue**")
col1, col2, col3= st.columns((1,.3,1))
with col1:
temp_df = pd.DataFrame(dataframe.groupby('Description')['Quantity'].sum()).reset_index()
title = "Product Description per Quantity"
wordcloud_of_Description(temp_df, title)
st.pyplot()
with col2:
pass
with col3:
temp_df = pd.DataFrame(dataframe.groupby('Description')['Sales Revenue'].sum()).reset_index()
title = "Product Description per Sales Revenue"
wordcloud_of_Description(temp_df, title)
st.pyplot()
###############################################
st.markdown("##### **Monthly Stats:**")
"""
Below are the monthly analysis of the Sales and the Quantity of iterms sold
"""
col1, col2, col3= st.columns((1,.3,1))
with col1:
temp_df = group_Quantity_and_SalesRevenue(dataframe,'Month')
fig = make_subplots(rows=1, cols=2, shared_yaxes=False,
subplot_titles=("Quantity", "Sales Revenue")
)
fig.add_trace(go.Bar(x=temp_df['Month'], y=temp_df['Quantity'],name = 'Quantity'),1, 1)
fig.add_trace(go.Bar(x=temp_df['Month'], y=temp_df['Sales Revenue'],name = 'Sales Revenue'),1, 2)
fig.update_layout(showlegend=False, title_text="Monthly Sales Revanue and Quantity")
#fig.show(renderer='png', height=700, width=1200)
#fig.show( height=700, width=1000)
st.plotly_chart(fig)
"""
The above graphs show the monthly trend of Quantity of products ordered(left) and Sales Revenue(right).
"""
with col2:
pass
with col3:
fig = make_subplots(rows=1, cols=2,
specs=[[{"type": "pie"}, {"type": "pie"}]],
subplot_titles=("Quantity per Month", "Sales Revenue per Month")
)
fig.add_trace(
go.Pie(values = temp_df['Quantity'], labels = temp_df['Month'],
name = 'Quantity'),
row=1, col=1
)
fig.add_trace(
go.Pie(values = temp_df['Sales Revenue'], labels = temp_df['Month'],
name = 'Sales Revenue'),
row=1, col=2
)
fig.update_layout(title_text="Percentage pie charts for Monthly Sales Revanue and Quantity")
#fig.show(renderer='png', height=700, width=1200)
#fig.show( height=700, width=1000)
st.plotly_chart(fig)
"""
The above pie charts depicts the quantity of products ordered and sales revenue per month.
"""
##############################
st.markdown("##### **Weekly Stats:**")
"""
The below are the weekly analysis of the Sales and the Quantity of iterms sold
"""
ccol1, col2, col3= st.columns((.5,1,.5))
with col1:
pass
with col2:
temp_df = group_Quantity_and_SalesRevenue(dataframe,'Week of the Year')
fig = make_subplots(rows=1, cols=2, shared_yaxes=False,
subplot_titles=("Quantity", "Sales Revenue")
)
fig.add_trace(go.Bar(x=temp_df['Week of the Year'], y=temp_df['Quantity'],name = 'Quantity'),1, 1)
fig.add_trace(go.Bar(x=temp_df['Week of the Year'], y=temp_df['Sales Revenue'],name = 'Sales Revenue'),1, 2)
fig.update_layout(showlegend=False, title_text="Weekly Sales Revanue and Quantity")
#fig.show(renderer='png', height=700, width=1200)
#fig.show( height=700, width=1000)
st.plotly_chart(fig)
"""
The above graphs shows the weekly trend of sales revenue and the quantity of products ordered.
"""
with col3:
pass
##############################
st.markdown("##### **Daily Stats:**")
"""
The below are the daily analysis of the Sales and the Quantity of iterms sold
"""
col1, col2, col3= st.columns((1,.3,1))
with col1:
temp_df = group_Quantity_and_SalesRevenue(dataframe,'Day of Week')
fig = make_subplots(rows=1, cols=2, shared_yaxes=False,
subplot_titles=("Quantity", "Sales Revenue")
)
fig.add_trace(go.Bar(x=temp_df['Day of Week'], y=temp_df['Quantity'],name = 'Quantity'),1, 1)
fig.add_trace(go.Bar(x=temp_df['Day of Week'], y=temp_df['Sales Revenue'],name = 'Sales Revenue'),1, 2)
fig.update_layout(coloraxis=dict(colorscale='Greys'), showlegend=False, title_text="Day of the Week Sales Revanue and Quantity")
#fig.show(renderer='png', height=700, width=1200)
#fig.show( height=700, width=1000)
st.plotly_chart(fig)
st.markdown("The above graphs depict the daily trend of Sales revenue and quantity.")
with col2:
pass
with col3:
fig = make_subplots(rows=1, cols=2,
specs=[[{"type": "pie"}, {"type": "pie"}]],
subplot_titles=("Quantity", "Sales Revenue")
)
fig.add_trace(
go.Pie(values = temp_df['Quantity'], labels = temp_df['Day of Week'],
name = 'Quantity'),
row=1, col=1
)
fig.add_trace(
go.Pie(values = temp_df['Sales Revenue'], labels = temp_df['Day of Week'],
name = 'Sales Revenue'),
row=1, col=2
)
fig.update_layout(title_text="Percentage pie charts for Day of the Week Sales Revanue and Quantity")
#fig.show(renderer='png', height=700, width=1200)
#fig.show( height=700, width=1000)
st.plotly_chart(fig)
st.markdown("The above pie charts shows the daily trend of sales revenue and quantity of products ordered.")
###############################
col1, col2, col3= st.columns((.5,1,.5))
with col1:
pass
with col2:
temp_df = group_Quantity_and_SalesRevenue(dataframe,'Time of Day')
fig = make_subplots(rows=1, cols=2,
specs=[[{"type": "pie"}, {"type": "pie"}]],
subplot_titles=("Quantity", "Sales Revenue")
)
fig.add_trace(
go.Pie(values = temp_df['Quantity'], labels = temp_df['Time of Day'],
name = 'Quantity'),
row=1, col=1
)
fig.add_trace(
go.Pie(values = temp_df['Sales Revenue'], labels = temp_df['Time of Day'],
name = 'Sales Revenue'),
row=1, col=2
)
fig.update_layout(title_text="Percentage pie charts for Time of Day Sales Revanue and Quantity")
st.plotly_chart(fig)
st.markdown("The above piecharts shows the breakdown of Quantity of orders(left) and Sales revenue(right) by time of the day.")
with col3:
pass
###############################
col1, col2, col3= st.columns((1,.1,1))
with col1:
#we can also look at the volume of Invoice Numbers hourly data
Hourly_Sales = (dataframe.groupby('Hour').sum()["Quantity"]).reset_index()
fig = px.bar(Hourly_Sales, x='Hour', y='Quantity', title='Hourly Volume of quantity sold')
#fig.show(renderer='png', height=700, width=1000)
#fig.show(height=700, width=1000)
st.plotly_chart(fig)
with col2:
pass
with col3:
#we can also look at the volume quantity sold hourly data
Hourly_Sales = (dataframe.groupby('Hour').count()["InvoiceNo"]).reset_index()
fig = px.bar(Hourly_Sales, x='Hour', y='InvoiceNo', title='Hourly sale using the Invoice Numbers')
#fig.show(renderer='png', height=700, width=1000)
#fig.show(height=700, width=1000)
st.plotly_chart(fig)
###############################
st.markdown("##### ***Customers:***")
col1, col2, col3= st.columns((1,.1,1))
with col1:
data = dataframe.groupby("CustomerID")["InvoiceNo"].nunique().sort_values(ascending = False).reset_index().head(11)
fig = px.bar(data, x='CustomerID', y='InvoiceNo', title='Graph of top ten customer with respect to the invoice number')
#fig.show(renderer='png', height=700, width=1000)
#fig.show(height=700, width=1000)
st.plotly_chart(fig)
with col2:
pass
with col3:
temp_df = dataframe[dataframe["CustomerID"] != "Guest Customer"]
data = temp_df.groupby("CustomerID")["InvoiceNo"].nunique().sort_values(ascending = False).reset_index().head(11)
fig = px.bar(data, x='CustomerID', y='InvoiceNo', title='Graph of top ten customer with respect to the invoice number without the Guest Customer')
#fig.show(renderer='png', height=700, width=1000)
#fig.show(height=700, width=1000)
st.plotly_chart(fig)
#################################################
temp_df = group_Quantity_and_SalesRevenue(dataframe, 'Description')
Quantity_tempA = temp_df.sort_values(ascending=False, by = "Quantity").reset_index(drop=True)
Sales_Revenue_tempA = temp_df.sort_values(ascending=False, by = "Sales Revenue").reset_index(drop=True)
Quantity_tempA.drop('Sales Revenue', axis=1, inplace=True)
Sales_Revenue_tempA.drop('Quantity', axis=1, inplace=True)
colspace, col1, col2, col3= st.columns((.45,1,.01,1))
with col1:
qchoice = st.radio("Choose Either Top or Bottom of Product Description by Quantity", ("Top 10", "Bottom 10"))
qchoice_dict = {"Top 10":Quantity_tempA.head(10), "Bottom 10": Quantity_tempA.tail(10)}
st.markdown(f"{qchoice} Description by Quantity")
st.dataframe(qchoice_dict.get(qchoice))
with col3:
schoice = st.radio("Choose Either Top or Bottom of Product Description by Sales Revenue", ("Top 10", "Bottom 10"))
schoice_dict = {"Top 10":Sales_Revenue_tempA.head(10), "Bottom 10": Sales_Revenue_tempA.tail(10)}
st.markdown(f"{schoice} Description by Sales Revenue")
st.dataframe(schoice_dict.get(schoice))
##################################################
col1, col2, col3= st.columns((.5,1,.5))
with col1:
pass
with col2:
fig = make_subplots(rows=1, cols=2, shared_yaxes=False,
subplot_titles=(f"{qchoice} Products by Quantity", f"{schoice} Products by Sales Revenue")
)
fig.add_trace(go.Bar(x=qchoice_dict.get(qchoice)['Description'], y=qchoice_dict.get(qchoice)['Quantity'],name = f"{qchoice}"),1, 1)
fig.add_trace(go.Bar(x=schoice_dict.get(schoice)['Description'], y=schoice_dict.get(schoice)['Sales Revenue'],name = f"{schoice}"),1, 2)
fig.update_layout(height=600)
fig.update_layout(showlegend=False, title_text="Product Description by Quantity and Sales Revenue")
#fig.show(renderer='png', height=700, width=1200)
#fig.show(height=700, width=1000)
st.plotly_chart(fig)
with col3:
pass
#####################################################
#####################################################
st.markdown("----")
#########################################################################
st.markdown(" <h3 style='text-align: center;'>Market Basket Analysis <i>(MBA)</i>:</h3>", unsafe_allow_html=True)
r"""
**What is Market Basket Analysis?:**
Market Basket Analysis (MBA) is a data mining technique that is mostly used in the Retail Industry to uncover customer purchasing patterns and product relationships. The techniques used in MBA identify the patterns, associations, and relationships (revealing product groupings and which products are likely to be purchased together) in in frequently purchased items by customers in large transaction datasets collected/registered at the point of sale. The results of the Market Basket Analysis can be used by retailers or marketers to design and develop marketing and operation strategies for a retail business or organization.<br>
Market basket analysis mainly utilize Association Rules {IF} -> {THEN}. However, MBA assigns Business outcomes and scenarios to the rules, for example,{IF X is bought} -> {THEN Y is also bought}, so X,Y could be sold together. <br>
Definition: **Association Rule**
Let $I$= \{$i_{1},i_{2},\ldots ,i_{n}$\} be an itemset.
Let $D$= \{$t_{1},t_{2},\ldots ,t_{m}$\} be a database of transactions $t$. Where each transaction $t$ is a nonempty itemset such that ${t \subseteq I}$
Each transaction in D has a unique transaction ID and contains a subset of the items in I.
A rule is defined as an implication of the form:
$X\Rightarrow Y$, where ${X,Y\subseteq I}$.
The rule ${X \Rightarrow Y}$ holds in the dataset of transactions $D$ with support $s$, where $s$ is the percentage of transactions in $D$ that contain ${X \cup Y}$ (that is the union of set $X$ and set $Y$, or, both $X$ and $Y$). This is taken as the probability, ${P(X \cup Y)}$. Rule ${X \Rightarrow Y}$ has confidence $c$ in the transaction set $D$, where $c$ is the percentage of transactions in $D$ containing $X$ that also contains $Y$. This is taken to be the conditional probability, like ${P(Y | X)}$. That is,
* support ${(X \Rightarrow Y)}$ = ${P(X \cup Y)}$
* confidence ${(X \Rightarrow Y)}$ = ${P(X|Y)}$
The lift of the rule ${(X \Rightarrow Y)}$ is the confidence of the rule divided by the expected confidence, assuming that the itemsets $X$ and $Y$ are independent of each other.The expected confidence is the confidence divided by the frequency of ${Y}$.
* lift ${(X \Rightarrow Y)}$ = ${ \frac {\mathrm {supp} (X\cap Y)}{\mathrm {supp} (X)\times \mathrm {supp} (Y)}}$
Lift value near 1 indicates ${X}$ and ${Y}$ almost often appear together as expected, greater than 1 means they appear together more than expected and less than 1 means they appear less than expected.Greater lift values indicate stronger association
"""
"""
"""
"""
##### ***Now the Implementation of the MBA***
"""
mba_country_list = [
'United Kingdom',
'Germany',
'France',
'EIRE',
'Spain',
'Netherlands',
'Switzerland',
'Belgium',
'Portugal',
'Australia']
col1, col2, col3= st.columns((3))
with col1:
option = st.selectbox(
'Please Choose a country for the Market Basket Analysis',
mba_country_list)
if option == "All":
st.markdown("We will at data from All the countries")
else:
st.markdown(f"We will be looking at data from {option}")
MBA_df = choose_country(country=option)
"""
We are going to use the Apriori Algorithm for the association rule mining/analysis. Apriori is an algorithm for frequent item set mining and association rule learning over relational dataset. It proceeds by identifying the frequent individual items in the dataset and extending them to larger and larger item sets as long as those item sets appear sufficiently often in the dataset. The frequent item sets determined by Apriori can be used to determine association rules which highlight general trends, pattern, and relationships in the dataset.
"""
#we are going to rearrage the dataframe having the 'InvoiceNo' column the index, so that each row contains all the items purchased under the same invoice
basket = (MBA_df.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))
st.markdown("Below is the one-hot encoded basket with the InvoiceNo #s being the index")
def change_dtype_to_list(x):
x = list(x)
return x
def encoder(x):
if(x <= 0):
return 0
if(x >= 1):
return 1
col1, col2, col3= st.columns((.1,1,.1))
with col1:
pass
with col2:
with st.spinner("One-Hot Encoding the basket..."):
#now we encode
basket = basket.applymap(encoder)
st.dataframe(basket.head())
st.success('Done!')
st.markdown("The next step will be to generate the frequent itemsets that have a support of at "
"least 10% using the MLxtend Apriori fuction which returns frequent itemsets from a "
"one-hot DataFrame. And then can look at the rules of association using the "
"`MLxtend association_rules(), The function generates a DataFrame of association "
"rules including the metrics 'score', 'confidence', and 'lift'")
with st.spinner("Generating the Frequent Itemsets and Assosiation Rules..."):
rules = association_rules(apriori(basket, min_support=0.01, use_colnames=True), metric="lift", min_threshold=1)
# Sort values based on lift
rules = rules.sort_values("lift",ascending=False).reset_index(drop= True)
rules["antecedents"] = rules["antecedents"].apply(change_dtype_to_list)
rules["consequents"] = rules["consequents"].apply(change_dtype_to_list)
st.success('Done!')
"""Assosiation Rules"""
st.dataframe(rules.head())
with col3:
pass
st.markdown("----")
st.markdown(" <h3 style='text-align: center;'>Customer Segmentation:</h3>", unsafe_allow_html=True)
"""
Customer segmentation is the process of dividing customers into groups based on common characteristics so companies can market to each group effectively and appropriately.
* RFM (recency, frequency, monetary) Analysis
"""
"""
**RFM (recency, frequency, monetary) Analysis**
The “RFM” in RFM analysis stands for recency, frequency and monetary value. RFM analysis is a way to use data based on existing customer behavior to predict how a new customer is likely to act in the future.
1. How recently a customer has transacted with a brand
2. How frequently they’ve engaged with a brand
3. How much money they’ve spent on a brand’s products and services
RFM analysis enables marketers to increase revenue by targeting specific groups of existing customers (i.e., customer segmentation) with messages and offers that are more likely to be relevant based on data about a particular set of behaviors.
"""
rfm_country_list = [
'United Kingdom',
'Germany',
'France',
'EIRE',
'Spain',
'Netherlands',
'Switzerland',
'Belgium',
'Portugal',
'Australia']
col1, col2, col3= st.columns((3))
with col1:
option = st.selectbox(
'Please Choose a country for the Recency, Frequency, Monetary Analysis',
rfm_country_list)
if option == "All":
st.markdown("We will at data from All the countries")
else:
st.markdown(f"We will be looking at data from {option}")
RFM_df = choose_country(country=option)
#the first thing that we are going to need is the reference date
#in this case the day after the last recorded date in the dataset plus a day
ref_date = RFM_df['InvoiceDate'].max() + dt.timedelta(days=1)
st.markdown("----")
st.markdown(" <h3 style='text-align: center;'>Product recomendation <i>(people who bought this also bought)</i>:</h3>", unsafe_allow_html=True)
col1, col2, col3= st.columns((.1,1,.1))
with col1:
pass
with col2:
"""
The product recommendation part of this project is going to make use of the Association Rules that where uncovered in the MBA section. Product recomentation is basically one of the advantages of Market Basket Analysis where you can recommend to customers products that are in the same itemsets as the customer's current products.
"""
with col3:
pass
st.markdown("---")
st.markdown("---")
st.markdown("### ***Conclusion***")
st.markdown("This was a very interesting project which covered a wide array"
" of topics that seem different but quite similar. For further "
"research or analysis, it would be interesting to do the market "
"basket analysis on the clusters that resulted (with the help of the"
" K-means clustering) from the recency, frequency, and monetary value "
"analysis and do market basket analysis on each of them")