forked from ML-boot-camp/ratebeer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
exploratory_data_analysis.py
136 lines (109 loc) · 2.84 KB
/
exploratory_data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# %%
import pandas as pd
import html
# %%
def parse(path):
with open(path, "r") as f:
for line in f:
yield eval(line)
# %%
filename = "ratebeer.json"
df_raw = pd.DataFrame.from_records(parse(filename), nrows=100000)
df_raw.columns = df_raw.columns.str.replace("/", "_")
# %%
df_raw.info()
# %%
pd.options.plotting.backend = "plotly"
# %%
def clean_integer_features(df, columns):
df = df.copy()
for c in columns:
df[c] = df[c].str.split("/").str[0].astype(int)
return df
def clean_float_features(df, columns):
df = df.copy()
for c in columns:
df[c] = df[c].replace("-", -1).astype(float)
return df
def clean_categorical_features(df, columns):
df = df.copy()
for c in columns:
df[c] = df[c].apply(html.unescape)
return df
integer_features = [
"review_appearance",
"review_aroma",
"review_palate",
"review_taste",
"review_overall",
]
float_features = [
"beer_ABV",
]
categorical_features = [
"beer_name",
"beer_style",
]
df = (
(df_raw)
.pipe(clean_integer_features, integer_features)
.pipe(clean_float_features, float_features)
.pipe(clean_categorical_features, categorical_features)
)
df
# %%
from scipy.stats import chi2_contingency
print(chi2_contingency(pd.crosstab(df.beer_style, df.review_overall))[1])
pd.crosstab(df.beer_style, df.review_overall).plot(kind="imshow")
# %%
(
df
.assign(review_overall=lambda df: df.review_overall.pipe(lambda s: s - s.mean()))
.groupby("beer_style", as_index=False)
.review_overall.agg(["mean", "std", "count"])
.add_prefix("review_overall_")
.style.background_gradient(cmap="RdYlBu")
)
# %% [markdown]
# 1. continuous features:
# 1. preprocessing
# - scaling
# - transformation
# 1. plot:
# - scatter (continuous target)
# - histogram / kde (categorical target)
# 1. stat test:
# 1. categorical features:
# 1. preprocessing
# - label encoding
# - one-hot encoding
# - target encoding
# 1. plot:
# - ...
# 1. stat test:
# - ...
#
# %% [markdown]
# Variable types:
# | Data Type | Shorthand Code | Description |
# | --- | --- | --- |
# | quantitative | Q | a continuous real-valued quantity |
# | ordinal | O | a discrete ordered quantity |
# | nominal | N | a discrete unordered category |
# | temporal | T | a time or date value |
# | geojson | G | a geographic shape |
#
# | Data type | Object | Feature type |
# |---|---|
# | Numerical / Continuous | float | Ratio |
# | Numerical / Continuous | float | Interval |
# | Categorical / Discrete | string / int | Nominal |
# | Categorical / Discrete | int | Ordinal |
# Complex data type:
# | Array[Numerical] | |
# | Array[Numerical] | |
# %% [markdown]
# | | Quantitative | Ordinal |
# |---|---|---|
# | a | a | a |
# | a | a| a |