-
Notifications
You must be signed in to change notification settings - Fork 0
/
ML_Lab_01.py
142 lines (114 loc) · 3.96 KB
/
ML_Lab_01.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-
"""ML Lab 01.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1ZWm7-sDnLNC_ZFnM0gjxAjtTKn2Hl1LH
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
#01
# Read the data
df = pd.read_csv('insurance.csv')
#02
# Display the first 5 rows
print(df.head())
# Display the last 5 rows
print(df.tail())
#03
# Display the number of rows and columns
print(f"Number of rows: {df.shape[0]}, Number of columns: {df.shape[1]}")
#04
# Display the number of categorical and numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"Number of numerical columns: {len(numerical_cols)}")
print(f"Number of categorical columns: {len(categorical_cols)}")
#05
# Display min, max, and mode for numerical columns
for col in numerical_cols:
print(f"Column: {col}")
print(f"Min: {df[col].min()}, Max: {df[col].max()}, Mode: {df[col].mode().values[0]}")
print()
#06
# Display columns with null values
null_cols = df.columns[df.isnull().any()]
print(f"Columns with null values: {null_cols}")
#07
# Calculate 5-number summary
summary = df[numerical_cols].describe()
print(summary)
# Correlate with box plot
df[numerical_cols].plot(kind='box', subplots=True, layout=(2, 3), figsize=(12, 8))
plt.show()
#08
# Display outlier values using Z-score
for col in numerical_cols:
z_scores = stats.zscore(df[col])
outliers = df[(np.abs(z_scores) > 3)]
print(f"Outliers in column {col}: {outliers.shape[0]}")
print(outliers)
print()
#09
# Calculate correlation matrix
corr_matrix = df[numerical_cols].corr()
# Display features with high positive correlation
high_pos_corr = corr_matrix[(corr_matrix > 0.7) & (corr_matrix != 1)]
print(f"Features with high positive correlation: {high_pos_corr}")
# Display features with high negative correlation
high_neg_corr = corr_matrix[(corr_matrix < -0.7) & (corr_matrix != -1)]
print(f"Features with high negative correlation: {high_neg_corr}")
# Display features with no correlation
no_corr = corr_matrix[(corr_matrix < 0.3) & (corr_matrix > -0.3)]
print(f"Features with no correlation: {no_corr}")
#10
import seaborn as sns
# Analyze skewness of features
for col in numerical_cols:
sns.distplot(df[col], kde=False)
plt.title(f"Distribution of {col}")
plt.show()
# Display features with right skew, left skew, and no skew
skewness = df[numerical_cols].skew()
right_skew = skewness[skewness > 1]
left_skew = skewness[skewness < -1]
no_skew = skewness[(skewness >= -1) & (skewness <= 1)]
print(f"Features with right skew: {right_skew.index}")
print(f"Features with left skew: {left_skew.index}")
print(f"Features with no skew: {no_skew.index}")
#11
# Perform univariate analysis for categorical variables using bar plot
for col in categorical_cols:
sns.countplot(x=col, data=df)
plt.title(f"Univariate analysis of {col}")
plt.show()
#12
# Perform univariate analysis for continuous variables
for col in numerical_cols:
sns.swarmplot(x=col, data=df)
plt.title(f"Univariate analysis of {col} (Swarm Plot)")
plt.show()
sns.violinplot(x=col, data=df)
plt.title(f"Univariate analysis of {col} (Violin Plot)")
plt.show()
#13
# Display scatter plot to show the relationship between two continuous variables
sns.scatterplot(x='age', y='bmi', data=df)
plt.title("Relationship between age and bmi")
plt.show()
#14
# Perform bivariate analysis between continuous variable and categorical variable
sns.catplot(x='smoker', y='charges', data=df, kind='box')
plt.title("Bivariate analysis of smoker and charges")
plt.show()
#15
# Display counts of observations for categorical variable
sns.countplot(x='region', data=df)
plt.title("Counts of observations for region")
plt.show()
#16
# Perform multivariate analysis between features
sns.pairplot(df[numerical_cols], diag_kind='kde')
plt.show()
#Crafted With Love By Sam Naveenkumar .V