ML_Lab_01.py

# -*- coding: utf-8 -*-
"""ML Lab 01.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1ZWm7-sDnLNC_ZFnM0gjxAjtTKn2Hl1LH
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

#01
# Read the data
df = pd.read_csv('insurance.csv')

#02
# Display the first 5 rows
print(df.head())

# Display the last 5 rows
print(df.tail())

#03
# Display the number of rows and columns
print(f"Number of rows: {df.shape[0]}, Number of columns: {df.shape[1]}")

#04
# Display the number of categorical and numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"Number of numerical columns: {len(numerical_cols)}")
print(f"Number of categorical columns: {len(categorical_cols)}")

#05
# Display min, max, and mode for numerical columns
for col in numerical_cols:
    print(f"Column: {col}")
    print(f"Min: {df[col].min()}, Max: {df[col].max()}, Mode: {df[col].mode().values[0]}")
    print()

#06
# Display columns with null values
null_cols = df.columns[df.isnull().any()]
print(f"Columns with null values: {null_cols}")

#07
# Calculate 5-number summary
summary = df[numerical_cols].describe()
print(summary)

# Correlate with box plot
df[numerical_cols].plot(kind='box', subplots=True, layout=(2, 3), figsize=(12, 8))
plt.show()

#08
# Display outlier values using Z-score
for col in numerical_cols:
    z_scores = stats.zscore(df[col])
    outliers = df[(np.abs(z_scores) > 3)]
    print(f"Outliers in column {col}: {outliers.shape[0]}")
    print(outliers)
    print()

#09
# Calculate correlation matrix
corr_matrix = df[numerical_cols].corr()

# Display features with high positive correlation
high_pos_corr = corr_matrix[(corr_matrix > 0.7) & (corr_matrix != 1)]
print(f"Features with high positive correlation: {high_pos_corr}")

# Display features with high negative correlation
high_neg_corr = corr_matrix[(corr_matrix < -0.7) & (corr_matrix != -1)]
print(f"Features with high negative correlation: {high_neg_corr}")

# Display features with no correlation
no_corr = corr_matrix[(corr_matrix < 0.3) & (corr_matrix > -0.3)]
print(f"Features with no correlation: {no_corr}")

#10
import seaborn as sns

# Analyze skewness of features
for col in numerical_cols:
    sns.distplot(df[col], kde=False)
    plt.title(f"Distribution of {col}")
    plt.show()

# Display features with right skew, left skew, and no skew
skewness = df[numerical_cols].skew()
right_skew = skewness[skewness > 1]
left_skew = skewness[skewness < -1]
no_skew = skewness[(skewness >= -1) & (skewness <= 1)]
print(f"Features with right skew: {right_skew.index}")
print(f"Features with left skew: {left_skew.index}")
print(f"Features with no skew: {no_skew.index}")

#11
# Perform univariate analysis for categorical variables using bar plot
for col in categorical_cols:
    sns.countplot(x=col, data=df)
    plt.title(f"Univariate analysis of {col}")
    plt.show()

#12
# Perform univariate analysis for continuous variables
for col in numerical_cols:
    sns.swarmplot(x=col, data=df)
    plt.title(f"Univariate analysis of {col} (Swarm Plot)")
    plt.show()

    sns.violinplot(x=col, data=df)
    plt.title(f"Univariate analysis of {col} (Violin Plot)")
    plt.show()

#13
# Display scatter plot to show the relationship between two continuous variables
sns.scatterplot(x='age', y='bmi', data=df)
plt.title("Relationship between age and bmi")
plt.show()

#14
# Perform bivariate analysis between continuous variable and categorical variable
sns.catplot(x='smoker', y='charges', data=df, kind='box')
plt.title("Bivariate analysis of smoker and charges")
plt.show()

#15
# Display counts of observations for categorical variable
sns.countplot(x='region', data=df)
plt.title("Counts of observations for region")
plt.show()

#16
# Perform multivariate analysis between features
sns.pairplot(df[numerical_cols], diag_kind='kde')
plt.show()

#Crafted With Love By Sam Naveenkumar .V