-
Notifications
You must be signed in to change notification settings - Fork 0
/
Feature_Engg_Clean.py
125 lines (72 loc) · 3.11 KB
/
Feature_Engg_Clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import date
WBK = 'KPMG_Data.xlsx'
Book = pd.read_excel(WBK, sheet_name=None)
print(Book.keys())
Demo_DF = pd.read_excel(WBK, sheet_name="CustomerDemographic")
Addr_DF= pd.read_excel(WBK, sheet_name="CustomerAddress")
Tranx_DF= pd.read_excel(WBK, sheet_name="Transactions")
New_Cust_DF = pd.read_excel(WBK, sheet_name="NewCustomerList")
Demo_DF.head(3)
def col_name_format(*df):
DF_ls = []
for c in df:
new_col = c.iloc[0]
c = c[1:]
c.columns = new_col
DF_ls.append(c)
return DF_ls
Demo_DF, Addr_DF, Tranx_DF, New_Cust_DF = col_name_format(Demo_DF, Addr_DF, Tranx_DF, New_Cust_DF)
Demo_DF = Demo_DF.merge(Addr_DF, how = "left", on="customer_id")
Demo_DF.where(Demo_DF.notnull(), np.nan)
def nan_pcent(df):
points = df.shape[0]
for col in df.columns:
print("* {} has {}% nan values\n".format(col,((df[col].isna().sum())/points)*100))
print("Demo_DF shape: {}".format(Demo_DF.shape))
print()
nan_pcent(Demo_DF)
Demo_DF.drop(columns=["first_name", "last_name", "default", "deceased_indicator", "address", "country"], inplace=True)
Demo_DF["gender"] = Demo_DF["gender"].apply(lambda x: "Female" if x=="F" else "Female" if x=="Femal" else "Male"
if x=="M" else "NA")
print(list(set(Demo_DF["gender"])))
def age_calculation(dob):
dob = pd.to_datetime(dob)
today = date.today()
return (today.year - 3) - dob.year
Demo_DF['Age'] = Demo_DF['DOB'].apply(age_calculation)
Demo_DF['Age'] = Demo_DF['Age'].replace(np.nan, "NA")
Demo_DF['Age'] = Demo_DF['Age'].apply(lambda x: int(x) if isinstance(x, float) else "NA")
print(set(Demo_DF["Age"].to_list()))
Demo_DF.drop(columns="DOB",inplace=True)
print("Tranx_DF shape: {}".format(Tranx_DF.shape))
print()
nan_pcent(Tranx_DF)
Tranx_DF.drop(columns=["transaction_date", "online_order", "order_status", "product_id", "transaction_id",
"list_price", "standard_cost", "product_first_sold_date"], inplace=True)
Tranx_DF.head()
def prod_class(*trnx_df):
df = Tranx_DF
for column in trnx_df:
print("Col \'{}\' items: {}".format(column, set(df[column].to_list())))
print()
prod_class("brand", "product_line", "product_class", "product_size")
Demo_DF_old = Demo_DF
New_Cust_DF['Age'] = New_Cust_DF['DOB'].apply(age_calculation)
New_Cust_DF['Age'] = New_Cust_DF['Age'].replace(np.nan, "NA")
New_Cust_DF['Age'] = New_Cust_DF['Age'].apply(lambda x: int(x) if isinstance(x, float) else "NA")
print(set(New_Cust_DF["Age"].to_list()))
print([col for col in New_Cust_DF.columns.to_list()])
print()
New_Cust_DF = New_Cust_DF.loc[:, New_Cust_DF.columns.notnull()]
New_Cust_DF["customer_id"] = [Demo_DF.shape[0]+1+i for i in range(len(New_Cust_DF))]
New_Cust_DF = New_Cust_DF[[col for col in Demo_DF.columns.to_list()]]
New_Cust_DF = Demo_DF.append(New_Cust_DF, ignore_index = True)
New_Cust_DF
Demo_DF_old = Demo_DF_old.merge(Tranx_DF, how="inner", on="customer_id")
Demo_DF_old.head()
# *By:* `Adipta Biwas`