-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessor_lib.py
204 lines (163 loc) · 7.38 KB
/
preprocessor_lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python
# coding: utf-8
# Author: Mandis Beigi
# Copyright (c) 2022 Medidata Solutions, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import logging
from sas7bdat import SAS7BDAT
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import math
#checks whether a given float contains decimals
def contains_decimal(x):
frac, whole = math.modf(x)
if frac>0:
return(True)
return(False)
#checks weather the given vector of numbers contains floats, otherwise all are assumed to be ints
def contains_floats(x_vec):
highest_prec = False
for x in x_vec:
prec = contains_decimal(x)
if prec:
highest_prec = prec
return(highest_prec)
def get_boolean_and_nonboolean_columns(df):
boolean_columns = []
for c in df.columns:
if set(df.loc[:,c]).issubset(set([0, 1])) or '|' in c:
boolean_columns.append(c)
nonboolean_columns = list(set(df.columns) - set(boolean_columns))
return(boolean_columns, nonboolean_columns)
#LabelEncoding: converts a dataframe containing categorical values to numbers and returns the new dataframe as well as the dictionary of the mappings
def label_encoding_encode(df):
#df = pd.DataFrame({'value': ["a", "b", "c", "a"], 'num': [1,2,3,4], 'name':["yy","bb", "yy","zz"]})
df_encoded = df.copy()
df_type = df.dtypes
object_idx = np.where(df_type == 'object')
dicts = {}
for i in range(0, len(object_idx[0])):
c = df_encoded[df.columns[object_idx[0][i]]].astype('category')
d = dict(enumerate(c.cat.categories))
col_name = df.columns[object_idx[0][i]]
df_encoded[col_name] = c.cat.codes
dicts[col_name] = d
return (df_encoded, dicts)
#LabelEncoding: converts a dataframe containing numbers to categorical values using a mapping dictionary and returns a new dataframe
def label_encoding_decode(df_encoded, dicts):
df = df_encoded.copy()
keys=dicts.keys()
for key in keys:
df[key] = df_encoded[key].map(dicts[key])
return (df)
def one_hot_encoding_encode(df):
encoded_df = pd.get_dummies(df, prefix_sep='|')
return(encoded_df)
def one_hot_encoding_decode(df_dummies):
prefix_sep = '|'
pos = defaultdict(list)
vals = defaultdict(list)
for i, c in enumerate(df_dummies.columns):
if prefix_sep in c:
k, v = c.split(prefix_sep, 1)
pos[k].append(i)
vals[k].append(v)
else:
pos[prefix_sep].append(i)
df = pd.DataFrame({k: pd.Categorical.from_codes(
np.argmax(df_dummies.iloc[:, pos[k]].values, axis=1),
vals[k])
for k in vals})
df[df_dummies.columns[pos[prefix_sep]]] = df_dummies.iloc[:, pos[prefix_sep]]
return df
# This function takes a dataframe with categorical and ordinal columns and converts all fields to floats
# It performs one-hot-encoding for the categorical variables and label encoding for the ordinal variables
def encode_df(df, categorical_columns, ordinal_columns):
all_columns = df.columns.tolist()
all_columns_set = set(all_columns)
other_columns_set = all_columns_set.difference(set(categorical_columns))
other_columns = list(rest_columns_set.difference(set(ordinal_columns)))
cat_df = df[categorical_columns]
ord_df = df[ordinal_columns]
other_df = df[other_columns]
cat_df_encoded = one_hot_encoding_encode(cat_df)
ord_df_encoded, ord_dict = label_encoding_encode(ord_df)
encoded_df = other_df.join(cat_df_encoded)
encoded_df = encoded_df.join(ord_df_encoded)
return(encoded_df, ord_dict)
# This function converts back an encoded dataframe to the original categorical and ordinal columns
def decode_df(df, ord_dict):
decoded_df = label_encoding_decode(df, ord_dict)
decoded_df = one_hot_encoding_decode(decoded_df)
return(decoded_df)
#fill the missing data with new values not existing in the column
#this is used to determine the column correlations
def impute_label_encoded_df(df):
for column in df.columns:
tmp_col_values = sorted(df[column].unique())
col_values = [x for x in tmp_col_values if math.isnan(x) == False]
if len(col_values) >= 2:
fill_val = col_values[len(col_values)-1]+(col_values[len(col_values)-1]-col_values[len(col_values)-2])
elif len(col_values) == 1:
if col_values[0] != 0:
fill_val = 2*col_values[0]
else:
fill_val = 1
else:
fill_val = 0
df[[column]] = df[[column]].fillna(value=fill_val)
return(df)
#impute the missing values of boolean columns with the most frequent value and
#impute the missing values of the non-boolean columns with the median
def impute_one_hot_encoded_df(df):
boolean_columns, nonboolean_columns = get_boolean_and_nonboolean_columns(df)
boolean_df = df[boolean_columns]
nonboolean_df = df[nonboolean_columns]
if len(nonboolean_columns) != 0:
imputed_nonboolean_df = nonboolean_df.fillna(nonboolean_df.median())
imputed_nonboolean_df = imputed_nonboolean_df.reset_index(drop=True)
if len(boolean_columns) != 0:
imp_most_freq = SimpleImputer(strategy='most_frequent')
imp_most_freq.fit(boolean_df)
imputed_boolean_df = pd.DataFrame(imp_most_freq.transform(boolean_df))
imputed_boolean_df.columns = boolean_df.columns
imputed_boolean_df = imputed_boolean_df.reset_index(drop=True)
if len(nonboolean_columns) != 0 and len(boolean_columns) != 0:
imputed_df = pd.concat([imputed_nonboolean_df, imputed_boolean_df], axis=1)
elif len(nonboolean_columns) == 0:
imputed_df = imputed_boolean_df
elif len(boolean_columns) == 0:
imputed_df = imputed_nonboolean_df
imputed_df = imputed_df.reindex(columns=df.columns)
imputed_df = imputed_df.set_index(df.index)
return(imputed_df)
def iterative_impute(df):
logging.info("Iterative imputing the data")
imputer = IterativeImputer()
imputer.fit(df)
imputed_np = imputer.transform(df)
imputed_df = pd.DataFrame(imputed_np, columns=df.columns)
imputed_df = imputed_df.reindex(columns=df.columns)
imputed_df = imputed_df.set_index(df.index)
return(imputed_df)