-
Notifications
You must be signed in to change notification settings - Fork 7
/
Outlier Treatment
41 lines (32 loc) · 1.49 KB
/
Outlier Treatment
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from collections import Counter
# Outlier detection
def detect_outliers(train_data,n,features):
"""
Takes a dataframe df of features and returns a list of the indices
corresponding to the observations containing more than n outliers according
to the Tukey method.
"""
outlier_indices = []
# iterate over features(columns)
for col in features:
# 1st quartile (25%)
Q1 = np.percentile(train_data[col], 25)
# 3rd quartile (75%)
Q3 = np.percentile(train_data[col],75)
# Interquartile range (IQR)
IQR = Q3 - Q1
# outlier step
outlier_step = 1.5 * IQR
# Determine a list of indices of outliers for feature col
outlier_list_col = train_data[(train_data[col] < Q1 - outlier_step) | (train_data[col] > Q3 + outlier_step )].index
# append the found outlier indices for col to the list of outlier indices
outlier_indices.extend(outlier_list_col)
# select observations containing more than 2 outliers
outlier_indices = Counter(outlier_indices)
multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
return multiple_outliers
# detect outliers from Age, SibSp , Parch and Fare
Outliers_to_drop = detect_outliers(df,2,["Views","Comments","Likes","Popularity", 'Followers','Age'])
df.loc[Outliers_to_drop] # Show the outliers rows
# Drop outliers
df = df.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)