-
Notifications
You must be signed in to change notification settings - Fork 2
/
data_cleaning.py
65 lines (49 loc) · 1.65 KB
/
data_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pandas as pd
import numpy as np
"""
This is a generalized script for Data Cleaning, and can work for any dataset input
"""
# Method to Drop Multiple columns
def drop_columns(col_names, df):
"""
Input -> (List of column names, dataset)
"""
df.drop(col_names, axis=1, inplace=True)
return df
# Method to change datatypes to save memory
def change_dtypes(col_int, col_float, df):
"""
Input -> (column with integer, column with float, dataset)
"""
df[col_int] = df[col_int].astype('int32')
df[col_float] = df[col_float].astype('float32')
# Change categorical to numerical
def change_cat_to_num(df):
"""
Input -> dataset
"""
num_encode = {'col_1': {'POSITIVE': 1, 'NEGATIVE': 0}}
df.replace(num_encode, inplace=True)
# Checking for missing data
def check_missing_data(df):
# Input is a dataset
return df.isnull().sum().sort_values(ascending=False)
# Removing strings in columns to avoid errors
def remove_col_str(df):
# remove a portion of string in a dataframe column - col_1
df['col_1'].replace('\n', '', regex=True, inplace=True)
# remove all the characters after &# (including &#) for column - col_1
df['col_1'].replace(' &#.*', '', regex=True, inplace=True)
# Remove whitespaces in columns
def remove_spaces(df, column):
"""
Input -> dataset, column
"""
# Strip the spaces from begininng of string
df[column].str.lstrip()
# Manipulating timestamps - convert string to datetime format
def convert_str_datetime(df):
"""
Input -> dataset
"""
df.insert(loc=2, column='timestamp', value=pd.to_datetime(df.transdate, format='%Y-%m-%d %H:%M:%S.%f'))