-
Notifications
You must be signed in to change notification settings - Fork 70
/
ds_common_functions.py
90 lines (62 loc) · 2.11 KB
/
ds_common_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 15 12:52:51 2020
@author: Ken
"""
# import exploration files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
file_path = 'path_to_data.csv'
# read in data
data = pd.read_csv(file_path)
##############################################################################
#Data Exploration
##############################################################################
#rows and columns returns (rows, columns)
data.shape
#returns the first x number of rows when head(num). Without a number it returns 5
data.head()
#returns the last x number of rows when tail(num). Without a number it returns 5
data.tail()
#returns an object with all of the column headers
data.columns
#basic information on all columns
data.info()
#gives basic statistics on numeric columns
data.describe()
#shows what type the data was read in as (float, int, string, bool, etc.)
data.dtypes
#shows which values are null
data.isnull()
#shows which columns have null values
data.isnull().any()
#shows for each column the percentage of null values
data.isnull().sum() / data.shape[0]
#plot histograms for all numeric columns
data.hist()
##############################################################################
#Data Manipulation
##############################################################################
# rename columns
data.rename(index=str columns={'col_oldname':'col_newname'})
# view all rows for one column
data.col_name
data['col_name']
# multiple columns by name
data[['col1','col2']]
data.loc[:['col1','col2']]
#columns by index
data.iloc[:,[0:2]]
# drop columns
data.drop('colname', axis =1) #add inplace = True to do save over current dataframe
#drop multiple
data.drop(['col1','col2'], axis =1)
#lambda function
data.apply(lambda x: x.colname**2, axis =1)
# pivot table
pd.pivot_table(data, index = 'col_name', values = 'col2', columns = 'col3')
# merge == JOIN in SQL
pd.merge(data1, data2, how = 'inner' , on = 'col1')
# write to csv
data.to_csv('data_out.csv')