import pandas as pd
import numpy as np
df = pd.read_csv("...file_name.csv")
df.dtypes
df.to_csv("...file_name.csv", index=False)
Navigate to the directory containing the .tar.gz file from your command prompt and enter this command:
pip install my-tarball-file-name.tar.gz
df["Notes"] = df["Notes"].astype(str)
Can also use "int" and "float"
df["Notes"] = df["Notes"].map(str.lower)
Can also use .upper
geoCodeAdd["STREET_NAME"] = geoCodeAdd["STREET_NAME"].str.replace('[^\w\s]','')
df['Country'] = df['Country'].str.replace('\d+', '')
df['LICENSE_NO']= df['LICENSE_NO'].str.extract('(\d+)')
df['street']=df['address'].str.extract('(\s.+$)')
d['Type'] = np.where(d['Inspector Dept'].str.contains("Enforcement", case=False, na=False), 'Enforcement', 'Development')
d["Inspector Department"] = d['Inspector Dept'].str.split().str[0]
from difflib import SequenceMatcher as SM
s1 = "3243 IRWIN AVENUE BRONX"
s2 = "3253 IRWIN AVENUE BRONX"
ratio = SM(None, s1, s2).ratio()
s1 = 'Alphonso Marshall'
s2 = 'Marshall Alphonso'
set1 = set(s1.split(' '))
set2 = set(s2.split(' '))
if set1 == set2:
print "True, same person"
else:
print "False, not same person"
import re
a = 'JohnHamilton'
re.findall('[A-Z][^A-Z]*', a)
df['Initial File Year'] = df['Initial File Date Raw'].apply(lambda x: x[:4])
df['Last Name'] = df['Last Name'].map(lambda x: x.strip('0123456789'))
df['result'] = df['result'].map(lambda x: x.lstrip('+-').rstrip('aAbBcC'))
df["Address"] = df["Address"].map(str.strip)
col_names = sorted(list(df.columns.values))
for i in range(0, len(col_names)):
df[col_names[i]] = df[col_names[i]].astype(str)
for i in range(0, len(col_names)):
df[col_names[i]] = df[col_names[i]].map(str.strip)
dfn = dfn.rename(columns=lambda x: x.strip())
line = "59-35"
import re
line = re.sub('[-]', '', line)
Output: 5935
str = "h3110 23 cat 444.4 rabbit 11 2 dog"
str2 = [int(s) for s in str.split() if s.isdigit()]
ouptut: [23, 11, 2]
st = '132ND'
st = ''.join(c for c in st if c.isdigit())
*output: 132
dfstaff["Name"] = dfstaff["Name"].str.replace('[^\x00-\x7F]','')
text = '272-276 Broadway New York NY 10007'
text.rsplit(' ', 1)[0]
import re
[w for w in text8 if re.search('@[A-Za-z0-9_]+', w)]
[]: matches a set of characters, [a-z]: matches a character in the range a to z, [^abc]: matches a character not a, b, or c, [a|b]: matches a or b where a and b are strings
df["Variable"].unique
df.isnull().sum()
df["VEHICLE_NUMBER"].isnull().sum()
df.isnull().sum()
df = df.drop('First Name_x', 1)
df = df.drop_duplicates(['Group2']).reset_index(drop=True)
*Note: Always reset index after droping
df = df.drop_duplicates(subset=['Address', 'Boro']).reset_index(drop=True)
drops rows where address and borough are the same
Drop duplicates based on element in another column, this drops all rows of data except if value in INSPECTION_ID equals the value in INSPECTION_ID.1
dn = dn[dn['INSPECTION_ID'] == dn["INSPECTION_ID.1"]]
df = df.reset_index(drop=True)
df = df.rename(columns={'Org.Level.4': 'org_level_4', 'Org.Level.4.Number': 'org_level_4_number'})
dfn['Boro'] = dfn['Boro'].str.replace('BRO', 'BROOKLYN')
df['Boro'] = df.borough.replace({'Manhattan':1,'Bronx':2,'Brooklyn':3,'Queens':4,'Staten Island':5})
df = df.fillna(0)
d = d.replace(["None"], [""])
df = df.replace(['very bad', 'bad', 'poor', 'good', 'very good'], [1, 2, 3, 4, 5])
df = df[df["BORO"] != 0]
drops all rows with 0
df2 = df2[df2.Address.str.contains("WEST END AVE") == False]
drops all rows in Address field if contain WEST END AVE
df = df[pd.notnull(df['Distressed'])]
df = df.dropna()
this drops all rows with NaN
df = df.dropna(subset = ['BIN'])
df = df[np.isfinite(df['House Number'])]
cols = nodesGephi.columns.tolist()
cols = cols[-1:] + cols[:-1]
nodesGephi = nodesGephi[cols]
merged = pd.merge(w1, w2)
merged_all = pd.merge(merged, w3, left_on = "FULL NAME1", right_on = "FULL NAME2" )
frames = [df1, df2]
result = pd.concat(frames)
qn = pd.DataFrame(columns=('BIN', 'Boro Code', 'Boro', 'House Number', 'Street Name', 'Address', 'Latitude', 'Longitude'))
qn.loc[0] = ['NA','NA','NA', 'NA','NA','NA', 'NA','NA']
qn.loc[1] = ['NA','NA','NA', 'NA','NA','NA', 'NA','NA']
Note: Number of columns must match
df = df.T
pd.set_option('chained_assignment', None)
geoCodeCheck = geoCodeCheck[geoCode['Street Name'] == 'knickerbocker avenue']
mask = (dfList['XCoord'] >= xy2[0]) & (dfList['YCoord'] <= xy1[0])
dfList_subset = dfList.loc[mask]
df2 = df2[df2.Address.str.contains("WEST END AVE") == False]
df = df[(df['closing_price'] >= 99) & (df['closing_price'] <= 101)]
df = df[df['Name'].isin(nameList)]
dpsub = dp[~dp['BIN'].isin(binList)]
df2 = df[df['INSPECTION_ID'] == df["INSPECTION_ID.1"]]
only_gold = df[df['Gold'] > 0]
Replaces numbers with NaN for countries that did not have a gold medal
export = export.sort_values(by = 'High Risk Ranking', ascending=False).reset_index(drop=False)
sorted(ratio_list, reverse=True)
ids = dfInt["NODEID"]
dfInt2 = dfInt[ids.isin(ids[ids.duplicated()])].sort_values(by="NODEID")
from operator import itemgetter
ratio_list = sorted(ratio_list, key=itemgetter(1), reverse=True)
df = df.reindex_axis(sorted(df.columns), axis=1)
df = pd.DataFrame(dfList, columns=['BIN', 'Boro', 'Address'])
list_block = []
for i in range(0, len(df2)):
list_block.append(df2["Block"][i])
dateList = df2['Date'].tolist()
set_block = set(list_block)
text = [x for x in text if not any(c.isdigit() for c in x)]
set_block = sorted(set_block)
frame = pd.DataFrame({'a' : ['the cat is blue', 'the sky is green', 'the dog is black']})
mylist =['dog', 'cat', 'fish']
pattern = '|'.join(mylist)
frame["TF"] = frame.a.str.contains(pattern)
a = filter(None, a)
a=[1,"",3,1,3,2,1,1]
a = ["NA" if x=="" else x for x in a]
x = list(set(list1).intersection(list2))
words = flist
final_string = ', '.join(str(nameDic.get(word, word)) for word in words)
matchList.sort(key = lambda s: len(s), reverse=True )
name_list = ['ALEX PERA', ' ', 'VAL TOL']
name_list = [x for x in name_list if x != ' ']
oList = sum(oList, [])
people = ['Dr. Christopher Brooks', 'Dr. Kevyn Collins-Thompson', 'Dr. VG Vinod Vydiswaran', 'Dr. Daniel Romero']
def split_title_and_name(person):
return person.split()[0] + " " + person.split()[-1]
list(map(split_title_and_name, people))
df3['Target'] = df.WAITING_TIME.map( lambda x: 1 if x >= 1800 else 0)
my_function = lambda a, b, c: a3 + b2 + c
print my_function(2,3,4)
First create dummy variables using the categorical variable in the dataframe, and then concatinate to original dataframe
df = df[["Permit Number", "PermitType"]]
jobtype_dummies = pd.get_dummies(df.PermitType, prefix='PermitType')
df = pd.concat([df, jobtype_dummies], axis=1)
lst = []
for i in range(10):
for j in range (10):
lst.append(i*j)
lst = [i*j for i in range(10) for j in range(10)]
all_data = pd.DataFrame()
all_data = all_data.append(df,ignore_index=False)
location_df = df['h_no'].apply(lambda x: pd.Series(x.split(',')))
Create a list of matching addresses in the dataframes:
common_cols = list(set(df14.Address) & set(dfmn.Address))
dfmn["Corner"] = 0
dfmn["Corner"] = 0
for i in range(0, len(dfmn)):
if dfmn["Address"][i] in common_cols:
dfmn["Corner"][i] = 1
lv['Correct Duration Sum'] = lv['Correct Duration'].groupby(lv['CityTime ID']).transform('sum')
qnG = qn.groupby(['BIN', 'Year']).sum()
qnG = qnG.add_suffix('_Count').reset_index()
Groupby using lambda, weight and quantity are columns in dataframe, suma up the multiplication of weight x quantity
dfG = df.groupby('Category').apply(lambda df,a,b: sum(df[a] * df[b]), 'Weight (oz.)', 'Quantity'))
dfG = (df.groupby('BIN_Number').agg({'Job_Number':'count', 'AHV_Grants': 'sum', 'Initial_Da':'sum', 'Additional':'sum'}).reset_index().rename(columns={'Job_Number':'Job_Number_count'}) )
df3 = df.groupby("Date").agg({"VISIT_KEY": pd.Series.nunique})
Groupby and sum strings in row, this first creates a series and then needs to be converted to dataframe:
f = rank_title.groupby('Name')['Title'].apply(lambda x: "{%s}" % ', '.join(x))
X = pd.DataFrame(f)
X = X.reset_index(drop=False)
coG = coG.groupby(['BIN Number'], sort=False)['C of O Issue Date'].max()
df['DATE_SCHEDULED'] = pd.to_datetime(df['DATE_SCHEDULED'])
start_date = str(1) + '-' + str(1) + '-' + str(2016)
from dateutil import parser
start_date = parser.parse(start_date)
start_date = start_date.date()
db["DATE_INSPECTION"] = db["DATE_INSPECTION"].apply(lambda x: x.date())
df['Inspection Time'] = df["Inspection Time"].apply( lambda d : d.time() )
df['Difference'] = (df["hearing_date"] - df["ticket_issued_date"]).dt.days
OR
d = df2["Variance End Date"][j] - df2["Variance Start Date"][j]
diff = d.days
diff = int(diff)
print pd.Timedelta(d["Next_Login"][0] - d["Crash_Time_EST"][0]).seconds / 3600.0
df2 = df2.assign(session=pd.cut(df2.DateTime.dt.hour,[0,6,12,18,24],labels=['Night','Morning','Afternoon','Evening']))
df["Week Number"][i] = datetime.date(year, month, day).isocalendar()[1]
start_time = str(lv["Start Time"][i].month) + '-' + str(lv["Start Time"][i].day) + '-' + str(lv["Start Time"][i].year)
end_time = str(lv["End Time"][i].month) + '-' + str(lv["End Time"][i].day) + '-' + str(lv["End Time"][i].year)
mask = (wd['workdays'] >= start_time) & (wd['workdays'] <= end_time)
wd_subset = wd.loc[mask]
from datetime import datetime
print str(datetime.now())
insp_date = df["DATE_INSPECTION"][0]
d = datetime.timedelta(days = 2)
minus2 = insp_date - d
plus2 = insp_date + d
mask = (all_data2['Date'] >= minus2) & (all_data2['Date'] <= plus2)
dfpm2 = all_data2.loc[mask]
sum_trans = df3["Transaction Amount"].sum()
df2["EucDistance"].min()
df.describe().transpose()
dr.dtypes
dr[dr.select_dtypes(include=['float']).columns] *= 100
dr = dr.round(2)
dwsDic = dwsG.set_index('bin')['WallArea SqFt'].to_dict()
Mapping dictionaries (insert value of dictionary) into dataframe, lat is dataframe where key is the address
df["lat"] = df["address"].map(lat)
di = {1: "A", 2: "B"}
df = df.replace({"col1": di})
Creating flag using dictionaries, this creates a Y/N flag where Y is in flag if job number in column, N if not:
dic_s = {'7398378':'Y', '7398310':'Y'}
df['Flag'] = df['Job Number'].map(dic_s).fillna('N')
for i in range(0, len(dfdn)):
print "terracot address", dfdn["AddressMatch"][i]
try:
print "dic list", d2[dfdn["AddressMatch"][i]]
except KeyError, e:
print e
https://www.analyticsvidhya.com/blog/2016/01/12-pandas-techniques-python-data-manipulation/
http://pandas.pydata.org/pandas-docs/stable/reshaping.html
Pivot with name, year, sum of variable (creates index with name, colums are date, values are duration in hours):
p = resultG.pivot_table(index='Name', columns='Month-Year', values='Duration Hours_Count')
p = dfc3.pivot_table(index='RegistrationID', columns='Type', values='Name', aggfunc=lambda x: ' '.join(x))
st = st.pivot_table(index='BIN Number', columns='Permit Sub Type',values='Value', aggfunc=np.sum)
https://www.python-course.eu/list_comprehension.php
http://www.secnetix.de/olli/Python/list_comprehensions.hawk
https://www.analyticsvidhya.com/blog/2016/01/python-tutorial-list-comprehension-examples/