-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data_Cleaning-3-plyr&dplyr&reshape.R
433 lines (368 loc) · 17 KB
/
Data_Cleaning-3-plyr&dplyr&reshape.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
setwd("D:\\study\\DataScience")
#1. subsetting - quick review
set.seed(13435)
X<-data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15))
X<-X[sample(1:5),];
X$var2[c(1,3)]=NA
X
X[,1]
X[,"var1"]
X[1:2,"var2"]
X[(X$var1<=3 & X$var3>11),] #subset using logical statements
X[(X$var1<=3|X$var3>15),]
#dealing with NA. (NA's would not be omitted in above four expressions)
X[which(X$var2>8),] # which: Give the TRUE indices of a logical object, allowing for array indices. NAs are omitted
#sorting
sort(X$var1)
sort(X$var1,decreasing=TRUE)
sort(X$var2,na.last=TRUE)
sort(X$var2,na.last=FALSE)
#Ordering
X[order(X$var1),]
X[order(X$var1,X$var3),]
####ordering with plyr
library(plyr)
arrange(X,var1) #for fast ordering without using ordering() commands
arrange(X,desc(var1))
#add new column
X$var4<-rnorm(5)
X
Y<-cbind(X,rnorm(5)) #another way to add column
Y
Z<-rbind(Y,rnorm(5))
Z
#2.Summarizing the data
if(!file.exists("./GetAndCleanData")){dir.create("./GetAndCleanData")}
fileUrl<-"http://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl,destfile="./GetAndCleanData/restaurants.csv")
restData<-read.csv("./GetAndCleanData/restaurants.csv")
head(restData,n=3)
tail(restData,n=3)
summary(restData)
str(restData)
quantile(restData$councilDistrict,na.rm=TRUE)
quantile(restData$councilDistrict,probs=c(0.5,0.75,0.9))
table(restData$zipCode,useNA="ifany") #useNA="ifany" -->if there is any missing values, they'll be
#an added column to this table, which will be NA and NA's count.
#By default, NA's nubmer would not be reported.
table(restData$councilDistrict,restData$zipCode) #2-dim table
class(table(restData$councilDistrict,restData$zipCode)) #table
sum(is.na(restData$councilDistrict))
any(is.na(restData$councilDistrict))
all(restData$zipCode>0)
colSums(is.na(restData))
all(colSums(is.na(restData))==0)
#Values with specific characteristics
table(restData$zipCode %in% c("21212"))
table(restData$zipCode %in% c("21212","21213"))
restData[restData$zipCode %in% c("21212","21213"),] #pass to row subsetting.
#Cross tabs
data(UCBAdmissions)
DF=as.data.frame(UCBAdmissions) #data.frame() calls as.data.frame().therefore as.data.frame is faster:
xt<-xtabs(Freq~Gender+Admit,data=DF) #Create a contingency table (optionally a sparse matrix)
#from cross-classifying factors
#Flat Tables
warpbreaks$replicate<-rep(1:9,len=54)
xt<-xtabs(breaks~.,data=warpbreaks)
ftable(xt)
#size of a data set
fakeData<-rnorm(1e5)
object.size(fakeData)
class(fakeData)
print(object.size(fakeData),units="Mb")
#3. Create New Variables
#often the raw data won't have a value you are looking for
#Common Variables to create: (1)missingness indicators (2)"cutting up" quantitative variables (3)Applying transforms
if(!file.exists("./GetAndCleanData")){dir.create("./GetAndCleanData")}
fileUrl<-"http://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl,destfile="./GetAndCleanData/restaurants.csv")
restData<-read.csv("./GetAndCleanData/restaurants.csv")
#Creating Sequences
s1<-seq(1,10,by=2);s1
s2<-seq(1,10,length=3);s2
x<-c(1,3,8,25,100);
seq(along=x)
seq_along(x) #seq_along and seq_len are very fast primitives for two common cases.
seq_len(9)
#add one column by subsetting
names(restData)
restData$nearMe<-restData$neighborhood %in% c("Rolland Park","Homeland") #nearMe column is appended
table(restData$nearMe)
#create binary variable
restData$zipWrong=ifelse(restData$zipCode<0,TRUE,FALSE) #same as if in excel
table(restData$zipWrong,restData$zipCode<0)
#create catogorical/factor variable with cut-- break a quantitative variable up into a categorical variable
restData$zipGroup=cut(restData$zipCode,breaks=quantile(restData$zipCode))
#cut(x, breaks, labels = NULL,include.lowest = FALSE, right = TRUE, dig.lab = 3,ordered_result = FALSE, ...)
class(restData$zipGroup)
table(restData$zipGroup)
table(restData$zipGroup,restData$zipCode)
####Easier cutting with Hmisc(Harrell Miscellaneous)
if(!require("Hmisc")){install.packages("Hmisc")}
library("Hmisc")
restData$zipGroups<-cut2(restData$zipCode,g=4)
#restData$zipGroups<-cut2(restData$zipCode,cuts=quantile(restData$zipCode))
str(cut2) #function (x, cuts, m = 150, g, levels.mean = FALSE, digits, minmax = TRUE, oneval = TRUE, onlycuts = FALSE)
#cut2: Function like cut but left endpoints are inclusive and labels are of the form
#[lower, upper), except that last interval is [lower,upper]. If cuts are given, will
#by default make sure that cuts include entire range of x. Also, if cuts are not given,
#will cut x into quantile groups (g given) or groups with a given minimum number of
#observations (m)(the algorithm does not guarantee that all groups will have at least m observations..
class(restData$zipGroups)
table(restData$zipGroups)
head(restData)
x <- runif(1000, 0, 100)
z <- cut2(x, c(10,20,30))
table(z)
table(cut2(x, g=10)) # quantile groups
table(cut2(x, m=50))
#Creating factor variables
restData$zcf<-factor(restData$zipCode)
restData$zcf[1:10]
class(restData$zcf)
####levels of factor variables
yesno<-sample(c("yes","no"),size=10,replace=TRUE)
yesnofac<-factor(yesno,levels=c("no","yes"))
relevel(yesnofac,ref="yes") #The levels of a factor are re-ordered so that the level specified by ref is first and the others are moved down.
as.numeric(yesnofac)
#use the mutate function to create a new variable and add it to the data set
head(restData)
names(restData)
library(Hmisc);library(plyr)
restData2<-mutate(restData,zipGroups2=cut2(zipCode,g=4))
names(restData2)
table(restData2$zipGroups2)
restData2$zipGroups2==restData2$zipGroups2
#4.Transform variables
#abs(x),sqrt(x),ceiling(x),floor(x),cos(x),sin(x),log(x),log2(x),log10(x),exp(x)
#round(x,digits=n), round(3.475,digits=2)
#signif(x,digits=n),signif(3.475,digits=2) is 3.5 #signif rounds the values in its first argument to the specified number of significant digits.
#5.Reshape data
library(reshape2)
head(mtcars)
#dcasting data frames
mtcars$carname<-rownames(mtcars)
head(mtcars)
carMelt<-melt(mtcars,id=c("carname","gear","cyl"),measure.vars=c("mpg","hp")) #id variables, measure variables
?melt
head(carMelt,n=3) #in variable column, you will see mpg and hp(both show up for each car)
nrow(mtcars)*2== nrow(carMelt)
#recast after melting -- acast and dcast are the same
cylData<-dcast(carMelt,cyl~variable) #put cyl on the rows, and variable on the columns
cylData
?dcast
cylData<-dcast(carMelt,cyl~variable,mean)
cylData
carNameData<-acast(carMelt,carname~variable,mean)
carNameData
#averaging values
head(InsectSprays)
tapply(InsectSprays$count,InsectSprays$spray,mean) #take mean grouped by spray
#another way - split, apply, combine
#split
spIns = split(InsectSprays$count,InsectSprays$spray) #group by spray
spIns
#apply
sprCount=lapply(spIns, sum)
sprCount
#combine
unlist(sprCount)
#or take apply and combine together
sapply(spIns, sum)
#using plyr package - - the first argument is always the data frame
#summarise(mtcars, mean=mean(disp))
#summarise(baseball,duration = max(year) - min(year),nteams = length(unique(team)))
ddply(InsectSprays,.(spray),summarise,sum=sum(count))#we use sum as the way to summarise variable spray
ddply(InsectSprays,"spray",summarise,sum=sum(count)) #same with above
library(dplyr)
summarise(group_by(mtcars, cyl), mean(disp)) #group_by(data frame, variable) generate a data frame
summarise(group_by(mtcars, cyl), m = mean(disp), sd = sd(disp))
ddply(InsectSprays,"spray",summarize,sum=sum(count)) #the only difference is summarize instead of summarise
detach(package:dplyr,unload=TRUE)
#use plyr to create a new variable
spraySums<-ddply(InsectSprays,.(spray),summarise,sum=ave(count,FUN=sum))
#surprisingly, we could use sum function in ave function
#with ave, we generate a column with the same length as the summarized variable
dim(spraySums)
head(spraySums)
#different ways to summarize the sum by spray - the first argument is always the data frame
(count_by_spray <- with(InsectSprays, split(count, spray)))
(mean_by_spray <- lapply(count_by_spray, mean))
unlist(mean_by_spray)
sapply(count_by_spray, mean) #same with the above two lines but with higher efficiency
vapply(count_by_spray, mean, numeric(1)) #same with the above line
with(InsectSprays, tapply(count, spray, mean))
tapply(InsectSprays$count, InsectSprays$spray, mean)
with(InsectSprays, by(count, spray, mean))
by(InsectSprays$count, InsectSprays$spray, mean)
aggregate(count ~ spray, InsectSprays, mean)
ddply(InsectSprays, .(spray), summarise, mean.count = mean(count))
dlply(InsectSprays, .(spray), summarise, mean.count = mean(count)) #list, dlply is similar to by except that the results are returned in a different format.
with(InsectSprays, ave(count, spray)) #used when when you want the output statistic vector to have the same length as the original input vectors
#6.tutorial of reshape
#http://www.slideshare.net/jeffreybreen/reshaping-data-in-r
pop<-read.csv('http://www.census.gov/2010census/csv/pop_density.csv', skip=3)
pop<-pop[,1:12]
colnames(pop)
colnames(pop)=c('state',seq(1910,2010,10))
colnames(pop)
head(pop,n=40)
#library doBy - split data into groups: summaryBy, splitBy, orderBy, sampleBy, transformBy etc.
if(!require("doBy")){install.packages("doBy")}
library(doBy)
top=orderBy(~-2010,pop)
top1=orderBy(-2010~,pop)
dim(top)
dim(top1)
?orderBy #orderBy(formula, data)
#This function is essentially a wrapper for the order() function - the important difference
#being that variables to order by can be given by a model formula.
#The sign of the terms in the formula determines whether sorting should be ascending or decreasing
top=subset(top, state!='United States') #subet is a function in {base}
top1=top[top$state!='United States',] #equivalent
top=head(top,10)
top$state=factor(top$state)
head(top)
#how would you plot population versus year?
names(top)
mtop=melt(top,id.vars='state',variable.name='year',value.name='population') #measure.vars is omitted
####### melt's introduction:
#put measured variables into one column, and give the column as name "variable.name".
#melt.data.frame: melt(data, id.vars, measure.vars, variable.name = "variable", ..., na.rm = FALSE, value.name = "value",factorsAsStrings = TRUE)
#id.vars: vector of id variables. Can be integer (variable position) or string (variable name). If blank, will use all non-measured variables.
#variable.name: name of variable used to store measured variable names
#measure.vars: vector of measured variables. Can be integer (variable position) or string (variable name). If blank, will use all non id.vars
#value.name: name of variable used to store values
names(airquality) <- tolower(names(airquality))
melt(airquality, id=c("month", "day"))
names(ChickWeight) <- tolower(names(ChickWeight))
melt(ChickWeight, id=2:4) #id = c("chick","diet","variable"), measure =
melt(ChickWeight, id = c(" time","chick","diet"))
#melt.array: melt(data, varnames = names(dimnames(data)), ...,na.rm = FALSE, as.is = FALSE, value.name = "value")
a <- array(c(1:23, NA), c(2,3,4))
melt(a)
melt(a, na.rm = TRUE)
melt(a, varnames=c("X","Y","Z"))
#melt.list: melt(data, ..., level = 1)
a <- as.list(c(1:4, NA))
melt(a)
names(a) <- letters[1:4]
melt(a)
a <- list(matrix(1:4, ncol=2), matrix(1:6, ncol=2))
melt(a)
a <- list(matrix(1:4, ncol=2), array(1:27, c(3,3,3)))
melt(a)
###############
head(mtop)
library(ggplot2)
ggplot(data=mtop,aes(group=state))+geom_line(aes(x=year,y=population,color=state))
?ggplot
####ggplot & geom_line
mry <- do.call(rbind, by(movies, round(movies$rating), function(df) {
nums <- tapply(df$length, df$year, length)
data.frame(rating=round(df$rating[1]), year = as.numeric(names(nums)), number=as.vector(nums))
}))
p <- ggplot(mry, aes(x=year, y=number, group=rating))
p + geom_line()
# Add aesthetic mappings
p + geom_line(aes(size = rating))
p + geom_line(aes(colour = rating))
#######
dcast(mtop,state~year,value.var='population') #value.var: name of column which stores values
#The cast formula has the following format: x_variable + x_2 ~ y_variable + y_2 ~ z_variable ~ ...
#6.Managing Data Frame with dplyr - an optimized and distilled version of plyr package. in C++
#arrange, filter(extracct a subset of rows based on logical conditions), select(return a subset of the columns),
#mutate(add new variables), rename, summarize/summarise(summary statistics of different variables)
#the first argument should always be a data frame, and the result is also a data frame
#the subsequent arguments describe what to do with the first argument and you can refer to columns directly without using $
library(dplyr)
chicago<-readRDS("./GetAndCleanData/chicago.rds")
head(chicago)
dim(chicago)
str(chicago)
#select
head(select(chicago,city:dptp)) #such expression as city:dptp are normally not be able to use in other functions, but you can use it in the select function
head(select(chicago,-(city:dptp)))
i<-match("city",names(chicago))
j<-match("dptp",names(chicago))
head(chicago[,-(i:j)]) #without select, we could only use integer indecxes
#filter #same as select, we could directly refer to the variable names
chic.f<-filter(chicago,pm25tmean2>30) #directly refer to the variable names
head(chic.f,10)
dim(chic.f)
chic.f<-filter(chicago,pm25tmean2>30 & tmpd>80)
dim(chic.f)
#arrange
chicago<-arrange(chicago,date) #without arrange, this ordering would be hard to realize
head(chicago)
tail(chicago)
chicago<-arrange(chicago,desc(date))
head(chicago)
tail(chicago)
#rename
chicago<-rename(chicago,pm25=pm25tmean2,dewpoint=dptp)
head(chicago)
#mutate --> transform exsiting variables or to create new variables
#create a new variable
chicago<-mutate(chicago,pm25detrend=pm25-mean(pm25,na.rm=TRUE))
head(select(chicago, pm25, pm25detrend))
#group_by - split a data frame according to certain categorical variables
#first use mutate to create a categorical variable
chicago<-mutate(chicago, tempcat=factor(1*(tmpd>80),labels=c("cold","hot")))
head(chicago)
hotcold<-group_by(chicago,tempcat)
#summarize
summarize(hotcold, pm25=mean(pm25),o3=max(o3tmean2),no2=median(no2tmean2))
hotcold[is.na(hotcold$tempcat),] #one NA
#summarize each year
#first use mutate to create a categorical variable
chicago<-mutate(chicago,year=as.POSIXlt(date)$year+1900)
head(chicago)
years<-group_by(chicago,year)
head(years)
summarize(years,pm25=mean(pm25,na.rm=TRUE),o3=mean(o3tmean2),no2=median(no2tmean2))
#pipeline operator %>% -- chain operators together
#the idea is that you that a date set and you feed through a pipeline of operations to create a new data set
months<-chicago %>% mutate(month=as.POSIXlt(date)$mon+1) %>% group_by(month) %>% summarize(pm25=mean(pm25,na.rm=TRUE),o3=max(o3tmean2),no2=median(no2tmean2))
####dplyr could be used for data.table, SQL interface for relational databases via the DBI package
#7. Merging Data
if(!file.exists("./GetAndCleanData")){dir.create("./GetAndCleanData")}
fileUrl1<-"http://dl.dropboxusercontent.com/u/7710864/data/reviews-apr29.csv"
fileUrl2<-"http://dl.dropboxusercontent.com/u/7710864/data/solutions-apr29.csv"
download.file(fileUrl1,destfile="./GetAndCleanData/reviews.csv")
download.file(fileUrl2,destfile="./GetAndCleanData/solutions.csv")
reviews<-read.csv("./GetAndCleanData/reviews.csv")
solutions<-read.csv("./GetAndCleanData/solutions.csv")
head(reviews,2)
head(solutions,2)
names(reviews)
names(solutions)
#be default, data sets would merge by all common attributes
str(merge) #important parameters: x,y,by.x,by.y,all(default)
mergedData<-merge(reviews,solutions,by.x="solution_id",by.y="id",all=TRUE)
#all=TRUE is full join, all = FALSE is natural join, all.x = TRU is left join, all.y = TRUE is right join
head(mergedData) #could find out that solution_id goes to the first column
#without telling merge by which variables
intersect(names(solutions),names(reviews))
?intersect #in {base}
mergedData2<-merge(reviews,solutions,all=TRUE) #full join
head(mergedData2)
#using join in the plyr package -- faster but less full featured -- default to left join
#could only merge based on common column names. couldn't merge based on different names,
#as we didn't before joining based on solution_id and id, two different column names.
library(plyr)
df1<-data.frame(id=sample(1:10),x=rnorm(10))
df2<-data.frame(id=sample(1:10),y=rnorm(10))
#use join:
str(join) #function (x, y, by = NULL, type = "left", match = "all")
#by: character vector of variable names to join by. If omitted, will match on all common variables.
#type of join: left (default), right, inner or full
#match: how should duplicate ids be matched? Either match just the "first" matching row, or match "all" matching rows.
arrange(join(df1,df2),id) #arrange is just used for reordering
join(df1,df2)
df1<-data.frame(id=sample(1:10),x=rnorm(10))
df2<-data.frame(id=sample(1:10),y=rnorm(10))
df3<-data.frame(id=sample(1:10),z=rnorm(10))
dfList<-list(df1,df2,df3)
join_all(dfList)
?join_all