-
Notifications
You must be signed in to change notification settings - Fork 0
/
EDA-1.R
605 lines (548 loc) · 28.7 KB
/
EDA-1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
setwd("D:\\study\\DataScience")
#EDA is used to get a sense of what's happening and what are the kinds of plots that you want to make
#Plotting and analytic graphics.
#1.Before beginning
# - Principal Analytical Graphics - Rules from Edward Tuffey
#(1) show comparisons - always ask "compared to what?"
#- Evidence for a hypothesis is always relative to another competing hypothesis
#so always have a control set.
#(2) show causality, mechanism, explanation, systematic structure
#think about what is the causal work
#(3) show multivariate data(more than 2 variables) (the real world is multivariate)
#need to "escape flatland"
#(4) Integration of Evidence
#Completely integrate words, numbers, images, diagrams
#Don't let the tool drive the analysis
#(5) Describe and document the evidence with appropriate labels, scales, sources, etc.
#a data graphic should tell a complete story that is credible
#(6) Content is king
# - why do we use graphs in data analysis?
#(1) To understand data properties
#(2) To find pattern in data
#(3) To suggest modeling strategies
#(4) To "debug" analyses
#(5) To communicate results
#2. a case on characteristics of exploratory graphs
pollution<-read.csv("./EDAdata/avgpm25.csv",colClasses=c("numeric","character","factor","numeric","numeric"))
head(pollution)
#question: we want to see counties exceed the national ambient air quality standard
#####one dimention summaries of data:
#Methods: six-number summary, Boxplots, Histograms, Density Plot, Barplot
str(pollution)
summary(pollution$pm25)
boxplot(pollution$pm25,col="blue",range=2.0) #default range = 1.5
?boxplot
abline(h=12) #h: the y-value(s) for horizontal line(s).
#abline(a = NULL, b = NULL, h = NULL, v = NULL, reg = NULL,coef = NULL, untf = FALSE, ...)
?abline
#histogram
#Histograms can be a poor method for determining the shape of a distribution because it is so
#strongly affected by the number of bins used.
hist(pollution$pm25,col="green",breaks=100)
?hist
rug(pollution$pm25) #plot all of the points in your dataset along the underneath the histgram
abline(v=12,lwd=2)
abline(v=median(pollution$pm25),col="magenta",lwd=4) #Unlike boxplot, histogram doesn't have a median
#so we always put in a medium bar into the plot.
#Barplot is for categorical data
barplot(table(pollution$region),col="wheat",main="number of Counties in Each Region")
#Density Plot is just plot a line on the barplot
hist(mtcars$mpg)
hist(mtcars$mpg, breaks=12, col="red")
# Add a Normal Curve (Thanks to Peter Dalgaard)
x <- mtcars$mpg
h<-hist(x, breaks=10, col="red", xlab="Miles Per Gallon", main="Histogram with Normal Curve")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)
#Kernel Density Plots
#Kernal density plots are usually a much more effective way to view the distribution of a variable.
d <- density(mtcars$mpg) # returns the density data
plot(d) # plots the results
# Filled Density Plot
plot(d, main="Kernel Density of Miles Per Gallon")
polygon(d, col="red", border="blue")
#Comparing Groups VIA Kernal Density
#The sm.density.compare( ) function in the sm package allows you to superimpose the kernal
#density plots of two or more groups.
# Compare MPG distributions for cars with # 4,6, or 8 cylinders
if(!require("sm")){install.packages("sm")}
library(sm)
# create value labels
cyl.f <- factor(mtcars$cyl, levels= c(4,6,8),labels = c("4 cylinder", "6 cylinder", "8 cylinder"))
# plot densities
sm.density.compare(mtcars$mpg, mtcars$cyl, xlab="Miles Per Gallon")
title(main="MPG Distribution by Car Cylinders")
# add legend via mouse click
colfill<-c(2:(2+length(levels(cyl.f))))
legend(locator(1), levels(cyl.f), fill=colfill)
?locator #Reads the position of the graphics cursor when the (first) mouse button is pressed.
#####two dimensions summary
#Methods: Multiple/overlayed 1-D plots(Lattice/ggplot2), Scatterplots, Smooth scatterplot
#Modifications: Overlayed/multiple 2-D plots(coplots), use color/size/shape to add dimensions
#Spinning plots(move data around in three dimensions), Actual 3-D plots(not that useful)
#Multiple Boxplots
boxplot(pm25~region, data=pollution, col='red') # we could find that all the extreme states are in the west region
#Multiple histograms
par(mfrow=c(2,1),mar=c(4,4,2,1)) #mar: A numerical vector of the form c(bottom, left, top, right). The default is c(5, 4, 4, 2) + 0.1.
hist(subset(pollution,region=="west")$pm25,col="green")
#using subset is very convenient
hist(subset(pollution,region=="east")$pm25,col="green")
#scatterplots:
par(mfrow=c(1,1))
with(pollution,plot(latitude,pm25)) #pm2.5 v.s. north-south trend
#plot(pollution$latitude,pollution$pm25) #the only difference from above is the axes labels
abline(h=12,lwd=2,lty=2)
#using color
palette() #order of color use
palette(rainbow(6))
with(pollution,plot(latitude,pm25,col=region)) #use black and red to designate different regions
#red circles are eastern counties, yellow circles are western counties
#how to change color?
palette("default")
#Multiple Scatterplots
par(mfrow=c(1,2),mar=c(5,4,2,1))
with(subset(pollution,region="west"),plot(latitude,pm25,main="west"))
with(subset(pollution,region="east"),plot(latitude,pm25,main="east"))
par(mfrow=c(1,1))
#3. Three core plotting systems in R
#(1)The Base Plotting System: start with blank canvas and build up from there
#first plot(x,y,...) function (or similar), then use annotation functions(text, line, points, axis) to add/modify
#advantages: convenient
#drawback: cannot go back once plot has started(to adjust margin)
#difficult to "translate" to others once a new plot has been created
with(cars,plot(speed,dist,col=dist))
#(2)The Lattice System
library(lattice)
#idea is quite different from base plotting system
#plots are created with a single function call(xyplot,bwplot,etc.)
#Most useful for conditioning types of plots(Coplot): looking at how y changes with x across levels of z
#good for putting many many plots on a screen
#construct an entire plot all at once, so you have to specify a lot of information in the call function.
#drawbacks: sometimes awkward to specify an entire plot in a single function/call
#annotation in plot is not especially intuitive, cannot "add" to the plot once it is created
#use of panel functions and subscripts difficult to wield and requires intense preparation
state<-data.frame(state.x77,region=state.region)
xyplot(Life.Exp~Income|region,data=state,layout=c(4,1)) #in data frame, dot could be used to substitute "space"
#in data table, we couldn't do the same
#(3)The ggplot2 system
#it creates a kind of language or grammar, mixing the ideas from both base and lattice plotting systems
library(ggplot2)
#automatically deals with spacings, text, titles but also allow you to annotate
head(mpg)
qplot(displ,hwy,data=mpg)
str(qplot)
#4.The Basic Plotting System
#First step: initializing a new plot with plot(x,y) or hist(x) or boxplot(x). A graphic device would be launched
#parameters of plot function are documented in ?par
library(datasets)
hist(airquality$Ozone)
#scatterplot
with(airquality,plot(Wind, Ozone))
#Boxplot
airquality<-transform(airquality,MOnth=factor(Month))
boxplot(Ozone~Month,airquality,xlab="Month",ylab="Ozone(ppb)")
#key parameters for plotting function:
#pch: the plotting symble(default is open circle) (plot character)
#lty: defualt is solid, can bed dashed, dotted, etc.
#lwd: integer
#col: palette() and colors()
#xlab, ylab
#key parameters for par() function
#las: the orientation of the axis labels on the plot
#bg: the background color
#mar: the margin size
#oma: the outer margin size(default is 0 for all sides)
#mfrow: # of plots per row, column. (plots are filled row-wise)
#mfcol: # of plots per row, column. (plots are filled column-wise)
par('lty')
par("col")
par('pch') #1 is open circle
par("bg")
par("mar") #[1] 5.1 4.1 4.1 2.1
par("mfrow")
#Basic plot functions:
#plot
#lines
#points
#text
#title
#mtext: add arbitrary text to the margins (inner or outer) of the plot
#axis: adding axis ticks/labels
with(airquality,plot(Wind,Ozone,main='Ozone and Wind in NYC'))
with(subset(airquality,Month==5),points(Wind, Ozone, col="blue"))
with(subset(airquality,Month!=5),points(Wind, Ozone, col="red"))
legend("topright",pch=1,col=c("blue","red"),legend=c("May","Other Months"))
#Add a regression line
with(airquality,plot(Wind,Ozone,main='Ozone and Wind in NYC',pch=20))
model<-lm(Ozone~Wind,airquality)
abline(model,lwd=2)
#Multiple base plot
par(mfrow=c(1,3),mar=c(4,4,2,1),oma=c(0,0,2,0))
with(airquality,{
plot(Wind,Ozone,main="Ozone and Wind")
plot(Solar.R,Ozone,main="Ozone and Solar Radiation")
plot(Temp,Ozone, main="Ozone and Temperature")
mtext("Ozone and Weather in NYC",outer=TRUE)
})
par(mfrow=c(1,1))
#mtext: write text into the margins of a plot
#a demonstration:
x<-rnorm(100)
hist(x)
y<-rnorm(100)
plot(x,y)
z<-rnorm(100)
plot(x,z)
par(mar=c(2,2,2,2)) #I will lost my label in this setting of margin size
plot(x,y)
par(mar=c(5, 4, 4, 2) + 0.1) #default
plot(x,y,pch=20) #solid circle
plot(x,y,pch=18) #solid diamond
plot(x,y,pch=2) #triangles
plot(x,y,pch=4) #X
plot(x,y,pch="s") #s
example(points) #we could see plot symbols after some example plots
par(mar=c(1,1,1,1))
pchShow(c("o","O","0"), cex = 3)
try(TestChars(sign=-1))
par(mar=c(5, 4, 4, 2) + 0.1)
plot(x,y,pch=20) #solid circle
title("Scatterplot")
text(-2,-2,"Label")
legend("topright",legend="Data")
legend("topright",legend="Data",pch=20) #pch could also be used here
str(legend)
fit<-lm(y~x)
abline(fit)
abline(fit,lwd=3)
abline(fit,lwd=3,col="blue")
abline(fit,lwd=1,col="red") #cover the above lines
plot(x,y,xlab="Weight",ylab="Height",main="ScatterPlot",pch=20)
legend("topright",legend="Data",pch=20) #
fit<-lm(y~x)
abline(fit,lwd=3,col="red")
z<-rpois(100,2)
par(mfrow=c(1,1))
plot(x,y,pch=20)
par("mar")
par(mar=c(2,2,1,1))
plot(x,y,pch=20)
x<-rnorm(100)
y<-x+rnorm(100)
g<-gl(2,50) #gl(n, k, length = n*k, labels = seq_len(n), ordered = FALSE) n is # of levels, k is # of replicatins, result is of length 50*2=100
?gl
g<-gl(2,50,labels=c("Male","Female"))
str(g)
g<-sample(g,100,replace=TRUE)
plot(x,y)
#now let's try to add each category of data into the canvas
plot(x,y,type="n") #first give a blank canvas
points(x[g=="Male"],y[g=="Male"],col="blue")
points(x[g=="Female"],y[g=="Female"],col="red",pch=19)
###grDevices package -- contains all the code implementing the various graphics devices,
#including X11, PDF, PostScript, PNG, etc.
#Graphics Devices is something or some place where you can make a plot appear:
#a window on you computer(screen device), a PDF file(file device), A PNG/JPEG(file device), A scalable vector graphics(SVG) file(file device)
#when you make a plot in R, it has to be sent to a specific graphics device
#Most common place is the screen device.On Mac, the screen device is launched by quartz(), on Windows is called Windows(), on Unix/Linux is x11()
?Devices
library(grDevices)
#Two common way to create a plot:
#the most common way to create a plot:
with(faithful,plot(eruptions,waiting))
title(main="Old Faithful Geyser Data")
#Another way to create a plot:
pdf(file="myplot.pdf") #open PDF device, will not plot on screen
with(faithful,plot(eruptions,waiting))
title(main="Old Faithful Geyser Data")
dev.off()
#Two Categories of file devices: vector and bitmap devices
#Vector Formats - most useful for line-graphics(different from natual scenes like photgraphs):
#pdf(used for line-type graphics,reize well, not portable; not efficient if a plot has many objects/points)
#svg(XML-based scalable vector graphics; support animation and interactivity, potentially used for web-based plots)
#win.metafile(only on Windows), postscript(older format, resize well, windows system doesn't have a postscript viewer)
#Bitmap devices - generally don't resize weill:
#PNG(Portable Network Graphics)-a series of pixels, good for line drawings or images with solid colors, use lossless compression.
#Good for plotting many many points, doesn't resize well
#jpeg
#tiff: Create bitmap files in TIFF format; support lossless compression
#bmp: a native Windows bitmapped format
#Multiple Open Graphics Devices:
#you could only on one devices at a time, so the graphics devices you plot to is the active device
#use dev.cur() to see the active graphics device. Every open graphics device is assigned an integer 2.
#you change the active graphics device with dev.set(<integer>).
#Copy plots -- not an exact operation, result may not be identical to the original:
with(faithful,plot(eruptions,waiting,main='Old Faithful Geyser Data'))
dev.copy(png,file="geyserplot.png")
dev.off()
dev.copy2pdf(file="MY.pdf", width = 7, height = 5) #don't have to close device for this one
dev.cur()
#5. lattice plotting system -- xyplot
#contains code for producing Trellis graphs, which are indep of the "base" graphics system
#including functions like xyplot, bwplot, levelplot
#another package of lattice system -- grid
#library(grid)
#we seldom call functions from the grid package. the lattice package builds on top of grid
#Lattice Functions:
#xyplot: main function for creating scatterplots: xyplot(y~x|f*g,data) # * means interaction
#bwplot: box and whiskers plot("boxplots)
#histogram
#stripplot
#dotplot: plot dots on "violin strings"
#splom: scatterplot matrix; like 'pairs' in base plotting system
#levelplot, contourplot: for plotting 'image' data
library(lattice)
xyplot(Ozone~Wind,data=airquality)
airquality<-transform(airquality, Month=factor(Month)) #we also need to factorize characters. This won't change the month in airquality.
xyplot(Ozone~Wind|Month,data=airquality,layout=c(5,1)) #5,6,7,8,9 five months
class(airquality$Month)
class(xyplot(Ozone~Wind|Month,data=airquality,layout=c(5,1))) #trellis
#airquality$Month=factor(airquality$Month) #next session of R would change it back to integer
#class(airquality$Month)
p<-xyplot(Ozone~Wind|Month,data=airquality,layout=c(5,1)) #could be saved as an object
dev.cur()
dev.off()
print(p)
#lattice have a "panel function' which controls what happens inside each panel of the plot
#each panel's going to represent a subset of the data, which is defined by the conditioning variable
set.seed(10)
x<-rnorm(100)
f<-rep(0:1,each=50)
y<-x+f-f*x+rnorm(100,sd=0.5)
f<-factor(f,labels=c("Group 1","Group 2"))
xyplot(y~x|f,layout=c(2,1))
#custom panel function
#functions in panel functions:
#panel.abline(a = NULL, b = 0,h = NULL, v = NULL,reg = NULL, coef = NULL,col, col.line, lty, lwd, alpha, type,...,reference = FALSE,identifier = "abline")
#panel.refline: panel.refline is similar to abline, but uses the "reference.line" settings for the defaults.
#panel.curve(expr, from, to, n = 101,curve.type = "l",col, lty, lwd, type,...,identifier = "curve")
#panel.rug:adds a rug(data points) representation of the (marginal) data to the panel, much like rug.
#panel.average(x, y, fun = mean, horizontal = TRUE,lwd, lty, col, col.line, type,...,identifier = "linejoin")
#panel.linejoin(x, y, fun = mean, horizontal = TRUE,lwd, lty, col, col.line, type,...,identifier = "linejoin")
#panel.fill(col, border, ..., identifier = "fill")
#panel.grid(h=3, v=3, col, col.line, lty, lwd, x, y, ..., identifier = "grid")
#panel.lmline(x, y, ..., identifier = "lmline") #panel.lmline(x, y) is equivalent to panel.abline(lm(y ~ x)).
#panel.mathdensity(dmath = dnorm, args = list(mean=0, sd=1),n = 50, col, col.line, lwd, lty, type,..., identifier = "mathdensity")
#panel.superpose(x, y = NULL, subscripts, groups,panel.groups = "panel.xyplot",...,col, col.line, col.symbol,pch, cex, fill, font,fontface, fontfamily,lty, lwd, alpha,type = "p", grid = FALSE,distribute.type = FALSE)
#subscripts: An integer vector of subscripts giving indices of the x and y values in the original data source.
#panel.stripplot(x, y, jitter.data = FALSE,factor = 0.5, amount = NULL,horizontal = TRUE, groups = NULL,...,identifier = "stripplot")
#Creates stripplot (one dimensional scatterplot) of x for each level of y (or vice versa, depending on the value of horizontal)
#Much of the power of Trellis Graphics comes from the ability to define customized panel functions.
xyplot(y~x|f,panel=function(x,y,...){ #The actual plotting is done by the function specified by the panel argument.
panel.xyplot(x,y,...) #First call the default panel function for "xyplot"
panel.abline(h=median(y),lty=2) #add a horizontal line a the median
})
xyplot(y~x|f,panel=function(x,y,...){
panel.xyplot(x,y,...)
panel.lmline(h=median(y),lty=2)
})
#another example
histogram( ~ height | voice.part, data = singer, layout = c(2, 4),
type = "density", border = "transparent", col.line = "grey60", #type = c("percent", "count", "density") --> we need to claim "type" here so that panel.mathdensity would be active in the panel function.
#border: the border of bins in histogram.Either a color for the border, or a logical flag. In the latter case, the border color is black if border is TRUE, and no border is drawn if it is FALSE (the default).
xlab = "Height (inches)",
ylab = "Density Histogram\n with Normal Fit", #\n: Line feed
panel = function(x, ...) {
panel.histogram(x, ...)
panel.mathdensity(dmath = dnorm,
args = list(mean=mean(x),sd=sd(x)), ...)
})
#another example
histogram( ~ height | voice.part, data = singer,
xlab = "Height (inches)", type = "density", #type = c("percent", "count", "density")
panel = function(x, ...) {
panel.histogram(x, ...)
panel.mathdensity(dmath = dnorm, col = "black", #panel.mathdensity plots a (usually theoretical) probability density function.
args = list(mean=mean(x),sd=sd(x)))
#A vectorized function that produces density values given a numeric vector named x, e.g. dnorm
#panel.mathdensity (dmath = dnorm, args = list(mean = 0, sd = 1), n = 50, col, col.line = reference.line$col, lwd = reference.line$lwd, lty = reference.line$lty, type, ..., identifier = "mathdensity")
})
#another example
bwplot(yield ~ site, barley, groups = year,
panel = function(x, y, groups, subscripts, ...) {
panel.grid(h = -1, v = 0) #For panel.grid, these usually specify
#the number of horizontal and vertical reference lines
#to be added to the plot. Alternatively, they can be
#negative numbers. h=-1 and v=-1 are intended to make
#the grids aligned with the axis labels.
panel.stripplot(x, y, ..., jitter.data = TRUE, #jitter.data=TRUE: jitter data to avoid overplotting
groups = groups, subscripts = subscripts)
panel.superpose(x, y, ..., panel.groups = panel.average,
groups = groups, subscripts = subscripts)
#panel.superpose divides up the x (and optionally y) variable(s) by the
#unique values of groups[subscripts], and plots each subset with different
#graphical parameters.
},
auto.key = list(points = FALSE, lines = TRUE, columns = 2)) #col attribute is used for text, not for lines
#key: A list that defines a legend to be drawn on the plot. This list is used as an argument to the draw.key function,
#which produces a "grob" (grid object) eventually plotted by the print method for "trellis" objects.
#another example:
bwplot(decrease ~ treatment, OrchardSprays, groups = rowpos,
panel = "panel.superpose",
panel.groups = "panel.linejoin",
xlab = "treatment",
key = list(lines = Rows(trellis.par.get("superpose.line"),
c(1:7, 1)),
text = list(lab = as.character(unique(OrchardSprays$rowpos))),
columns = 4, title = "Row position"))
#Functions used to query, display and modify graphical parameters for fine control of Trellis displays. :
#trellis.par.set(name, value, ..., theme, warn = TRUE, strict = FALSE)
#trellis.par.get(name = NULL)
#show.settings(x = NULL)
#another example
dotplot(variety ~ yield | site, data = barley, groups = year,
key = simpleKey(levels(barley$year), space = "right"),
xlab = "Barley Yield (bushels/acre) ",
aspect=0.5, layout = c(3,2), ylab=NULL)
stripplot(voice.part ~ jitter(height), data = singer, aspect = 1,
jitter.data = TRUE, xlab = "Height (inches)")
#6. ggplot2
#aesthetic artributes: color, shape, size of geometric objects(points,lines,bars,shapes)
#the plot may also contain statistical transformations of the data.
#stats: statistical transformations like binning, quantiles, smoothing
#scales: what scale an aesthetic map uses (example, male=red, female=blue)
#Scales are to find how to different variables are coded, in terms of plot (make man red, make female blue).
library(ggplot2)
#the basic: qplot() - quick plot, hides what goes on underneath, which is okay for most operations
#ggplot() is the core function and very flexible for doing things qplot() cannot do
#####data has to be organized as data frame
#factors should be labeled to be informative
#(1) qplot
#Hello World for GGplot2
str(mpg)
args(qplot)
# function (x, y = NULL, ..., data, facets = NULL, margins = FALSE,
# geom = "auto", stat = list(NULL), position = list(NULL),
# xlim = c(NA, NA), ylim = c(NA, NA), log = "", main = NULL,
# xlab = deparse(substitute(x)), ylab = deparse(substitute(y)),
# asp = NA)
qplot(displ, hwy, data=mpg) #solid circles as points.
plot(mpg$displ, mpg$hwy) #the same, but more ugly. Open circles as points
#modifying aesthetics
qplot(displ, hwy, data=mpg,color=drv) #three kinds of drives, drv are factors #legend is added automatically
class(mpg$drv) #factor
qplot(displ, hwy, data=mpg,geom=c("point","smooth"),method='lm')
qplot(displ, hwy, data=mpg,color=drv,geom=c("point","smooth")) #haha, interesting
qplot(hwy, data=mpg, geom="density")
qplot(hwy, data=mpg, geom="density",color=drv)
qplot(displ, hwy, data=mpg,shape=drv)
qplot(displ, hwy, data=mpg,color=drv)
qplot(displ, hwy, data=mpg,color=drv,geom=c("point","smooth"),method='lm') #smooth method is "lm"
#you could make an histogram with a qplot function (compared with hist in base plotting system)
qplot(hwy, data=mpg, fill=drv) #plot would be a histogram once we only specify one variable
#Facets - like panels in lattice -- split by groups
qplot(displ,hwy,data=mpg,facets=.~drv) #variable on the righ-hand side of ~(tilde) determines the columns of the panels
qplot(hwy,data=mpg,facets=drv~.,binwidth=2) #variable on the left side of ~(tilde) indicates the rows of this kind of matrix here.
qplot(displ, hwy, data=mpg,color=drv,geom=c("point","smooth"),method='lm',facets=.~drv)
#(2)ggplot
g<-ggplot(mpg,aes(displ,hwy))
summary(g)
print(g) #it will say: No layers in plot. It means R doesn't know how to draw the data yet.
#it doesn't know if you want points or if you want lines and tiles...
p<-g+geom_point()
print(p)
g+geom_point() #auto print
g+geom_point()+geom_smooth()
g+geom_point()+geom_smooth(method="lm")
g+geom_point()+facet_grid(.~drv)+geom_smooth(method="lm") #order doesn't matter
#notice that labels are determined by the facet_grid. So make sure the metadata is then specified appropriately.
?facet_grid
qplot(mpg, wt, data=mtcars, facets = . ~ vs + am)
qplot(mpg, wt, data=mtcars, facets = vs + am ~ . )
mt <- ggplot(mtcars, aes(mpg, wt, colour = factor(cyl))) + geom_point()
#Are scales shared across all facets (the default, "fixed"), or do they vary across
#rows ("free_x"), columns ("free_y"), or both rows and columns ("free")
mt + facet_grid(. ~ cyl, scales = "free")
mt + facet_grid(vs ~ am, scales = "free")
mt + facet_grid(vs ~ am, scales = "free_x")
mt + facet_grid(vs ~ am, scales = "free_y")
mt + facet_grid(vs ~ am, scales = "free", space="free")
mt + facet_grid(vs ~ am, scales = "free", space="free_x")
mt + facet_grid(vs ~ am, scales = "free", space="free_y")
#Annotation
#labels: xlab, ylab, labs, ggtitle
#each of the "geom" functions has options to modify
#for things that only make sense globally, use theme(): e.g.: theme(legend.position="none")
#two standard appearance themes are included:
#theme_gray(): The default theme(gray background)
#theme_bw(): More stark/plain
g+geom_point(color="steelblue",size=4,alpha=1/2) #alpha=1/2: use transparent points
str(geom_point)
#function (mapping = NULL, data = NULL, stat = "identity", position = "identity", na.rm = FALSE, ...)
g+geom_point(aes(color=drv),size=4,alpha=1/2) #alpha=1/2: use transparent points
?aes
aes(x = mpg, y = wt) #aes creates a list of unevaluated expressions.
aes(color=drv)
g+geom_point(aes(color=drv),size=3)+labs(title="MPG DRV")+labs(x=expression("displ"[2]),y="hwy")
g+geom_point(aes(color=drv),size=3,alpha=1/2)+geom_smooth(size=4,linetype=3,method="lm",se=FALSE)+labs(title="MPG DRV")+labs(x=expression("displ"[2]),y="hwy")
g+geom_point(aes(color=drv),size=3,alpha=1/2)+geom_smooth(size=2,linetype=4,method="lm",se=TRUE)+labs(title="MPG DRV")+labs(x=expression("displ"[2]),y="hwy")
g+geom_point(aes(color=drv),size=3,alpha=1/2)+theme_bw(base_family="Times")
?theme_bw #theme_bw(base_size = 12, base_family = "")
#examples of geom_point
p <- ggplot(mtcars, aes(wt, mpg))
p + geom_point()
# Add aesthetic mappings
p + geom_point(aes(colour = qsec))
p + geom_point(aes(alpha = qsec))
p + geom_point(aes(colour = factor(cyl)))
p + geom_point(aes(shape = factor(cyl)))
p + geom_point(aes(size = qsec))
# Change scales
p + geom_point(aes(colour = cyl)) + scale_colour_gradient(low = "blue")
p + geom_point(aes(size = qsec)) + scale_size_area()
?scale_size_area() #scale the area of points to be proportional to the value.
p + geom_point(aes(shape = factor(cyl))) + scale_shape(solid = FALSE)
# Set aesthetics to fixed value
p + geom_point(colour = "red", size = 3)
qplot(wt, mpg, data = mtcars, colour = I("red"), size = I(3))
# Varying alpha is useful for large datasets #no transparency when alpha=1, the smaller of alpha, the more transparent of data points
d + geom_point(alpha = 1)
d <- ggplot(diamonds, aes(carat, price))
d + geom_point(alpha = 1/10)
d + geom_point(alpha = 1/20)
d + geom_point(alpha = 1/100)
# You can create interesting shapes by layering multiple points of
# different sizes
p <- ggplot(mtcars, aes(mpg, wt))
p + geom_point(colour="grey50", size = 4) + geom_point(aes(colour = cyl))
p + aes(shape = factor(cyl)) +
geom_point(aes(colour = factor(cyl)), size = 4) +
geom_point(colour="grey90", size = 1.5)
p + geom_point(colour="black", size = 4.5) +
geom_point(colour="pink", size = 4) +
geom_point(aes(shape = factor(cyl)))
# These extra layers don't usually appear in the legend, but we can
# force their inclusion
p + geom_point(colour="black", size = 4.5, show_guide = TRUE) +
geom_point(colour="pink", size = 4, show_guide = TRUE) +
geom_point(aes(shape = factor(cyl)))
# Transparent points:
qplot(mpg, wt, data = mtcars, size = I(5), alpha = I(0.2))
# geom_point warns when missing values have been dropped from the data set
# and not plotted, you can turn this off by setting na.rm = TRUE
mtcars2 <- transform(mtcars, mpg = ifelse(runif(32) < 0.2, NA, mpg))
qplot(wt, mpg, data = mtcars2)
qplot(wt, mpg, data = mtcars2, na.rm = TRUE)
#A note about Axis Limits - Coordinate
testdata<-data.frame(x=1:100,y=rnorm(100))
testdata[50,2]<-100 #outlier
plot(testdata$x,testdata$y,type='l',ylim=c(-3,3)) #baes plot
g<-ggplot(testdata,aes(x=x,y=y))
g+geom_line()
g+geom_line()+coord_cartesian(ylim=c(-3,3))
#if we want to condition on cts variable, we first need to use cut() function to
#make the cts variable categorical
set.seed(6809)
diamonds <- diamonds[sample(nrow(diamonds), 1000), ]
diamonds$cut <- factor(diamonds$cut,
levels = c("Ideal", "Very Good", "Fair", "Good", "Premium"))
# Repeat first example with new order
p <- ggplot(diamonds, aes(carat, ..density..)) +
geom_histogram(binwidth = 1)
p + facet_grid(. ~ cut)
mt <- ggplot(mtcars, aes(mpg, wt, colour = factor(cyl))) + geom_point()
mt + facet_grid(. ~ cyl, scales = "free_x", space="free") +
scale_x_continuous(breaks = seq(10, 36, by = 2))
last_plot() + xlim(10, 15)