setwd("D:\\study\\DataScience")

#EDA is used to get a sense of what's happening and what are the kinds of plots that you want to make
#Plotting and analytic graphics.

#1.Before beginning
# - Principal Analytical Graphics - Rules from Edward Tuffey
#(1) show comparisons - always ask "compared to what?"
                     #- Evidence for a hypothesis is always relative to another competing hypothesis
        #so always have a control set.
#(2) show causality, mechanism, explanation, systematic structure
        #think about what is the causal work
#(3) show multivariate data(more than 2 variables) (the real world is multivariate)
        #need to "escape flatland"
#(4) Integration of Evidence
        #Completely integrate words, numbers, images, diagrams
        #Don't let the tool drive the analysis
#(5) Describe and document the evidence with appropriate labels, scales, sources, etc.
        #a data graphic should tell a complete story that is credible
#(6) Content is king

# - why do we use graphs in data analysis?
#(1) To understand data properties
#(2) To find pattern in data
#(3) To suggest modeling strategies
#(4) To "debug" analyses
#(5) To communicate results

#2. a case on characteristics of exploratory graphs
pollution<-read.csv("./EDAdata/avgpm25.csv",colClasses=c("numeric","character","factor","numeric","numeric"))
head(pollution)

#question: we want to see counties exceed the national ambient air quality standard

#####one dimention summaries of data:
#Methods: six-number summary, Boxplots, Histograms, Density Plot, Barplot
str(pollution)
summary(pollution$pm25)
boxplot(pollution$pm25,col="blue",range=2.0)  #default range = 1.5
?boxplot  
abline(h=12)   #h: the y-value(s) for horizontal line(s).
        #abline(a = NULL, b = NULL, h = NULL, v = NULL, reg = NULL,coef = NULL, untf = FALSE, ...)
?abline
#histogram
#Histograms can be a poor method for determining the shape of a distribution because it is so 
#strongly affected by the number of bins used.
hist(pollution$pm25,col="green",breaks=100)
?hist
rug(pollution$pm25)  #plot all of the points in your dataset along the underneath the histgram
abline(v=12,lwd=2)
abline(v=median(pollution$pm25),col="magenta",lwd=4) #Unlike boxplot, histogram doesn't have a median
                                        #so we always put in a medium bar into the plot.
#Barplot is for categorical data
barplot(table(pollution$region),col="wheat",main="number of Counties in Each Region")
#Density Plot is just plot a line on the barplot
hist(mtcars$mpg)
hist(mtcars$mpg, breaks=12, col="red")
# Add a Normal Curve (Thanks to Peter Dalgaard)
x <- mtcars$mpg 
h<-hist(x, breaks=10, col="red", xlab="Miles Per Gallon", main="Histogram with Normal Curve") 
xfit<-seq(min(x),max(x),length=40) 
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x)) 
yfit <- yfit*diff(h$mids[1:2])*length(x) 
lines(xfit, yfit, col="blue", lwd=2)
#Kernel Density Plots
#Kernal density plots are usually a much more effective way to view the distribution of a variable.
d <- density(mtcars$mpg) # returns the density data 
plot(d) # plots the results
# Filled Density Plot
plot(d, main="Kernel Density of Miles Per Gallon")
polygon(d, col="red", border="blue")
#Comparing Groups VIA Kernal Density
#The sm.density.compare( ) function in the sm package allows you to superimpose the kernal 
#density plots of two or more groups.
# Compare MPG distributions for cars with # 4,6, or 8 cylinders
if(!require("sm")){install.packages("sm")}
library(sm)
# create value labels 
cyl.f <- factor(mtcars$cyl, levels= c(4,6,8),labels = c("4 cylinder", "6 cylinder", "8 cylinder")) 
# plot densities 
sm.density.compare(mtcars$mpg, mtcars$cyl, xlab="Miles Per Gallon")
title(main="MPG Distribution by Car Cylinders")
# add legend via mouse click
colfill<-c(2:(2+length(levels(cyl.f)))) 
legend(locator(1), levels(cyl.f), fill=colfill)  
?locator  #Reads the position of the graphics cursor when the (first) mouse button is pressed.

#####two dimensions summary
#Methods: Multiple/overlayed 1-D plots(Lattice/ggplot2), Scatterplots, Smooth scatterplot
#Modifications: Overlayed/multiple 2-D plots(coplots), use color/size/shape to add dimensions
               #Spinning plots(move data around in three dimensions), Actual 3-D plots(not that useful)
#Multiple Boxplots
boxplot(pm25~region, data=pollution, col='red') # we could find that all the extreme states are in the west region
#Multiple histograms
par(mfrow=c(2,1),mar=c(4,4,2,1)) #mar: A numerical vector of the form c(bottom, left, top, right). The default is c(5, 4, 4, 2) + 0.1.
hist(subset(pollution,region=="west")$pm25,col="green")
#using subset is very convenient
hist(subset(pollution,region=="east")$pm25,col="green")
#scatterplots:
par(mfrow=c(1,1))
with(pollution,plot(latitude,pm25))  #pm2.5 v.s. north-south trend
#plot(pollution$latitude,pollution$pm25)  #the only difference from above is the axes labels
abline(h=12,lwd=2,lty=2)
#using color
palette()  #order of color use
palette(rainbow(6))
with(pollution,plot(latitude,pm25,col=region)) #use black and red to designate different regions
                                     #red circles are eastern counties, yellow circles are western counties
                                     #how to change color?
palette("default")
#Multiple Scatterplots
par(mfrow=c(1,2),mar=c(5,4,2,1))
with(subset(pollution,region="west"),plot(latitude,pm25,main="west"))
with(subset(pollution,region="east"),plot(latitude,pm25,main="east"))
par(mfrow=c(1,1))

#3. Three core plotting systems in R
#(1)The Base Plotting System: start with blank canvas and build up from there
#first plot(x,y,...) function (or similar), then use annotation functions(text, line, points, axis) to add/modify
#advantages: convenient
#drawback: cannot go back once plot has started(to adjust margin)
          #difficult to "translate" to others once a new plot has been created
with(cars,plot(speed,dist,col=dist))
#(2)The Lattice System
library(lattice)
#idea is quite different from base plotting system
#plots are created with a single function call(xyplot,bwplot,etc.)
#Most useful for conditioning types of plots(Coplot): looking at how y changes with x across levels of z
#good for putting many many plots on a screen
#construct an entire plot all at once, so you have to specify a lot of information in the call function.
#drawbacks: sometimes awkward to specify an entire plot in a single function/call
          #annotation in plot is not especially intuitive, cannot "add" to the plot once it is created
          #use of panel functions and subscripts difficult to wield and requires intense preparation
state<-data.frame(state.x77,region=state.region)
xyplot(Life.Exp~Income|region,data=state,layout=c(4,1)) #in data frame, dot could be used to substitute "space"
                                                        #in data table, we couldn't do the same
#(3)The ggplot2 system
#it creates a kind of language or grammar, mixing the ideas from both base and lattice plotting systems
library(ggplot2)
#automatically deals with spacings, text, titles but also allow you to annotate
head(mpg)
qplot(displ,hwy,data=mpg)
str(qplot)



#4.The Basic Plotting System
#First step: initializing a new plot with plot(x,y) or hist(x) or boxplot(x). A graphic device would be launched
#parameters of plot function are documented in ?par
library(datasets)
hist(airquality$Ozone)
#scatterplot
with(airquality,plot(Wind, Ozone))
#Boxplot
airquality<-transform(airquality,MOnth=factor(Month))
boxplot(Ozone~Month,airquality,xlab="Month",ylab="Ozone(ppb)")
#key parameters for plotting function:
        #pch: the plotting symble(default is open circle) (plot character)
        #lty: defualt is solid, can bed dashed, dotted, etc.
        #lwd: integer
        #col: palette() and colors()
        #xlab, ylab
#key parameters for par() function
        #las: the orientation of the axis labels on the plot
        #bg: the background color
        #mar: the margin size
        #oma: the outer margin size(default is 0 for all sides)
        #mfrow: # of plots per row, column. (plots are filled row-wise)
        #mfcol: # of plots per row, column. (plots are filled column-wise)
par('lty')
par("col")
par('pch')  #1 is open circle
par("bg")
par("mar")  #[1] 5.1 4.1 4.1 2.1
par("mfrow")

#Basic plot functions:
        #plot
        #lines
        #points
        #text
        #title
        #mtext: add arbitrary text to the margins (inner or outer) of the plot
        #axis: adding axis ticks/labels
with(airquality,plot(Wind,Ozone,main='Ozone and Wind in NYC'))
with(subset(airquality,Month==5),points(Wind, Ozone, col="blue"))
with(subset(airquality,Month!=5),points(Wind, Ozone, col="red"))
legend("topright",pch=1,col=c("blue","red"),legend=c("May","Other Months"))

#Add a regression line
with(airquality,plot(Wind,Ozone,main='Ozone and Wind in NYC',pch=20))
model<-lm(Ozone~Wind,airquality)
abline(model,lwd=2)

#Multiple base plot
par(mfrow=c(1,3),mar=c(4,4,2,1),oma=c(0,0,2,0))
with(airquality,{
        plot(Wind,Ozone,main="Ozone and Wind")
        plot(Solar.R,Ozone,main="Ozone and Solar Radiation")
        plot(Temp,Ozone, main="Ozone and Temperature")
        mtext("Ozone and Weather in NYC",outer=TRUE)
})
par(mfrow=c(1,1))
#mtext: write text into the margins of a plot

#a demonstration:
x<-rnorm(100)
hist(x)
y<-rnorm(100)
plot(x,y)
z<-rnorm(100)
plot(x,z)
par(mar=c(2,2,2,2)) #I will lost my label in this setting of margin size
plot(x,y)
par(mar=c(5, 4, 4, 2) + 0.1)  #default

plot(x,y,pch=20)  #solid circle
plot(x,y,pch=18)  #solid diamond
plot(x,y,pch=2)  #triangles
plot(x,y,pch=4)  #X
plot(x,y,pch="s")  #s
example(points)  #we could see plot symbols after some example plots
par(mar=c(1,1,1,1))
pchShow(c("o","O","0"), cex = 3)
try(TestChars(sign=-1))
par(mar=c(5, 4, 4, 2) + 0.1)  

plot(x,y,pch=20)  #solid circle
title("Scatterplot")
text(-2,-2,"Label")
legend("topright",legend="Data")
legend("topright",legend="Data",pch=20)  #pch could also be used here
str(legend)
fit<-lm(y~x)
abline(fit)
abline(fit,lwd=3)
abline(fit,lwd=3,col="blue")
abline(fit,lwd=1,col="red")  #cover the above lines

plot(x,y,xlab="Weight",ylab="Height",main="ScatterPlot",pch=20)
legend("topright",legend="Data",pch=20)  #
fit<-lm(y~x)
abline(fit,lwd=3,col="red")                
z<-rpois(100,2)
par(mfrow=c(1,1))
plot(x,y,pch=20)
par("mar")
par(mar=c(2,2,1,1))
plot(x,y,pch=20)

x<-rnorm(100)
y<-x+rnorm(100)
g<-gl(2,50)  #gl(n, k, length = n*k, labels = seq_len(n), ordered = FALSE) n is # of levels, k is # of replicatins, result is of length 50*2=100
?gl
g<-gl(2,50,labels=c("Male","Female"))
str(g)
g<-sample(g,100,replace=TRUE)
plot(x,y)
#now let's try to add each category of data into the canvas
plot(x,y,type="n")  #first give a blank canvas
points(x[g=="Male"],y[g=="Male"],col="blue")
points(x[g=="Female"],y[g=="Female"],col="red",pch=19)

###grDevices package -- contains all the code implementing the various graphics devices,
    #including X11, PDF, PostScript, PNG, etc.
#Graphics Devices is something or some place where you can make a plot appear:
        #a window on you computer(screen device), a PDF file(file device), A PNG/JPEG(file device), A scalable vector graphics(SVG) file(file device)
#when you make a plot in R, it has to be sent to a specific graphics device
        #Most common place is the screen device.On Mac, the screen device is launched by quartz(), on Windows is called Windows(), on Unix/Linux is x11()
?Devices
library(grDevices)
#Two common way to create a plot:
#the most common way to create a plot:
with(faithful,plot(eruptions,waiting))
title(main="Old Faithful Geyser Data")
#Another way to create a plot:
pdf(file="myplot.pdf")  #open PDF device, will not plot on screen
with(faithful,plot(eruptions,waiting))
title(main="Old Faithful Geyser Data")
dev.off()

#Two Categories of file devices: vector and bitmap devices
#Vector Formats - most useful for line-graphics(different from natual scenes like photgraphs): 
        #pdf(used for line-type graphics,reize well, not portable; not efficient if a plot has many objects/points) 
        #svg(XML-based scalable vector graphics; support animation and interactivity, potentially used for web-based plots) 
        #win.metafile(only on Windows), postscript(older format, resize well, windows system doesn't have a postscript viewer)
#Bitmap devices - generally don't resize weill:        
        #PNG(Portable Network Graphics)-a series of pixels, good for line drawings or images with solid colors, use lossless compression. 
                #Good for plotting many many points, doesn't resize well
        #jpeg
        #tiff: Create bitmap files in TIFF format; support lossless compression
        #bmp: a native Windows bitmapped format

#Multiple Open Graphics Devices:
#you could only on one devices at a time, so the graphics devices you plot to is the active device
#use dev.cur() to see the active graphics device. Every open graphics device is assigned an integer 2.
#you change the active graphics device with dev.set(<integer>).

#Copy plots -- not an exact operation, result may not be identical to the original: 
with(faithful,plot(eruptions,waiting,main='Old Faithful Geyser Data'))
dev.copy(png,file="geyserplot.png")
dev.off()
dev.copy2pdf(file="MY.pdf", width = 7, height = 5) #don't have to close device for this one
dev.cur()


#5. lattice plotting system -- xyplot
#contains code for producing Trellis graphs, which are indep of the "base" graphics system
#including functions like xyplot, bwplot, levelplot
#another package of lattice system -- grid
#library(grid) 
#we seldom call functions from the grid package. the lattice package builds on top of grid
#Lattice Functions:
#xyplot: main function for creating scatterplots: xyplot(y~x|f*g,data)  # * means interaction
#bwplot: box and whiskers plot("boxplots)
#histogram
#stripplot
#dotplot: plot dots on "violin strings"
#splom: scatterplot matrix; like 'pairs' in base plotting system
#levelplot, contourplot: for plotting 'image' data
library(lattice)
xyplot(Ozone~Wind,data=airquality)
airquality<-transform(airquality, Month=factor(Month)) #we also need to factorize characters. This won't change the month in airquality.  
xyplot(Ozone~Wind|Month,data=airquality,layout=c(5,1))  #5,6,7,8,9 five months
class(airquality$Month)  
class(xyplot(Ozone~Wind|Month,data=airquality,layout=c(5,1)))  #trellis
#airquality$Month=factor(airquality$Month)  #next session of R would change it back to integer
#class(airquality$Month)
p<-xyplot(Ozone~Wind|Month,data=airquality,layout=c(5,1))  #could be saved as an object
dev.cur()
dev.off()
print(p)

#lattice have a "panel function' which controls what happens inside each panel of the plot
#each panel's going to represent a subset of the data, which is defined by the conditioning variable
set.seed(10)
x<-rnorm(100)
f<-rep(0:1,each=50)
y<-x+f-f*x+rnorm(100,sd=0.5)
f<-factor(f,labels=c("Group 1","Group 2"))
xyplot(y~x|f,layout=c(2,1))

#custom panel function
#functions in panel functions:
  #panel.abline(a = NULL, b = 0,h = NULL, v = NULL,reg = NULL, coef = NULL,col, col.line, lty, lwd, alpha, type,...,reference = FALSE,identifier = "abline")
  #panel.refline: panel.refline is similar to abline, but uses the "reference.line" settings for the defaults.
  #panel.curve(expr, from, to, n = 101,curve.type = "l",col, lty, lwd, type,...,identifier = "curve")
  #panel.rug:adds a rug(data points) representation of the (marginal) data to the panel, much like rug.
  #panel.average(x, y, fun = mean, horizontal = TRUE,lwd, lty, col, col.line, type,...,identifier = "linejoin")
  #panel.linejoin(x, y, fun = mean, horizontal = TRUE,lwd, lty, col, col.line, type,...,identifier = "linejoin")
  #panel.fill(col, border, ..., identifier = "fill")
  #panel.grid(h=3, v=3, col, col.line, lty, lwd, x, y, ..., identifier = "grid")
  #panel.lmline(x, y, ..., identifier = "lmline")  #panel.lmline(x, y) is equivalent to panel.abline(lm(y ~ x)).
  #panel.mathdensity(dmath = dnorm, args = list(mean=0, sd=1),n = 50, col, col.line, lwd, lty, type,..., identifier = "mathdensity")  
  #panel.superpose(x, y = NULL, subscripts, groups,panel.groups = "panel.xyplot",...,col, col.line, col.symbol,pch, cex, fill, font,fontface, fontfamily,lty, lwd, alpha,type = "p", grid = FALSE,distribute.type = FALSE)
       #subscripts: An integer vector of subscripts giving indices of the x and y values in the original data source.
  #panel.stripplot(x, y, jitter.data = FALSE,factor = 0.5, amount = NULL,horizontal = TRUE, groups = NULL,...,identifier = "stripplot")
        #Creates stripplot (one dimensional scatterplot) of x for each level of y (or vice versa, depending on the value of horizontal)

#Much of the power of Trellis Graphics comes from the ability to define customized panel functions.
xyplot(y~x|f,panel=function(x,y,...){  #The actual plotting is done by the function specified by the panel argument.
       panel.xyplot(x,y,...)  #First call the default panel function for "xyplot"
       panel.abline(h=median(y),lty=2)  #add a horizontal line a the median 
})

xyplot(y~x|f,panel=function(x,y,...){ 
        panel.xyplot(x,y,...) 
        panel.lmline(h=median(y),lty=2)
})

#another example
histogram( ~ height | voice.part, data = singer, layout = c(2, 4),
           type = "density", border = "transparent", col.line = "grey60",  #type = c("percent", "count", "density") --> we need to claim "type" here so that panel.mathdensity would be active in the panel function.
                #border: the border of bins in histogram.Either a color for the border, or a logical flag. In the latter case, the border color is black if border is TRUE, and no border is drawn if it is FALSE (the default).
           xlab = "Height (inches)",
           ylab = "Density Histogram\n with Normal Fit",  #\n: Line feed
           panel = function(x, ...) {
                   panel.histogram(x, ...)
                   panel.mathdensity(dmath = dnorm,
                                     args = list(mean=mean(x),sd=sd(x)), ...)
                   })
#another example
histogram( ~ height | voice.part, data = singer,
           xlab = "Height (inches)", type = "density",  #type = c("percent", "count", "density")
           panel = function(x, ...) {
                   panel.histogram(x, ...)
                   panel.mathdensity(dmath = dnorm, col = "black",  #panel.mathdensity plots a (usually theoretical) probability density function. 
                                     args = list(mean=mean(x),sd=sd(x)))
                                #A vectorized function that produces density values given a numeric vector named x, e.g. dnorm
                   #panel.mathdensity (dmath = dnorm, args = list(mean = 0, sd = 1), n = 50, col, col.line = reference.line$col, lwd = reference.line$lwd, lty = reference.line$lty, type, ..., identifier = "mathdensity") 
           })
#another example
bwplot(yield ~ site, barley, groups = year,
       panel = function(x, y, groups, subscripts, ...) {
               panel.grid(h = -1, v = 0) #For panel.grid, these usually specify 
                        #the number of horizontal and vertical reference lines 
                        #to be added to the plot. Alternatively, they can be 
                        #negative numbers. h=-1 and v=-1 are intended to make 
                        #the grids aligned with the axis labels.                        
               panel.stripplot(x, y, ..., jitter.data = TRUE,  #jitter.data=TRUE: jitter data to avoid overplotting
                               groups = groups, subscripts = subscripts)
               panel.superpose(x, y, ..., panel.groups = panel.average,
                               groups = groups, subscripts = subscripts)
                   #panel.superpose divides up the x (and optionally y) variable(s) by the 
                   #unique values of groups[subscripts], and plots each subset with different 
                   #graphical parameters.
       },
       auto.key = list(points = FALSE, lines = TRUE, columns = 2))  #col attribute is used for text, not for lines
        #key: A list that defines a legend to be drawn on the plot. This list is used as an argument to the draw.key function,
              #which produces a "grob" (grid object) eventually plotted by the print method for "trellis" objects. 

#another example:
bwplot(decrease ~ treatment, OrchardSprays, groups = rowpos,
       panel = "panel.superpose",
       panel.groups = "panel.linejoin",
       xlab = "treatment",
       key = list(lines = Rows(trellis.par.get("superpose.line"),
                               c(1:7, 1)),
                  text = list(lab = as.character(unique(OrchardSprays$rowpos))),
                  columns = 4, title = "Row position"))
        #Functions used to query, display and modify graphical parameters for fine control of Trellis displays. :
                #trellis.par.set(name, value, ..., theme, warn = TRUE, strict = FALSE)
                #trellis.par.get(name = NULL)
                #show.settings(x = NULL)
#another example
dotplot(variety ~ yield | site, data = barley, groups = year,
        key = simpleKey(levels(barley$year), space = "right"),
        xlab = "Barley Yield (bushels/acre) ",
        aspect=0.5, layout = c(3,2), ylab=NULL)

stripplot(voice.part ~ jitter(height), data = singer, aspect = 1,
          jitter.data = TRUE, xlab = "Height (inches)")



#6. ggplot2
#aesthetic artributes: color, shape, size of geometric objects(points,lines,bars,shapes)
#the plot may also contain statistical transformations of the data.
#stats: statistical transformations like binning, quantiles, smoothing
#scales: what scale an aesthetic map uses (example, male=red, female=blue)
        #Scales are to find how to different variables are coded, in terms of plot (make man red, make female blue).

library(ggplot2)
#the basic: qplot()  - quick plot, hides what goes on underneath, which is okay for most operations
#ggplot() is the core function and very flexible for doing things qplot() cannot do
#####data has to be organized as data frame
#factors should be labeled to be informative


#(1) qplot
#Hello World for GGplot2
str(mpg)
args(qplot)
# function (x, y = NULL, ..., data, facets = NULL, margins = FALSE, 
#           geom = "auto", stat = list(NULL), position = list(NULL), 
#           xlim = c(NA, NA), ylim = c(NA, NA), log = "", main = NULL, 
#           xlab = deparse(substitute(x)), ylab = deparse(substitute(y)), 
#           asp = NA) 
qplot(displ, hwy, data=mpg)  #solid circles as points.
plot(mpg$displ, mpg$hwy)  #the same, but more ugly. Open circles as points
#modifying aesthetics
qplot(displ, hwy, data=mpg,color=drv)  #three kinds of drives, drv are factors #legend is added automatically
class(mpg$drv)  #factor
qplot(displ, hwy, data=mpg,geom=c("point","smooth"),method='lm')  
qplot(displ, hwy, data=mpg,color=drv,geom=c("point","smooth"))  #haha, interesting
qplot(hwy, data=mpg, geom="density")
qplot(hwy, data=mpg, geom="density",color=drv)
qplot(displ, hwy, data=mpg,shape=drv)
qplot(displ, hwy, data=mpg,color=drv)
qplot(displ, hwy, data=mpg,color=drv,geom=c("point","smooth"),method='lm')  #smooth method is "lm"
#you could make an histogram with a qplot function (compared with hist in base plotting system)
qplot(hwy, data=mpg, fill=drv)  #plot would be a histogram once we only specify one variable
#Facets - like panels in lattice  -- split by groups
qplot(displ,hwy,data=mpg,facets=.~drv)  #variable on the righ-hand side of ~(tilde) determines the columns of the panels
qplot(hwy,data=mpg,facets=drv~.,binwidth=2) #variable on the left side of ~(tilde) indicates the rows of this kind of matrix here.
qplot(displ, hwy, data=mpg,color=drv,geom=c("point","smooth"),method='lm',facets=.~drv)

#(2)ggplot
g<-ggplot(mpg,aes(displ,hwy))
summary(g)
print(g) #it will say: No layers in plot. It means R doesn't know how to draw the data yet.
        #it doesn't know if you want points or if you want lines and tiles...
p<-g+geom_point()
print(p)
g+geom_point()  #auto print
g+geom_point()+geom_smooth()
g+geom_point()+geom_smooth(method="lm")
g+geom_point()+facet_grid(.~drv)+geom_smooth(method="lm") #order doesn't matter
        #notice that labels are determined by the facet_grid. So make sure the metadata is then specified appropriately.
?facet_grid
qplot(mpg, wt, data=mtcars, facets = . ~ vs + am)
qplot(mpg, wt, data=mtcars, facets = vs + am ~ . )

mt <- ggplot(mtcars, aes(mpg, wt, colour = factor(cyl))) + geom_point()
#Are scales shared across all facets (the default, "fixed"), or do they vary across 
#rows ("free_x"), columns ("free_y"), or both rows and columns ("free")
mt + facet_grid(. ~ cyl, scales = "free") 
mt + facet_grid(vs ~ am, scales = "free")
mt + facet_grid(vs ~ am, scales = "free_x")
mt + facet_grid(vs ~ am, scales = "free_y")
mt + facet_grid(vs ~ am, scales = "free", space="free")
mt + facet_grid(vs ~ am, scales = "free", space="free_x")
mt + facet_grid(vs ~ am, scales = "free", space="free_y")


#Annotation
#labels: xlab, ylab, labs, ggtitle
#each of the "geom" functions has options to modify
#for things that only make sense globally, use theme(): e.g.: theme(legend.position="none")
#two standard appearance themes are included:
        #theme_gray(): The default theme(gray background)
        #theme_bw(): More stark/plain
g+geom_point(color="steelblue",size=4,alpha=1/2) #alpha=1/2: use transparent points
str(geom_point)
#function (mapping = NULL, data = NULL, stat = "identity", position = "identity", na.rm = FALSE, ...)  
g+geom_point(aes(color=drv),size=4,alpha=1/2) #alpha=1/2: use transparent points
?aes
aes(x = mpg, y = wt) #aes creates a list of unevaluated expressions.
aes(color=drv)
g+geom_point(aes(color=drv),size=3)+labs(title="MPG DRV")+labs(x=expression("displ"[2]),y="hwy")
g+geom_point(aes(color=drv),size=3,alpha=1/2)+geom_smooth(size=4,linetype=3,method="lm",se=FALSE)+labs(title="MPG DRV")+labs(x=expression("displ"[2]),y="hwy")
g+geom_point(aes(color=drv),size=3,alpha=1/2)+geom_smooth(size=2,linetype=4,method="lm",se=TRUE)+labs(title="MPG DRV")+labs(x=expression("displ"[2]),y="hwy")
g+geom_point(aes(color=drv),size=3,alpha=1/2)+theme_bw(base_family="Times")
?theme_bw  #theme_bw(base_size = 12, base_family = "")

#examples of geom_point
p <- ggplot(mtcars, aes(wt, mpg))
p + geom_point()
# Add aesthetic mappings
p + geom_point(aes(colour = qsec))
p + geom_point(aes(alpha = qsec))
p + geom_point(aes(colour = factor(cyl)))
p + geom_point(aes(shape = factor(cyl)))
p + geom_point(aes(size = qsec))
# Change scales
p + geom_point(aes(colour = cyl)) + scale_colour_gradient(low = "blue")
p + geom_point(aes(size = qsec)) + scale_size_area()
?scale_size_area() #scale the area of points to be proportional to the value.
p + geom_point(aes(shape = factor(cyl))) + scale_shape(solid = FALSE)
# Set aesthetics to fixed value
p + geom_point(colour = "red", size = 3)
qplot(wt, mpg, data = mtcars, colour = I("red"), size = I(3))
# Varying alpha is useful for large datasets #no transparency when alpha=1, the smaller of alpha, the more transparent of data points
d + geom_point(alpha = 1)
d <- ggplot(diamonds, aes(carat, price))
d + geom_point(alpha = 1/10)
d + geom_point(alpha = 1/20)
d + geom_point(alpha = 1/100)
# You can create interesting shapes by layering multiple points of
# different sizes
p <- ggplot(mtcars, aes(mpg, wt))
p + geom_point(colour="grey50", size = 4) + geom_point(aes(colour = cyl))
p + aes(shape = factor(cyl)) +
        geom_point(aes(colour = factor(cyl)), size = 4) +
        geom_point(colour="grey90", size = 1.5)
p + geom_point(colour="black", size = 4.5) +
        geom_point(colour="pink", size = 4) +
        geom_point(aes(shape = factor(cyl)))
# These extra layers don't usually appear in the legend, but we can
# force their inclusion
p + geom_point(colour="black", size = 4.5, show_guide = TRUE) +
        geom_point(colour="pink", size = 4, show_guide = TRUE) +
        geom_point(aes(shape = factor(cyl)))
# Transparent points:
qplot(mpg, wt, data = mtcars, size = I(5), alpha = I(0.2))
# geom_point warns when missing values have been dropped from the data set
# and not plotted, you can turn this off by setting na.rm = TRUE
mtcars2 <- transform(mtcars, mpg = ifelse(runif(32) < 0.2, NA, mpg))
qplot(wt, mpg, data = mtcars2)
qplot(wt, mpg, data = mtcars2, na.rm = TRUE)

#A note about Axis Limits - Coordinate
testdata<-data.frame(x=1:100,y=rnorm(100))
testdata[50,2]<-100   #outlier
plot(testdata$x,testdata$y,type='l',ylim=c(-3,3))  #baes plot
g<-ggplot(testdata,aes(x=x,y=y))
g+geom_line()
g+geom_line()+coord_cartesian(ylim=c(-3,3))

#if we want to condition on cts variable, we first need to use cut() function to 
 #make the cts variable categorical
set.seed(6809)
diamonds <- diamonds[sample(nrow(diamonds), 1000), ]
diamonds$cut <- factor(diamonds$cut,
                       levels = c("Ideal", "Very Good", "Fair", "Good", "Premium"))
# Repeat first example with new order
p <- ggplot(diamonds, aes(carat, ..density..)) +
        geom_histogram(binwidth = 1)
p + facet_grid(. ~ cut)

mt <- ggplot(mtcars, aes(mpg, wt, colour = factor(cyl))) + geom_point()
mt + facet_grid(. ~ cyl, scales = "free_x", space="free") +
        scale_x_continuous(breaks = seq(10, 36, by = 2))
last_plot() + xlim(10, 15)