-
Notifications
You must be signed in to change notification settings - Fork 0
/
bare.peat.national.RF.classification.R
357 lines (298 loc) · 16.8 KB
/
bare.peat.national.RF.classification.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
#############################################################################
# Modifed script, original from CEH for peatland condition classification, then
#joblog 88131 , 88296 and currently for bare peat 92279. Original is in A2567145
#
# The script reads an ESRI Shapefile (defined by the "shapefile" variable) with
# training polygons and then selects a
# user-determined number of samples (defied by the "numsamps" variable) from
# each land cover type. A multilayer image that contains spectral, other
# continuous data or categorical data is also input (defined by the inImage
# variable). For each randomly selected sample the data values for that pixel
# are determined and these data are used to run the Random Forest model.
#
# After building the model the multilayer image is read, and up to three output
# layers (classImage, probImage, threshImage) can be selected for the output image.
# "classImage" classifies all of the pixels.
#
# "probImage" outputs the class probability of the class that got the most votes
# (i.e., the class that was selected for the classImage layer).
#
# "threshImage" is the same as "classImage" except all pixels with a class probability
# of the class that got the most votes below the "probThreshold" parameter are set to 0.
# This is useful to identify pixels with inter-class confusion.
#
# The image is written out (name and location is defined by the "outImage variable)
# using the GeoTIFF format. A variable importance plot is displayed to provide information
# about the influence of each variable. An error rate estimate and confusion matrix are also
# printed to provide information about classification accuracy.
#
# There is an option to assess the quality of the training data. The metric for this
# is the "margin". The margin of a training point is the proportion of votes for the correct
# class minus maximum proportion of votes for the other classes for that segment. Positive margin
# values represent correct classification, and vice versa. The margin data are written to a
# point ESRI Shapefile so they can be overlaid on the image and training polygons to assess which
# points need to be removed and relabeled in the training data and it can help determine which
# classes needs additional training segments. If this output is not needed you can enter two
# double or single-quotes ("" or '') for the variable outPointsFile.
#
# There is also an option to output a feature space plot using two bands of your choice.
# If a feature space plot is not needed then enter "0" for the variables xBand and/or yBand.
#
# Set the variables below in the "SET VARIABLES HERE" section of the script.
#
# This script was written by Ned Horning [horning@amnh.org]
# Support for writing and maintaining this script comes from The John D. and
# Catherine T. MacArthur Foundation and Google.org.
#
# This script is free software; you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software Foundation
# either version 2 of the License, or (at your option) any later version.
#
#############################################################################
#Load libraries
library(rgdal)
library(sp)
library(randomForest)
library(raster)
library(caret)
library(dplyr)
# set working directory using forward slashes
setwd("B:/92279_Bare_peat_Duncan/R_classification")
cat("Set variables and start processing\n")
# set temporary file directory with enough space for rasters, otherwise it uses C:\Users etc
rasterOptions(tmpdir = "B:/92279_Bare_peat_Duncan/R_classification")
memory.limit(size = 500000 )
############################# SET VARIABLES HERE ###################################
# Name and path for the Shapefile (don't need the .shp extension)
shapefile <- 'Training_ML5.shp'
# Approximate number of training pixels to be randomly selected for each land cover class
numsamps <- 300
# Name of the attributes that hold the integer land cover type code and the polygon id
attName <- 'ID'
polyId <- 'Poly_ID'
# No-data value for the input image
nd <- -3.39999995214e+038
# Iteration name for filenames and log file
it <- "National_Mlarea5_comparison"
# Name and path for the input satellite images
inImage<-"S2_MLarea5_indices_only.tif"
# Name and path of the output GeoTiff image
outImageName <- paste('RF_',it,'.tif',sep="")
# Name and location of the output Shapefile point file that will be created. If this output
# is not needed you can enter two double or single-quotes ("" or '')
# Note that if this file exists the write will fail with the message "Creation of output file failed"
outMarginFile <- paste('Margin_training_data',it,'.shp',sep="")
# DB - Name of output sample points used
outSamplePoints <- paste('Sample_points_',it,'.shp',sep="")
#Name of log file for outputs
logFile <- file(paste('R_log_',it,'.txt',sep=""), open ="a")
# Output classification layer without applying threshold (enter TRUE or FALSE)
classImage <- TRUE
# Output probability image layer (enter TRUE or FALSE)
probImage <- TRUE
# Output classification layer and set pixels with probability less than "probThreshold" to 0 (enter TRUE or FALSE)
threshImage <- TRUE
# Enter threshold probability in percent (values must be between 0 and 100) only used if threshImage=TRUE
probThreshold <- 70
# Layer number (band number) for the X and Y axis of the feature space plot.
# If you do not want to calculate a feature plot enter 0 as the layer number
xBand <- 0
yBand <- 0
#######################################################################################
# Start processing
cat("Pixel based classification for",it,"\n","\n", file = logFile)
startTime <- Sys.time()
cat("Start time", format(startTime),"\n")
cat("Start time", format(startTime),"\n", file = logFile)
# Read the Shapefile
vec <- readOGR(shapefile)
# Load the image then flag all no-data values (nd) so they are not processed
#DB - to speed process just load in 1 band for now.
satImageBand1 <- raster(inImage, band=1)
satImageBand1@file@nodatavalue <- nd
# Create vector of unique land cover attribute values
# DB - returns the data component, as a dataframe, of the SpatialPolygonsDataFrame
allAtt <- slot(vec, "data")
#DB - create a table with the number of occurrences per habitat class
tabAtt <- table(allAtt[[attName]])
# Check if there are data for each class - na means not available
uniqueAtt <- as.character(names(tabAtt))
if (is.na(uniqueAtt[1])) {
cat("\n*************************No attributes were found**************************** \n")
stop("Check the attName variable in the variable settings\n", call.=FALSE)
}
# Create input data from a Shapefile by sampling training data
cat("Create training data by sampling", numsamps, "pixels for each class\n", file = logFile)
# DB - first speed up process by clipping input raster for training data to extent and geometry of shapefile
# cropping and then masking is 5x quicker than vice versa
cropRaster <- crop (x=satImageBand1, y=extent(vec))
maskRaster <- mask(cropRaster, vec, inverse = FALSE)
# DB - Grenerate points for raster cell centres into a SpatialPointDataFrame
cat("Creating centroids from raster pixels","\n")
rasterPttrain <- rasterToPoints(maskRaster[[1]], spatial = TRUE)
rasterPttrain@proj4string <- vec@proj4string
# DB - retrieve polygon ids and tidy training dataset
# use over again to return a data frame containing polygon id only
polyid.df <- over(rasterPttrain, vec)[[polyId]]
polyid.spdf <- cbind(rasterPttrain, polyid.df)
# rename second column to polyId as it has a long meaningless name!
colnames(polyid.spdf@data)[2] = polyId
# remove the first column that contains the value of the first raster in the stack
rasterPttrain <- polyid.spdf[,-1]
# Create dataset of training sample points
for (x in 1:length(uniqueAtt)) {
cat("Processing class", x ," of", length(uniqueAtt), "\n")
# Get the metadata for all polygons for a particular class (based on the uniqueAtt variable)
class_data <- vec[vec[[attName]]==uniqueAtt[x],]
# Get the area of each polygon for a particular class
areas <- sapply(slot(class_data, "polygons"), slot, "area")
# Calculate the number of samples for each polygon based on the area in proportion to total area for a class
nsamps <- ceiling(numsamps*(areas/sum(areas)))
# Use random sampling to select training points (proportial based on area) from each polygon for a given class
for (i in 1:dim(class_data)[1]) {
# DB - select points that are within the polygon
cat("Selecting points from habitat ", x," polygon ", i, "\r")
polyPoints <- rasterPttrain[!is.na(over(rasterPttrain, class_data[i,])[[attName]]),]
# DB - then select an appropriate number of those points - don't duplicate points unless sample size is greater than the number of potential sample points
if (nsamps[i] > length(polyPoints)) {
# get all the points
xy_class_all <- polyPoints
# get extra set of duplicate points chosen at random to make up the numbers - replace is TRUE in case number of samples > 2x number of points
xy_class_extra <- polyPoints[sample(1:length(polyPoints), replace = TRUE, size = (nsamps[i] - length(polyPoints))),]
xy_class <- rbind(xy_class_all, xy_class_extra)
} else {
xy_class <- polyPoints[sample(1:length(polyPoints), replace = FALSE, size = nsamps[i]),]
}
# Add coordinates to create a list of random points for all polygons
if (i == 1) cpts <- xy_class
else cpts <- rbind(cpts, xy_class)
}
# The number of points might not match numsamps exactly.
# DB - it is often more because ceiling rounds up!
classpts <- cpts
if (x == 1) {
xy_allClasses<- classpts
} else {
xy_allClasses<- rbind(xy_allClasses, classpts)
}
}
#DB - EXPORT SAMPLED COORDINATES TO A SHAPEFILE
# It's only possible to export a SpatialPointsDataFrame whereas xy_allClasses is a
# spatial point object. The data frame is created from the coordinates and the data which contains polyId
xyDataFrame <- cbind(as.data.frame(xy_allClasses@coords), as.data.frame(xy_allClasses@data))
xySpatialDataFrame <- SpatialPointsDataFrame(xy_allClasses,xyDataFrame)
writeOGR(xySpatialDataFrame,
outSamplePoints,
"layer",
driver="ESRI Shapefile",
overwrite=TRUE)
# Get class code for each sample point from the polygon it falls in for response variable
response <- over(xy_allClasses, vec)[[attName]]
# DB - Load in full image now and get pixel DNs for each sample point
# reduce processing time by creating brick rather than stack
satImage <- brick(inImage)
satImage@file@nodatavalue <- nd
cat("Extracting values from the raster","\n")
trainvals <- cbind(response,xy_allClasses@data$Poly_ID, extract(satImage, xy_allClasses))
colnames(trainvals)[2] = polyId
cat("Number of training points = ",nrow(trainvals),"\n", file = logFile)
# Remove NA values
trainvals <- na.omit(trainvals)
cat("Number of training points after no data values removed =", nrow(trainvals),"\n", file = logFile)
# Check to make sure Shapefile and input image are in the same projection
if (nrow(trainvals) == 0) {
cat("\n*************************No training data found**************************** \n")
stop("It is possible the projection of the Shapefile with training data and input image are different\nCheck projections and run again", call.=FALSE)
}
# Run Random Forest in Caret package for cross validation, testing against 20% of the training data and optionally tuning the mtry parameter
set.seed(123)
dataset <- as.data.frame(trainvals)
# Hold back 20% for testing - creates a stratified random sample (as long as response is a factor) of the data into traing and test datasets
train_index <- createDataPartition(as.factor(dataset$response), p=0.8, list = FALSE)
training_data <-dataset[train_index,]
test_data <- dataset[-train_index,]
# Do cross fold validation on the 80% training dataset
startTime <- Sys.time()
cat("Starting cross-fold validation at",format(startTime), "\n", file = logFile)
# use groupKfolds to ensure that pixels from any training polygon are retained within the same fold of the data.
# This will make accuracy predictions a bit more realistic - i.e. can we predict the habitat in a geographically separate
# polygon rather than another pixel within the same polygon.
group_folds <- groupKFold(training_data$Poly_ID, k = 10)
train_control <- trainControl(index = group_folds, method="cv", search = "grid")
# reduce the data to only the predictor variables
training_data_predictors <- select(training_data, - Poly_ID, - response)
# Optionally create a grid of mtry parameters to tune but the default, the sqrt of the number of columns is usually ok
# and is what the random forest package would use. It needs to be set explicitly here though as 'train' would by default use
# 3 parameters which are unlikely to be near the default value!!
# e.g. rf_grid <- expand.grid(mtry= seq(1,ncol(training_data_predictors), by=10))
# set mtry to the sqrt of the number of predictors
mtry <- sqrt(ncol(training_data_predictors))
rf_grid <- expand.grid(.mtry=mtry)
# Train the model - as.factor ensures classification rather than regression rf
rf_model <- train(as.factor(response)~., data = select(training_data, - polyId), method="rf", trControl=train_control, tuneGrid = rf_grid, na.action=na.omit)
capture.output(rf_model, file = logFile, append = TRUE)
capture.output(rf_model$finalModel, file = logFile, append = TRUE)
endTime <- Sys.time()
cat("Finished cross-fold validation at",format(endTime), "\n", file = logFile)
# test on the 20% left out of training and print confusion matrix
cat("Test data prediction", "\n", file = logFile)
test_pred <- predict(rf_model, test_data)
responseFactor <- as.factor(test_data$response)
capture.output(confusionMatrix(test_pred, responseFactor), file = logFile, append = TRUE)
# Plotting variable importance plot
importance<-varImp(rf_model, scale=FALSE, n.var=10)
plot(importance)
# rebuild the random forest model using all the training data.
# NB the caret implementation cannot be used here as it fails to handle the no data parts of the raster.
randfor <- randomForest(as.factor(response) ~., data= select(dataset, - polyId), importance=TRUE, na.action=na.omit)
# Start predictions
cat("Predicting unknown data","\n")
# Calculate how many bands the output image should have
numBands <- classImage + probImage + threshImage
# Calculate the image block size for processing
bs <- blockSize(satImage)
# Create the output raster block DB - a raster brick is a multilayer raster which processes facter than a RasterStack
outImage <- brick(satImage, values=FALSE, nl=numBands)
outImage <- writeStart(outImage, filename=outImageName, progress='text', format='GTiff', datatype='INT1U', overwrite=TRUE)
# Loop though each of the image blocks to calculate the output layers selected in the variables section
for (i in 1:bs$n) {
cat("processing block", i, "of", bs$n, "\n")
imageBlock <- getValuesBlock(satImage, row=bs$row[i], nrows=bs$nrows[i])
predValues <- predict(randfor, imageBlock, type='response')
classValues <- as.numeric(levels(predValues))[predValues]
outMatrix <- matrix(nrow=nrow(imageBlock), ncol=0)
if (classImage) {
outMatrix <- cbind(outMatrix, classValues)
}
if (probImage || threshImage) {
predProbs <- as.data.frame(predict(randfor, imageBlock, type='prob'))
maxProb <- round(apply(predProbs, 1, max) * 100)
if (probImage) {
outMatrix <- cbind(outMatrix, maxProb)
}
if (threshImage) {
threshValues <- classValues
threshValues[which(maxProb <= probThreshold)] <- 0
outMatrix <- cbind(outMatrix,threshValues)
}
}
writeValues(outImage, outMatrix, bs$row[i])
}
# Stop writing and close the file
outImage <- writeStop(outImage)
# DB - Create 3 separate rasters from output image
writeRaster(outImage, outImageName, format='GTiff', datatype='INT1U', bylayer=TRUE, suffix='numbers', overwrite= TRUE)
# Calculate margin (proportion of votes for correct class minus maximum proportion of votes for other classes)
marginData <- randomForest::margin(randfor)
trainingAccuracy <- cbind(marginData[order(marginData)], trainvals[order(marginData),1])
#Add column names to attributes table
colnames(trainingAccuracy) <- c("margin", "classNum")
#Calculate X and Y coordinates for training data points
xyCoords <- xy_allClasses@coords
xyCoords <- xyCoords[order(marginData),]
# Create and write point Shapefile with margin information to help improve training data
pointVector <- SpatialPointsDataFrame(xyCoords, as.data.frame(trainingAccuracy), coords.nrs = numeric(0), proj4string = satImage@crs)
writeOGR(pointVector, outMarginFile, "layer", driver="ESRI Shapefile", check_exists=TRUE)
endTime <- Sys.time()
cat("End time", format(endTime),"\n", file = logFile)
close(logFile)