The R package 'DeepGS' can be used to perform genomic selection (GS), which is a promising
breeding strategy in plants and animals. DeepGS predicts phenotypes using genomewide
genotypic markers with an advanced machine learning technique (deep learning). The effectiveness
of DeepGS has been demonstrated in predicting eight phenotypic traits on a population
of 2000 Iranian bread wheat (Triticum aestivum) lines from the wheat gene bank of the International
Maize and Wheat Improvement Center (CIMMYT).
- Version 1.0 -First version released on Feb, 15th, 2017
- Version 1.1 -Second version released on Oct, 12th, 2017
- Version 1.2 -Third version released on Jun, 6th, 2018
1.'ELBPSO' funtion was added for ensemble learning based on particle swarm optimization (ELBPSO)
2.Update package document
3.Function optimization for building deep learning Genomic selection prediction model
- Download Docker for windows
- Double click the EXE file to open it;
- Follow the wizard instruction and complete installation;
- Search docker, select Docker for Windows in the search results and clickit.
- Download Docker for Mac os
- Double click the DMG file to open it;
- Drag the docker into Applications and complete installation;
- Start docker from Launchpad by click it.
- Go to Docker, choose your Ubuntuversion, browse to pool/stable and choose amd64, armhf, ppc64el or s390x. Download the DEB file for the Docker version you want to install;
- Install Docker, supposing that the DEB file is download into following path:"/home/docker-ce~ubuntu_amd64.deb"
$ sudo dpkg -i /home/docker-ce<version-XXX>~ubuntu_amd64.deb
$ sudo apt-get install -f
Once Docker installation is completed, we can run hello-world image to verify if Docker is installed correctly. Open terminal in Mac OS X and Linux operating system and open CMD for Windows operating system, then type the following command:
$ docker run hello-world
Note: root permission is required for Linux operating system. Note: considering that differences between different computers may exist, please refer to official installation manual if instructions above don’t work.
$ docker pull malab/deepgs_cpu
$ docker run -it -v /host directory of dataset:/home/data malab/deepgs_cpu R
Note: Supposing that users’ private dataset is located in directory /home/test
, then change the words above (/host directory of dataset
) to host directory (/home/test
)
library(DeepGS)
setwd("/home/data/")
Important: the directory (/home/data/
) is a virtual directory in DeepGS Docker image. In order to use private dataset more easily, the parameter “-v” is strongly recommended to mount host directory of dataset to DeepGS image.
The details of DeepGS installation are available at: https://github.com/cma2015/DeepGS/blob/master/DeepGS_GPU_installation.md
data(wheat_example)
Markers <- wheat_example$Markers
y <- wheat_example$y
cvSampleList <- cvSampleIndex(length(y),10,1)
# cross validation set
cvIdx <- 1
trainIdx <- cvSampleList[[cvIdx]]$trainIdx
testIdx <- cvSampleList[[cvIdx]]$testIdx
trainMat <- Markers[trainIdx,]
trainPheno <- y[trainIdx]
validIdx <- sample(1:length(trainIdx),floor(length(trainIdx)*0.1))
validMat <- trainMat[validIdx,]
validPheno <- trainPheno[validIdx]
trainMat <- trainMat[-validIdx,]
trainPheno <- trainPheno[-validIdx]
conv_kernel <- c("1*18") ## convolution kernels (fileter shape)
conv_stride <- c("1*1")
conv_num_filter <- c(8) ## number of filters
pool_act_type <- c("relu") ## active function for next pool
pool_type <- c("max") ## max pooling shape
pool_kernel <- c("1*4") ## pooling shape
pool_stride <- c("1*4") ## number of pool kernerls
fullayer_num_hidden <- c(32,1)
fullayer_act_type <- c("sigmoid")
drop_float <- c(0.2,0.1,0.05)
cnnFrame <- list(conv_kernel =conv_kernel,conv_num_filter = conv_num_filter,
conv_stride = conv_stride,pool_act_type = pool_act_type,
pool_type = pool_type,pool_kernel =pool_kernel,
pool_stride = pool_stride,fullayer_num_hidden= fullayer_num_hidden,
fullayer_act_type = fullayer_act_type,drop_float = drop_float)
markerImage = paste0("1*",ncol(trainMat))
trainGSmodel <- train_deepGSModel(trainMat = trainMat,trainPheno = trainPheno,
validMat = validMat,validPheno = validPheno, markerImage = markerImage,
cnnFrame = cnnFrame,device_type = "cpu",gpuNum = 1, eval_metric = "mae",
num_round = 6000,array_batch_size= 30,learning_rate = 0.01,
momentum = 0.5,wd = 0.00001, randomseeds = 0,initializer_idx = 0.01,
verbose = TRUE)
predscores <- predict_GSModel(GSModel = trainGSmodel,testMat = Markers[testIdx,],
markerImage = markerImage )
refer_value <- runif(100)
pred_value <- sin(refer_value) + cos(refer_value)
meanNDCG(realScores = refer_value,predScores = pred_value, topAlpha = 10)
# Not run
# library(DeepGS)
# library(rrBLUP)
# data("wheat_example")
# Markers <- wheat_example$Markers
# y <- wheat_example$y
# cvSampleList <- cvSampleIndex(length(y),10,1)
# # select one fold
# cvIdx <- 1
# trainIdx <- cvSampleList[[cvIdx]]$trainIdx
# testIdx <- cvSampleList[[cvIdx]]$testIdx
# trainMat = Markers[trainIdx,]
# trainPheno = y[trainIdx]
# validIdx <- sample(1:length(trainIdx),floor(length(trainIdx)*0.1))
# validMat <- trainMat[validIdx,]
# validPheno <- trainPheno[validIdx]
# testMat = Markers[testIdx,]
# testPheno = y[testIdx]
# # design DeepGS architecture
# conv_kernel <- c("1*18") # convolution kernels (fileter shape)
# conv_stride <- c("1*1")
# conv_num_filter <- c(8) # number of filters
# pool_act_type <- c("relu") # active function for next pool
# pool_type <- c("max") # max pooling shape
# pool_kernel <- c("1*4") # pooling shape
# pool_stride <- c("1*4") # number of pool kernerls
# fullayer_num_hidden <- c(32,1)
# fullayer_act_type <- c("sigmoid")
# drop_float <- c(0.2,0.1,0.05)
# cnnFrame <- list(conv_kernel =conv_kernel,conv_num_filter = conv_num_filter,
# conv_stride = conv_stride,pool_act_type = pool_act_type,
# pool_type = pool_type,pool_kernel =pool_kernel,
# pool_stride = pool_stride,fullayer_num_hidden= fullayer_num_hidden,
# fullayer_act_type = fullayer_act_type,drop_float = drop_float)
#
# markerImage = paste0("1*",ncol(trainMat))
# # train DeepGS model
# DeepGS_obj <- train_deepGSModel(trainMat = trainMat,trainPheno = trainPheno,
# validMat = validMat,validPheno = validPheno, markerImage = markerImage,
# cnnFrame = cnnFrame,device_type = "cpu",gpuNum = 1, eval_metric = "mae",
# num_round = 6000,array_batch_size= 30,learning_rate = 0.01,
# momentum = 0.5,wd = 0.00001, randomseeds = 0,initializer_idx = 0.01,
# verbose =TRUE)
# # make predictions based on the trained model
# DeepGS_pred <- predict_GSModel(GSModel = DeepGS_obj,testMat = Markers[testIdx,],
# markerImage = markerImage )
# # train RR-BLUP model
# rrBLUP_obj <-mixed.solve(trainPheno, Z=trainMat, K=NULL, SE = FALSE, return.Hinv=FALSE)
# # make predictions based on the trained model
# rrBLUP_pred <- testMat %*% rrBLUP_obj$u + as.numeric(rrBLUP_obj$beta )
# # prepare the prediction matrix
# test_predMat <- cbind(t(DeepGS_pred), rrBLUP_pred)
# train_predMat <- cbind(testPheno, t(DeepGS_pred), rrBLUP_pred)
# colnames(train_predMat) <- c("real", "DeepGS", "RR-BLUP")
## End not run
# calculating the weight of different training model by using their predict socres
test_datapath <- system.file("exdata", "test_ELBPSO.RData",
package = "DeepGS")
load(test_datapath)
weight <- ELBPSO(rep_times = 100,interation_times = 25,weight_dimension = 2,
weight_min = 0,weight_max=1,rate_min = -0.01,rate_max = 0.01,
paticle_number = 10, pred_matrix = train_predMat,IW = 1,
AF1 = 2, AF2 = 2)
ensemble_pred <- (test_predMat %*% weight)/sum(weight)
predMat <- cbind(testPheno, t(DeepGS_pred), rrBLUP_pred, ensemble_pred)
colnames(predMat) <- c("real", "DeepGS", "RR-BLUP", "ensemble")
cor(predMat)
Please use DeepGS/issues for how to use DeepGS and reporting bugs.
Ma, W., Qiu, Z., Song, J., Li, J., Cheng, Q., Zhai, J., & Ma, C. (2018). A deep convolutional neural network approach for predicting phenotypes from genotypes. Planta, 248(5): 1307-1318.