snp/linear_regression.R

#Input_file2 for this script generated by starting with a list of sample ids from the SQL db (one sample id per line) 
#and following lines 101-109 in https://github.com/harrisonlab/popgen/blob/master/snp/crown_rot_qtl_markers_gwas_plate.sh
#No missing data allowed for any marker.
#Input_file2 - change to the file containing the sample genotypes which you want crown rot scores for.
input_file2 <- "all_cultivars_ids_outliers.raw2.txt"
#Input_file contains the  genotypes for cultivars with known crown rot scores 
#and used for generating the linear model - do not change unless you have better data.
input_file <- "istraw_35_outliers.raw.txt"

stats <- read.table(input_file, sep="\t", quote='', stringsAsFactors=TRUE,header=TRUE)
rownames(stats) <- stats$FID
# make sure R knows genotype is categorical
stats$Affx.88880888_G <- factor(stats$Affx.88880888_G)
stats$Affx.88900057_A <- factor(stats$Affx.88900057_A)
stats$Affx.88900641_T <- factor(stats$Affx.88900641_T)
stats$Affx.88900655_T <- factor(stats$Affx.88900655_T)
stats$Affx.88901304_C <- factor(stats$Affx.88901304_C)
#Add region to the model and create the linear model
model <- lm(PHENOTYPE ~ Affx.88880888_G + Affx.88900057_A + Affx.88900641_T + Affx.88900655_T + Affx.88901304_C, data=stats)
library(effects)
plot(allEffects(model))
plot(model, pch=16, which=1)

#Plot crown rot scores - predicted vs actual crown rot scores for samples used in the linear model.
summary(model)$r.squared
no_missing_data <- na.omit(stats)
plot(predict(model),no_missing_data$PHENOTYPE,xlab="Predicted score", ylab="Actual score") 
abline(a=0, b=1)

#Actual prediction of crown rot scores for samples with unknown resistance
sbc <- read.table(input_file2, sep="\t", quote='', stringsAsFactors=TRUE,header=TRUE)
rownames(sbc) <- sbc$FID
keeps <- c("Affx.88880888_G", "Affx.88900057_A", "Affx.88900641_T", "Affx.88900655_T", "Affx.88901304_C")
just_markers <- sbc[keeps]
just_markers$Affx.88880888_G <- factor(just_markers$Affx.88880888_G)
just_markers$Affx.88900057_A <- factor(just_markers$Affx.88900057_A)
just_markers$Affx.88900641_T <- factor(just_markers$Affx.88900641_T)
just_markers$Affx.88900655_T <- factor(just_markers$Affx.88900655_T)
just_markers$Affx.88901304_C <- factor(just_markers$Affx.88901304_C)
#Prediction with 95% CI
my_prediction <- predict(model, just_markers, interval="predict")
write.table(my_prediction, file="all_cultivars_predictions.txt", sep="\t", quote=FALSE)
######Optional: join with cultivar name - the "cultivars" table needs changing to custom one.
cultivars = read.table("all_cultivars_ids.txt", sep="\t", quote='', stringsAsFactors=TRUE,header=TRUE)
rownames(cultivars) <- cultivars$Id
pred_cultivars <- merge(my_prediction, cultivars, by.x = 0, by.y = 0)
write.table(pred_cultivars, file="all_cultivars_predictions.txt", sep="\t", quote=FALSE)