From 8add798841662d7a628074527666d4930e016bbc Mon Sep 17 00:00:00 2001 From: hongzhonglu Date: Wed, 12 Jun 2019 13:54:10 +0200 Subject: [PATCH] feat: add the general function to conduct the clumps and hotspot analysis --- 2.2 Mutation enrichment analysis in batch.R | 1 + 2.2.1 Mutation enrichment analysis in batch.R | 53 +++++++++++++++ 2.3 Mutation hot spot analysis in batch.R | 3 - 2.3.1 Mutation hot spot analysis in batch.R | 64 +++++++++++++++++++ 4 files changed, 118 insertions(+), 3 deletions(-) create mode 100644 2.2.1 Mutation enrichment analysis in batch.R create mode 100644 2.3.1 Mutation hot spot analysis in batch.R diff --git a/2.2 Mutation enrichment analysis in batch.R b/2.2 Mutation enrichment analysis in batch.R index 8005c95..d872135 100644 --- a/2.2 Mutation enrichment analysis in batch.R +++ b/2.2 Mutation enrichment analysis in batch.R @@ -42,6 +42,7 @@ for (i in 1:1047) { # step 1 # preprocess the SNP information print(i) + i <-1 ss <- pdb_Ex$locus[i] mutated_gene0 <- preprocessSNP(ss, gene_feature = gene_feature0) mutated_gene1 <- mutated_gene0[which(mutated_gene0$strain %in% strain_select1$Standardized_name), ] diff --git a/2.2.1 Mutation enrichment analysis in batch.R b/2.2.1 Mutation enrichment analysis in batch.R new file mode 100644 index 0000000..630a3d7 --- /dev/null +++ b/2.2.1 Mutation enrichment analysis in batch.R @@ -0,0 +1,53 @@ +#----------------note +#this main script is used to handle with the gene mutation only from SNP information +#in this process, the gene with SNP will be translated into protein, based on which +#the SNP could be classified into nsSNP and sSNP +#Only nsSNP is used to mapping onto protein 3D structure + +strain_classification <- read.table("data/strain_PDETOH_classification.txt", header = TRUE,stringsAsFactors = FALSE) +strain_classification <- strain_classification[, c('strain_name','type')] +strain_type <-"PDETOH_high" +strain_select1 <- chooseStrain(type = "PDETOH_high") + + + + +##batch process for the above whole process +##batch process +# input the gene information +pdb_Ex <- read_excel("data/pdb_Ex refine for final residue distance calculation_manual check.xlsx") +pdb_Ex <- filter(pdb_Ex, is.na(pdb_Ex$With_distance)) +pdb_Ex$pdbid <- paste(pdb_Ex$template, pdb_Ex$chain_new, sep = "@") +pdb_Ex <- select(pdb_Ex, locus, pdbid, qstart2, qend2, sstart2, send2) +geneWithSNP <- getGeneNameWithSNP() +pdb_Ex <- pdb_Ex[which(pdb_Ex$locus %in% geneWithSNP ==TRUE),] + +#add two more clumns +pdb_Ex$strain_type <- strain_type +pdb_Ex$p_value <- NA + +#creat new file to store the results +outfile0 <- paste('result/CLUMPS from pdb_ex for ', strain_type, sep = "") +dir.create(outfile0) +print(outfile0) + + +#main loop +for (i in 1:1047) { + print(i) + i <- 1 + ss0 <- pdb_Ex$locus[i] + mutated_gene0 <- preprocessSNP(ss0,gene_feature = gene_feature0) + mutated_gene1 <- mutated_gene0[which(mutated_gene0$strain %in% strain_select1$Standardized_name), ] + pdbID <- pdb_Ex$pdbid[i] + p1 <- pdb_Ex$sstart2[i] + p2 <- pdb_Ex$send2[i] + distance_dir <- paste("residue_distance/pdb_ex/", pdbID, ".txt", sep = "") + #in some function, gene_snp was not put as the input, should be careful + #gene_snp <- getGeneCoordinate(gene_name = ss0, genesum = gene_feature_GEM) + result0 <- clumpsAnalysis(gene0 = ss0, SNPlist0 = mutated_gene1, gene_annotation0 = gene_feature0, pdb_dir = distance_dir, sstart0 = p1, send0 = p2) + pdb_Ex$p_value[i] <- result0 +} + +# save the result +write.table(pdb_Ex, paste(outfile0,'/','pdb_EX.txt', sep = ""), row.names = FALSE, sep = "\t") diff --git a/2.3 Mutation hot spot analysis in batch.R b/2.3 Mutation hot spot analysis in batch.R index 990bbe3..595dd23 100644 --- a/2.3 Mutation hot spot analysis in batch.R +++ b/2.3 Mutation hot spot analysis in batch.R @@ -12,9 +12,6 @@ strain_select1 <- chooseStrain(type = "PDETOH_high") #------------batch process-------------------------------------------------------- -#------------new version---------------------------------------------------------- -#------------this version is used to preprocess data from 1011 project - # step 0 # input the gene information pdb_Ex <- read_excel("data/pdb_Ex refine for final residue distance calculation_manual check.xlsx") diff --git a/2.3.1 Mutation hot spot analysis in batch.R b/2.3.1 Mutation hot spot analysis in batch.R new file mode 100644 index 0000000..ca6a11c --- /dev/null +++ b/2.3.1 Mutation hot spot analysis in batch.R @@ -0,0 +1,64 @@ +#----------------note +# this main script is used to handle with the gene mutation only from SNP information +# in this process, the gene with SNP will be translated into protein, based on which +# the SNP could be classified into nsSNP and sSNP +# Only nsSNP is used to mapping onto protein 3D structure + +strain_classification <- read.table("data/strain_PDETOH_classification.txt", header = TRUE, stringsAsFactors = FALSE) +strain_classification <- strain_classification[, c('strain_name','type')] +strain_type <-"PDETOH_high" +strain_select1 <- chooseStrain(type = "PDETOH_high") + +#------------batch process-------------------------------------------------------- +# input the gene information +pdb_Ex <- read_excel("data/pdb_Ex refine for final residue distance calculation_manual check.xlsx") +pdb_Ex <- filter(pdb_Ex, is.na(pdb_Ex$With_distance)) +pdb_Ex$pdbid <- paste(pdb_Ex$template, pdb_Ex$chain_new, sep = "@") +pdb_Ex <- select(pdb_Ex, locus, pdbid, qstart2, qend2, sstart2, send2) +geneWithSNP <- getGeneNameWithSNP() +pdb_Ex <- pdb_Ex[which(pdb_Ex$locus %in% geneWithSNP ==TRUE),] + +#creat new file to store the results +outfile0 <- paste('result/hotspot from pdb_ex for ', strain_type, sep = "") +dir.create(outfile0) +print(outfile0) + +# start the batch process +for (i in 1:1047) { + print(i) + ss0 <- pdb_Ex$locus[i] + mutated_gene0 <- preprocessSNP(ss0, gene_feature = gene_feature0) + mutated_gene1 <- mutated_gene0[which(mutated_gene0$strain %in% strain_select1$Standardized_name), ] + pdbID <- pdb_Ex$pdbid[i] + p1 <- pdb_Ex$sstart2[i] + p2 <- pdb_Ex$send2[i] + distance_dir <- paste("residue_distance/pdb_ex/", pdbID, ".txt", sep = "") + # run the function + hotSpotAnalysis( + gene0 = ss0, + SNPlist0 = mutated_gene1, + gene_annotation0 = gene_feature0, + pdb_dir = distance_dir, + sstart0 = p1, + send0 = p2, + result_dir = outfile0 + ) +} + + + + + + + + + + + + + + + + + +