From 8add798841662d7a628074527666d4930e016bbc Mon Sep 17 00:00:00 2001
From: hongzhonglu <luho@chalmers.se>
Date: Wed, 12 Jun 2019 13:54:10 +0200
Subject: [PATCH] feat: add the general function to conduct the clumps and
 hotspot analysis

---
 2.2 Mutation enrichment analysis in batch.R   |  1 +
 2.2.1 Mutation enrichment analysis in batch.R | 53 +++++++++++++++
 2.3 Mutation hot spot analysis in batch.R     |  3 -
 2.3.1 Mutation hot spot analysis in batch.R   | 64 +++++++++++++++++++
 4 files changed, 118 insertions(+), 3 deletions(-)
 create mode 100644 2.2.1 Mutation enrichment analysis in batch.R
 create mode 100644 2.3.1 Mutation hot spot analysis in batch.R

diff --git a/2.2 Mutation enrichment analysis in batch.R b/2.2 Mutation enrichment analysis in batch.R
index 8005c95..d872135 100644
--- a/2.2 Mutation enrichment analysis in batch.R	
+++ b/2.2 Mutation enrichment analysis in batch.R	
@@ -42,6 +42,7 @@ for (i in 1:1047) {
   # step 1
   # preprocess the SNP information
   print(i)
+  i <-1
   ss <- pdb_Ex$locus[i]
   mutated_gene0 <- preprocessSNP(ss, gene_feature = gene_feature0)
   mutated_gene1 <- mutated_gene0[which(mutated_gene0$strain %in% strain_select1$Standardized_name), ]
diff --git a/2.2.1 Mutation enrichment analysis in batch.R b/2.2.1 Mutation enrichment analysis in batch.R
new file mode 100644
index 0000000..630a3d7
--- /dev/null
+++ b/2.2.1 Mutation enrichment analysis in batch.R	
@@ -0,0 +1,53 @@
+#----------------note
+#this main script is used to handle with the gene mutation only from SNP information
+#in this process, the gene with SNP will be translated into protein, based on which
+#the SNP could be classified into nsSNP and sSNP
+#Only nsSNP is used to mapping onto protein 3D structure
+
+strain_classification <- read.table("data/strain_PDETOH_classification.txt", header = TRUE,stringsAsFactors = FALSE)
+strain_classification <- strain_classification[, c('strain_name','type')]
+strain_type <-"PDETOH_high"
+strain_select1 <- chooseStrain(type = "PDETOH_high")
+
+
+
+
+##batch process for the above whole process
+##batch process
+# input the gene information
+pdb_Ex <- read_excel("data/pdb_Ex refine for final residue distance calculation_manual check.xlsx")
+pdb_Ex <- filter(pdb_Ex, is.na(pdb_Ex$With_distance))
+pdb_Ex$pdbid <- paste(pdb_Ex$template, pdb_Ex$chain_new, sep = "@")
+pdb_Ex <- select(pdb_Ex, locus, pdbid, qstart2, qend2, sstart2, send2)
+geneWithSNP <- getGeneNameWithSNP()
+pdb_Ex <- pdb_Ex[which(pdb_Ex$locus %in% geneWithSNP ==TRUE),]
+
+#add two more clumns
+pdb_Ex$strain_type <- strain_type
+pdb_Ex$p_value <- NA
+
+#creat new file to store the results
+outfile0 <- paste('result/CLUMPS from pdb_ex for ', strain_type, sep = "")
+dir.create(outfile0)
+print(outfile0)
+
+
+#main loop
+for (i in 1:1047) {
+  print(i)
+  i <- 1
+  ss0 <- pdb_Ex$locus[i]
+  mutated_gene0 <- preprocessSNP(ss0,gene_feature = gene_feature0)
+  mutated_gene1 <- mutated_gene0[which(mutated_gene0$strain %in% strain_select1$Standardized_name), ]
+  pdbID <- pdb_Ex$pdbid[i]
+  p1 <- pdb_Ex$sstart2[i]
+  p2 <- pdb_Ex$send2[i]
+  distance_dir <- paste("residue_distance/pdb_ex/", pdbID, ".txt", sep = "")
+  #in some function, gene_snp was not put as the input, should be careful
+  #gene_snp <- getGeneCoordinate(gene_name = ss0, genesum = gene_feature_GEM)
+  result0 <- clumpsAnalysis(gene0 = ss0, SNPlist0 = mutated_gene1, gene_annotation0 = gene_feature0, pdb_dir = distance_dir, sstart0 = p1, send0 = p2)
+  pdb_Ex$p_value[i] <- result0
+}
+
+# save the result
+write.table(pdb_Ex, paste(outfile0,'/','pdb_EX.txt', sep = ""), row.names = FALSE, sep = "\t")
diff --git a/2.3 Mutation hot spot analysis in batch.R b/2.3 Mutation hot spot analysis in batch.R
index 990bbe3..595dd23 100644
--- a/2.3 Mutation hot spot analysis in batch.R	
+++ b/2.3 Mutation hot spot analysis in batch.R	
@@ -12,9 +12,6 @@ strain_select1 <- chooseStrain(type = "PDETOH_high")
 
 
 #------------batch process--------------------------------------------------------
-#------------new version----------------------------------------------------------
-#------------this version is used to preprocess data from 1011 project
-
 # step 0
 # input the gene information
 pdb_Ex <- read_excel("data/pdb_Ex refine for final residue distance calculation_manual check.xlsx")
diff --git a/2.3.1 Mutation hot spot analysis in batch.R b/2.3.1 Mutation hot spot analysis in batch.R
new file mode 100644
index 0000000..ca6a11c
--- /dev/null
+++ b/2.3.1 Mutation hot spot analysis in batch.R	
@@ -0,0 +1,64 @@
+#----------------note
+# this main script is used to handle with the gene mutation only from SNP information
+# in this process, the gene with SNP will be translated into protein, based on which
+# the SNP could be classified into nsSNP and sSNP
+# Only nsSNP is used to mapping onto protein 3D structure
+
+strain_classification <- read.table("data/strain_PDETOH_classification.txt", header = TRUE, stringsAsFactors = FALSE)
+strain_classification <- strain_classification[, c('strain_name','type')]
+strain_type <-"PDETOH_high"
+strain_select1 <- chooseStrain(type = "PDETOH_high")
+
+#------------batch process--------------------------------------------------------
+# input the gene information
+pdb_Ex <- read_excel("data/pdb_Ex refine for final residue distance calculation_manual check.xlsx")
+pdb_Ex <- filter(pdb_Ex, is.na(pdb_Ex$With_distance))
+pdb_Ex$pdbid <- paste(pdb_Ex$template, pdb_Ex$chain_new, sep = "@")
+pdb_Ex <- select(pdb_Ex, locus, pdbid, qstart2, qend2, sstart2, send2)
+geneWithSNP <- getGeneNameWithSNP()
+pdb_Ex <- pdb_Ex[which(pdb_Ex$locus %in% geneWithSNP ==TRUE),]
+
+#creat new file to store the results
+outfile0 <- paste('result/hotspot from pdb_ex for ', strain_type, sep = "")
+dir.create(outfile0)
+print(outfile0)
+
+# start the batch process
+for (i in 1:1047) {
+  print(i)
+  ss0 <- pdb_Ex$locus[i]
+  mutated_gene0 <- preprocessSNP(ss0, gene_feature = gene_feature0)
+  mutated_gene1 <- mutated_gene0[which(mutated_gene0$strain %in% strain_select1$Standardized_name), ]
+  pdbID <- pdb_Ex$pdbid[i]
+  p1 <- pdb_Ex$sstart2[i]
+  p2 <- pdb_Ex$send2[i]
+  distance_dir <- paste("residue_distance/pdb_ex/", pdbID, ".txt", sep = "")
+  # run the function
+  hotSpotAnalysis(
+    gene0 = ss0,
+    SNPlist0 = mutated_gene1,
+    gene_annotation0 = gene_feature0,
+    pdb_dir = distance_dir,
+    sstart0 = p1,
+    send0 = p2,
+    result_dir = outfile0
+  )
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+