Skip to content

Commit

Permalink
feat: add the general function to conduct the clumps and hotspot anal…
Browse files Browse the repository at this point in the history
…ysis
  • Loading branch information
hongzhonglu committed Jun 12, 2019
1 parent c2fd946 commit 8add798
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 3 deletions.
1 change: 1 addition & 0 deletions 2.2 Mutation enrichment analysis in batch.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ for (i in 1:1047) {
# step 1
# preprocess the SNP information
print(i)
i <-1
ss <- pdb_Ex$locus[i]
mutated_gene0 <- preprocessSNP(ss, gene_feature = gene_feature0)
mutated_gene1 <- mutated_gene0[which(mutated_gene0$strain %in% strain_select1$Standardized_name), ]
Expand Down
53 changes: 53 additions & 0 deletions 2.2.1 Mutation enrichment analysis in batch.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#----------------note
#this main script is used to handle with the gene mutation only from SNP information
#in this process, the gene with SNP will be translated into protein, based on which
#the SNP could be classified into nsSNP and sSNP
#Only nsSNP is used to mapping onto protein 3D structure

strain_classification <- read.table("data/strain_PDETOH_classification.txt", header = TRUE,stringsAsFactors = FALSE)
strain_classification <- strain_classification[, c('strain_name','type')]
strain_type <-"PDETOH_high"
strain_select1 <- chooseStrain(type = "PDETOH_high")




##batch process for the above whole process
##batch process
# input the gene information
pdb_Ex <- read_excel("data/pdb_Ex refine for final residue distance calculation_manual check.xlsx")
pdb_Ex <- filter(pdb_Ex, is.na(pdb_Ex$With_distance))
pdb_Ex$pdbid <- paste(pdb_Ex$template, pdb_Ex$chain_new, sep = "@")
pdb_Ex <- select(pdb_Ex, locus, pdbid, qstart2, qend2, sstart2, send2)
geneWithSNP <- getGeneNameWithSNP()
pdb_Ex <- pdb_Ex[which(pdb_Ex$locus %in% geneWithSNP ==TRUE),]

#add two more clumns
pdb_Ex$strain_type <- strain_type
pdb_Ex$p_value <- NA

#creat new file to store the results
outfile0 <- paste('result/CLUMPS from pdb_ex for ', strain_type, sep = "")
dir.create(outfile0)
print(outfile0)


#main loop
for (i in 1:1047) {
print(i)
i <- 1
ss0 <- pdb_Ex$locus[i]
mutated_gene0 <- preprocessSNP(ss0,gene_feature = gene_feature0)
mutated_gene1 <- mutated_gene0[which(mutated_gene0$strain %in% strain_select1$Standardized_name), ]
pdbID <- pdb_Ex$pdbid[i]
p1 <- pdb_Ex$sstart2[i]
p2 <- pdb_Ex$send2[i]
distance_dir <- paste("residue_distance/pdb_ex/", pdbID, ".txt", sep = "")
#in some function, gene_snp was not put as the input, should be careful
#gene_snp <- getGeneCoordinate(gene_name = ss0, genesum = gene_feature_GEM)
result0 <- clumpsAnalysis(gene0 = ss0, SNPlist0 = mutated_gene1, gene_annotation0 = gene_feature0, pdb_dir = distance_dir, sstart0 = p1, send0 = p2)
pdb_Ex$p_value[i] <- result0
}

# save the result
write.table(pdb_Ex, paste(outfile0,'/','pdb_EX.txt', sep = ""), row.names = FALSE, sep = "\t")
3 changes: 0 additions & 3 deletions 2.3 Mutation hot spot analysis in batch.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@ strain_select1 <- chooseStrain(type = "PDETOH_high")


#------------batch process--------------------------------------------------------
#------------new version----------------------------------------------------------
#------------this version is used to preprocess data from 1011 project

# step 0
# input the gene information
pdb_Ex <- read_excel("data/pdb_Ex refine for final residue distance calculation_manual check.xlsx")
Expand Down
64 changes: 64 additions & 0 deletions 2.3.1 Mutation hot spot analysis in batch.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#----------------note
# this main script is used to handle with the gene mutation only from SNP information
# in this process, the gene with SNP will be translated into protein, based on which
# the SNP could be classified into nsSNP and sSNP
# Only nsSNP is used to mapping onto protein 3D structure

strain_classification <- read.table("data/strain_PDETOH_classification.txt", header = TRUE, stringsAsFactors = FALSE)
strain_classification <- strain_classification[, c('strain_name','type')]
strain_type <-"PDETOH_high"
strain_select1 <- chooseStrain(type = "PDETOH_high")

#------------batch process--------------------------------------------------------
# input the gene information
pdb_Ex <- read_excel("data/pdb_Ex refine for final residue distance calculation_manual check.xlsx")
pdb_Ex <- filter(pdb_Ex, is.na(pdb_Ex$With_distance))
pdb_Ex$pdbid <- paste(pdb_Ex$template, pdb_Ex$chain_new, sep = "@")
pdb_Ex <- select(pdb_Ex, locus, pdbid, qstart2, qend2, sstart2, send2)
geneWithSNP <- getGeneNameWithSNP()
pdb_Ex <- pdb_Ex[which(pdb_Ex$locus %in% geneWithSNP ==TRUE),]

#creat new file to store the results
outfile0 <- paste('result/hotspot from pdb_ex for ', strain_type, sep = "")
dir.create(outfile0)
print(outfile0)

# start the batch process
for (i in 1:1047) {
print(i)
ss0 <- pdb_Ex$locus[i]
mutated_gene0 <- preprocessSNP(ss0, gene_feature = gene_feature0)
mutated_gene1 <- mutated_gene0[which(mutated_gene0$strain %in% strain_select1$Standardized_name), ]
pdbID <- pdb_Ex$pdbid[i]
p1 <- pdb_Ex$sstart2[i]
p2 <- pdb_Ex$send2[i]
distance_dir <- paste("residue_distance/pdb_ex/", pdbID, ".txt", sep = "")
# run the function
hotSpotAnalysis(
gene0 = ss0,
SNPlist0 = mutated_gene1,
gene_annotation0 = gene_feature0,
pdb_dir = distance_dir,
sstart0 = p1,
send0 = p2,
result_dir = outfile0
)
}


















0 comments on commit 8add798

Please sign in to comment.