-
Notifications
You must be signed in to change notification settings - Fork 7
/
hmmEra
executable file
·112 lines (97 loc) · 4.41 KB
/
hmmEra
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env Rscript
### hmmEra v1.0.0 (C) Max Planck Society for the Advancement of Science
###
### Code developed by Josue Barrera-Redondo <josue.barrera@tuebingen.mpg.de>
###
### This program is free software: you can redistribute it and/or modify
### it under the terms of the GNU General Public License as published by
### the Free Software Foundation, either version 3 of the License, or
### (at your option) any later version.
###
### This program is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
### GNU General Public License for more details.
library(optparse)
option_list = list(
make_option(c("-f", "--fasta"), type="character", default=NULL,
help="Name of input FASTA file (sequences.fasta)", metavar="file"),
make_option(c("-a", "--ages"), type="character", default=NULL,
help="Gene age table previously generated by GenEra ([taxid]_gene_ages.tsv)", metavar="file"),
make_option(c("-e", "--eval"), type = "numeric", default="1e-5",
help="E-value threshold to retain homologs against the Uniprot database [default= %default]", metavar="NUM"),
make_option(c("-s", "--strata"), type="integer", default="2",
help="Genes equal and younger to this phylostratum will be assessed with jackhmmer [default= %default]", metavar="NUM"),
make_option(c("-o", "--out"), type="character", default="HMMER_results.bout",
help="Output file name [default= %default]", metavar="OUT")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (is.null(opt$fasta)){
message ("")
message ("hmmEra: a HMMER wrapper for GenEra")
message ("Generate an input table for GenEra using jackhmmer")
message ("This script is meant to be used after running GenEra using DIAMOND or Foldseek")
message ("")
print_help(opt_parser)
stop("The FASTA input file must be supplied (sequences.fasta)", call.=FALSE)
}
if (is.null(opt$ages)){
message ("")
message ("hmmEra: a HMMER wrapper for GenEra")
message ("Generate an input table for GenEra using jackhmmer")
message ("This script is meant to be used after running GenEra using DIAMOND or Foldseek")
message ("")
print_help(opt_parser)
stop("The table with gene ages must be supplied ([taxid]_gene_ages.tsv)", call.=FALSE)
}
library(bio3d)
library(tidyverse)
library(seqinr)
QUERYFASTA <- seqinr::read.fasta(file = opt$fasta, seqtype = "AA")
QUERY_AGES <- read.table(opt$ages, header = FALSE, sep = "\t")
GENE_SUBSET <- dplyr::filter(QUERY_AGES, QUERY_AGES$V3 >= opt$strata)
SUBSET_IDS <- GENE_SUBSET$V1
# Perform HMMER analysis on every gene in the list
for (GENE_ID in SUBSET_IDS) {
# Generate temporary file with each individual sequence
GENE_SEQUENCE <- QUERYFASTA[names(QUERYFASTA) %in% GENE_ID]
seqinr::write.fasta(sequences = GENE_SEQUENCE, names = names(GENE_SEQUENCE), file.out = "tmp.fasta", open = "w", nbchar = 60)
QUERYSEQ <- bio3d::read.fasta(file = "tmp.fasta")
# Run JACKHMMER for each gene in the list. If the server prints an error, try again, and if it fails again, skip to the next gene
res1 <- try(outdb <- bio3d::hmmer(seq = QUERYSEQ, type="jackhmmer", db = "uniprotkb"))
if(inherits(res1, "try-error"))
{
message ("")
message ("Sequence submission ended in an error, let's try again")
message("")
Sys.sleep(10)
res2 <- try(outdb <- bio3d::hmmer(seq = QUERYSEQ, type="jackhmmer", db = "uniprotkb"))
if(inherits(res2, "try-error"))
{
message("")
message(" WARNING: Skipping the following sequence due to jackhmmer error:")
print(QUERYSEQ)
message("")
next
}
}
# Generate and print output table, or go to the next sequence if no hits were found after the e-value filter
OUT_TABLE <- outdb$hit.tbl[c(1,9,20,23)]
OUT_TABLE <- dplyr::filter(OUT_TABLE, OUT_TABLE$evalue < opt$eval)
if (nrow(OUT_TABLE) == 0) {
next
}
OUT_TABLE$queryID <- GENE_ID
OUT_TABLE <- OUT_TABLE[c(5,1,2,3,4)]
write.table(OUT_TABLE, opt$out, col.names = !file.exists(opt$out), append = T, row.names=FALSE, sep="\t", quote = FALSE)
# Remove the temporary file
unlink("tmp.fasta")
}
# Print goodbye message
message ("")
message ("ANALYSIS COMPLETE!!!")
message ("Feed the output file to GenEra (-p) for a second assessment of gene ages")
message ("")
message ("Thanks for using hmmEra!")
message ("")