diff --git a/annotation_pipeline.nf b/annotation_pipeline.nf index 8e7069d80..e71e787ec 100644 --- a/annotation_pipeline.nf +++ b/annotation_pipeline.nf @@ -509,6 +509,24 @@ process execute_amrscan { """ } +process blast_vfdb { + container "$params.blast_container" + conda "$baseDir/conda/blast.yaml" + + input: + tuple (path(vf_db), path(seq)) + + output: + path '*tab' + + script: + + n = seq.name + """ + blastp -db $vf_db/vfdb.fasta -query ${n} -outfmt 6 -evalue 10.0 > ${n}.tab + """ +} + process setup_db { container "$params.annotation_container" conda "$baseDir/conda/annotation.yaml" @@ -797,6 +815,32 @@ process load_amr_into_db { """ } +process load_vfdb_hits_into_db { + container "$params.annotation_container" + conda "$baseDir/conda/annotation.yaml" + + input: + path db + path blast_results + path vf_db_fasta + path vf_db_defs + + output: + path db + + script: + """ + #!/usr/bin/env python + import setup_chlamdb + + kwargs = ${gen_python_args()} + blast_results = "$blast_results".split() + + setup_chlamdb.load_vfdb_hits(kwargs, blast_results, "$db", "$vf_db_fasta", "$vf_db_defs") + """ +} + + process create_chlamdb_search_index { container "$params.annotation_container" conda "$baseDir/conda/annotation.yaml" @@ -959,6 +1003,13 @@ workflow { db = load_amr_into_db(amr_table.collect(), db, amr_version) } + if(params.vfdb) { + vf_ref_db = Channel.fromPath("$params.vf_db", type: "dir") + db_seq_combined = vf_ref_db.combine(split_nr_seqs) + vfdb_blast = blast_vfdb(db_seq_combined) + db = load_vfdb_hits_into_db(db, vfdb_blast.collect(), Channel.fromPath("$params.vf_db/vfdb.fasta"), Channel.fromPath("$params.vf_db/VFs.xls")) + } + (to_index_cleanup, to_db_cleanup) = create_chlamdb_search_index(db) cleanup(to_index_cleanup, to_db_cleanup, mafft_alignments.collect(), Channel.fromPath("${params.results_dir}", type: "dir"), to_cleanup_gbks) } diff --git a/bin/setup_chlamdb.py b/bin/setup_chlamdb.py index c07f6f45f..51a731d41 100644 --- a/bin/setup_chlamdb.py +++ b/bin/setup_chlamdb.py @@ -1,9 +1,12 @@ import os +import re from collections import defaultdict, namedtuple +from datetime import datetime import KO_module import pandas as pd import search_bar +import xlrd from Bio import SeqIO, SeqUtils from db_utils import DB @@ -518,7 +521,7 @@ def load_pfam(params, pfam_files, db, pfam_def_file, ref_db_dir): db.commit() -class SwissProtIdCount(defaultdict): +class ProtIdCounter(defaultdict): def __init__(self): super().__init__() self.id_count = 0 @@ -562,9 +565,29 @@ def parse_swissprot_id(to_parse): return db, ident, name +vfdb_id_expr = re.compile(r"(.*)\(gb\|(.*)\)") + + +def parse_vfdb_id(to_parse): + """IDs in vfdb.fasta are either of the form VFID(gb_accession) or VFID + """ + if "(" in to_parse: + return vfdb_id_expr.match(to_parse).groups() + return to_parse, None + + +vfdb_descr_expr = re.compile(r"\(.*?\) (.*?) \[.*?\((VF.*?)\).*?\] \[.*?\]") + + +def parse_vfdb_entry(description): + description = description.split(" ", 1)[1] + prot_name, vfid = vfdb_descr_expr.match(description).groups() + return prot_name, vfid + + def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db_dir): db = DB.load_db(db_name, params) - hsh_swissprot_id = SwissProtIdCount() + hsh_swissprot_id = ProtIdCounter() db.create_swissprot_tables() # Note: this is not really scalable to x genomes, as @@ -607,6 +630,61 @@ def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db db.commit() +def load_vfdb_hits(params, blast_results, db_name, vfdb_fasta, vfdb_defs): + db = DB.load_db(db_name, params) + hsh_prot_id = ProtIdCounter() + db.create_vf_tables() + + # Note: this is not really scalable to x genomes, as + # it necessitates to keep the prot id in memory + # may need to process the blast results in batch instead (slower but + # spares memory). + for blast_file in blast_results: + data = [] + with open(blast_file, "r") as blast_fh: + for line in blast_fh: + crc, gene_id, perid, leng, n_mis, n_gap, qs, qe, ss, se, e, score = line.split() + hsh = simplify_hash(crc) + prot_id, _ = parse_vfdb_id(gene_id) + db_prot_id = hsh_prot_id[prot_id] + data.append((hsh, db_prot_id, float(e), int(float(score)), + int(float(perid)), int(n_gap), int(leng))) + db.load_vf_hits(data) + + # Definitions are constructed from two data sources. + # vfdb_fasta contains matching entries for every hit, with a + # VFID relating to more information in VFs.xls + vf_defs = pd.read_excel(vfdb_defs, header=1) + vf_defs = vf_defs.set_index("VFID") + + vfdb_prot_defs = [] + for record in SeqIO.parse(vfdb_fasta, "fasta"): + prot_id, gb_accession = parse_vfdb_id(record.name) + if prot_id not in hsh_prot_id: + continue + db_prot_id = hsh_prot_id[prot_id] + prot_name, vfid = parse_vfdb_entry(record.description) + # Get info from definitions + vf_data = vf_defs.loc[vfid] + vfdb_prot_defs.append( + (db_prot_id, prot_id, gb_accession, prot_name, vfid, + vf_data.VFcategory, vf_data.Characteristics, vf_data.Structure, + vf_data.Function, vf_data.Mechanism)) + db.load_vf_defs(vfdb_prot_defs) + db.set_status_in_config_table("BLAST_vfdb", 1) + db.commit() + + # There is no version of the VFdb, only a download date + # As there is a new version of the DB every week, the download date + # will have to do. + book = xlrd.open_workbook(vfdb_defs) + sheet = book.sheet_by_index(0) + download_date = re.match(r".*\[(.*)\]", sheet.row(0)[3].value).groups()[0] + download_date = datetime.strptime(download_date, "%a %b %d %H:%M:%S %Y") + db.load_data_into_table("versions", [("VFDB", download_date.date().isoformat())]) + db.commit() + + # NOTE: # Several KO marked as significant can be assigned to the same locus # only take the hit with the lowest evalue (the first in the list) diff --git a/bin/zdb b/bin/zdb index e359d8911..5f1de5e66 100755 --- a/bin/zdb +++ b/bin/zdb @@ -479,6 +479,10 @@ elif [[ "$1" == "run" ]]; then args="${args} --pfam" shift ;; + --vfdb) + args="${args} --vfdb" + shift + ;; --singularity_dir=*) singularity_dir=$(realpath ${i#*=}) shift @@ -503,6 +507,7 @@ elif [[ "$1" == "run" ]]; then echo " --ko: perform ko (metabolism) annotation" echo " --pfam: peform PFAM domain annotation" echo " --swissprot: search for homologs in the swissprot database" + echo " --vfdb: search for homologs in the virulence factor database" echo " --ref_dir: directory where the reference databases were setup up (default zdb_ref)" echo " --cpu: number of parallel processes allowed (default 8)" echo " --mem: max memory usage allowed (default 8GB)" diff --git a/conda/annotation.yaml b/conda/annotation.yaml index d9b8d5462..b925be8b6 100644 --- a/conda/annotation.yaml +++ b/conda/annotation.yaml @@ -11,3 +11,4 @@ dependencies: - numpy - pandas - metagenlab_libs + - xlrd diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 658d1b088..1493c44c6 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -17,6 +17,7 @@ to [Common Changelog](https://common-changelog.org) ### Added +- Add virulence factor annotations. ([#56](https://github.com/metagenlab/zDB/pull/56)) (Niklaus Johner) - Store reference DB and software versions in DB. ([#54](https://github.com/metagenlab/zDB/pull/54)) (Niklaus Johner) - Add script to compute rendering times of various views. ([#51](https://github.com/metagenlab/zDB/pull/51)) (Niklaus Johner) - Add possibility to dump HTML responses from webapp tests. ([#50](https://github.com/metagenlab/zDB/pull/50)) (Niklaus Johner) diff --git a/testing/pipelines/test_annotation_pipeline.py b/testing/pipelines/test_annotation_pipeline.py index c239fe447..187a8d38a 100644 --- a/testing/pipelines/test_annotation_pipeline.py +++ b/testing/pipelines/test_annotation_pipeline.py @@ -174,6 +174,20 @@ def test_amr_hits(self): self.assert_db_base_table_row_counts() self.assertEqual(2, self.query("amr_hits").count()) + def test_vfdb_hits(self): + self.nf_params["vfdb"] = "true" + execution = self.execute_pipeline() + self.assert_success(execution) + self.load_db(execution) + + # Let's check that tables were correctly created and filled + self.assertItemsEqual( + base_tables + ['vf_hits', 'vf_defs'], + self.metadata_obj.tables.keys()) + self.assert_db_base_table_row_counts() + self.assertEqual(0, self.query("vf_hits").count()) + self.assertEqual(0, self.query("vf_defs").count()) + def test_full_pipeline(self): self.nf_params["pfam"] = "true" self.nf_params["ko"] = "true" diff --git a/utils/setup_base_db.py b/utils/setup_base_db.py index 94e96c8a1..37c953255 100755 --- a/utils/setup_base_db.py +++ b/utils/setup_base_db.py @@ -68,6 +68,7 @@ def create_data_table(db): ("phylogenetic_profile", "optional", False), ("gbk_files", "optional", False), ("AMR", "optional", False), + ("BLAST_vfdb", "optional", False), ] db.load_chlamdb_config_tables(entry_list) db.commit() diff --git a/webapp/lib/db_utils.py b/webapp/lib/db_utils.py index 370784bee..1c7123fe9 100644 --- a/webapp/lib/db_utils.py +++ b/webapp/lib/db_utils.py @@ -299,6 +299,38 @@ def load_swissprot_hits(self, data): def load_swissprot_defs(self, data): self.load_data_into_table("swissprot_defs", data) + def create_vf_tables(self): + query = ( + "CREATE TABLE vf_hits (" + " hsh INT, prot_id INT, evalue INT, score INT," + " perc_id INT, gaps INT, leng INT" + ");" + ) + self.server.adaptor.execute(query,) + query = ( + "CREATE INDEX vfhi ON vf_hits(hsh);" + ) + self.server.adaptor.execute(query,) + query = ( + "CREATE TABLE vf_defs (" + " prot_id INT, vfdb_id varchar(15), gb_accession varchar(15)," + " prot_name tinytext, vfid varchar(10), category tinytext," + " characteristics TEXT, structure TEXT, function TEXT," + " mechanism TEXT" + ");" + ) + self.server.adaptor.execute(query,) + query = ( + "CREATE INDEX vfdi ON vf_defs(prot_id);" + ) + self.server.adaptor.execute(query,) + + def load_vf_hits(self, data): + self.load_data_into_table("vf_hits", data) + + def load_vf_defs(self, data): + self.load_data_into_table("vf_defs", data) + def create_diamond_refseq_match_id(self): query = ( "CREATE TABLE diamond_refseq_match_id ( "