diff --git a/annotation_pipeline.nf b/annotation_pipeline.nf index 8e7069d80..e71e787ec 100644 --- a/annotation_pipeline.nf +++ b/annotation_pipeline.nf @@ -509,6 +509,24 @@ process execute_amrscan { """ } +process blast_vfdb { + container "$params.blast_container" + conda "$baseDir/conda/blast.yaml" + + input: + tuple (path(vf_db), path(seq)) + + output: + path '*tab' + + script: + + n = seq.name + """ + blastp -db $vf_db/vfdb.fasta -query ${n} -outfmt 6 -evalue 10.0 > ${n}.tab + """ +} + process setup_db { container "$params.annotation_container" conda "$baseDir/conda/annotation.yaml" @@ -797,6 +815,32 @@ process load_amr_into_db { """ } +process load_vfdb_hits_into_db { + container "$params.annotation_container" + conda "$baseDir/conda/annotation.yaml" + + input: + path db + path blast_results + path vf_db_fasta + path vf_db_defs + + output: + path db + + script: + """ + #!/usr/bin/env python + import setup_chlamdb + + kwargs = ${gen_python_args()} + blast_results = "$blast_results".split() + + setup_chlamdb.load_vfdb_hits(kwargs, blast_results, "$db", "$vf_db_fasta", "$vf_db_defs") + """ +} + + process create_chlamdb_search_index { container "$params.annotation_container" conda "$baseDir/conda/annotation.yaml" @@ -959,6 +1003,13 @@ workflow { db = load_amr_into_db(amr_table.collect(), db, amr_version) } + if(params.vfdb) { + vf_ref_db = Channel.fromPath("$params.vf_db", type: "dir") + db_seq_combined = vf_ref_db.combine(split_nr_seqs) + vfdb_blast = blast_vfdb(db_seq_combined) + db = load_vfdb_hits_into_db(db, vfdb_blast.collect(), Channel.fromPath("$params.vf_db/vfdb.fasta"), Channel.fromPath("$params.vf_db/VFs.xls")) + } + (to_index_cleanup, to_db_cleanup) = create_chlamdb_search_index(db) cleanup(to_index_cleanup, to_db_cleanup, mafft_alignments.collect(), Channel.fromPath("${params.results_dir}", type: "dir"), to_cleanup_gbks) } diff --git a/bin/setup_chlamdb.py b/bin/setup_chlamdb.py index c07f6f45f..51a731d41 100644 --- a/bin/setup_chlamdb.py +++ b/bin/setup_chlamdb.py @@ -1,9 +1,12 @@ import os +import re from collections import defaultdict, namedtuple +from datetime import datetime import KO_module import pandas as pd import search_bar +import xlrd from Bio import SeqIO, SeqUtils from db_utils import DB @@ -518,7 +521,7 @@ def load_pfam(params, pfam_files, db, pfam_def_file, ref_db_dir): db.commit() -class SwissProtIdCount(defaultdict): +class ProtIdCounter(defaultdict): def __init__(self): super().__init__() self.id_count = 0 @@ -562,9 +565,29 @@ def parse_swissprot_id(to_parse): return db, ident, name +vfdb_id_expr = re.compile(r"(.*)\(gb\|(.*)\)") + + +def parse_vfdb_id(to_parse): + """IDs in vfdb.fasta are either of the form VFID(gb_accession) or VFID + """ + if "(" in to_parse: + return vfdb_id_expr.match(to_parse).groups() + return to_parse, None + + +vfdb_descr_expr = re.compile(r"\(.*?\) (.*?) \[.*?\((VF.*?)\).*?\] \[.*?\]") + + +def parse_vfdb_entry(description): + description = description.split(" ", 1)[1] + prot_name, vfid = vfdb_descr_expr.match(description).groups() + return prot_name, vfid + + def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db_dir): db = DB.load_db(db_name, params) - hsh_swissprot_id = SwissProtIdCount() + hsh_swissprot_id = ProtIdCounter() db.create_swissprot_tables() # Note: this is not really scalable to x genomes, as @@ -607,6 +630,61 @@ def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db db.commit() +def load_vfdb_hits(params, blast_results, db_name, vfdb_fasta, vfdb_defs): + db = DB.load_db(db_name, params) + hsh_prot_id = ProtIdCounter() + db.create_vf_tables() + + # Note: this is not really scalable to x genomes, as + # it necessitates to keep the prot id in memory + # may need to process the blast results in batch instead (slower but + # spares memory). + for blast_file in blast_results: + data = [] + with open(blast_file, "r") as blast_fh: + for line in blast_fh: + crc, gene_id, perid, leng, n_mis, n_gap, qs, qe, ss, se, e, score = line.split() + hsh = simplify_hash(crc) + prot_id, _ = parse_vfdb_id(gene_id) + db_prot_id = hsh_prot_id[prot_id] + data.append((hsh, db_prot_id, float(e), int(float(score)), + int(float(perid)), int(n_gap), int(leng))) + db.load_vf_hits(data) + + # Definitions are constructed from two data sources. + # vfdb_fasta contains matching entries for every hit, with a + # VFID relating to more information in VFs.xls + vf_defs = pd.read_excel(vfdb_defs, header=1) + vf_defs = vf_defs.set_index("VFID") + + vfdb_prot_defs = [] + for record in SeqIO.parse(vfdb_fasta, "fasta"): + prot_id, gb_accession = parse_vfdb_id(record.name) + if prot_id not in hsh_prot_id: + continue + db_prot_id = hsh_prot_id[prot_id] + prot_name, vfid = parse_vfdb_entry(record.description) + # Get info from definitions + vf_data = vf_defs.loc[vfid] + vfdb_prot_defs.append( + (db_prot_id, prot_id, gb_accession, prot_name, vfid, + vf_data.VFcategory, vf_data.Characteristics, vf_data.Structure, + vf_data.Function, vf_data.Mechanism)) + db.load_vf_defs(vfdb_prot_defs) + db.set_status_in_config_table("BLAST_vfdb", 1) + db.commit() + + # There is no version of the VFdb, only a download date + # As there is a new version of the DB every week, the download date + # will have to do. + book = xlrd.open_workbook(vfdb_defs) + sheet = book.sheet_by_index(0) + download_date = re.match(r".*\[(.*)\]", sheet.row(0)[3].value).groups()[0] + download_date = datetime.strptime(download_date, "%a %b %d %H:%M:%S %Y") + db.load_data_into_table("versions", [("VFDB", download_date.date().isoformat())]) + db.commit() + + # NOTE: # Several KO marked as significant can be assigned to the same locus # only take the hit with the lowest evalue (the first in the list) diff --git a/bin/zdb b/bin/zdb index d5c20e3c7..5f1de5e66 100755 --- a/bin/zdb +++ b/bin/zdb @@ -119,6 +119,10 @@ elif [[ "$1" == "setup" ]]; then db_setup_args="${db_setup_args} --pfam" shift ;; + --vfdb) + db_setup_args="${db_setup_args} --vfdb" + shift + ;; --singularity_dir=*) singularity_dir=${i#*=} shift @@ -135,6 +139,7 @@ elif [[ "$1" == "setup" ]]; then echo " --ko: downloads and setups the hmm profiles of the ko database" echo " --pfam: downloads and setups up the hmm profiles of the PFAM protein domains" echo " --swissprot: downloads and indexes the swissprot database " + echo " --vfdb: downloads the virulence factor database " echo "" echo "Other parameters:" echo " --dir: directory where to store the reference databases (defaults zdb_ref in the current directory)" @@ -474,6 +479,10 @@ elif [[ "$1" == "run" ]]; then args="${args} --pfam" shift ;; + --vfdb) + args="${args} --vfdb" + shift + ;; --singularity_dir=*) singularity_dir=$(realpath ${i#*=}) shift @@ -498,6 +507,7 @@ elif [[ "$1" == "run" ]]; then echo " --ko: perform ko (metabolism) annotation" echo " --pfam: peform PFAM domain annotation" echo " --swissprot: search for homologs in the swissprot database" + echo " --vfdb: search for homologs in the virulence factor database" echo " --ref_dir: directory where the reference databases were setup up (default zdb_ref)" echo " --cpu: number of parallel processes allowed (default 8)" echo " --mem: max memory usage allowed (default 8GB)" diff --git a/conda/annotation.yaml b/conda/annotation.yaml index d9b8d5462..9c47f4e50 100644 --- a/conda/annotation.yaml +++ b/conda/annotation.yaml @@ -2,7 +2,6 @@ name: annotation-base channels: - anaconda - conda-forge - - metagenlab dependencies: - python=3.9 - whoosh @@ -10,4 +9,4 @@ dependencies: - biopython>=1.8 - numpy - pandas - - metagenlab_libs + - xlrd diff --git a/db_setup.nf b/db_setup.nf index aad33247e..37d667f3e 100644 --- a/db_setup.nf +++ b/db_setup.nf @@ -149,6 +149,40 @@ process prepare_swissprot { """ } +process download_vfdb { + + output: + tuple path("vfdb.fasta"), path("VFs.xls") + + script: + """ + wget http://www.mgc.ac.cn/VFs/Down/VFDB_setB_nt.fas.gz + wget http://www.mgc.ac.cn/VFs/Down/VFs.xls.gz + gunzip < VFDB_setB_nt.fas.gz > vfdb.fasta + gunzip < VFs.xls.gz > VFs.xls + rm VFDB_setB_nt.fas.gz + rm VFs.xls.gz + """ +} + +process prepare_vfdb { + container "$params.blast_container" + conda "$baseDir/conda/blast.yaml" + + publishDir "$params.vf_db", mode: "move" + + input: + tuple path(vfdb_fasta), path(vf_descr) + + output: + path("*", includeInputs: true) + + script: + """ + makeblastdb -dbtype prot -in $vfdb_fasta + """ +} + workflow setup_cogg_db { download_cog_cdd() | setup_cog_cdd } @@ -174,6 +208,10 @@ workflow setup_swissprot_db { download_swissprot() | prepare_swissprot } +workflow setup_vfdb { + download_vfdb() | prepare_vfdb +} + workflow { if( params.cog ) setup_cogg_db() @@ -189,4 +227,7 @@ workflow { if ( params.blast_swissprot ) setup_swissprot_db() + + if ( params.vfdb ) + setup_vfdb() } diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 658d1b088..1493c44c6 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -17,6 +17,7 @@ to [Common Changelog](https://common-changelog.org) ### Added +- Add virulence factor annotations. ([#56](https://github.com/metagenlab/zDB/pull/56)) (Niklaus Johner) - Store reference DB and software versions in DB. ([#54](https://github.com/metagenlab/zDB/pull/54)) (Niklaus Johner) - Add script to compute rendering times of various views. ([#51](https://github.com/metagenlab/zDB/pull/51)) (Niklaus Johner) - Add possibility to dump HTML responses from webapp tests. ([#50](https://github.com/metagenlab/zDB/pull/50)) (Niklaus Johner) diff --git a/nextflow.config b/nextflow.config index 2c5fbcd72..63d4f6f00 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,6 +11,7 @@ params.pfam_db = "${params.base_db}/pfam/" params.cog_db = "${params.base_db}/cog/" params.ko_db = "${params.base_db}/kegg/" params.refseq_db = "${params.base_db}/refseq/" +params.vf_db = "${params.base_db}/vfdb/" params.results_dir = "zdb/results" @@ -22,6 +23,7 @@ params.cog = false params.ko = false params.pfam = false params.amr = false +params.vfdb = false params.checkm_args = "domain Bacteria" @@ -59,7 +61,7 @@ docker.enabled = "$params.docker" == "true"?true:false conda.enabled = "$params.conda" == "true"?true:false conda.useMamba = true -params.annotation_container = "registry.hub.docker.com/metagenlab/annotation-pipeline:1.3.1" +params.annotation_container = "registry.hub.docker.com/metagenlab/annotation-pipeline:1.4.0" params.zdb_container = "docker-daemon://metagenlab/zdb:latest" params.checkm_container = "quay.io/biocontainers/checkm-genome:1.2.1--pyhdfd78af_0" diff --git a/testing/pipelines/test_annotation_pipeline.py b/testing/pipelines/test_annotation_pipeline.py index c239fe447..187a8d38a 100644 --- a/testing/pipelines/test_annotation_pipeline.py +++ b/testing/pipelines/test_annotation_pipeline.py @@ -174,6 +174,20 @@ def test_amr_hits(self): self.assert_db_base_table_row_counts() self.assertEqual(2, self.query("amr_hits").count()) + def test_vfdb_hits(self): + self.nf_params["vfdb"] = "true" + execution = self.execute_pipeline() + self.assert_success(execution) + self.load_db(execution) + + # Let's check that tables were correctly created and filled + self.assertItemsEqual( + base_tables + ['vf_hits', 'vf_defs'], + self.metadata_obj.tables.keys()) + self.assert_db_base_table_row_counts() + self.assertEqual(0, self.query("vf_hits").count()) + self.assertEqual(0, self.query("vf_defs").count()) + def test_full_pipeline(self): self.nf_params["pfam"] = "true" self.nf_params["ko"] = "true" diff --git a/testing/pipelines/test_db_setup.py b/testing/pipelines/test_db_setup.py index 7b56a9110..b23b06504 100644 --- a/testing/pipelines/test_db_setup.py +++ b/testing/pipelines/test_db_setup.py @@ -108,3 +108,20 @@ def test_creating_swissprot_db(self): self.assertItemsEqual( expected_files, os.listdir(os.path.join(self.ref_db_dir, "uniprot", "swissprot"))) + + def test_creating_vf_db(self): + self.nf_params["vfdb"] = "true" + execution = self.execute_pipeline() + self.assert_success(execution) + + self.assertEqual([proc.name for proc in execution.process_executions], + ["setup_vfdb:download_vfdb"]) + + download_process = execution.process_executions[0] + # Files are moved to zdb_ref/vfdb + expected_files = ['vfdb.fasta', + 'VFs.xls'] + self.assert_created_files(download_process, []) + self.assertItemsEqual( + expected_files, + os.listdir(os.path.join(self.ref_db_dir, "vfdb"))) diff --git a/utils/setup_base_db.py b/utils/setup_base_db.py index 94e96c8a1..37c953255 100755 --- a/utils/setup_base_db.py +++ b/utils/setup_base_db.py @@ -68,6 +68,7 @@ def create_data_table(db): ("phylogenetic_profile", "optional", False), ("gbk_files", "optional", False), ("AMR", "optional", False), + ("BLAST_vfdb", "optional", False), ] db.load_chlamdb_config_tables(entry_list) db.commit() diff --git a/webapp/lib/db_utils.py b/webapp/lib/db_utils.py index 370784bee..1c7123fe9 100644 --- a/webapp/lib/db_utils.py +++ b/webapp/lib/db_utils.py @@ -299,6 +299,38 @@ def load_swissprot_hits(self, data): def load_swissprot_defs(self, data): self.load_data_into_table("swissprot_defs", data) + def create_vf_tables(self): + query = ( + "CREATE TABLE vf_hits (" + " hsh INT, prot_id INT, evalue INT, score INT," + " perc_id INT, gaps INT, leng INT" + ");" + ) + self.server.adaptor.execute(query,) + query = ( + "CREATE INDEX vfhi ON vf_hits(hsh);" + ) + self.server.adaptor.execute(query,) + query = ( + "CREATE TABLE vf_defs (" + " prot_id INT, vfdb_id varchar(15), gb_accession varchar(15)," + " prot_name tinytext, vfid varchar(10), category tinytext," + " characteristics TEXT, structure TEXT, function TEXT," + " mechanism TEXT" + ");" + ) + self.server.adaptor.execute(query,) + query = ( + "CREATE INDEX vfdi ON vf_defs(prot_id);" + ) + self.server.adaptor.execute(query,) + + def load_vf_hits(self, data): + self.load_data_into_table("vf_hits", data) + + def load_vf_defs(self, data): + self.load_data_into_table("vf_defs", data) + def create_diamond_refseq_match_id(self): query = ( "CREATE TABLE diamond_refseq_match_id ( "