Merge pull request #56 from metagenlab/nj/vfdb

Add virulence factor annotations.
metagenlab · Feb 14, 2024 · 820a2fe · 820a2fe
2 parents 9f25f49 + f3b0ef7
commit 820a2fe
Show file tree

Hide file tree

Showing 11 changed files with 251 additions and 5 deletions.
diff --git a/annotation_pipeline.nf b/annotation_pipeline.nf
@@ -509,6 +509,24 @@ process execute_amrscan {
   """
 }
 
+process blast_vfdb {
+  container "$params.blast_container"
+  conda "$baseDir/conda/blast.yaml"
+
+  input:
+      tuple (path(vf_db), path(seq))
+
+  output:
+      path '*tab'
+
+  script:
+
+  n = seq.name
+  """
+  blastp -db $vf_db/vfdb.fasta -query ${n} -outfmt 6 -evalue 10.0 > ${n}.tab
+  """
+}
+
 process setup_db {
     container "$params.annotation_container"
     conda "$baseDir/conda/annotation.yaml"
@@ -797,6 +815,32 @@ process load_amr_into_db {
         """
 }
 
+process load_vfdb_hits_into_db {
+    container "$params.annotation_container"
+    conda "$baseDir/conda/annotation.yaml"
+
+    input:
+        path db
+        path blast_results
+        path vf_db_fasta
+        path vf_db_defs
+
+    output:
+        path db
+
+    script:
+    """
+        #!/usr/bin/env python
+        import setup_chlamdb
+
+        kwargs = ${gen_python_args()}
+        blast_results = "$blast_results".split()
+
+        setup_chlamdb.load_vfdb_hits(kwargs, blast_results, "$db", "$vf_db_fasta", "$vf_db_defs")
+    """
+}
+
+
 process create_chlamdb_search_index {
     container "$params.annotation_container"
     conda "$baseDir/conda/annotation.yaml"
@@ -959,6 +1003,13 @@ workflow {
         db = load_amr_into_db(amr_table.collect(), db, amr_version)
     }
 
+    if(params.vfdb) {
+        vf_ref_db = Channel.fromPath("$params.vf_db", type: "dir")
+        db_seq_combined = vf_ref_db.combine(split_nr_seqs)
+        vfdb_blast = blast_vfdb(db_seq_combined)
+        db = load_vfdb_hits_into_db(db, vfdb_blast.collect(), Channel.fromPath("$params.vf_db/vfdb.fasta"), Channel.fromPath("$params.vf_db/VFs.xls"))
+    }
+
     (to_index_cleanup, to_db_cleanup) = create_chlamdb_search_index(db)
     cleanup(to_index_cleanup, to_db_cleanup, mafft_alignments.collect(), Channel.fromPath("${params.results_dir}", type: "dir"), to_cleanup_gbks)
 }

diff --git a/bin/setup_chlamdb.py b/bin/setup_chlamdb.py
@@ -1,9 +1,12 @@
 import os
+import re
 from collections import defaultdict, namedtuple
+from datetime import datetime
 
 import KO_module
 import pandas as pd
 import search_bar
+import xlrd
 from Bio import SeqIO, SeqUtils
 from db_utils import DB
 
@@ -518,7 +521,7 @@ def load_pfam(params, pfam_files, db, pfam_def_file, ref_db_dir):
     db.commit()
 
 
-class SwissProtIdCount(defaultdict):
+class ProtIdCounter(defaultdict):
     def __init__(self):
         super().__init__()
         self.id_count = 0
@@ -562,9 +565,29 @@ def parse_swissprot_id(to_parse):
     return db, ident, name
 
 
+vfdb_id_expr = re.compile(r"(.*)\(gb\|(.*)\)")
+
+
+def parse_vfdb_id(to_parse):
+    """IDs in vfdb.fasta are either of the form VFID(gb_accession) or VFID
+    """
+    if "(" in to_parse:
+        return vfdb_id_expr.match(to_parse).groups()
+    return to_parse, None
+
+
+vfdb_descr_expr = re.compile(r"\(.*?\) (.*?) \[.*?\((VF.*?)\).*?\] \[.*?\]")
+
+
+def parse_vfdb_entry(description):
+    description = description.split(" ", 1)[1]
+    prot_name, vfid = vfdb_descr_expr.match(description).groups()
+    return prot_name, vfid
+
+
 def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db_dir):
     db = DB.load_db(db_name, params)
-    hsh_swissprot_id = SwissProtIdCount()
+    hsh_swissprot_id = ProtIdCounter()
     db.create_swissprot_tables()
 
     # Note: this is not really scalable to x genomes, as
@@ -607,6 +630,61 @@ def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db
     db.commit()
 
 
+def load_vfdb_hits(params, blast_results, db_name, vfdb_fasta, vfdb_defs):
+    db = DB.load_db(db_name, params)
+    hsh_prot_id = ProtIdCounter()
+    db.create_vf_tables()
+
+    # Note: this is not really scalable to x genomes, as
+    # it necessitates to keep the prot id in memory
+    # may need to process the blast results in batch instead (slower but
+    # spares memory).
+    for blast_file in blast_results:
+        data = []
+        with open(blast_file, "r") as blast_fh:
+            for line in blast_fh:
+                crc, gene_id, perid, leng, n_mis, n_gap, qs, qe, ss, se, e, score = line.split()
+                hsh = simplify_hash(crc)
+                prot_id, _ = parse_vfdb_id(gene_id)
+                db_prot_id = hsh_prot_id[prot_id]
+                data.append((hsh, db_prot_id, float(e), int(float(score)),
+                             int(float(perid)), int(n_gap), int(leng)))
+        db.load_vf_hits(data)
+
+    # Definitions are constructed from two data sources.
+    # vfdb_fasta contains matching entries for every hit, with a
+    # VFID relating to more information in VFs.xls
+    vf_defs = pd.read_excel(vfdb_defs, header=1)
+    vf_defs = vf_defs.set_index("VFID")
+
+    vfdb_prot_defs = []
+    for record in SeqIO.parse(vfdb_fasta, "fasta"):
+        prot_id, gb_accession = parse_vfdb_id(record.name)
+        if prot_id not in hsh_prot_id:
+            continue
+        db_prot_id = hsh_prot_id[prot_id]
+        prot_name, vfid = parse_vfdb_entry(record.description)
+        # Get info from definitions
+        vf_data = vf_defs.loc[vfid]
+        vfdb_prot_defs.append(
+            (db_prot_id, prot_id, gb_accession, prot_name, vfid,
+             vf_data.VFcategory, vf_data.Characteristics, vf_data.Structure,
+             vf_data.Function, vf_data.Mechanism))
+    db.load_vf_defs(vfdb_prot_defs)
+    db.set_status_in_config_table("BLAST_vfdb", 1)
+    db.commit()
+
+    # There is no version of the VFdb, only a download date
+    # As there is a new version of the DB every week, the download date
+    # will have to do.
+    book = xlrd.open_workbook(vfdb_defs)
+    sheet = book.sheet_by_index(0)
+    download_date = re.match(r".*\[(.*)\]", sheet.row(0)[3].value).groups()[0]
+    download_date = datetime.strptime(download_date, "%a %b %d %H:%M:%S %Y")
+    db.load_data_into_table("versions", [("VFDB", download_date.date().isoformat())])
+    db.commit()
+
+
 # NOTE:
 # Several KO marked as significant can be assigned to the same locus
 # only take the hit with the lowest evalue (the first in the list)

diff --git a/bin/zdb b/bin/zdb
@@ -119,6 +119,10 @@ elif [[ "$1" == "setup" ]]; then
 				db_setup_args="${db_setup_args} --pfam"
 				shift
 				;;
+			--vfdb)
+				db_setup_args="${db_setup_args} --vfdb"
+				shift
+				;;
 			--singularity_dir=*)
 				singularity_dir=${i#*=}
 				shift
@@ -135,6 +139,7 @@ elif [[ "$1" == "setup" ]]; then
 				echo " --ko: downloads and setups the hmm profiles of the ko database"
 				echo " --pfam: downloads and setups up the hmm profiles of the PFAM protein domains"
 				echo " --swissprot: downloads and indexes the swissprot database "
+				echo " --vfdb: downloads the virulence factor database "
 				echo ""
 				echo "Other parameters:"
 				echo " --dir: directory where to store the reference databases (defaults zdb_ref in the current directory)"
@@ -474,6 +479,10 @@ elif [[ "$1" == "run" ]]; then
 				args="${args} --pfam"
 				shift
 				;;
+			--vfdb)
+				args="${args} --vfdb"
+				shift
+				;;
 			--singularity_dir=*)
 				singularity_dir=$(realpath ${i#*=})
 				shift
@@ -498,6 +507,7 @@ elif [[ "$1" == "run" ]]; then
 				echo " --ko: perform ko (metabolism) annotation"
 				echo " --pfam: peform PFAM domain annotation"
 				echo " --swissprot: search for homologs in the swissprot database"
+				echo " --vfdb: search for homologs in the virulence factor database"
 				echo " --ref_dir: directory where the reference databases were setup up (default zdb_ref)"
 				echo " --cpu: number of parallel processes allowed (default 8)"
 				echo " --mem: max memory usage allowed (default 8GB)"

diff --git a/conda/annotation.yaml b/conda/annotation.yaml
@@ -2,12 +2,11 @@ name: annotation-base
 channels:
     - anaconda
     - conda-forge
-    - metagenlab
 dependencies:
     - python=3.9
     - whoosh
     - sqlite
     - biopython>=1.8
     - numpy
     - pandas
-    - metagenlab_libs
+    - xlrd
diff --git a/db_setup.nf b/db_setup.nf
@@ -149,6 +149,40 @@ process prepare_swissprot {
     """
 }
 
+process download_vfdb {
+
+    output:
+        tuple path("vfdb.fasta"), path("VFs.xls")
+
+    script:
+    """
+    wget http://www.mgc.ac.cn/VFs/Down/VFDB_setB_nt.fas.gz
+    wget http://www.mgc.ac.cn/VFs/Down/VFs.xls.gz
+    gunzip < VFDB_setB_nt.fas.gz > vfdb.fasta
+    gunzip < VFs.xls.gz > VFs.xls
+    rm VFDB_setB_nt.fas.gz
+    rm VFs.xls.gz
+    """
+}
+
+process prepare_vfdb {
+    container "$params.blast_container"
+    conda "$baseDir/conda/blast.yaml"
+
+    publishDir "$params.vf_db", mode: "move"
+
+    input:
+        tuple path(vfdb_fasta), path(vf_descr)
+
+    output:
+        path("*", includeInputs: true)
+
+    script:
+    """
+    makeblastdb -dbtype prot -in $vfdb_fasta
+    """
+}
+
 workflow setup_cogg_db {
     download_cog_cdd() | setup_cog_cdd
 }
@@ -174,6 +208,10 @@ workflow setup_swissprot_db {
     download_swissprot() | prepare_swissprot
 }
 
+workflow setup_vfdb {
+    download_vfdb() | prepare_vfdb
+}
+
 workflow {
     if( params.cog )
         setup_cogg_db()
@@ -189,4 +227,7 @@ workflow {
 
     if ( params.blast_swissprot )
         setup_swissprot_db()
+
+    if ( params.vfdb )
+        setup_vfdb()
 }
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -17,6 +17,7 @@ to [Common Changelog](https://common-changelog.org)
 
 ### Added
 
+- Add virulence factor annotations. ([#56](https://github.com/metagenlab/zDB/pull/56)) (Niklaus Johner)
 - Store reference DB and software versions in DB. ([#54](https://github.com/metagenlab/zDB/pull/54)) (Niklaus Johner)
 - Add script to compute rendering times of various views. ([#51](https://github.com/metagenlab/zDB/pull/51)) (Niklaus Johner)
 - Add possibility to dump HTML responses from webapp tests. ([#50](https://github.com/metagenlab/zDB/pull/50)) (Niklaus Johner)

diff --git a/nextflow.config b/nextflow.config
@@ -11,6 +11,7 @@ params.pfam_db = "${params.base_db}/pfam/"
 params.cog_db = "${params.base_db}/cog/"
 params.ko_db = "${params.base_db}/kegg/"
 params.refseq_db = "${params.base_db}/refseq/"
+params.vf_db = "${params.base_db}/vfdb/"
 
 params.results_dir = "zdb/results"
 
@@ -22,6 +23,7 @@ params.cog = false
 params.ko = false
 params.pfam = false
 params.amr = false
+params.vfdb = false
 
 params.checkm_args = "domain Bacteria"
 
@@ -59,7 +61,7 @@ docker.enabled = "$params.docker" == "true"?true:false
 conda.enabled = "$params.conda" == "true"?true:false
 conda.useMamba = true
 
-params.annotation_container = "registry.hub.docker.com/metagenlab/annotation-pipeline:1.3.1"
+params.annotation_container = "registry.hub.docker.com/metagenlab/annotation-pipeline:1.4.0"
 
 params.zdb_container = "docker-daemon://metagenlab/zdb:latest"
 params.checkm_container = "quay.io/biocontainers/checkm-genome:1.2.1--pyhdfd78af_0"

diff --git a/testing/pipelines/test_annotation_pipeline.py b/testing/pipelines/test_annotation_pipeline.py
@@ -174,6 +174,20 @@ def test_amr_hits(self):
         self.assert_db_base_table_row_counts()
         self.assertEqual(2, self.query("amr_hits").count())
 
+    def test_vfdb_hits(self):
+        self.nf_params["vfdb"] = "true"
+        execution = self.execute_pipeline()
+        self.assert_success(execution)
+        self.load_db(execution)
+
+        # Let's check that tables were correctly created and filled
+        self.assertItemsEqual(
+            base_tables + ['vf_hits', 'vf_defs'],
+            self.metadata_obj.tables.keys())
+        self.assert_db_base_table_row_counts()
+        self.assertEqual(0, self.query("vf_hits").count())
+        self.assertEqual(0, self.query("vf_defs").count())
+
     def test_full_pipeline(self):
         self.nf_params["pfam"] = "true"
         self.nf_params["ko"] = "true"

diff --git a/testing/pipelines/test_db_setup.py b/testing/pipelines/test_db_setup.py
@@ -108,3 +108,20 @@ def test_creating_swissprot_db(self):
         self.assertItemsEqual(
             expected_files,
             os.listdir(os.path.join(self.ref_db_dir, "uniprot", "swissprot")))
+
+    def test_creating_vf_db(self):
+        self.nf_params["vfdb"] = "true"
+        execution = self.execute_pipeline()
+        self.assert_success(execution)
+
+        self.assertEqual([proc.name for proc in execution.process_executions],
+                         ["setup_vfdb:download_vfdb"])
+
+        download_process = execution.process_executions[0]
+        # Files are moved to zdb_ref/vfdb
+        expected_files = ['vfdb.fasta',
+                          'VFs.xls']
+        self.assert_created_files(download_process, [])
+        self.assertItemsEqual(
+            expected_files,
+            os.listdir(os.path.join(self.ref_db_dir, "vfdb")))
diff --git a/utils/setup_base_db.py b/utils/setup_base_db.py
@@ -68,6 +68,7 @@ def create_data_table(db):
         ("phylogenetic_profile", "optional", False),
         ("gbk_files", "optional", False),
         ("AMR", "optional", False),
+        ("BLAST_vfdb", "optional", False),
     ]
     db.load_chlamdb_config_tables(entry_list)
     db.commit()