Add option to blast against VFDB and store the hits in the DB.

metagenlab · Feb 13, 2024 · 13da27d · 13da27d
1 parent 96a84fe
commit 13da27d
Show file tree

Hide file tree

Showing 8 changed files with 185 additions and 2 deletions.
diff --git a/annotation_pipeline.nf b/annotation_pipeline.nf
@@ -509,6 +509,24 @@ process execute_amrscan {
   """
 }
 
+process blast_vfdb {
+  container "$params.blast_container"
+  conda "$baseDir/conda/blast.yaml"
+
+  input:
+      tuple (path(vf_db), path(seq))
+
+  output:
+      path '*tab'
+
+  script:
+
+  n = seq.name
+  """
+  blastp -db $vf_db/vfdb.fasta -query ${n} -outfmt 6 -evalue 10.0 > ${n}.tab
+  """
+}
+
 process setup_db {
     container "$params.annotation_container"
     conda "$baseDir/conda/annotation.yaml"
@@ -797,6 +815,32 @@ process load_amr_into_db {
         """
 }
 
+process load_vfdb_hits_into_db {
+    container "$params.annotation_container"
+    conda "$baseDir/conda/annotation.yaml"
+
+    input:
+        path db
+        path blast_results
+        path vf_db_fasta
+        path vf_db_defs
+
+    output:
+        path db
+
+    script:
+    """
+        #!/usr/bin/env python
+        import setup_chlamdb
+
+        kwargs = ${gen_python_args()}
+        blast_results = "$blast_results".split()
+
+        setup_chlamdb.load_vfdb_hits(kwargs, blast_results, "$db", "$vf_db_fasta", "$vf_db_defs")
+    """
+}
+
+
 process create_chlamdb_search_index {
     container "$params.annotation_container"
     conda "$baseDir/conda/annotation.yaml"
@@ -959,6 +1003,13 @@ workflow {
         db = load_amr_into_db(amr_table.collect(), db, amr_version)
     }
 
+    if(params.vfdb) {
+        vf_ref_db = Channel.fromPath("$params.vf_db", type: "dir")
+        db_seq_combined = vf_ref_db.combine(split_nr_seqs)
+        vfdb_blast = blast_vfdb(db_seq_combined)
+        db = load_vfdb_hits_into_db(db, vfdb_blast.collect(), Channel.fromPath("$params.vf_db/vfdb.fasta"), Channel.fromPath("$params.vf_db/VFs.xls"))
+    }
+
     (to_index_cleanup, to_db_cleanup) = create_chlamdb_search_index(db)
     cleanup(to_index_cleanup, to_db_cleanup, mafft_alignments.collect(), Channel.fromPath("${params.results_dir}", type: "dir"), to_cleanup_gbks)
 }

diff --git a/bin/setup_chlamdb.py b/bin/setup_chlamdb.py
@@ -1,9 +1,12 @@
 import os
+import re
 from collections import defaultdict, namedtuple
+from datetime import datetime
 
 import KO_module
 import pandas as pd
 import search_bar
+import xlrd
 from Bio import SeqIO, SeqUtils
 from db_utils import DB
 
@@ -518,7 +521,7 @@ def load_pfam(params, pfam_files, db, pfam_def_file, ref_db_dir):
     db.commit()
 
 
-class SwissProtIdCount(defaultdict):
+class ProtIdCounter(defaultdict):
     def __init__(self):
         super().__init__()
         self.id_count = 0
@@ -562,9 +565,29 @@ def parse_swissprot_id(to_parse):
     return db, ident, name
 
 
+vfdb_id_expr = re.compile(r"(.*)\(gb\|(.*)\)")
+
+
+def parse_vfdb_id(to_parse):
+    """IDs in vfdb.fasta are either of the form VFID(gb_accession) or VFID
+    """
+    if "(" in to_parse:
+        return vfdb_id_expr.match(to_parse).groups()
+    return to_parse, None
+
+
+vfdb_descr_expr = re.compile(r"\(.*?\) (.*?) \[.*?\((VF.*?)\).*?\] \[.*?\]")
+
+
+def parse_vfdb_entry(description):
+    description = description.split(" ", 1)[1]
+    prot_name, vfid = vfdb_descr_expr.match(description).groups()
+    return prot_name, vfid
+
+
 def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db_dir):
     db = DB.load_db(db_name, params)
-    hsh_swissprot_id = SwissProtIdCount()
+    hsh_swissprot_id = ProtIdCounter()
     db.create_swissprot_tables()
 
     # Note: this is not really scalable to x genomes, as
@@ -607,6 +630,61 @@ def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db
     db.commit()
 
 
+def load_vfdb_hits(params, blast_results, db_name, vfdb_fasta, vfdb_defs):
+    db = DB.load_db(db_name, params)
+    hsh_prot_id = ProtIdCounter()
+    db.create_vf_tables()
+
+    # Note: this is not really scalable to x genomes, as
+    # it necessitates to keep the prot id in memory
+    # may need to process the blast results in batch instead (slower but
+    # spares memory).
+    for blast_file in blast_results:
+        data = []
+        with open(blast_file, "r") as blast_fh:
+            for line in blast_fh:
+                crc, gene_id, perid, leng, n_mis, n_gap, qs, qe, ss, se, e, score = line.split()
+                hsh = simplify_hash(crc)
+                prot_id, _ = parse_vfdb_id(gene_id)
+                db_prot_id = hsh_prot_id[prot_id]
+                data.append((hsh, db_prot_id, float(e), int(float(score)),
+                             int(float(perid)), int(n_gap), int(leng)))
+        db.load_vf_hits(data)
+
+    # Definitions are constructed from two data sources.
+    # vfdb_fasta contains matching entries for every hit, with a
+    # VFID relating to more information in VFs.xls
+    vf_defs = pd.read_excel(vfdb_defs, header=1)
+    vf_defs = vf_defs.set_index("VFID")
+
+    vfdb_prot_defs = []
+    for record in SeqIO.parse(vfdb_fasta, "fasta"):
+        prot_id, gb_accession = parse_vfdb_id(record.name)
+        if prot_id not in hsh_prot_id:
+            continue
+        db_prot_id = hsh_prot_id[prot_id]
+        prot_name, vfid = parse_vfdb_entry(record.description)
+        # Get info from definitions
+        vf_data = vf_defs.loc[vfid]
+        vfdb_prot_defs.append(
+            (db_prot_id, prot_id, gb_accession, prot_name, vfid,
+             vf_data.VFcategory, vf_data.Characteristics, vf_data.Structure,
+             vf_data.Function, vf_data.Mechanism))
+    db.load_vf_defs(vfdb_prot_defs)
+    db.set_status_in_config_table("BLAST_vfdb", 1)
+    db.commit()
+
+    # There is no version of the VFdb, only a download date
+    # As there is a new version of the DB every week, the download date
+    # will have to do.
+    book = xlrd.open_workbook(vfdb_defs)
+    sheet = book.sheet_by_index(0)
+    download_date = re.match(r".*\[(.*)\]", sheet.row(0)[3].value).groups()[0]
+    download_date = datetime.strptime(download_date, "%a %b %d %H:%M:%S %Y")
+    db.load_data_into_table("versions", [("VFDB", download_date.date().isoformat())])
+    db.commit()
+
+
 # NOTE:
 # Several KO marked as significant can be assigned to the same locus
 # only take the hit with the lowest evalue (the first in the list)

diff --git a/bin/zdb b/bin/zdb
@@ -479,6 +479,10 @@ elif [[ "$1" == "run" ]]; then
 				args="${args} --pfam"
 				shift
 				;;
+			--vfdb)
+				args="${args} --vfdb"
+				shift
+				;;
 			--singularity_dir=*)
 				singularity_dir=$(realpath ${i#*=})
 				shift
@@ -503,6 +507,7 @@ elif [[ "$1" == "run" ]]; then
 				echo " --ko: perform ko (metabolism) annotation"
 				echo " --pfam: peform PFAM domain annotation"
 				echo " --swissprot: search for homologs in the swissprot database"
+				echo " --vfdb: search for homologs in the virulence factor database"
 				echo " --ref_dir: directory where the reference databases were setup up (default zdb_ref)"
 				echo " --cpu: number of parallel processes allowed (default 8)"
 				echo " --mem: max memory usage allowed (default 8GB)"

diff --git a/conda/annotation.yaml b/conda/annotation.yaml
@@ -11,3 +11,4 @@ dependencies:
     - numpy
     - pandas
     - metagenlab_libs
+    - xlrd
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -17,6 +17,7 @@ to [Common Changelog](https://common-changelog.org)
 
 ### Added
 
+- Add virulence factor annotations. ([#56](https://github.com/metagenlab/zDB/pull/56)) (Niklaus Johner)
 - Store reference DB and software versions in DB. ([#54](https://github.com/metagenlab/zDB/pull/54)) (Niklaus Johner)
 - Add script to compute rendering times of various views. ([#51](https://github.com/metagenlab/zDB/pull/51)) (Niklaus Johner)
 - Add possibility to dump HTML responses from webapp tests. ([#50](https://github.com/metagenlab/zDB/pull/50)) (Niklaus Johner)

diff --git a/testing/pipelines/test_annotation_pipeline.py b/testing/pipelines/test_annotation_pipeline.py
@@ -174,6 +174,20 @@ def test_amr_hits(self):
         self.assert_db_base_table_row_counts()
         self.assertEqual(2, self.query("amr_hits").count())
 
+    def test_vfdb_hits(self):
+        self.nf_params["vfdb"] = "true"
+        execution = self.execute_pipeline()
+        self.assert_success(execution)
+        self.load_db(execution)
+
+        # Let's check that tables were correctly created and filled
+        self.assertItemsEqual(
+            base_tables + ['vf_hits', 'vf_defs'],
+            self.metadata_obj.tables.keys())
+        self.assert_db_base_table_row_counts()
+        self.assertEqual(0, self.query("vf_hits").count())
+        self.assertEqual(0, self.query("vf_defs").count())
+
     def test_full_pipeline(self):
         self.nf_params["pfam"] = "true"
         self.nf_params["ko"] = "true"

diff --git a/utils/setup_base_db.py b/utils/setup_base_db.py
@@ -68,6 +68,7 @@ def create_data_table(db):
         ("phylogenetic_profile", "optional", False),
         ("gbk_files", "optional", False),
         ("AMR", "optional", False),
+        ("BLAST_vfdb", "optional", False),
     ]
     db.load_chlamdb_config_tables(entry_list)
     db.commit()

diff --git a/webapp/lib/db_utils.py b/webapp/lib/db_utils.py
@@ -299,6 +299,38 @@ def load_swissprot_hits(self, data):
     def load_swissprot_defs(self, data):
         self.load_data_into_table("swissprot_defs", data)
 
+    def create_vf_tables(self):
+        query = (
+            "CREATE TABLE vf_hits ("
+            " hsh INT, prot_id INT, evalue INT, score INT,"
+            " perc_id INT, gaps INT, leng INT"
+            ");"
+        )
+        self.server.adaptor.execute(query,)
+        query = (
+            "CREATE INDEX vfhi ON vf_hits(hsh);"
+        )
+        self.server.adaptor.execute(query,)
+        query = (
+            "CREATE TABLE vf_defs ("
+            " prot_id INT, vfdb_id varchar(15), gb_accession varchar(15),"
+            " prot_name tinytext, vfid varchar(10), category tinytext,"
+            " characteristics TEXT,  structure TEXT, function TEXT,"
+            " mechanism TEXT"
+            ");"
+        )
+        self.server.adaptor.execute(query,)
+        query = (
+            "CREATE INDEX vfdi ON vf_defs(prot_id);"
+        )
+        self.server.adaptor.execute(query,)
+
+    def load_vf_hits(self, data):
+        self.load_data_into_table("vf_hits", data)
+
+    def load_vf_defs(self, data):
+        self.load_data_into_table("vf_defs", data)
+
     def create_diamond_refseq_match_id(self):
         query = (
             "CREATE TABLE diamond_refseq_match_id ( "