Skip to content

Commit

Permalink
Add option to blast against VFDB and store the hits in the DB.
Browse files Browse the repository at this point in the history
  • Loading branch information
njohner committed Feb 13, 2024
1 parent 96a84fe commit 13da27d
Show file tree
Hide file tree
Showing 8 changed files with 185 additions and 2 deletions.
51 changes: 51 additions & 0 deletions annotation_pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,24 @@ process execute_amrscan {
"""
}

process blast_vfdb {
container "$params.blast_container"
conda "$baseDir/conda/blast.yaml"

input:
tuple (path(vf_db), path(seq))

output:
path '*tab'

script:

n = seq.name
"""
blastp -db $vf_db/vfdb.fasta -query ${n} -outfmt 6 -evalue 10.0 > ${n}.tab
"""
}

process setup_db {
container "$params.annotation_container"
conda "$baseDir/conda/annotation.yaml"
Expand Down Expand Up @@ -797,6 +815,32 @@ process load_amr_into_db {
"""
}

process load_vfdb_hits_into_db {
container "$params.annotation_container"
conda "$baseDir/conda/annotation.yaml"

input:
path db
path blast_results
path vf_db_fasta
path vf_db_defs

output:
path db

script:
"""
#!/usr/bin/env python
import setup_chlamdb
kwargs = ${gen_python_args()}
blast_results = "$blast_results".split()
setup_chlamdb.load_vfdb_hits(kwargs, blast_results, "$db", "$vf_db_fasta", "$vf_db_defs")
"""
}


process create_chlamdb_search_index {
container "$params.annotation_container"
conda "$baseDir/conda/annotation.yaml"
Expand Down Expand Up @@ -959,6 +1003,13 @@ workflow {
db = load_amr_into_db(amr_table.collect(), db, amr_version)
}

if(params.vfdb) {
vf_ref_db = Channel.fromPath("$params.vf_db", type: "dir")
db_seq_combined = vf_ref_db.combine(split_nr_seqs)
vfdb_blast = blast_vfdb(db_seq_combined)
db = load_vfdb_hits_into_db(db, vfdb_blast.collect(), Channel.fromPath("$params.vf_db/vfdb.fasta"), Channel.fromPath("$params.vf_db/VFs.xls"))
}

(to_index_cleanup, to_db_cleanup) = create_chlamdb_search_index(db)
cleanup(to_index_cleanup, to_db_cleanup, mafft_alignments.collect(), Channel.fromPath("${params.results_dir}", type: "dir"), to_cleanup_gbks)
}
Expand Down
82 changes: 80 additions & 2 deletions bin/setup_chlamdb.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import os
import re
from collections import defaultdict, namedtuple
from datetime import datetime

import KO_module
import pandas as pd
import search_bar
import xlrd
from Bio import SeqIO, SeqUtils
from db_utils import DB

Expand Down Expand Up @@ -518,7 +521,7 @@ def load_pfam(params, pfam_files, db, pfam_def_file, ref_db_dir):
db.commit()


class SwissProtIdCount(defaultdict):
class ProtIdCounter(defaultdict):
def __init__(self):
super().__init__()
self.id_count = 0
Expand Down Expand Up @@ -562,9 +565,29 @@ def parse_swissprot_id(to_parse):
return db, ident, name


vfdb_id_expr = re.compile(r"(.*)\(gb\|(.*)\)")


def parse_vfdb_id(to_parse):
"""IDs in vfdb.fasta are either of the form VFID(gb_accession) or VFID
"""
if "(" in to_parse:
return vfdb_id_expr.match(to_parse).groups()
return to_parse, None


vfdb_descr_expr = re.compile(r"\(.*?\) (.*?) \[.*?\((VF.*?)\).*?\] \[.*?\]")


def parse_vfdb_entry(description):
description = description.split(" ", 1)[1]
prot_name, vfid = vfdb_descr_expr.match(description).groups()
return prot_name, vfid


def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db_dir):
db = DB.load_db(db_name, params)
hsh_swissprot_id = SwissProtIdCount()
hsh_swissprot_id = ProtIdCounter()
db.create_swissprot_tables()

# Note: this is not really scalable to x genomes, as
Expand Down Expand Up @@ -607,6 +630,61 @@ def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db
db.commit()


def load_vfdb_hits(params, blast_results, db_name, vfdb_fasta, vfdb_defs):
db = DB.load_db(db_name, params)
hsh_prot_id = ProtIdCounter()
db.create_vf_tables()

# Note: this is not really scalable to x genomes, as
# it necessitates to keep the prot id in memory
# may need to process the blast results in batch instead (slower but
# spares memory).
for blast_file in blast_results:
data = []
with open(blast_file, "r") as blast_fh:
for line in blast_fh:
crc, gene_id, perid, leng, n_mis, n_gap, qs, qe, ss, se, e, score = line.split()
hsh = simplify_hash(crc)
prot_id, _ = parse_vfdb_id(gene_id)
db_prot_id = hsh_prot_id[prot_id]
data.append((hsh, db_prot_id, float(e), int(float(score)),
int(float(perid)), int(n_gap), int(leng)))
db.load_vf_hits(data)

# Definitions are constructed from two data sources.
# vfdb_fasta contains matching entries for every hit, with a
# VFID relating to more information in VFs.xls
vf_defs = pd.read_excel(vfdb_defs, header=1)
vf_defs = vf_defs.set_index("VFID")

vfdb_prot_defs = []
for record in SeqIO.parse(vfdb_fasta, "fasta"):
prot_id, gb_accession = parse_vfdb_id(record.name)
if prot_id not in hsh_prot_id:
continue
db_prot_id = hsh_prot_id[prot_id]
prot_name, vfid = parse_vfdb_entry(record.description)
# Get info from definitions
vf_data = vf_defs.loc[vfid]
vfdb_prot_defs.append(
(db_prot_id, prot_id, gb_accession, prot_name, vfid,
vf_data.VFcategory, vf_data.Characteristics, vf_data.Structure,
vf_data.Function, vf_data.Mechanism))
db.load_vf_defs(vfdb_prot_defs)
db.set_status_in_config_table("BLAST_vfdb", 1)
db.commit()

# There is no version of the VFdb, only a download date
# As there is a new version of the DB every week, the download date
# will have to do.
book = xlrd.open_workbook(vfdb_defs)
sheet = book.sheet_by_index(0)
download_date = re.match(r".*\[(.*)\]", sheet.row(0)[3].value).groups()[0]
download_date = datetime.strptime(download_date, "%a %b %d %H:%M:%S %Y")
db.load_data_into_table("versions", [("VFDB", download_date.date().isoformat())])
db.commit()


# NOTE:
# Several KO marked as significant can be assigned to the same locus
# only take the hit with the lowest evalue (the first in the list)
Expand Down
5 changes: 5 additions & 0 deletions bin/zdb
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,10 @@ elif [[ "$1" == "run" ]]; then
args="${args} --pfam"
shift
;;
--vfdb)
args="${args} --vfdb"
shift
;;
--singularity_dir=*)
singularity_dir=$(realpath ${i#*=})
shift
Expand All @@ -503,6 +507,7 @@ elif [[ "$1" == "run" ]]; then
echo " --ko: perform ko (metabolism) annotation"
echo " --pfam: peform PFAM domain annotation"
echo " --swissprot: search for homologs in the swissprot database"
echo " --vfdb: search for homologs in the virulence factor database"
echo " --ref_dir: directory where the reference databases were setup up (default zdb_ref)"
echo " --cpu: number of parallel processes allowed (default 8)"
echo " --mem: max memory usage allowed (default 8GB)"
Expand Down
1 change: 1 addition & 0 deletions conda/annotation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ dependencies:
- numpy
- pandas
- metagenlab_libs
- xlrd
1 change: 1 addition & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ to [Common Changelog](https://common-changelog.org)

### Added

- Add virulence factor annotations. ([#56](https://github.com/metagenlab/zDB/pull/56)) (Niklaus Johner)
- Store reference DB and software versions in DB. ([#54](https://github.com/metagenlab/zDB/pull/54)) (Niklaus Johner)
- Add script to compute rendering times of various views. ([#51](https://github.com/metagenlab/zDB/pull/51)) (Niklaus Johner)
- Add possibility to dump HTML responses from webapp tests. ([#50](https://github.com/metagenlab/zDB/pull/50)) (Niklaus Johner)
Expand Down
14 changes: 14 additions & 0 deletions testing/pipelines/test_annotation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,20 @@ def test_amr_hits(self):
self.assert_db_base_table_row_counts()
self.assertEqual(2, self.query("amr_hits").count())

def test_vfdb_hits(self):
self.nf_params["vfdb"] = "true"
execution = self.execute_pipeline()
self.assert_success(execution)
self.load_db(execution)

# Let's check that tables were correctly created and filled
self.assertItemsEqual(
base_tables + ['vf_hits', 'vf_defs'],
self.metadata_obj.tables.keys())
self.assert_db_base_table_row_counts()
self.assertEqual(0, self.query("vf_hits").count())
self.assertEqual(0, self.query("vf_defs").count())

def test_full_pipeline(self):
self.nf_params["pfam"] = "true"
self.nf_params["ko"] = "true"
Expand Down
1 change: 1 addition & 0 deletions utils/setup_base_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def create_data_table(db):
("phylogenetic_profile", "optional", False),
("gbk_files", "optional", False),
("AMR", "optional", False),
("BLAST_vfdb", "optional", False),
]
db.load_chlamdb_config_tables(entry_list)
db.commit()
Expand Down
32 changes: 32 additions & 0 deletions webapp/lib/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,38 @@ def load_swissprot_hits(self, data):
def load_swissprot_defs(self, data):
self.load_data_into_table("swissprot_defs", data)

def create_vf_tables(self):
query = (
"CREATE TABLE vf_hits ("
" hsh INT, prot_id INT, evalue INT, score INT,"
" perc_id INT, gaps INT, leng INT"
");"
)
self.server.adaptor.execute(query,)
query = (
"CREATE INDEX vfhi ON vf_hits(hsh);"
)
self.server.adaptor.execute(query,)
query = (
"CREATE TABLE vf_defs ("
" prot_id INT, vfdb_id varchar(15), gb_accession varchar(15),"
" prot_name tinytext, vfid varchar(10), category tinytext,"
" characteristics TEXT, structure TEXT, function TEXT,"
" mechanism TEXT"
");"
)
self.server.adaptor.execute(query,)
query = (
"CREATE INDEX vfdi ON vf_defs(prot_id);"
)
self.server.adaptor.execute(query,)

def load_vf_hits(self, data):
self.load_data_into_table("vf_hits", data)

def load_vf_defs(self, data):
self.load_data_into_table("vf_defs", data)

def create_diamond_refseq_match_id(self):
query = (
"CREATE TABLE diamond_refseq_match_id ( "
Expand Down

0 comments on commit 13da27d

Please sign in to comment.