Skip to content

Commit

Permalink
Merge pull request #56 from metagenlab/nj/vfdb
Browse files Browse the repository at this point in the history
Add virulence factor annotations.
  • Loading branch information
njohner committed Feb 14, 2024
2 parents 9f25f49 + f3b0ef7 commit 820a2fe
Show file tree
Hide file tree
Showing 11 changed files with 251 additions and 5 deletions.
51 changes: 51 additions & 0 deletions annotation_pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,24 @@ process execute_amrscan {
"""
}

process blast_vfdb {
container "$params.blast_container"
conda "$baseDir/conda/blast.yaml"

input:
tuple (path(vf_db), path(seq))

output:
path '*tab'

script:

n = seq.name
"""
blastp -db $vf_db/vfdb.fasta -query ${n} -outfmt 6 -evalue 10.0 > ${n}.tab
"""
}

process setup_db {
container "$params.annotation_container"
conda "$baseDir/conda/annotation.yaml"
Expand Down Expand Up @@ -797,6 +815,32 @@ process load_amr_into_db {
"""
}

process load_vfdb_hits_into_db {
container "$params.annotation_container"
conda "$baseDir/conda/annotation.yaml"

input:
path db
path blast_results
path vf_db_fasta
path vf_db_defs

output:
path db

script:
"""
#!/usr/bin/env python
import setup_chlamdb
kwargs = ${gen_python_args()}
blast_results = "$blast_results".split()
setup_chlamdb.load_vfdb_hits(kwargs, blast_results, "$db", "$vf_db_fasta", "$vf_db_defs")
"""
}


process create_chlamdb_search_index {
container "$params.annotation_container"
conda "$baseDir/conda/annotation.yaml"
Expand Down Expand Up @@ -959,6 +1003,13 @@ workflow {
db = load_amr_into_db(amr_table.collect(), db, amr_version)
}

if(params.vfdb) {
vf_ref_db = Channel.fromPath("$params.vf_db", type: "dir")
db_seq_combined = vf_ref_db.combine(split_nr_seqs)
vfdb_blast = blast_vfdb(db_seq_combined)
db = load_vfdb_hits_into_db(db, vfdb_blast.collect(), Channel.fromPath("$params.vf_db/vfdb.fasta"), Channel.fromPath("$params.vf_db/VFs.xls"))
}

(to_index_cleanup, to_db_cleanup) = create_chlamdb_search_index(db)
cleanup(to_index_cleanup, to_db_cleanup, mafft_alignments.collect(), Channel.fromPath("${params.results_dir}", type: "dir"), to_cleanup_gbks)
}
Expand Down
82 changes: 80 additions & 2 deletions bin/setup_chlamdb.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import os
import re
from collections import defaultdict, namedtuple
from datetime import datetime

import KO_module
import pandas as pd
import search_bar
import xlrd
from Bio import SeqIO, SeqUtils
from db_utils import DB

Expand Down Expand Up @@ -518,7 +521,7 @@ def load_pfam(params, pfam_files, db, pfam_def_file, ref_db_dir):
db.commit()


class SwissProtIdCount(defaultdict):
class ProtIdCounter(defaultdict):
def __init__(self):
super().__init__()
self.id_count = 0
Expand Down Expand Up @@ -562,9 +565,29 @@ def parse_swissprot_id(to_parse):
return db, ident, name


vfdb_id_expr = re.compile(r"(.*)\(gb\|(.*)\)")


def parse_vfdb_id(to_parse):
"""IDs in vfdb.fasta are either of the form VFID(gb_accession) or VFID
"""
if "(" in to_parse:
return vfdb_id_expr.match(to_parse).groups()
return to_parse, None


vfdb_descr_expr = re.compile(r"\(.*?\) (.*?) \[.*?\((VF.*?)\).*?\] \[.*?\]")


def parse_vfdb_entry(description):
description = description.split(" ", 1)[1]
prot_name, vfid = vfdb_descr_expr.match(description).groups()
return prot_name, vfid


def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db_dir):
db = DB.load_db(db_name, params)
hsh_swissprot_id = SwissProtIdCount()
hsh_swissprot_id = ProtIdCounter()
db.create_swissprot_tables()

# Note: this is not really scalable to x genomes, as
Expand Down Expand Up @@ -607,6 +630,61 @@ def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db
db.commit()


def load_vfdb_hits(params, blast_results, db_name, vfdb_fasta, vfdb_defs):
db = DB.load_db(db_name, params)
hsh_prot_id = ProtIdCounter()
db.create_vf_tables()

# Note: this is not really scalable to x genomes, as
# it necessitates to keep the prot id in memory
# may need to process the blast results in batch instead (slower but
# spares memory).
for blast_file in blast_results:
data = []
with open(blast_file, "r") as blast_fh:
for line in blast_fh:
crc, gene_id, perid, leng, n_mis, n_gap, qs, qe, ss, se, e, score = line.split()
hsh = simplify_hash(crc)
prot_id, _ = parse_vfdb_id(gene_id)
db_prot_id = hsh_prot_id[prot_id]
data.append((hsh, db_prot_id, float(e), int(float(score)),
int(float(perid)), int(n_gap), int(leng)))
db.load_vf_hits(data)

# Definitions are constructed from two data sources.
# vfdb_fasta contains matching entries for every hit, with a
# VFID relating to more information in VFs.xls
vf_defs = pd.read_excel(vfdb_defs, header=1)
vf_defs = vf_defs.set_index("VFID")

vfdb_prot_defs = []
for record in SeqIO.parse(vfdb_fasta, "fasta"):
prot_id, gb_accession = parse_vfdb_id(record.name)
if prot_id not in hsh_prot_id:
continue
db_prot_id = hsh_prot_id[prot_id]
prot_name, vfid = parse_vfdb_entry(record.description)
# Get info from definitions
vf_data = vf_defs.loc[vfid]
vfdb_prot_defs.append(
(db_prot_id, prot_id, gb_accession, prot_name, vfid,
vf_data.VFcategory, vf_data.Characteristics, vf_data.Structure,
vf_data.Function, vf_data.Mechanism))
db.load_vf_defs(vfdb_prot_defs)
db.set_status_in_config_table("BLAST_vfdb", 1)
db.commit()

# There is no version of the VFdb, only a download date
# As there is a new version of the DB every week, the download date
# will have to do.
book = xlrd.open_workbook(vfdb_defs)
sheet = book.sheet_by_index(0)
download_date = re.match(r".*\[(.*)\]", sheet.row(0)[3].value).groups()[0]
download_date = datetime.strptime(download_date, "%a %b %d %H:%M:%S %Y")
db.load_data_into_table("versions", [("VFDB", download_date.date().isoformat())])
db.commit()


# NOTE:
# Several KO marked as significant can be assigned to the same locus
# only take the hit with the lowest evalue (the first in the list)
Expand Down
10 changes: 10 additions & 0 deletions bin/zdb
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,10 @@ elif [[ "$1" == "setup" ]]; then
db_setup_args="${db_setup_args} --pfam"
shift
;;
--vfdb)
db_setup_args="${db_setup_args} --vfdb"
shift
;;
--singularity_dir=*)
singularity_dir=${i#*=}
shift
Expand All @@ -135,6 +139,7 @@ elif [[ "$1" == "setup" ]]; then
echo " --ko: downloads and setups the hmm profiles of the ko database"
echo " --pfam: downloads and setups up the hmm profiles of the PFAM protein domains"
echo " --swissprot: downloads and indexes the swissprot database "
echo " --vfdb: downloads the virulence factor database "
echo ""
echo "Other parameters:"
echo " --dir: directory where to store the reference databases (defaults zdb_ref in the current directory)"
Expand Down Expand Up @@ -474,6 +479,10 @@ elif [[ "$1" == "run" ]]; then
args="${args} --pfam"
shift
;;
--vfdb)
args="${args} --vfdb"
shift
;;
--singularity_dir=*)
singularity_dir=$(realpath ${i#*=})
shift
Expand All @@ -498,6 +507,7 @@ elif [[ "$1" == "run" ]]; then
echo " --ko: perform ko (metabolism) annotation"
echo " --pfam: peform PFAM domain annotation"
echo " --swissprot: search for homologs in the swissprot database"
echo " --vfdb: search for homologs in the virulence factor database"
echo " --ref_dir: directory where the reference databases were setup up (default zdb_ref)"
echo " --cpu: number of parallel processes allowed (default 8)"
echo " --mem: max memory usage allowed (default 8GB)"
Expand Down
3 changes: 1 addition & 2 deletions conda/annotation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@ name: annotation-base
channels:
- anaconda
- conda-forge
- metagenlab
dependencies:
- python=3.9
- whoosh
- sqlite
- biopython>=1.8
- numpy
- pandas
- metagenlab_libs
- xlrd
41 changes: 41 additions & 0 deletions db_setup.nf
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,40 @@ process prepare_swissprot {
"""
}

process download_vfdb {

output:
tuple path("vfdb.fasta"), path("VFs.xls")

script:
"""
wget http://www.mgc.ac.cn/VFs/Down/VFDB_setB_nt.fas.gz
wget http://www.mgc.ac.cn/VFs/Down/VFs.xls.gz
gunzip < VFDB_setB_nt.fas.gz > vfdb.fasta
gunzip < VFs.xls.gz > VFs.xls
rm VFDB_setB_nt.fas.gz
rm VFs.xls.gz
"""
}

process prepare_vfdb {
container "$params.blast_container"
conda "$baseDir/conda/blast.yaml"

publishDir "$params.vf_db", mode: "move"

input:
tuple path(vfdb_fasta), path(vf_descr)

output:
path("*", includeInputs: true)

script:
"""
makeblastdb -dbtype prot -in $vfdb_fasta
"""
}

workflow setup_cogg_db {
download_cog_cdd() | setup_cog_cdd
}
Expand All @@ -174,6 +208,10 @@ workflow setup_swissprot_db {
download_swissprot() | prepare_swissprot
}

workflow setup_vfdb {
download_vfdb() | prepare_vfdb
}

workflow {
if( params.cog )
setup_cogg_db()
Expand All @@ -189,4 +227,7 @@ workflow {

if ( params.blast_swissprot )
setup_swissprot_db()

if ( params.vfdb )
setup_vfdb()
}
1 change: 1 addition & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ to [Common Changelog](https://common-changelog.org)

### Added

- Add virulence factor annotations. ([#56](https://github.com/metagenlab/zDB/pull/56)) (Niklaus Johner)
- Store reference DB and software versions in DB. ([#54](https://github.com/metagenlab/zDB/pull/54)) (Niklaus Johner)
- Add script to compute rendering times of various views. ([#51](https://github.com/metagenlab/zDB/pull/51)) (Niklaus Johner)
- Add possibility to dump HTML responses from webapp tests. ([#50](https://github.com/metagenlab/zDB/pull/50)) (Niklaus Johner)
Expand Down
4 changes: 3 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ params.pfam_db = "${params.base_db}/pfam/"
params.cog_db = "${params.base_db}/cog/"
params.ko_db = "${params.base_db}/kegg/"
params.refseq_db = "${params.base_db}/refseq/"
params.vf_db = "${params.base_db}/vfdb/"

params.results_dir = "zdb/results"

Expand All @@ -22,6 +23,7 @@ params.cog = false
params.ko = false
params.pfam = false
params.amr = false
params.vfdb = false

params.checkm_args = "domain Bacteria"

Expand Down Expand Up @@ -59,7 +61,7 @@ docker.enabled = "$params.docker" == "true"?true:false
conda.enabled = "$params.conda" == "true"?true:false
conda.useMamba = true

params.annotation_container = "registry.hub.docker.com/metagenlab/annotation-pipeline:1.3.1"
params.annotation_container = "registry.hub.docker.com/metagenlab/annotation-pipeline:1.4.0"

params.zdb_container = "docker-daemon://metagenlab/zdb:latest"
params.checkm_container = "quay.io/biocontainers/checkm-genome:1.2.1--pyhdfd78af_0"
Expand Down
14 changes: 14 additions & 0 deletions testing/pipelines/test_annotation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,20 @@ def test_amr_hits(self):
self.assert_db_base_table_row_counts()
self.assertEqual(2, self.query("amr_hits").count())

def test_vfdb_hits(self):
self.nf_params["vfdb"] = "true"
execution = self.execute_pipeline()
self.assert_success(execution)
self.load_db(execution)

# Let's check that tables were correctly created and filled
self.assertItemsEqual(
base_tables + ['vf_hits', 'vf_defs'],
self.metadata_obj.tables.keys())
self.assert_db_base_table_row_counts()
self.assertEqual(0, self.query("vf_hits").count())
self.assertEqual(0, self.query("vf_defs").count())

def test_full_pipeline(self):
self.nf_params["pfam"] = "true"
self.nf_params["ko"] = "true"
Expand Down
17 changes: 17 additions & 0 deletions testing/pipelines/test_db_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,20 @@ def test_creating_swissprot_db(self):
self.assertItemsEqual(
expected_files,
os.listdir(os.path.join(self.ref_db_dir, "uniprot", "swissprot")))

def test_creating_vf_db(self):
self.nf_params["vfdb"] = "true"
execution = self.execute_pipeline()
self.assert_success(execution)

self.assertEqual([proc.name for proc in execution.process_executions],
["setup_vfdb:download_vfdb"])

download_process = execution.process_executions[0]
# Files are moved to zdb_ref/vfdb
expected_files = ['vfdb.fasta',
'VFs.xls']
self.assert_created_files(download_process, [])
self.assertItemsEqual(
expected_files,
os.listdir(os.path.join(self.ref_db_dir, "vfdb")))
1 change: 1 addition & 0 deletions utils/setup_base_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def create_data_table(db):
("phylogenetic_profile", "optional", False),
("gbk_files", "optional", False),
("AMR", "optional", False),
("BLAST_vfdb", "optional", False),
]
db.load_chlamdb_config_tables(entry_list)
db.commit()
Expand Down
Loading

0 comments on commit 820a2fe

Please sign in to comment.