Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add virulence factor annotations. #56

Merged
merged 5 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions annotation_pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,24 @@ process execute_amrscan {
"""
}

process blast_vfdb {
container "$params.blast_container"
conda "$baseDir/conda/blast.yaml"

input:
tuple (path(vf_db), path(seq))

output:
path '*tab'

script:

n = seq.name
"""
blastp -db $vf_db/vfdb.fasta -query ${n} -outfmt 6 -evalue 10.0 > ${n}.tab
"""
}

process setup_db {
container "$params.annotation_container"
conda "$baseDir/conda/annotation.yaml"
Expand Down Expand Up @@ -797,6 +815,32 @@ process load_amr_into_db {
"""
}

process load_vfdb_hits_into_db {
container "$params.annotation_container"
conda "$baseDir/conda/annotation.yaml"

input:
path db
path blast_results
path vf_db_fasta
path vf_db_defs

output:
path db

script:
"""
#!/usr/bin/env python
import setup_chlamdb

kwargs = ${gen_python_args()}
blast_results = "$blast_results".split()

setup_chlamdb.load_vfdb_hits(kwargs, blast_results, "$db", "$vf_db_fasta", "$vf_db_defs")
"""
}


process create_chlamdb_search_index {
container "$params.annotation_container"
conda "$baseDir/conda/annotation.yaml"
Expand Down Expand Up @@ -959,6 +1003,13 @@ workflow {
db = load_amr_into_db(amr_table.collect(), db, amr_version)
}

if(params.vfdb) {
vf_ref_db = Channel.fromPath("$params.vf_db", type: "dir")
db_seq_combined = vf_ref_db.combine(split_nr_seqs)
vfdb_blast = blast_vfdb(db_seq_combined)
db = load_vfdb_hits_into_db(db, vfdb_blast.collect(), Channel.fromPath("$params.vf_db/vfdb.fasta"), Channel.fromPath("$params.vf_db/VFs.xls"))
}

(to_index_cleanup, to_db_cleanup) = create_chlamdb_search_index(db)
cleanup(to_index_cleanup, to_db_cleanup, mafft_alignments.collect(), Channel.fromPath("${params.results_dir}", type: "dir"), to_cleanup_gbks)
}
Expand Down
82 changes: 80 additions & 2 deletions bin/setup_chlamdb.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import os
import re
from collections import defaultdict, namedtuple
from datetime import datetime

import KO_module
import pandas as pd
import search_bar
import xlrd
from Bio import SeqIO, SeqUtils
from db_utils import DB

Expand Down Expand Up @@ -518,7 +521,7 @@ def load_pfam(params, pfam_files, db, pfam_def_file, ref_db_dir):
db.commit()


class SwissProtIdCount(defaultdict):
class ProtIdCounter(defaultdict):
def __init__(self):
super().__init__()
self.id_count = 0
Expand Down Expand Up @@ -562,9 +565,29 @@ def parse_swissprot_id(to_parse):
return db, ident, name


vfdb_id_expr = re.compile(r"(.*)\(gb\|(.*)\)")


def parse_vfdb_id(to_parse):
"""IDs in vfdb.fasta are either of the form VFID(gb_accession) or VFID
"""
if "(" in to_parse:
return vfdb_id_expr.match(to_parse).groups()
return to_parse, None


vfdb_descr_expr = re.compile(r"\(.*?\) (.*?) \[.*?\((VF.*?)\).*?\] \[.*?\]")


def parse_vfdb_entry(description):
description = description.split(" ", 1)[1]
prot_name, vfid = vfdb_descr_expr.match(description).groups()
return prot_name, vfid


def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db_dir):
db = DB.load_db(db_name, params)
hsh_swissprot_id = SwissProtIdCount()
hsh_swissprot_id = ProtIdCounter()
db.create_swissprot_tables()

# Note: this is not really scalable to x genomes, as
Expand Down Expand Up @@ -607,6 +630,61 @@ def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db
db.commit()


def load_vfdb_hits(params, blast_results, db_name, vfdb_fasta, vfdb_defs):
db = DB.load_db(db_name, params)
hsh_prot_id = ProtIdCounter()
db.create_vf_tables()

# Note: this is not really scalable to x genomes, as
# it necessitates to keep the prot id in memory
# may need to process the blast results in batch instead (slower but
# spares memory).
for blast_file in blast_results:
data = []
with open(blast_file, "r") as blast_fh:
for line in blast_fh:
crc, gene_id, perid, leng, n_mis, n_gap, qs, qe, ss, se, e, score = line.split()
hsh = simplify_hash(crc)
prot_id, _ = parse_vfdb_id(gene_id)
db_prot_id = hsh_prot_id[prot_id]
data.append((hsh, db_prot_id, float(e), int(float(score)),
int(float(perid)), int(n_gap), int(leng)))
db.load_vf_hits(data)

# Definitions are constructed from two data sources.
# vfdb_fasta contains matching entries for every hit, with a
# VFID relating to more information in VFs.xls
vf_defs = pd.read_excel(vfdb_defs, header=1)
vf_defs = vf_defs.set_index("VFID")

vfdb_prot_defs = []
for record in SeqIO.parse(vfdb_fasta, "fasta"):
prot_id, gb_accession = parse_vfdb_id(record.name)
if prot_id not in hsh_prot_id:
continue
db_prot_id = hsh_prot_id[prot_id]
prot_name, vfid = parse_vfdb_entry(record.description)
# Get info from definitions
vf_data = vf_defs.loc[vfid]
vfdb_prot_defs.append(
(db_prot_id, prot_id, gb_accession, prot_name, vfid,
vf_data.VFcategory, vf_data.Characteristics, vf_data.Structure,
vf_data.Function, vf_data.Mechanism))
db.load_vf_defs(vfdb_prot_defs)
db.set_status_in_config_table("BLAST_vfdb", 1)
db.commit()

# There is no version of the VFdb, only a download date
# As there is a new version of the DB every week, the download date
# will have to do.
book = xlrd.open_workbook(vfdb_defs)
sheet = book.sheet_by_index(0)
download_date = re.match(r".*\[(.*)\]", sheet.row(0)[3].value).groups()[0]
download_date = datetime.strptime(download_date, "%a %b %d %H:%M:%S %Y")
db.load_data_into_table("versions", [("VFDB", download_date.date().isoformat())])
db.commit()


# NOTE:
# Several KO marked as significant can be assigned to the same locus
# only take the hit with the lowest evalue (the first in the list)
Expand Down
10 changes: 10 additions & 0 deletions bin/zdb
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,10 @@ elif [[ "$1" == "setup" ]]; then
db_setup_args="${db_setup_args} --pfam"
shift
;;
--vfdb)
db_setup_args="${db_setup_args} --vfdb"
shift
;;
--singularity_dir=*)
singularity_dir=${i#*=}
shift
Expand All @@ -135,6 +139,7 @@ elif [[ "$1" == "setup" ]]; then
echo " --ko: downloads and setups the hmm profiles of the ko database"
echo " --pfam: downloads and setups up the hmm profiles of the PFAM protein domains"
echo " --swissprot: downloads and indexes the swissprot database "
echo " --vfdb: downloads the virulence factor database "
echo ""
echo "Other parameters:"
echo " --dir: directory where to store the reference databases (defaults zdb_ref in the current directory)"
Expand Down Expand Up @@ -474,6 +479,10 @@ elif [[ "$1" == "run" ]]; then
args="${args} --pfam"
shift
;;
--vfdb)
args="${args} --vfdb"
shift
;;
--singularity_dir=*)
singularity_dir=$(realpath ${i#*=})
shift
Expand All @@ -498,6 +507,7 @@ elif [[ "$1" == "run" ]]; then
echo " --ko: perform ko (metabolism) annotation"
echo " --pfam: peform PFAM domain annotation"
echo " --swissprot: search for homologs in the swissprot database"
echo " --vfdb: search for homologs in the virulence factor database"
echo " --ref_dir: directory where the reference databases were setup up (default zdb_ref)"
echo " --cpu: number of parallel processes allowed (default 8)"
echo " --mem: max memory usage allowed (default 8GB)"
Expand Down
3 changes: 1 addition & 2 deletions conda/annotation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@ name: annotation-base
channels:
- anaconda
- conda-forge
- metagenlab
dependencies:
- python=3.9
- whoosh
- sqlite
- biopython>=1.8
- numpy
- pandas
- metagenlab_libs
- xlrd
41 changes: 41 additions & 0 deletions db_setup.nf
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,40 @@ process prepare_swissprot {
"""
}

process download_vfdb {

output:
tuple path("vfdb.fasta"), path("VFs.xls")

script:
"""
wget http://www.mgc.ac.cn/VFs/Down/VFDB_setB_nt.fas.gz
wget http://www.mgc.ac.cn/VFs/Down/VFs.xls.gz
gunzip < VFDB_setB_nt.fas.gz > vfdb.fasta
gunzip < VFs.xls.gz > VFs.xls
rm VFDB_setB_nt.fas.gz
rm VFs.xls.gz
"""
}

process prepare_vfdb {
container "$params.blast_container"
conda "$baseDir/conda/blast.yaml"

publishDir "$params.vf_db", mode: "move"

input:
tuple path(vfdb_fasta), path(vf_descr)

output:
path("*", includeInputs: true)

script:
"""
makeblastdb -dbtype prot -in $vfdb_fasta
"""
}

workflow setup_cogg_db {
download_cog_cdd() | setup_cog_cdd
}
Expand All @@ -174,6 +208,10 @@ workflow setup_swissprot_db {
download_swissprot() | prepare_swissprot
}

workflow setup_vfdb {
download_vfdb() | prepare_vfdb
}

workflow {
if( params.cog )
setup_cogg_db()
Expand All @@ -189,4 +227,7 @@ workflow {

if ( params.blast_swissprot )
setup_swissprot_db()

if ( params.vfdb )
setup_vfdb()
}
1 change: 1 addition & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ to [Common Changelog](https://common-changelog.org)

### Added

- Add virulence factor annotations. ([#56](https://github.com/metagenlab/zDB/pull/56)) (Niklaus Johner)
- Store reference DB and software versions in DB. ([#54](https://github.com/metagenlab/zDB/pull/54)) (Niklaus Johner)
- Add script to compute rendering times of various views. ([#51](https://github.com/metagenlab/zDB/pull/51)) (Niklaus Johner)
- Add possibility to dump HTML responses from webapp tests. ([#50](https://github.com/metagenlab/zDB/pull/50)) (Niklaus Johner)
Expand Down
4 changes: 3 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ params.pfam_db = "${params.base_db}/pfam/"
params.cog_db = "${params.base_db}/cog/"
params.ko_db = "${params.base_db}/kegg/"
params.refseq_db = "${params.base_db}/refseq/"
params.vf_db = "${params.base_db}/vfdb/"

params.results_dir = "zdb/results"

Expand All @@ -22,6 +23,7 @@ params.cog = false
params.ko = false
params.pfam = false
params.amr = false
params.vfdb = false

params.checkm_args = "domain Bacteria"

Expand Down Expand Up @@ -59,7 +61,7 @@ docker.enabled = "$params.docker" == "true"?true:false
conda.enabled = "$params.conda" == "true"?true:false
conda.useMamba = true

params.annotation_container = "registry.hub.docker.com/metagenlab/annotation-pipeline:1.3.1"
params.annotation_container = "registry.hub.docker.com/metagenlab/annotation-pipeline:1.4.0"

params.zdb_container = "docker-daemon://metagenlab/zdb:latest"
params.checkm_container = "quay.io/biocontainers/checkm-genome:1.2.1--pyhdfd78af_0"
Expand Down
14 changes: 14 additions & 0 deletions testing/pipelines/test_annotation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,20 @@ def test_amr_hits(self):
self.assert_db_base_table_row_counts()
self.assertEqual(2, self.query("amr_hits").count())

def test_vfdb_hits(self):
self.nf_params["vfdb"] = "true"
execution = self.execute_pipeline()
self.assert_success(execution)
self.load_db(execution)

# Let's check that tables were correctly created and filled
self.assertItemsEqual(
base_tables + ['vf_hits', 'vf_defs'],
self.metadata_obj.tables.keys())
self.assert_db_base_table_row_counts()
self.assertEqual(0, self.query("vf_hits").count())
self.assertEqual(0, self.query("vf_defs").count())

def test_full_pipeline(self):
self.nf_params["pfam"] = "true"
self.nf_params["ko"] = "true"
Expand Down
17 changes: 17 additions & 0 deletions testing/pipelines/test_db_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,20 @@ def test_creating_swissprot_db(self):
self.assertItemsEqual(
expected_files,
os.listdir(os.path.join(self.ref_db_dir, "uniprot", "swissprot")))

def test_creating_vf_db(self):
self.nf_params["vfdb"] = "true"
execution = self.execute_pipeline()
self.assert_success(execution)

self.assertEqual([proc.name for proc in execution.process_executions],
["setup_vfdb:download_vfdb"])

download_process = execution.process_executions[0]
# Files are moved to zdb_ref/vfdb
expected_files = ['vfdb.fasta',
'VFs.xls']
self.assert_created_files(download_process, [])
self.assertItemsEqual(
expected_files,
os.listdir(os.path.join(self.ref_db_dir, "vfdb")))
1 change: 1 addition & 0 deletions utils/setup_base_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def create_data_table(db):
("phylogenetic_profile", "optional", False),
("gbk_files", "optional", False),
("AMR", "optional", False),
("BLAST_vfdb", "optional", False),
]
db.load_chlamdb_config_tables(entry_list)
db.commit()
Expand Down
Loading