Skip to content

Commit

Permalink
Merge pull request #54 from metagenlab/nj/db_versions2
Browse files Browse the repository at this point in the history
Add reference DB and software versions to database.
  • Loading branch information
njohner authored Feb 9, 2024
2 parents a3a2628 + b2dff95 commit 9f25f49
Show file tree
Hide file tree
Showing 14 changed files with 184 additions and 55 deletions.
45 changes: 34 additions & 11 deletions annotation_pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -475,9 +475,26 @@ process execute_kofamscan {
"""
}

process prepare_amrscan {
container "$params.ncbi_amr_container"
conda "$baseDir/conda/amrfinderplus.yaml"

output:
path 'versions.txt'

script:
conda = params.conda
"""
if $conda; then
amrfinder -u
fi
amrfinder -V > versions.txt
"""
}

process execute_amrscan {
container "$params.ncbi_amr_container"
conda "$baseDir/conda/ncbi_amr.yaml"
conda "$baseDir/conda/amrfinderplus.yaml"

input:
file(seq)
Expand Down Expand Up @@ -662,6 +679,7 @@ process load_COG_into_db {
path db
path cog_file
path cdd_to_cog
path cog_db_dir

output:
path db
Expand All @@ -674,7 +692,7 @@ process load_COG_into_db {
kwargs = ${gen_python_args()}
cog_files = "${cog_file}".split()
setup_chlamdb.load_cog(kwargs, cog_files, "$db", "$cdd_to_cog")
setup_chlamdb.load_cog(kwargs, cog_files, "$db", "$cdd_to_cog", "$cog_db_dir")
"""
}

Expand All @@ -685,6 +703,7 @@ process load_KO_into_db {
input:
path KO_results
path db
path ko_db_dir

output:
path db
Expand All @@ -699,7 +718,7 @@ process load_KO_into_db {
# this last function should be exported in a separate script to generate
# the scaffold of a database
setup_chlamdb.load_KO(kwargs, ko_files, "$db")
setup_chlamdb.load_KO(kwargs, ko_files, "$db", "$ko_db_dir")
setup_chlamdb.load_module_completeness(kwargs, "$db")
"""
}
Expand All @@ -712,6 +731,7 @@ process load_PFAM_info_db {
path db
path pfam_annot
path pfam_dat
path pfam_db

output:
path db
Expand All @@ -724,7 +744,7 @@ process load_PFAM_info_db {
kwargs = ${gen_python_args()}
pfam_files = "$pfam_annot".split()
setup_chlamdb.load_pfam(kwargs, pfam_files, "$db", "$pfam_dat")
setup_chlamdb.load_pfam(kwargs, pfam_files, "$db", "$pfam_dat", "$pfam_db")
"""
}

Expand All @@ -736,6 +756,7 @@ process load_swissprot_hits_into_db {
path db
path blast_results
path swissprot_db
path swissprot_db_dir

output:
path db
Expand All @@ -748,7 +769,7 @@ process load_swissprot_hits_into_db {
kwargs = ${gen_python_args()}
blast_results = "$blast_results".split()
setup_chlamdb.load_swissprot(kwargs, blast_results, "$db", "swissprot.fasta")
setup_chlamdb.load_swissprot(kwargs, blast_results, "$db", "swissprot.fasta", "$swissprot_db_dir")
"""
}

Expand All @@ -759,6 +780,7 @@ process load_amr_into_db {
input:
file collected_amr_files
file db
file version

output:
file db
Expand All @@ -771,7 +793,7 @@ process load_amr_into_db {
kwargs = ${gen_python_args()}
amr_files = "${collected_amr_files}".split()
setup_chlamdb.load_amr(kwargs, amr_files, "$db")
setup_chlamdb.load_amr(kwargs, amr_files, "$db", "$version")
"""
}

Expand Down Expand Up @@ -898,21 +920,21 @@ workflow {
Channel.fromPath("${params.pfam_db}", type: "dir").set { pfam_db }
pfam_db.combine(split_nr_seqs).set { to_pfam_scan_combined }
pfam_results = pfam_scan(to_pfam_scan_combined)
db = load_PFAM_info_db(db, pfam_results.collect(), Channel.fromPath("$params.pfam_db/Pfam-A.hmm.dat"))
db = load_PFAM_info_db(db, pfam_results.collect(), Channel.fromPath("$params.pfam_db/Pfam-A.hmm.dat"), pfam_db)
}

if(params.cog) {
Channel.fromPath("$params.cog_db", type: "dir").set { to_cog_multi }
to_cog_multi.combine(split_nr_seqs).set { to_rpsblast_COG_multi }
COG_to_load_db = rpsblast_COG(to_rpsblast_COG_multi)
db = load_COG_into_db(db, COG_to_load_db.collect(), Channel.fromPath("$params.cog_db/cdd_to_cog"))
db = load_COG_into_db(db, COG_to_load_db.collect(), Channel.fromPath("$params.cog_db/cdd_to_cog"), Channel.fromPath("$params.cog_db"))
}

if (params.blast_swissprot) {
Channel.fromPath("$params.swissprot_db", type: "dir").set { to_swissprot_multi }
to_swissprot_multi.combine(split_nr_seqs).set { to_blast_swissprot_multi }
swissprot_blast = blast_swissprot(to_blast_swissprot_multi)
db = load_swissprot_hits_into_db(db, swissprot_blast.collect(), Channel.fromPath("$params.swissprot_db/swissprot.fasta"))
db = load_swissprot_hits_into_db(db, swissprot_blast.collect(), Channel.fromPath("$params.swissprot_db/swissprot.fasta"), Channel.fromPath("$params.swissprot_db"))
}

if(params.diamond_refseq) {
Expand All @@ -928,12 +950,13 @@ workflow {
Channel.fromPath("$params.ko_db", type: "dir").set { to_ko_multi }
to_ko_multi.combine(split_nr_seqs).set { to_kofamscan_multi }
to_load_KO = execute_kofamscan(to_kofamscan_multi)
db = load_KO_into_db(to_load_KO.collect(), db)
db = load_KO_into_db(to_load_KO.collect(), db, Channel.fromPath("$params.ko_db"))
}

if(params.amr) {
amr_version = prepare_amrscan()
amr_table = execute_amrscan(split_nr_seqs)
db = load_amr_into_db(amr_table.collect(), db)
db = load_amr_into_db(amr_table.collect(), db, amr_version)
}

(to_index_cleanup, to_db_cleanup) = create_chlamdb_search_index(db)
Expand Down
60 changes: 50 additions & 10 deletions bin/setup_chlamdb.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import os
from collections import defaultdict, namedtuple

import KO_module
import pandas as pd

from Bio import AlignIO, SeqIO, SeqUtils

import search_bar
from Bio import SeqIO, SeqUtils
from db_utils import DB

import KO_module
import search_bar

# assumes orthofinder named: OG000N
# returns the N as int
Expand Down Expand Up @@ -259,7 +257,7 @@ def load_alignments_results(args, identity_csvs, db_file):
db.commit()


def load_cog(params, filelist, db_file, cdd_to_cog):
def load_cog(params, filelist, db_file, cdd_to_cog, cog_db_dir):
db = DB.load_db(db_file, params)
hsh_cdd_to_cog = {}
with open(cdd_to_cog, "r") as cdd_to_cog_file:
Expand All @@ -286,6 +284,14 @@ def load_cog(params, filelist, db_file, cdd_to_cog):
db.set_status_in_config_table("COG", 1)
db.commit()

# Determine reference DB version
with open(os.path.join(cog_db_dir, "cdd.info")) as fh:
dbname, _, version = fh.readline().split()
if dbname != "cdd":
raise ValueError("Could not determine Cog DB version")
db.load_data_into_table("versions", [("CDD", version.strip())])
db.commit()


def amr_hit_to_db_entry(hit):
columns = ["Gene symbol",
Expand All @@ -305,7 +311,7 @@ def amr_hit_to_db_entry(hit):
return entry


def load_amr(params, filelist, db_file):
def load_amr(params, filelist, db_file, version_file):
db = DB.load_db(db_file, params)

data = []
Expand All @@ -316,6 +322,17 @@ def load_amr(params, filelist, db_file):
db.set_status_in_config_table("AMR", 1)
db.commit()

# Determine software and reference DB version
with open(version_file) as fh:
for line in fh:
if line.startswith("Software version"):
soft_version = line.rsplit(":", 1)[1].strip()
elif line.startswith("Database version"):
db_version = line.rsplit(":", 1)[1].strip()
db.load_data_into_table("versions", [("AMRFinderSoftware", soft_version),
("AMRFinderDB", db_version)])
db.commit()


# Note: the trees are stored in files with name formatted as:
# OGN_nr_hits_mafft.nwk. To retrieve the orthogroup, parse the filename
Expand Down Expand Up @@ -456,7 +473,7 @@ def parse_pfam_entry(file_iter):
description = line[len("#=GF DE "):-1]


def load_pfam(params, pfam_files, db, pfam_def_file):
def load_pfam(params, pfam_files, db, pfam_def_file, ref_db_dir):
db = DB.load_db(db, params)

db.create_pfam_hits_table()
Expand All @@ -478,6 +495,7 @@ def load_pfam(params, pfam_files, db, pfam_def_file):
pfam_ids.add(pfam_i)

db.load_data_into_table("pfam_hits", entries)

db.commit()

pfam_entries = []
Expand All @@ -491,6 +509,14 @@ def load_pfam(params, pfam_files, db, pfam_def_file):
db.set_status_in_config_table("pfam", 1)
db.commit()

# Determine reference DB version
with open(os.path.join(ref_db_dir, "Pfam.version")) as fh:
title, version = fh.readline().split(":")
if "Pfam release" not in title:
raise ValueError("Could not determine Pfam DB version")
db.load_data_into_table("versions", [("Pfam", version.strip())])
db.commit()


class SwissProtIdCount(defaultdict):
def __init__(self):
Expand Down Expand Up @@ -536,7 +562,7 @@ def parse_swissprot_id(to_parse):
return db, ident, name


def load_swissprot(params, blast_results, db_name, swissprot_fasta):
def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db_dir):
db = DB.load_db(db_name, params)
hsh_swissprot_id = SwissProtIdCount()
db.create_swissprot_tables()
Expand Down Expand Up @@ -572,11 +598,19 @@ def load_swissprot(params, blast_results, db_name, swissprot_fasta):
db.set_status_in_config_table("BLAST_swissprot", 1)
db.commit()

# Determine reference DB version
with open(os.path.join(swissprot_db_dir, "relnotes.txt")) as fh:
dbname, _, version = fh.readline().split()
if dbname != "UniProt":
raise ValueError("Could not determine SwissProt DB version")
db.load_data_into_table("versions", [("SwissProt", version.strip())])
db.commit()


# NOTE:
# Several KO marked as significant can be assigned to the same locus
# only take the hit with the lowest evalue (the first in the list)
def load_KO(params, ko_files, db_name):
def load_KO(params, ko_files, db_name, ko_db_dir):
db = DB.load_db(db_name, params)
data = []
for ko_file in ko_files:
Expand Down Expand Up @@ -606,6 +640,12 @@ def load_KO(params, ko_files, db_name):
db.set_status_in_config_table("KEGG", 1)
db.commit()

# Determine reference DB version
with open(os.path.join(ko_db_dir, "version.txt")) as fh:
version = fh.readline()
db.load_data_into_table("versions", [("Ko", version.strip())])
db.commit()


def load_module_completeness(params, db_name):
db = DB.load_db(db_name, params)
Expand Down
6 changes: 6 additions & 0 deletions conda/amrfinderplus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: amrfinderplus
channels:
- conda-forge
- bioconda
dependencies:
- ncbi-amrfinderplus >= 3.12
Loading

0 comments on commit 9f25f49

Please sign in to comment.