Merge pull request #54 from metagenlab/nj/db_versions2

Add reference DB and software versions to database.
metagenlab · Feb 9, 2024 · 9f25f49 · 9f25f49
2 parents a3a2628 + b2dff95
commit 9f25f49
Show file tree

Hide file tree

Showing 14 changed files with 184 additions and 55 deletions.
diff --git a/annotation_pipeline.nf b/annotation_pipeline.nf
@@ -475,9 +475,26 @@ process execute_kofamscan {
   """
 }
 
+process prepare_amrscan {
+  container "$params.ncbi_amr_container"
+  conda "$baseDir/conda/amrfinderplus.yaml"
+
+  output:
+      path 'versions.txt'
+
+  script:
+  conda = params.conda
+  """
+  if $conda; then
+        amrfinder -u
+  fi
+  amrfinder -V > versions.txt
+  """
+}
+
 process execute_amrscan {
   container "$params.ncbi_amr_container"
-  conda "$baseDir/conda/ncbi_amr.yaml"
+  conda "$baseDir/conda/amrfinderplus.yaml"
 
   input:
   file(seq)
@@ -662,6 +679,7 @@ process load_COG_into_db {
         path db
         path cog_file
         path cdd_to_cog
+        path cog_db_dir
 
     output:
         path db
@@ -674,7 +692,7 @@ process load_COG_into_db {
 
         kwargs = ${gen_python_args()}
         cog_files = "${cog_file}".split()
-        setup_chlamdb.load_cog(kwargs, cog_files, "$db", "$cdd_to_cog")
+        setup_chlamdb.load_cog(kwargs, cog_files, "$db", "$cdd_to_cog", "$cog_db_dir")
         """
 }
 
@@ -685,6 +703,7 @@ process load_KO_into_db {
     input:
         path KO_results
         path db
+        path ko_db_dir
 
     output:
         path db
@@ -699,7 +718,7 @@ process load_KO_into_db {
 
         # this last function should be exported in a separate script to generate
         # the scaffold of a database
-        setup_chlamdb.load_KO(kwargs, ko_files, "$db")
+        setup_chlamdb.load_KO(kwargs, ko_files, "$db", "$ko_db_dir")
         setup_chlamdb.load_module_completeness(kwargs, "$db")
         """
 }
@@ -712,6 +731,7 @@ process load_PFAM_info_db {
         path db
         path pfam_annot
         path pfam_dat
+        path pfam_db
 
     output:
         path db
@@ -724,7 +744,7 @@ process load_PFAM_info_db {
         kwargs = ${gen_python_args()}
         pfam_files = "$pfam_annot".split()
 
-        setup_chlamdb.load_pfam(kwargs, pfam_files, "$db", "$pfam_dat")
+        setup_chlamdb.load_pfam(kwargs, pfam_files, "$db", "$pfam_dat", "$pfam_db")
         """
 }
 
@@ -736,6 +756,7 @@ process load_swissprot_hits_into_db {
         path db
         path blast_results
         path swissprot_db
+        path swissprot_db_dir
 
     output:
         path db
@@ -748,7 +769,7 @@ process load_swissprot_hits_into_db {
         kwargs = ${gen_python_args()}
         blast_results = "$blast_results".split()
 
-        setup_chlamdb.load_swissprot(kwargs, blast_results, "$db", "swissprot.fasta")
+        setup_chlamdb.load_swissprot(kwargs, blast_results, "$db", "swissprot.fasta", "$swissprot_db_dir")
     """
 }
 
@@ -759,6 +780,7 @@ process load_amr_into_db {
     input:
         file collected_amr_files
         file db
+        file version
 
     output:
         file db
@@ -771,7 +793,7 @@ process load_amr_into_db {
 
         kwargs = ${gen_python_args()}
         amr_files = "${collected_amr_files}".split()
-        setup_chlamdb.load_amr(kwargs, amr_files, "$db")
+        setup_chlamdb.load_amr(kwargs, amr_files, "$db", "$version")
         """
 }
 
@@ -898,21 +920,21 @@ workflow {
         Channel.fromPath("${params.pfam_db}", type: "dir").set { pfam_db }
         pfam_db.combine(split_nr_seqs).set { to_pfam_scan_combined }
         pfam_results = pfam_scan(to_pfam_scan_combined)
-        db = load_PFAM_info_db(db, pfam_results.collect(), Channel.fromPath("$params.pfam_db/Pfam-A.hmm.dat"))
+        db = load_PFAM_info_db(db, pfam_results.collect(), Channel.fromPath("$params.pfam_db/Pfam-A.hmm.dat"), pfam_db)
     }
 
     if(params.cog) {
         Channel.fromPath("$params.cog_db", type: "dir").set { to_cog_multi }
         to_cog_multi.combine(split_nr_seqs).set { to_rpsblast_COG_multi }
         COG_to_load_db = rpsblast_COG(to_rpsblast_COG_multi)
-        db = load_COG_into_db(db, COG_to_load_db.collect(), Channel.fromPath("$params.cog_db/cdd_to_cog"))
+        db = load_COG_into_db(db, COG_to_load_db.collect(), Channel.fromPath("$params.cog_db/cdd_to_cog"),  Channel.fromPath("$params.cog_db"))
     }
 
     if (params.blast_swissprot) {
         Channel.fromPath("$params.swissprot_db", type: "dir").set { to_swissprot_multi }
         to_swissprot_multi.combine(split_nr_seqs).set { to_blast_swissprot_multi }
         swissprot_blast = blast_swissprot(to_blast_swissprot_multi)
-        db = load_swissprot_hits_into_db(db, swissprot_blast.collect(), Channel.fromPath("$params.swissprot_db/swissprot.fasta"))
+        db = load_swissprot_hits_into_db(db, swissprot_blast.collect(), Channel.fromPath("$params.swissprot_db/swissprot.fasta"), Channel.fromPath("$params.swissprot_db"))
     }
 
     if(params.diamond_refseq) {
@@ -928,12 +950,13 @@ workflow {
         Channel.fromPath("$params.ko_db", type: "dir").set { to_ko_multi }
         to_ko_multi.combine(split_nr_seqs).set { to_kofamscan_multi }
         to_load_KO = execute_kofamscan(to_kofamscan_multi)
-        db = load_KO_into_db(to_load_KO.collect(), db)
+        db = load_KO_into_db(to_load_KO.collect(), db, Channel.fromPath("$params.ko_db"))
     }
 
     if(params.amr) {
+        amr_version = prepare_amrscan()
         amr_table = execute_amrscan(split_nr_seqs)
-        db = load_amr_into_db(amr_table.collect(), db)
+        db = load_amr_into_db(amr_table.collect(), db, amr_version)
     }
 
     (to_index_cleanup, to_db_cleanup) = create_chlamdb_search_index(db)

diff --git a/bin/setup_chlamdb.py b/bin/setup_chlamdb.py
@@ -1,14 +1,12 @@
 import os
 from collections import defaultdict, namedtuple
 
+import KO_module
 import pandas as pd
-
-from Bio import AlignIO, SeqIO, SeqUtils
-
+import search_bar
+from Bio import SeqIO, SeqUtils
 from db_utils import DB
 
-import KO_module
-import search_bar
 
 # assumes orthofinder named: OG000N
 # returns the N as int
@@ -259,7 +257,7 @@ def load_alignments_results(args, identity_csvs, db_file):
     db.commit()
 
 
-def load_cog(params, filelist, db_file, cdd_to_cog):
+def load_cog(params, filelist, db_file, cdd_to_cog, cog_db_dir):
     db = DB.load_db(db_file, params)
     hsh_cdd_to_cog = {}
     with open(cdd_to_cog, "r") as cdd_to_cog_file:
@@ -286,6 +284,14 @@ def load_cog(params, filelist, db_file, cdd_to_cog):
     db.set_status_in_config_table("COG", 1)
     db.commit()
 
+    # Determine reference DB version
+    with open(os.path.join(cog_db_dir, "cdd.info")) as fh:
+        dbname, _, version = fh.readline().split()
+        if dbname != "cdd":
+            raise ValueError("Could not determine Cog DB version")
+    db.load_data_into_table("versions", [("CDD", version.strip())])
+    db.commit()
+
 
 def amr_hit_to_db_entry(hit):
     columns = ["Gene symbol",
@@ -305,7 +311,7 @@ def amr_hit_to_db_entry(hit):
     return entry
 
 
-def load_amr(params, filelist, db_file):
+def load_amr(params, filelist, db_file, version_file):
     db = DB.load_db(db_file, params)
 
     data = []
@@ -316,6 +322,17 @@ def load_amr(params, filelist, db_file):
     db.set_status_in_config_table("AMR", 1)
     db.commit()
 
+    # Determine software and reference DB version
+    with open(version_file) as fh:
+        for line in fh:
+            if line.startswith("Software version"):
+                soft_version = line.rsplit(":", 1)[1].strip()
+            elif line.startswith("Database version"):
+                db_version = line.rsplit(":", 1)[1].strip()
+    db.load_data_into_table("versions", [("AMRFinderSoftware", soft_version),
+                                         ("AMRFinderDB", db_version)])
+    db.commit()
+
 
 # Note: the trees are stored in files with name formatted as:
 # OGN_nr_hits_mafft.nwk. To retrieve the orthogroup, parse the filename
@@ -456,7 +473,7 @@ def parse_pfam_entry(file_iter):
             description = line[len("#=GF DE   "):-1]
 
 
-def load_pfam(params, pfam_files, db, pfam_def_file):
+def load_pfam(params, pfam_files, db, pfam_def_file, ref_db_dir):
     db = DB.load_db(db, params)
 
     db.create_pfam_hits_table()
@@ -478,6 +495,7 @@ def load_pfam(params, pfam_files, db, pfam_def_file):
                 pfam_ids.add(pfam_i)
 
         db.load_data_into_table("pfam_hits", entries)
+
     db.commit()
 
     pfam_entries = []
@@ -491,6 +509,14 @@ def load_pfam(params, pfam_files, db, pfam_def_file):
     db.set_status_in_config_table("pfam", 1)
     db.commit()
 
+    # Determine reference DB version
+    with open(os.path.join(ref_db_dir, "Pfam.version")) as fh:
+        title, version = fh.readline().split(":")
+        if "Pfam release" not in title:
+            raise ValueError("Could not determine Pfam DB version")
+    db.load_data_into_table("versions", [("Pfam", version.strip())])
+    db.commit()
+
 
 class SwissProtIdCount(defaultdict):
     def __init__(self):
@@ -536,7 +562,7 @@ def parse_swissprot_id(to_parse):
     return db, ident, name
 
 
-def load_swissprot(params, blast_results, db_name, swissprot_fasta):
+def load_swissprot(params, blast_results, db_name, swissprot_fasta, swissprot_db_dir):
     db = DB.load_db(db_name, params)
     hsh_swissprot_id = SwissProtIdCount()
     db.create_swissprot_tables()
@@ -572,11 +598,19 @@ def load_swissprot(params, blast_results, db_name, swissprot_fasta):
     db.set_status_in_config_table("BLAST_swissprot", 1)
     db.commit()
 
+    # Determine reference DB version
+    with open(os.path.join(swissprot_db_dir, "relnotes.txt")) as fh:
+        dbname, _,  version = fh.readline().split()
+        if dbname != "UniProt":
+            raise ValueError("Could not determine SwissProt DB version")
+    db.load_data_into_table("versions", [("SwissProt", version.strip())])
+    db.commit()
+
 
 # NOTE:
 # Several KO marked as significant can be assigned to the same locus
 # only take the hit with the lowest evalue (the first in the list)
-def load_KO(params, ko_files, db_name):
+def load_KO(params, ko_files, db_name, ko_db_dir):
     db = DB.load_db(db_name, params)
     data = []
     for ko_file in ko_files:
@@ -606,6 +640,12 @@ def load_KO(params, ko_files, db_name):
     db.set_status_in_config_table("KEGG", 1)
     db.commit()
 
+    # Determine reference DB version
+    with open(os.path.join(ko_db_dir, "version.txt")) as fh:
+        version = fh.readline()
+    db.load_data_into_table("versions", [("Ko", version.strip())])
+    db.commit()
+
 
 def load_module_completeness(params, db_name):
     db = DB.load_db(db_name, params)

diff --git a/conda/amrfinderplus.yaml b/conda/amrfinderplus.yaml
@@ -0,0 +1,6 @@
+name: amrfinderplus
+channels:
+    - conda-forge
+    - bioconda
+dependencies:
+    -  ncbi-amrfinderplus >= 3.12