From 36d263e8e7ac82ebbfc021341588487467e62df1 Mon Sep 17 00:00:00 2001 From: ttubb Date: Wed, 10 Apr 2024 12:54:46 +0000 Subject: [PATCH 01/12] more formatting --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e9c8947..e1523a0 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ - submg Logo + submg Logo From a31d5bd6b105fc3aee8da7d39693c6640b34c1b5 Mon Sep 17 00:00:00 2001 From: ttubb Date: Fri, 3 May 2024 13:47:06 +0000 Subject: [PATCH 02/12] refactoring --- submg/main.py | 484 +++++++++++++++++++++++++++----------------------- 1 file changed, 261 insertions(+), 223 deletions(-) diff --git a/submg/main.py b/submg/main.py index 5aca45a..ecb73db 100644 --- a/submg/main.py +++ b/submg/main.py @@ -1,13 +1,17 @@ -#!/usr/bin/env python - import argparse import os import time import traceback -from submg import loggingC, utility, preflight, configGen, webinDownload, enaSearching, taxQuery -from submg.statConf import staticConfig +from submg import loggingC +from submg import preflight +from submg import utility +from submg import webinDownload +from submg import configGen +from submg import taxQuery +from submg import enaSearching +from submg.statConf import staticConfig from submg.utility import prepdir from submg.sampleSubmission import submit_samples from submg.readSubmission import submit_reads @@ -16,8 +20,14 @@ from submg.magSubmission import submit_mags -def main(): - # Parsing command line input +def init_argparse(): + """ + Use argparse to parse command line arguments and return the arguments + object. + + Returns: + argparse.ArgumentParser: The arguments object. + """ parser = argparse.ArgumentParser(description="""Tool for submitting metagenome bins to the European Nucleotide Archive. Environment variables ENA_USER and ENA_PASSWORD must be set for ENA upload.""") parser.add_argument("-v", "--version", action="version", version=f"%(prog)s {staticConfig.submg_version}") @@ -165,6 +175,12 @@ def main(): help="Use if you want to submit one assembly. " "To submit multiple assemblies, you need to " "use the tool multiple times.") + parser_makecfg.add_argument("-q", + "--quality_cutoffs", + action="store_true", + default=False, + help="Include fields for bin quality cutoff " + "(contamination & completeness) in config.") coverage_group = parser_makecfg.add_mutually_exclusive_group(required=True) coverage_group.add_argument("--coverage_from_bam", @@ -176,231 +192,253 @@ def main(): help="Coverages are already known and you " "provide them as a .bam file.") - args = parser.parse_args() + return parser - # Webin-cli download - if args.mode == 'download_webin': - toolVersion, webinCliVersion = webinDownload.versions() - print(f">Versions: tool={toolVersion}, webin-cli={webinCliVersion}") - print(">Checking Java installation...") - webinDownload.check_java() - webinDownload.download_webin_cli(webinCliVersion) - # Config generation - elif args.mode == 'makecfg': - configGen.make_config(outpath=args.outfile, - submit_samples=args.submit_samples, - submit_single_reads=args.submit_single_reads, - submit_paired_end_reads=args.submit_paired_end_reads, - coverage_from_bam=args.coverage_from_bam, - known_coverage=args.known_coverage, - submit_assembly=args.submit_assembly, - submit_bins=args.submit_bins, - submit_mags=args.submit_mags, - no_comments=args.no_comments) - - - # Submission - elif args.mode == 'submit': +def download_webin(): + """ + Download the webin-cli .jar file. + """ + toolVersion, webinCliVersion = webinDownload.versions() + print(f">Versions: tool={toolVersion}, webin-cli={webinCliVersion}") + print(">Checking Java installation...") + webinDownload.check_java() + webinDownload.download_webin_cli(webinCliVersion) - loggingC.set_up_logging(args.logging_dir, args.verbosity) - if args.timestamps or (args.timestamps is None and args.development_service): - utility.set_up_timestamps(vars(args)) - - try: - sver = staticConfig.submg_version - wver = staticConfig.webin_cli_version - loggingC.message(f">Running submg {sver} with webin-cli {wver}", 0) - if args.development_service == 1: - loggingC.message((">Initializing a test submission to " \ - "the ENA dev server."), 0) - else: - loggingC.message((">Initializing a LIVE SUBMISSION to " \ - "the ENA production server."), 0) - time.sleep(5) - - if not args.skip_checks: - utility.validate_parameter_combination(args.submit_samples, - args.submit_reads, - args.submit_assembly, - args.submit_bins, - args.submit_mags) - - config = preflight.preflight_checks(vars(args)) - - # If we are submitting bins, get the quality scores and the - # taxonomic information. - # We do this early so we notice issues before we start staging files. - if args.submit_bins or args.submit_mags: - bin_quality = get_bin_quality(config, silent=True) - # Test if there are bins which are too contaminated - for name in bin_quality.keys(): - contamination = bin_quality[name]['contamination'] - if contamination > staticConfig.max_contamination: - err = ( - f"\nERROR: Bin {name} has a contamination score " - f"of {contamination} which is higher than " - f"{staticConfig.max_contamination}" - ) - err += ( - "\nENA will reject the submission of this " - "bin. Consult the 'Contamination above 100%' " - "of README.md for more information." - ) - loggingC.message(err, threshold=-1) - exit(1) - bin_taxonomy = taxQuery.get_bin_taxonomy(config) - - # Construct depth files if there are .bam files in the config - if 'BAM_FILES' in config.keys(): - bam_files = utility.from_config(config, 'BAM_FILES') - if not isinstance(bam_files, list): - bam_files = [bam_files] - depth_files = utility.construct_depth_files(args.staging_dir, - args.threads, - bam_files) - bin_coverage_file = None - else: - if args.submit_bins: - bin_coverage_file = utility.from_config(config, - 'BINS', - 'COVERAGE_FILE') - depth_files = None - - if args.submit_samples: - sample_accession_data = submit_samples(config, - args.staging_dir, - args.logging_dir, - test=args.development_service) - - - else: - sample_accessions = utility.from_config(config, - 'SAMPLE_ACCESSIONS') - if not isinstance(sample_accessions, list): - sample_accessions = [sample_accessions] - sample_accession_data = [] - for acc in sample_accessions: - sample_accession_data.append({ - 'accession': acc, - 'external_accession': 'unk', - 'alias': 'unk', - }) + +def makecfg(args): + """ + Use configGen to create a .yml file containing the fields a user needs + to fill in order to create a submission for their specific setup + """ + configGen.make_config(outpath=args.outfile, + submit_samples=args.submit_samples, + submit_single_reads=args.submit_single_reads, + submit_paired_end_reads=args.submit_paired_end_reads, + coverage_from_bam=args.coverage_from_bam, + known_coverage=args.known_coverage, + submit_assembly=args.submit_assembly, + submit_bins=args.submit_bins, + submit_mags=args.submit_mags, + no_comments=args.no_comments, + quality_cutoffs=args.quality_cutoffs) + + +def submit(args): + """ + Submit data to the ENA. + + Args: + args (argparse.Namespace): The arguments object. + """ + loggingC.set_up_logging(args.logging_dir, args.verbosity) + if args.timestamps or (args.timestamps is None and args.development_service): + utility.set_up_timestamps(vars(args)) + + try: + sver = staticConfig.submg_version + wver = staticConfig.webin_cli_version + loggingC.message(f">Running submg {sver} with webin-cli {wver}", 0) + if args.development_service == 1: + loggingC.message((">Initializing a test submission to " \ + "the ENA dev server."), 0) + else: + loggingC.message((">Initializing a LIVE SUBMISSION to " \ + "the ENA production server."), 0) + time.sleep(5) + + if not args.skip_checks: + utility.validate_parameter_combination(args.submit_samples, + args.submit_reads, + args.submit_assembly, + args.submit_bins, + args.submit_mags) + + config = preflight.preflight_checks(vars(args)) + + # If we are submitting bins, get the quality scores and the + # taxonomic information. + # We do this early so we notice issues before we start staging files. + if args.submit_bins or args.submit_mags: + bin_quality = get_bin_quality(config, silent=True) + # Test if there are bins which are too contaminated + for name in bin_quality.keys(): + contamination = bin_quality[name]['contamination'] + if contamination > staticConfig.max_contamination: + err = ( + f"\nERROR: Bin {name} has a contamination score " + f"of {contamination} which is higher than " + f"{staticConfig.max_contamination}" + ) + err += ( + "\nENA will reject the submission of this " + "bin. Consult the 'Contamination above 100%' " + "of README.md for more information." + ) + loggingC.message(err, threshold=-1) + exit(1) + bin_taxonomy = taxQuery.get_bin_taxonomy(config) + + # Construct depth files if there are .bam files in the config + if 'BAM_FILES' in config.keys(): + bam_files = utility.from_config(config, 'BAM_FILES') + if not isinstance(bam_files, list): + bam_files = [bam_files] + depth_files = utility.construct_depth_files(args.staging_dir, + args.threads, + bam_files) + bin_coverage_file = None + else: + if args.submit_bins: + bin_coverage_file = utility.from_config(config, + 'BINS', + 'COVERAGE_FILE') + depth_files = None + + if args.submit_samples: + sample_accession_data = submit_samples(config, + args.staging_dir, + args.logging_dir, + test=args.development_service) - if args.submit_reads: - run_accessions = submit_reads(config, - sample_accession_data, - prepdir(args.staging_dir, 'reads'), - prepdir(args.logging_dir, 'reads'), - test=args.development_service) - else: - run_accessions = utility.from_config(config, 'ASSEMBLY', 'RUN_ACCESSIONS') - if not isinstance(run_accessions, list): - run_accessions = [run_accessions] + + else: + sample_accessions = utility.from_config(config, + 'SAMPLE_ACCESSIONS') + if not isinstance(sample_accessions, list): + sample_accessions = [sample_accessions] + sample_accession_data = [] + for acc in sample_accessions: + sample_accession_data.append({ + 'accession': acc, + 'external_accession': 'unk', + 'alias': 'unk', + }) + + if args.submit_reads: + run_accessions = submit_reads(config, + sample_accession_data, + prepdir(args.staging_dir, 'reads'), + prepdir(args.logging_dir, 'reads'), + test=args.development_service) + else: + run_accessions = utility.from_config(config, 'ASSEMBLY', 'RUN_ACCESSIONS') + if not isinstance(run_accessions, list): + run_accessions = [run_accessions] + + if args.submit_assembly: + assembly_sample_accession, assembly_fasta_accession = submit_assembly(config, + args.staging_dir, + args.logging_dir, + depth_files, + sample_accession_data, + run_accessions, + threads=args.threads, + test=args.development_service) + # Assembly sample accession will be either the accession of the + # co-assembly virtual sample or the accession of the single sample + # which the assembly is based on + else: + if args.submit_bins or args.submit_mags: + # In this case we need the submission of the sample that the + # assembly is based on. We can derive it from the assembly + # accession by querying ENA + assembly_dict = utility.from_config(config, 'ASSEMBLY') + assembly_sample_accession = None + if 'EXISTING_CO_ASSEMBLY_SAMPLE_ACCESSION' in assembly_dict.keys(): + if not assembly_dict['EXISTING_CO_ASSEMBLY_SAMPLE_ACCESSION'] is None: + assembly_sample_accession = assembly_dict['EXISTING_CO_ASSEMBLY_SAMPLE_ACCESSION'] + if 'EXISTING_ASSEMBLY_ANALYSIS_ACCESSION' in assembly_dict.keys(): + if not assembly_dict['EXISTING_ASSEMBLY_ANALYSIS_ACCESSION'] is None: + assembly_analysis_accession = assembly_dict['EXISTING_ASSEMBLY_ANALYSIS_ACCESSION'] + assembly_sample_accession = enaSearching.search_samples_by_assembly_analysis(assembly_analysis_accession, + args.development_service) + + # Bin submision + if args.submit_bins: + submit_bins(config, + bin_taxonomy, + assembly_sample_accession, + run_accessions, + prepdir(args.staging_dir, 'bins'), + prepdir(args.logging_dir, 'bins'), + depth_files, + bin_coverage_file, + threads=args.threads, + test=args.development_service) + + # MAG submission + if args.submit_mags: if args.submit_assembly: - assembly_sample_accession, assembly_fasta_accession = submit_assembly(config, - args.staging_dir, - args.logging_dir, - depth_files, - sample_accession_data, - run_accessions, - threads=args.threads, - test=args.development_service) - # Assembly sample accession will be either the accession of the - # co-assembly virtual sample or the accession of the single sample - # which the assembly is based on + metagenome_scientific_name = utility.from_config(config, 'METAGENOME_SCIENTIFIC_NAME') else: - if args.submit_bins or args.submit_mags: - # In this case we need the submission of the sample that the - # assembly is based on. We can derive it from the assembly - # accession by querying ENA - assembly_dict = utility.from_config(config, 'ASSEMBLY') - assembly_sample_accession = None - if 'EXISTING_CO_ASSEMBLY_SAMPLE_ACCESSION' in assembly_dict.keys(): - if not assembly_dict['EXISTING_CO_ASSEMBLY_SAMPLE_ACCESSION'] is None: - assembly_sample_accession = assembly_dict['EXISTING_CO_ASSEMBLY_SAMPLE_ACCESSION'] - if 'EXISTING_ASSEMBLY_ANALYSIS_ACCESSION' in assembly_dict.keys(): - if not assembly_dict['EXISTING_ASSEMBLY_ANALYSIS_ACCESSION'] is None: - assembly_analysis_accession = assembly_dict['EXISTING_ASSEMBLY_ANALYSIS_ACCESSION'] - assembly_sample_accession = enaSearching.search_samples_by_assembly_analysis(assembly_analysis_accession, - args.development_service) - - # Bin submision - if args.submit_bins: - submit_bins(config, - bin_taxonomy, - assembly_sample_accession, - run_accessions, - prepdir(args.staging_dir, 'bins'), - prepdir(args.logging_dir, 'bins'), - depth_files, - bin_coverage_file, - threads=args.threads, - test=args.development_service) - - - # MAG submission - if args.submit_mags: - if args.submit_assembly: - metagenome_scientific_name = utility.from_config(config, 'METAGENOME_SCIENTIFIC_NAME') - else: - try: - metagenome_scientific_name = enaSearching.search_scientific_name_by_sample(assembly_sample_accession, - args.development_service) - except: - # This is a workaround because I keep getting - # false negatives on the development server - # for samples that exist on both servers. - metagenome_scientific_name = enaSearching.search_scientific_name_by_sample(assembly_sample_accession, - False) - submit_mags(config, - metagenome_scientific_name, - sample_accession_data, - run_accessions, - bin_taxonomy, - prepdir(args.staging_dir, 'mags'), - prepdir(args.logging_dir, 'mags'), - depth_files, - bin_coverage_file, - threads=args.threads, - test=args.development_service) - - msg = "\n>All submissions completed." - if args.development_service: - msg += ( - "\n>This was a TEST submission to the ENA development " - "server." - "\n>You can check the status of your submission by " - "logging into the development instance of the ENA " - "submission website." - "\n>Since this was a test submission, you will not " - "receive final accessions via mail." - "\n>Your data will be removed from the development " - "server during the next 24 hours." - ) - else: - msg += ( - "\n>You will receive final accessions once your " - "submission has been processed by ENA." - "\n>ENA will send those final accession by email to " - "the contact adress of your ENA account." - ) - loggingC.message(msg, threshold=0) - - # Cleanup - if not args.keep_depth_files and depth_files is not None: - loggingC.message(">Deleting depth files to free up disk space.", threshold=0) - for depth_file in depth_files: - os.remove(depth_file) - - except Exception: - err = "\n\nTERMINATING BECAUSE AN UNHANDLED EXCEPTION OCCURED:\n" - loggingC.message(err, threshold=-1) - exc_info = traceback.format_exc() - loggingC.message(exc_info, threshold=-1) - exit(1) + try: + metagenome_scientific_name = enaSearching.search_scientific_name_by_sample(assembly_sample_accession, + args.development_service) + except: + # This is a workaround because I keep getting + # false negatives on the development server + # for samples that exist on both servers. + metagenome_scientific_name = enaSearching.search_scientific_name_by_sample(assembly_sample_accession, + False) + submit_mags(config, + metagenome_scientific_name, + sample_accession_data, + run_accessions, + bin_taxonomy, + prepdir(args.staging_dir, 'mags'), + prepdir(args.logging_dir, 'mags'), + depth_files, + bin_coverage_file, + threads=args.threads, + test=args.development_service) + + msg = "\n>All submissions completed." + if args.development_service: + msg += ( + "\n>This was a TEST submission to the ENA development " + "server." + "\n>You can check the status of your submission by " + "logging into the development instance of the ENA " + "submission website." + "\n>Since this was a test submission, you will not " + "receive final accessions via mail." + "\n>Your data will be removed from the development " + "server during the next 24 hours." + ) + else: + msg += ( + "\n>You will receive final accessions once your " + "submission has been processed by ENA." + "\n>ENA will send those final accession by email to " + "the contact adress of your ENA account." + ) + loggingC.message(msg, threshold=0) + + # Cleanup + if not args.keep_depth_files and depth_files is not None: + loggingC.message(">Deleting depth files to free up disk space.", threshold=0) + for depth_file in depth_files: + os.remove(depth_file) + + except Exception: + err = "\n\nTERMINATING BECAUSE AN UNHANDLED EXCEPTION OCCURED:\n" + loggingC.message(err, threshold=-1) + exc_info = traceback.format_exc() + loggingC.message(exc_info, threshold=-1) + exit(1) + +def main(): + parser = init_argparse() + args = parser.parse_args() + if args.mode == 'download_webin': + download_webin() + elif args.mode == 'makecfg': + makecfg(args) + elif args.mode == 'submit': + submit(args) else: parser.print_help() From 94ce2d50a810a7972c0fb49e00e61f055d793546 Mon Sep 17 00:00:00 2001 From: ttubb Date: Mon, 6 May 2024 12:56:57 +0000 Subject: [PATCH 03/12] removed debug messages --- submg/enaSearching.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/submg/enaSearching.py b/submg/enaSearching.py index 7efe954..3d06414 100644 --- a/submg/enaSearching.py +++ b/submg/enaSearching.py @@ -189,8 +189,6 @@ def search_samples_by_assembly_analysis(assembly_analysis_accession: str, "fields": "sample_accession" } response = requests.get(url, params=params) - print("INPUT: ", assembly_analysis_accession) - print("RESPONSE: ", response.text) try: sample_accession = response.text.split('\n')[1:-1][0] @@ -242,6 +240,7 @@ def search_scientific_name_by_sample(sample_accession: str, if __name__ == "__main__": # For debugging + print("DEBUG: Checking API availability...\n") print("DEBUG: Running sample_accession_exists('SAMEA113417025',False)") print(sample_accession_exists('SAMEA113417025',False)) print("DEBUG: Running sample_accession_exists('ERS28162653', False)") From ffa5e93fe786107e0abc892ae9dfd1102d529e7c Mon Sep 17 00:00:00 2001 From: ttubb Date: Mon, 6 May 2024 13:43:44 +0000 Subject: [PATCH 04/12] added contam. and compl. --- submg/statConf.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/submg/statConf.py b/submg/statConf.py index 23be657..9f7f1c3 100644 --- a/submg/statConf.py +++ b/submg/statConf.py @@ -109,6 +109,8 @@ class staticConfig: 'COVERAGE_FILE': ".tsv file containing the coverage values of each bin. Columns must be 'Bin_id' and 'Coverage'.", 'INSERT_SIZE': "Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html)", 'MAG_METADATA_FILE': "A .tsv specifying 'Bin_id', 'Sample_id', 'Quality_category', 'Flatfile_path', 'Chromosomes_path' and 'Unlocalised_path' for all MAGs. See README for more details." + 'MIN_COMPLETENESS': "Bins with smaller completeness value will be discarded (in percent, 0-100). Remove this row to ignore bin completeness". + 'MAX_CONTAMINATION': "Bins with larger contamination value will be discarded (in percent, 0-100). Remove this row to ignore bin contamination", } YAMLEXAMPLES = { @@ -153,5 +155,7 @@ class staticConfig: 'taxonomic identity marker': '\"multi marker approach\"', 'MAG_METADATA_FILE': '\"/mnt/data/mag_data.tsv\"', 'INSERT_SIZE': '\"300\"', + 'MIN_COMPLETENESS': '\"90\"', + 'MAX_CONTAMINATION': '\"5\"', } From 0953cc50ff9d6d467f2329a8391d4c338575d461 Mon Sep 17 00:00:00 2001 From: ttubb Date: Mon, 6 May 2024 13:44:25 +0000 Subject: [PATCH 05/12] added contamination and completeness --- submg/configGen.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/submg/configGen.py b/submg/configGen.py index 24d9098..2af326a 100644 --- a/submg/configGen.py +++ b/submg/configGen.py @@ -25,8 +25,6 @@ def __write_yaml(data: dict, data dictionary and values corresponding to the examples that should be written for each key """ - - # Function to handle None values, representing them as empty fields def represent_none(self, _): return self.represent_scalar('tag:yaml.org,2002:null', '') @@ -75,7 +73,8 @@ def __check_parameters(outpath: str, known_coverage: bool, submit_assembly: bool, submit_bins: bool, - submit_mags: bool) -> None: + submit_mags: bool, + quality_cutoffs: bool) -> None: """ Check if the parameters in their combination are valid. If not, fail gracefully. @@ -108,6 +107,14 @@ def __check_parameters(outpath: str, print("\nERROR: You must specify exactly one of --coverage-from-bam or --known-coverage.") exit(1) + # Check if quality cuttoffs make sense + if quality_cutoffs: + if not submit_bins: + msg = "ERROR: You cannot specify --quality-cutoffs without also specifying --submit-bins." + print(msg) + exit(1) + + # Check if the specified items can be combined in one submission utility.validate_parameter_combination(submit_samples, submit_reads, submit_assembly, @@ -122,7 +129,8 @@ def __make_config_dict(submit_samples: int, known_coverage: bool, submit_assembly: bool, submit_bins: bool, - submit_mags: bool) -> None: + submit_mags: bool, + quality_cutoffs: bool) -> None: """ Create the config dictionary. @@ -261,6 +269,9 @@ def __make_config_dict(submit_samples: int, 'ADDITIONAL_SAMPLESHEET_FIELDS': None, 'ADDITIONAL_MANIFEST_FIELDS': None, } + if quality_cutoffs: + bins['MIN_COMPLETENESS'] = None + bins['MAX_CONTAMINATION'] = None if submit_mags: # Since we need this for MAGs, we might as well ask for it here bins['ADDITIONAL_SAMPLESHEET_FIELDS'] = { 'binning parameters': None, @@ -302,7 +313,8 @@ def make_config(outpath: str, submit_assembly: bool, submit_bins: bool, submit_mags: bool, - no_comments: bool) -> None: + no_comments: bool, + quality_cutoffs: bool,) -> None: """ Write an empty YAML config file which holds the keys (but not the values) which the user needs. @@ -327,7 +339,8 @@ def make_config(outpath: str, known_coverage, submit_assembly, submit_bins, - submit_mags) + submit_mags, + quality_cutoffs) # Assemble all the fields we need @@ -338,7 +351,8 @@ def make_config(outpath: str, known_coverage, submit_assembly, submit_bins, - submit_mags) + submit_mags, + quality_cutoffs,) # Write to file __write_yaml(config_fields, From 9e931d5b32133e7fe8e31eee7d1bfab095656a24 Mon Sep 17 00:00:00 2001 From: ttubb Date: Mon, 6 May 2024 13:44:36 +0000 Subject: [PATCH 06/12] small fixes --- submg/main.py | 3 ++- submg/utility.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/submg/main.py b/submg/main.py index ecb73db..c028c33 100644 --- a/submg/main.py +++ b/submg/main.py @@ -130,7 +130,8 @@ def init_argparse(): 'containing the fields you need to ' 'fill out prior to submission') parser_makecfg.add_argument("-o", - "--outfile",required=True, + "--outfile", + required=True, help="Path to the empty config that will be " "generated.") parser_makecfg.add_argument("-c", diff --git a/submg/utility.py b/submg/utility.py index f6ee3e5..25f0aac 100644 --- a/submg/utility.py +++ b/submg/utility.py @@ -589,6 +589,7 @@ def validate_parameter_combination(submit_samples: bool, is_valid = True if not is_valid: + # Dont use loggingC here because this might be called from configGen print(f"\nERROR: The combination of parameters you have specified is not valid.") print(staticConfig.submission_modes_message) exit(1) From 1ca693be3f619279cf11f5b04dbba560384d46e1 Mon Sep 17 00:00:00 2001 From: ttubb Date: Fri, 10 May 2024 12:17:28 +0000 Subject: [PATCH 07/12] quality cutoffs + taxonomy --- README.md | 8 +- docker/Dockerfile | 1 - examples/02_samples_reads_assembly_bins.yaml | 4 +- examples/11_bins.yaml | 3 +- examples/data/checkm_quality_3bins.tsv | 4 +- examples/localtest.yaml | 106 ++++++++-------- submg/binSubmission.py | 40 +++--- submg/main.py | 19 +-- submg/preflight.py | 65 +++++++++- submg/statConf.py | 6 +- submg/taxQuery.py | 124 ++++++++++++++----- submg/utility.py | 61 ++++++++- todo.txt | 3 +- 13 files changed, 320 insertions(+), 124 deletions(-) diff --git a/README.md b/README.md index e1523a0..bf3826a 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ -submg aids in the submission of metagenomic study data to the European Nucleotide Archive. It can be used to submit various combinations of samples, reads, (co-)assemblies, bins and MAGs. After you enter your (meta)data in a configuration form, submg derives additional information where required, creates samplesheets and manifests and uploads everything to your ENA account. You can use a combination of manual and submg steps to submit your data (e.g. submitting samples and reads through the ENA web interface, then using the tool to submit the assembly and bins). +subMG aids in the submission of metagenomic study data to the European Nucleotide Archive. It can be used to submit various combinations of samples, reads, (co-)assemblies, bins and MAGs. After you enter your (meta)data in a configuration form, subMG derives additional information where required, creates samplesheets and manifests and uploads everything to your ENA account. You can use a combination of manual and subMG steps to submit your data (e.g. submitting samples and reads through the ENA web interface, then using the tool to submit the assembly and bins). @@ -54,10 +54,9 @@ Please Note # Installation -## Container A container based on the main branch is available [through DockerHub](https://hub.docker.com/r/ttubb/submg): `docker pull ttubb/submg` -## Local Installation +If you want to install the tool locally, follow these steps: - Make sure Python 3.8 or higher is installed - Make sure Java 1.8 or higher is installed - Make sure [wheel](https://pypi.org/project/wheel/) is installed @@ -157,7 +156,8 @@ ENA provides a [guideline for choosing taxonomy](https://ena-docs.readthedocs.io If your bins are the result of dereplicating data from a single assembly you can use submg as described above. If your bins are the result of dereplicating data from multiple different assemblies, you need to split them based on which assembly they belong to. You then run submg seperately for each assembly (together with the corresponding set of bins). # Bin Contamination above 100 percent -When calculating completeness and contamination of a bin with tools like [CheckM](https://github.com/Ecogenomics/CheckM), contamination values above 100% can occur. [Usually, this is not an error](https://github.com/Ecogenomics/CheckM/issues/107). However, the ENA API will refuse to accept bins with contamination values above 100%. This issue is unrelated to submg, but to avoid partial submissions submg will refuse to work if such a bin is present in the dataset. If you have bins with contamination values above 100% you can either leave them out by removing them from your dataset or manually set the contamination value to 100% in the `BINS_QUALITY_FILE` file you provide to submg. +When calculating completeness and contamination of a bin with tools like [CheckM](https://github.com/Ecogenomics/CheckM), contamination values above 100% can occur. [Usually, this is not an error](https://github.com/Ecogenomics/CheckM/issues/107). However, the ENA API will refuse to accept bins with contamination values above 100%. submg will automatically exclude bins with contamination values above 100% from the submission. +If you _need_ to submit such (presumably low quality) bins, you need to manually set the contamination value to 100 in the 'QUALITY_FILE' you provide under the bins section. # Support submg is being actively developed. Please use the github [issue tracker](https://github.com/ttubb/submg/issues) to report problems. A [discussions page](https://github.com/ttubb/submg/discussions) is available for questions, comments and suggestions. diff --git a/docker/Dockerfile b/docker/Dockerfile index 85b3a9c..bdb8c4e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,7 +1,6 @@ # Base Image FROM openjdk:slim - # Set up environment RUN apt-get update && \ apt-get upgrade -y diff --git a/examples/02_samples_reads_assembly_bins.yaml b/examples/02_samples_reads_assembly_bins.yaml index 8142170..757b585 100644 --- a/examples/02_samples_reads_assembly_bins.yaml +++ b/examples/02_samples_reads_assembly_bins.yaml @@ -47,7 +47,9 @@ BINS: QUALITY_FILE: "data/checkm_quality_2bins.tsv" # tsv file containing quality values of each bin. Header must include 'Bin_id', 'Completeness', 'Contamination'. A CheckM output table will work here. >>EXAMPLE: "/mnt/data/checkm_quality.tsv" NCBI_TAXONOMY_FILES: ["data/taxonomy/archaea_taxonomy.tsv", "data/taxonomy/bacteria_taxonomy.tsv"] # A list of files with NCBI taxonomy information about the bins. Consult the README to see how they should be structured. >>EXAMPLE: ["/mnt/data/bacteria_tax.tsv","/mnt/data/archaea_tax.tsv"] MANUAL_TAXONOMY_FILE: # Scientific names and taxids for bins. See example file for the structure. Columns must be 'Bin_id', 'Tax_id' and 'Scientific_name'. Consult the README for more information. >>EXAMPLE: "/mnt/data/manual_tax.tsv" - BINNING_SOFTWARE: 'VAMB' # The program that was used for binning. >>EXAMPLE: "metabat2" + BINNING_SOFTWARE: 'VAMB' + MIN_COMPLETENESS: 50 # Bins with smaller completeness value will be discarded (values in percent, 0-100). Remove this row to ignore bin completeness. >>EXAMPLE: "90" + MAX_CONTAMINATION: 10 # Bins with larger contamination value will be discarded (values in percent, 0-100). Remove this row to ignore bin contamination (>100% contamination bins will still be discarded). >>EXAMPLE: "5" ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest BAM_FILES: diff --git a/examples/11_bins.yaml b/examples/11_bins.yaml index a8d1fa8..9b521be 100644 --- a/examples/11_bins.yaml +++ b/examples/11_bins.yaml @@ -11,7 +11,7 @@ SEQUENCING_PLATFORMS: ["ILLUMINA"] # PROJECT_NAME: "Project ex11 idx00" # Name of the project within which the sequencing was organized >>EXAMPLE: "AgRFex 2 Biogas Survey" SAMPLE_ACCESSIONS: ["SAMEA113417017"] # These samples exist in ENA. Your assembly is based on them. >>EXAMPLE: ["ERS15898933","ERS15898932"] ASSEMBLY: - ASSEMBLY_NAME: "idx00_ex11_asm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" + ASSEMBLY_NAME: "idx00_ex11_asm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" EXISTING_ASSEMBLY_ANALYSIS_ACCESSION: "ERZ21942150" # The accession of the assembly analysis that all bins/MAGs originate from >>EXAMPLE: "GCA_012552665" EXISTING_CO_ASSEMBLY_SAMPLE_ACCESSION: # The accession of the virtual sample of the co-assembly which all bins/MAGs originate from >>EXAMPLE: "ERZ21942150" ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" @@ -25,6 +25,7 @@ BINS: NCBI_TAXONOMY_FILES: "data/taxonomy/eukaryota_taxonomy.tsv" # A list of files with NCBI taxonomy information about the bins. Consult the README to see how they should be structured. >>EXAMPLE: ["/mnt/data/bacteria_tax.tsv","/mnt/data/archaea_tax.tsv"] MANUAL_TAXONOMY_FILE: "data/taxonomy/manual_taxonomy_eukaryota.tsv" # Scientific names and taxids for bins. See example file for the structure. Columns must be 'Bin_id', 'Tax_id' and 'Scientific_name'. Consult the README for more information. >>EXAMPLE: "/mnt/data/manual_tax.tsv" BINNING_SOFTWARE: "metabat2" # The program that was used for binning. >>EXAMPLE: "metabat2" + MAX_CONTAMINATION: 5 # Bins with larger contamination value will be discarded (values in percent, 0-100). Remove this row to ignore bin contamination (>100% contamination bins will still be discarded). >>EXAMPLE: "5" ADDITIONAL_SAMPLESHEET_FIELDS: # You can add more fields from the ENA samplesheet that most closely matches your experiment ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest COVERAGE_FILE: "data/bin_coverage.tsv" # .tsv file containing the coverage values of each bin. Columns must be 'Bin_id' and 'Coverage'. diff --git a/examples/data/checkm_quality_3bins.tsv b/examples/data/checkm_quality_3bins.tsv index 119c26d..2fea526 100644 --- a/examples/data/checkm_quality_3bins.tsv +++ b/examples/data/checkm_quality_3bins.tsv @@ -1,4 +1,4 @@ Bin Id Marker lineage # genomes # markers # marker sets 0 1 2 3 4 5+ Completeness Contamination Strain heterogeneity bin1 k__Bacteria (UID2570) 433 273 183 101 172 0 0 0 0 62.22 0.10 0.10 -bin2 root (UID1) 5656 56 24 55 1 0 0 0 0 4.17 0.00 0.00 -bin3 k__Bacteria (UID203) 5449 104 58 84 20 0 0 0 0 23.90 0.00 0.00 +bin2 root (UID1) 5656 56 24 55 1 0 0 0 0 94.17 17.11 0.00 +bin3 k__Bacteria (UID203) 5449 104 58 84 20 0 0 0 0 23.90 1.20 0.00 diff --git a/examples/localtest.yaml b/examples/localtest.yaml index fc2d6b7..f50da8f 100644 --- a/examples/localtest.yaml +++ b/examples/localtest.yaml @@ -1,56 +1,56 @@ -# ABOUT: This is a config for submitting 2 set of paired end reads, an assembly and bins -# ABOUT: Coverage is known. -# ABOUT: Taxonomy is derived from `gtdb_to_ncbi_majority_vote.py` output and a MANUAL_TAXONOMY_FILE +# ABOUT: This is a config for submitting 1 sample, 2 sets of single-end reads, an assembly and bins. +# ABOUT: Coverage is derived from one unsorted .bam file. +# ABOUT: Taxonomy is derived from `gtdb_to_ncbi_majority_vote.py` output # USAGE: navigate to the directory -# USAGE: submg submit --config 05_reads_assembly_bins.yaml --staging_dir --logging_dir --submit_reads --submit_assembly --submit_bins - - -STUDY: "PRJEB71644" # The accession of your study (which has to already exist in ENA) >>EXAMPLE: "PRJEB71644" -METAGENOME_SCIENTIFIC_NAME: "biogas fermenter metagenome" # Taxonomic identifier of the metagenome. Check the ENA metagenome taxonomy tree to find a taxonomy ID and species name fitting your sample >>EXAMPLE: "biogas fermenter metagenome" -METAGENOME_TAXID: "718289" # Taxonomic identifier of the assembly. Must match SPECIES_SCIENTIFIC_NAME >>EXAMPLE: "718289" -SEQUENCING_PLATFORMS: ["ILLUMINA"] # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#platform >>EXAMPLE: ["ILLUMINA","OXFORD_NANOPORE"] -SAMPLE_ACCESSIONS: ['SAMEA113417017', 'SAMEA113417018'] # These samples exist in ENA. Your assembly is based on them. >>EXAMPLE: ["ERS15898933","ERS15898932"] -PAIRED_END_READS: -- NAME: "3rIQA_ex05_rp1" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" - SEQUENCING_INSTRUMENT: "Illumina HiSeq 1500" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] - LIBRARY_SOURCE: "METAGENOMIC" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" - LIBRARY_SELECTION: "RANDOM" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" - LIBRARY_STRATEGY: "WGS" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" - INSERT_SIZE: "300" # Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html) >>EXAMPLE: "300" - FASTQ1_FILE: "data/reads/fwd1.fastq" # Path to the fastq file with forward reads >>EXAMPLE: "/mnt/data/reads_R1.fastq.gz" - FASTQ2_FILE: "data/reads/rev1.fastq" # Path to the fastq file with reverse reads >>EXAMPLE: "/mnt/data/reads_R2.fastq.gz" - RELATED_SAMPLE_ACCESSION: 'SAMEA113417017' # The accession of the sample that these reads originate from >>EXAMPLE: "ERS15898933" - ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest -- NAME: "3rIQA_ex05_rp2" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" - SEQUENCING_INSTRUMENT: "Illumina HiSeq 1500" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] - LIBRARY_SOURCE: "METAGENOMIC" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" - LIBRARY_SELECTION: "RANDOM" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" - LIBRARY_STRATEGY: "WGS" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" - INSERT_SIZE: "300" # Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html) >>EXAMPLE: "300" - FASTQ1_FILE: "data/reads/fwd2.fastq" # Path to the fastq file with forward reads >>EXAMPLE: "/mnt/data/reads_R1.fastq.gz" - FASTQ2_FILE: "data/reads/rev2.fastq" # Path to the fastq file with reverse reads >>EXAMPLE: "/mnt/data/reads_R2.fastq.gz" - RELATED_SAMPLE_ACCESSION: 'SAMEA113417018' # The accession of the sample that these reads originate from >>EXAMPLE: "ERS15898933" - ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest +# USAGE: submg submit --config 02_samples_reads_assembly_bins.yaml --staging_dir --logging_dir --submit_samples --submit_reads --submit_assembly --submit_bins + +STUDY: "PRJEB71644" # The accession of your study (which has to already exist in ENA) >>EXAMPLE: "PRJEB71644" +METAGENOME_SCIENTIFIC_NAME: 'ant fungus garden metagenome' # Taxonomic identifier of the metagenome. Check the ENA metagenome taxonomy tree to find a taxonomy ID and species name fitting your sample >>EXAMPLE: "biogas fermenter metagenome" +METAGENOME_TAXID: '797283' # Taxonomic identifier of the assembly. Must match SPECIES_SCIENTIFIC_NAME >>EXAMPLE: "718289" +SEQUENCING_PLATFORMS: ['ILLUMINA'] # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#platform >>EXAMPLE: ["ILLUMINA","OXFORD_NANOPORE"] +NEW_SAMPLES: # These samples will be created in ENA according to the data entered below. Your assembly MUST BE BASED ON ALL OF THESE. +- TITLE: 'dlhr2_example02_sample01' # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample" + collection date: '2012' # Any ISO compliant time. Can be truncated from the right (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" + geographic location (country and/or sea): 'missing: data agreement established pre-2023' # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" + ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment +SINGLE_READS: +- NAME: 'dlhr2_pe_reads_01_sample01' # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" + SEQUENCING_INSTRUMENT: 'Illumina MiSeq' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] + LIBRARY_SOURCE: 'METAGENOMIC' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" + LIBRARY_SELECTION: 'RANDOM' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" + LIBRARY_STRATEGY: 'WGS' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" + FASTQ_FILE: "data/reads/fwd1.fastq" # Path to the fastq file >>EXAMPLE: "/mnt/data/reads.fastq.gz" + RELATED_SAMPLE_TITLE: 'dlhr2_example02_sample01' # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" + ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest +- NAME: 'dlhr2_pe_reads_02_sample01' # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" + SEQUENCING_INSTRUMENT: 'Illumina MiSeq' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] + LIBRARY_SOURCE: 'METAGENOMIC' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" + LIBRARY_SELECTION: 'RANDOM' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" + LIBRARY_STRATEGY: 'WGS' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" + FASTQ_FILE: "data/reads/fwd2.fastq" # Path to the fastq file >>EXAMPLE: "/mnt/data/reads.fastq.gz" + RELATED_SAMPLE_TITLE: 'dlhr2_example02_sample01' # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" + ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest ASSEMBLY: - ASSEMBLY_NAME: "3rIQA_e05_coasm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg" - ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" - ISOLATION_SOURCE: "biogas plant anaerobic digester" # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" - FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" - collection date: "2024-01-01" # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" - geographic location (country and/or sea): "Germany" # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" - COVERAGE_VALUE: 128.27 # Read coverage of the assembly. - ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment - ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest + ASSEMBLY_NAME: 'dlhr2_e02_asm' # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" + ASSEMBLY_SOFTWARE: 'metaSPAdes' # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" + ISOLATION_SOURCE: 'ant fungus garden' # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" + FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" + collection date: '2012-02' # Any ISO compliant time. Can be truncated from the right (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" + geographic location (country and/or sea): 'missing: data agreement established pre-2023' # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" + ADDITIONAL_SAMPLESHEET_FIELDS: + geographic location (latitude): 52.51 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85" + geographic location (longitude): 8.77 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65" + ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest BINS: - BINS_DIRECTORY: "data/3bins" # Directory containing the fasta files of all bins/MAGs >>EXAMPLE: "/mnt/data/bins" - COMPLETENESS_SOFTWARE: "CheckM" # Software used to calculate completeness >>EXAMPLE: "CheckM" - QUALITY_FILE: "data/checkm_quality_3bins.tsv" # tsv file containing quality values of each bin. Header must include 'Bin_id', 'Completeness', 'Contamination'. A CheckM output table will work here. >>EXAMPLE: "/mnt/data/checkm_quality.tsv" - NCBI_TAXONOMY_FILES: # A list of files with NCBI taxonomy information about the bins. Consult the README to see how they should be structured. >>EXAMPLE: ["/mnt/data/bacteria_tax.tsv","/mnt/data/archaea_tax.tsv"] - - "data/taxonomy/archaea_taxonomy.tsv" - - "data/taxonomy/bacteria_taxonomy.tsv" - MANUAL_TAXONOMY_FILE: "data/taxonomy/manual_taxonomy.tsv" # Scientific names and taxids for bins. See example file for the structure. Columns must be 'Bin_id', 'Tax_id' and 'Scientific_name'. Consult the README for more information. >>EXAMPLE: "/mnt/data/manual_tax.tsv" - BINNING_SOFTWARE: "metabat2" # The program that was used for binning. >>EXAMPLE: "metabat2" - ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment - ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest - COVERAGE_FILE: "data/bin_coverage.tsv" # .tsv file containing the coverage values of each bin. Columns must be 'Bin_id' and 'Coverage'. - \ No newline at end of file + BINS_DIRECTORY: "data/3bins" # Directory containing the fasta files of all bins/MAGs >>EXAMPLE: "/mnt/data/bins" + COMPLETENESS_SOFTWARE: "CheckM" + NCBI_TAXONOMY_FILES: "taxotest/archaea_taxonomy.tsv" # Software used to calculate completeness >>EXAMPLE: "CheckM" + QUALITY_FILE: "taxotest/checkm_quality_3bins.tsv" # tsv file containing quality values of each bin. Header must include 'Bin_id', 'Completeness', 'Contamination'. A CheckM output table will work here. >>EXAMPLE: "/mnt/data/checkm_quality.tsv" + MANUAL_TAXONOMY_FILE: "taxotest/manual_taxonomy_3bins.tsv" # Scientific names and taxids for bins. See example file for the structure. Columns must be 'Bin_id', 'Tax_id' and 'Scientific_name'. Consult the README for more information. >>EXAMPLE: "/mnt/data/manual_tax.tsv" + BINNING_SOFTWARE: 'VAMB' + MIN_COMPLETENESS: 1 # Bins with smaller completeness value will be discarded (values in percent, 0-100). Remove this row to ignore bin completeness. >>EXAMPLE: "90" + MAX_CONTAMINATION: 100 # Bins with larger contamination value will be discarded (values in percent, 0-100). Remove this row to ignore bin contamination (>100% contamination bins will still be discarded). >>EXAMPLE: "5" + ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment + ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest +BAM_FILES: + - "data/mapping/1.unsorted.bam" \ No newline at end of file diff --git a/submg/binSubmission.py b/submg/binSubmission.py index 6311f19..31df6e6 100644 --- a/submg/binSubmission.py +++ b/submg/binSubmission.py @@ -108,7 +108,8 @@ def get_bin_quality(config, silent=False) -> dict: return result -def __prep_bins_samplesheet(config: dict, +def __prep_bins_samplesheet(filtered_bins: list, + config: dict, assembly_sample_accession: str, samples_submission_dir: str, upload_taxonomy_data: dict) -> str: @@ -116,6 +117,7 @@ def __prep_bins_samplesheet(config: dict, Prepares an XML samplesheet for all bin samples. Args: + filtered_bins (list): A list of bin names to submit. config (dict): The config dictionary. assembly_sample_accession (str): Either the accession of a co-assembly virtual sample or the accession of the single biological sample @@ -150,7 +152,7 @@ def __prep_bins_samplesheet(config: dict, # Define root element root = ET.Element("SAMPLE_SET") - for bin_id in upload_taxonomy_data.keys(): + for bin_id in filtered_bins: assembly_name = utility.stamped_from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME').replace(' ', '_') sample_alias = f"{assembly_name}_bin_{bin_id}_virtual_sample" @@ -461,11 +463,13 @@ def bin_coverage_from_depth(depth_files: str, return bin_coverages -def bin_coverage_from_tsv(bin_coverage_file: str, +def bin_coverage_from_tsv(filtered_bins: list, + bin_coverage_file: str, bin_names: dict) -> dict: """Reads coverage for each bin from a tsv file. Args: + filtered_bins (list): A list of bin names to submit. bin_coverage_file (str): The path to the tsv file containing the bin coverage data. bin_names (dict): A dictionary mapping bin names to their corresponding @@ -482,8 +486,8 @@ def bin_coverage_from_tsv(bin_coverage_file: str, bin_name = row['Bin_id'] coverage = float(row['Coverage']) bin_coverages[bin_name] = coverage - for known_name in bin_names: - if not known_name in bin_coverages: + for known_name in filtered_bins: + if known_name not in bin_coverages: err = f"\nERROR: Bin {known_name} was not found in the coverage file at {os.path.abspath(bin_coverage_file)}." loggingC.message(err, threshold=-1) exit(1) @@ -511,7 +515,8 @@ def get_bins_in_dir(bins_directory: str) -> list: return bin_name_to_fasta -def submit_bins(config: dict, +def submit_bins(filtered_bins: list, + config: dict, upload_taxonomy_data: dict, assembly_sample_accession: str, run_accessions, @@ -527,6 +532,7 @@ def submit_bins(config: dict, bin as an individual analysis object using webin-cli. Args: + filtered_bins: A list of bin names to submit. config (dict): The config dictionary. upload_taxonomy_data (dict): A dictionary with the taxid and scientific name for each bin. @@ -568,21 +574,23 @@ def submit_bins(config: dict, # Get the coverage for each bin file loggingC.message(">Deriving bin coverage", threshold=1) coverage_outfile = os.path.join(logging_dir, 'bin_coverages.tsv') - if not depth_files is None: + if depth_files is not None: bin_coverages = bin_coverage_from_depth(depth_files, bin_name_to_fasta, coverage_outfile, threads=threads) - elif not bin_coverage_file is None: - bin_coverages = bin_coverage_from_tsv(bin_coverage_file, + elif bin_coverage_file is not None: + bin_coverages = bin_coverage_from_tsv(filtered_bins, + bin_coverage_file, bin_name_to_fasta.keys()) - # Make a samplesheet for all bins + # Make a samplesheet for filtered bins loggingC.message(">Making bin samplesheet", threshold=1) samples_submission_dir = os.path.join(staging_dir, 'bin_samplesheet') os.makedirs(samples_submission_dir, exist_ok=False) - samplesheet = __prep_bins_samplesheet(config, + samplesheet = __prep_bins_samplesheet(filtered_bins, + config, assembly_sample_accession, samples_submission_dir, upload_taxonomy_data) @@ -592,9 +600,10 @@ def submit_bins(config: dict, samples_logging_dir = os.path.join(logging_dir, 'bin_samplesheet') os.makedirs(samples_logging_dir, exist_ok=False) prefixbin_to_accession = __submit_bins_samplesheet(samplesheet, - samples_submission_dir, - samples_logging_dir, - url) + samples_submission_dir, + samples_logging_dir, + url) + # Remove the prefixes assembly_name = utility.stamped_from_config(config, 'ASSEMBLY', 'ASSEMBLY_NAME').replace(' ', '_') prefix_len = len(f"{assembly_name}_bin_") @@ -608,7 +617,8 @@ def submit_bins(config: dict, staging_directories = {} loggingC.message(">Staging bin submission sequences and manifests...", threshold=0) bin_manifests = {} - for bin_name, bin_fasta in bin_name_to_fasta.items(): + for bin_name in filtered_bins: + bin_fasta = bin_name_to_fasta[bin_name] bin_sample_accession = bin_to_accession[bin_name] staging_directory = os.path.join(staging_dir, f"bin_{bin_name}_staging") staging_directories[bin_name] = staging_directory diff --git a/submg/main.py b/submg/main.py index c028c33..acfa5e1 100644 --- a/submg/main.py +++ b/submg/main.py @@ -177,7 +177,7 @@ def init_argparse(): "To submit multiple assemblies, you need to " "use the tool multiple times.") parser_makecfg.add_argument("-q", - "--quality_cutoffs", + "--bin_quality_cutoffs", action="store_true", default=False, help="Include fields for bin quality cutoff " @@ -262,8 +262,10 @@ def submit(args): # We do this early so we notice issues before we start staging files. if args.submit_bins or args.submit_mags: bin_quality = get_bin_quality(config, silent=True) + # If there are quality cutoffs, make a list of bins to submit + filtered_bins = utility.filter_bins(bin_quality, config) # Test if there are bins which are too contaminated - for name in bin_quality.keys(): + for name in filtered_bins: contamination = bin_quality[name]['contamination'] if contamination > staticConfig.max_contamination: err = ( @@ -278,7 +280,7 @@ def submit(args): ) loggingC.message(err, threshold=-1) exit(1) - bin_taxonomy = taxQuery.get_bin_taxonomy(config) + bin_taxonomy = taxQuery.get_bin_taxonomy(filtered_bins, config) # Construct depth files if there are .bam files in the config if 'BAM_FILES' in config.keys(): @@ -357,7 +359,8 @@ def submit(args): # Bin submision if args.submit_bins: - submit_bins(config, + submit_bins(filtered_bins, + config, bin_taxonomy, assembly_sample_accession, run_accessions, @@ -378,9 +381,11 @@ def submit(args): metagenome_scientific_name = enaSearching.search_scientific_name_by_sample(assembly_sample_accession, args.development_service) except: - # This is a workaround because I keep getting - # false negatives on the development server - # for samples that exist on both servers. + # This is a workaround for times where the ENA development + # API does not work. If the sample is registered on the + # production server we can still continue submitting. + # Included because the situation came up multiple times + # during development. metagenome_scientific_name = enaSearching.search_scientific_name_by_sample(assembly_sample_accession, False) submit_mags(config, diff --git a/submg/preflight.py b/submg/preflight.py index 027dc5a..af327bf 100644 --- a/submg/preflight.py +++ b/submg/preflight.py @@ -2,7 +2,7 @@ from datetime import datetime -from submg import loggingC, utility, enaSearching +from submg import loggingC, utility, enaSearching, binSubmission, taxQuery from submg.statConf import staticConfig from submg.webinWrapper import find_webin_cli_jar from submg.taxQuery import taxid_from_scientific_name @@ -22,6 +22,7 @@ def __check_tsv(tsvfile: str, required_columns: A list of column names that must be present in the .tsv file. """ + global checks_failed if not os.path.isfile(tsvfile): err = f"\nERROR: The .tsv file '{tsvfile}' does not exist." loggingC.message(err, threshold=-1) @@ -30,7 +31,7 @@ def __check_tsv(tsvfile: str, header = f.readline().strip().split('\t') for col in required_columns: if col not in header: - err = f"\nWARNING: The .tsv file '{tsvfile}' is missing the column '{col}'." + err = f"\nERROR: The .tsv file '{tsvfile}' is missing the column '{col}'." loggingC.message(err, threshold=-1) checks_failed = True if 'Bin_ids' in header: @@ -545,11 +546,50 @@ def __check_bins(arguments: dict, checks_failed = True __check_tsv(quality_file, staticConfig.bin_quality_columns.split(';')) + # Check if the quality filtering criteria are defined and whether they + # look right (they are positive, between 0 and 100, they are not floats < 1) + if 'MIN_COMPLETENESS' in bin_data.keys(): + try: + min_completeness = float(bin_data['MIN_COMPLETENESS']) + if min_completeness < 0 or min_completeness > 100: + err = f"ERROR: The MIN_COMPLETENESS value is not between 0 and 100." + loggingC.message(err, threshold=-1) + checks_failed = True + if min_completeness < 1: + err = f"ERROR: The MIN_COMPLETENESS value is smaller than 1. Completeness needs to be defined as percent points (0-100)." + loggingC.message(err, threshold=-1) + checks_failed = True + except ValueError: + err = f"ERROR: The MIN_COMPLETENESS value {bin_data['MIN_COMPLETENESS']} is not a number." + loggingC.message(err, threshold=-1) + checks_failed = True + + if 'MAX_CONTAMINATION' in bin_data.keys(): + try: + max_contamination = float(bin_data['MAX_CONTAMINATION']) + if max_contamination < 0 or max_contamination > 100: + err = f"ERROR: The MAX_CONTAMINATION value is not between 0 and 100." + loggingC.message(err, threshold=-1) + checks_failed = True + if max_contamination < 1: + err = f"ERROR: The MAX_CONTAMINATION value is smaller than 1. Contamination needs to be defined as percent points (0-100)." + loggingC.message(err, threshold=-1) + checks_failed = True + except ValueError: + err = f"ERROR: The MAX_CONTAMINATION value {bin_data['MAX_CONTAMINATION']} is not a number." + loggingC.message(err, threshold=-1) + checks_failed = True + # Check if at least one NCBI_TAXONOMY_FILE or MANUAL_TAXONOMY_FILE exists tax_files = [] if 'NCBI_TAXONOMY_FILES' in bin_data.keys(): ncbi_tax_files = bin_data['NCBI_TAXONOMY_FILES'] + if ncbi_tax_files is None: + err = f"\nERROR: The field NCBI_TAXONOMY_FILES in the BINS section is empty." + err += f"\nPlease provide a valid file path or remove the field from the config file." + loggingC.message(err, threshold=-1) + exit(1) # We cannot carry out the rest of the preflight checks if not isinstance(ncbi_tax_files, list): ncbi_tax_files = [ncbi_tax_files] tax_files.extend(ncbi_tax_files) @@ -581,10 +621,12 @@ def __check_bins(arguments: dict, loggingC.message(err, threshold=-1) checks_failed = True __check_tsv(bin_data['MANUAL_TAXONOMY_FILE'], staticConfig.manual_taxonomy_columns.split(';')) + if not taxQuery.check_manual_taxonomies(bin_data['MANUAL_TAXONOMY_FILE']): + checks_failed = True tax_files.append(bin_data['MANUAL_TAXONOMY_FILE']) # And if the headers of the MANUAL_TAXONOMY_FILE are correct - # Now check there actual are tax files + # Now check if there actual are tax files if len(tax_files) == 0: err = f"\nERROR: You chose to submit bins, but did not provide any taxonomy files." loggingC.message(err, threshold=-1) @@ -639,6 +681,7 @@ def __check_mags(arguments: dict, utility.stamped_from_config(config, 'PROJECT_NAME') # Check the metadata file + all_mag_bins = set() metadata_file = utility.from_config(config, 'MAGS', 'MAG_METADATA_FILE') if metadata_file is None or metadata_file == '': err = f"\nERROR: No MAG_METADATA_FILE was provided in the MAGS section." @@ -650,6 +693,7 @@ def __check_mags(arguments: dict, reader = csv.DictReader(f, delimiter='\t') for row in reader: bin_id = row['Bin_id'].strip() + all_mag_bins.add(bin_id) if bin_id == '' or bin_id is None: err = f"\nERROR: The metadata file '{metadata_file}' contains an empty bin_id field." loggingC.message(err, threshold=-1) @@ -665,6 +709,15 @@ def __check_mags(arguments: dict, loggingC.message(err, threshold=-1) exit(1) + # Check if all MAGs bins pass the filtering that is being applied to bins + bin_quality = binSubmission.get_bin_quality(config, silent=True) + filtered_bins = utility.filter_bins(bin_quality, config) + for bin_id in all_mag_bins: + if bin_id not in filtered_bins: + err = f"\nERROR: The bin {bin_id} in the MAG_METADATA_FILE does not pass the filtering criteria for bins (MIN_COMPLETENESS / MAX_CONTAMINATION)." + loggingC.message(err, threshold=-1) + exit(1) + # Check fields in ASSEMBLY section assembly_data = utility.from_config(config, 'ASSEMBLY') mandatory_fields = [('ASSEMBLY_SOFTWARE', str), @@ -828,11 +881,11 @@ def preflight_checks(arguments: dict) -> None: exit(1) if checks_failed: - msg = f"Some preflight checks failed. If you are sure that the data " \ + msg = f"\nSome preflight checks failed. If you are sure that the data " \ "you provided is correct, you can skip these checks by using " \ "the --skip_checks flag. If any ERROR messages are not " \ - "adressed, they are likely to cause failure after partial " \ - "submission." + "adressed, they are likely to cause failure, sometimes after " \ + "partial submission of the data." loggingC.message(msg, threshold=-1) exit(1) else: diff --git a/submg/statConf.py b/submg/statConf.py index 9f7f1c3..3428052 100644 --- a/submg/statConf.py +++ b/submg/statConf.py @@ -108,9 +108,9 @@ class staticConfig: 'COVERAGE_VALUE': "Read coverage of the assembly.", 'COVERAGE_FILE': ".tsv file containing the coverage values of each bin. Columns must be 'Bin_id' and 'Coverage'.", 'INSERT_SIZE': "Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html)", - 'MAG_METADATA_FILE': "A .tsv specifying 'Bin_id', 'Sample_id', 'Quality_category', 'Flatfile_path', 'Chromosomes_path' and 'Unlocalised_path' for all MAGs. See README for more details." - 'MIN_COMPLETENESS': "Bins with smaller completeness value will be discarded (in percent, 0-100). Remove this row to ignore bin completeness". - 'MAX_CONTAMINATION': "Bins with larger contamination value will be discarded (in percent, 0-100). Remove this row to ignore bin contamination", + 'MAG_METADATA_FILE': "A .tsv specifying 'Bin_id', 'Sample_id', 'Quality_category', 'Flatfile_path', 'Chromosomes_path' and 'Unlocalised_path' for all MAGs. See README for more details.", + 'MIN_COMPLETENESS': "Bins with smaller completeness value will be discarded (in percent, 0-100). Remove this row to ignore bin completeness.", + 'MAX_CONTAMINATION': "Bins with larger contamination value will be discarded (in percent, 0-100). Remove this row to ignore bin contamination (>100% contamination bins will still be discarded).", } YAMLEXAMPLES = { diff --git a/submg/taxQuery.py b/submg/taxQuery.py index 46e020b..f75fd1e 100644 --- a/submg/taxQuery.py +++ b/submg/taxQuery.py @@ -4,7 +4,7 @@ import time from tqdm import tqdm -from submg import utility, loggingC +from submg import utility, loggingC, binSubmission from submg.statConf import staticConfig @@ -51,41 +51,43 @@ def __report_tax_issues(issues): def __check_bin_coherence(bin_basenames: list, + bin_quality_data: dict, annotated_bin_taxonomies: dict, upload_taxonomy_data: dict): """ - Check if the bins in the BINS_DIRECTORY are consistent with the taxonomy - files. If a bin is in the BINS_DIRECTORY but not in the taxonomy files, or - vice versa, the program will exit with an error message. + Check if all the sources of bin names are coherent (e.g. the set of bins in + the quality file, the fasta files and the taxonomy files has to be + identical.) Args: bin_basenames (list): A list of the basenames of the bins in the BINS_DIRECTORY + bin_quality_data (dict): A dictionary with the quality data for each bin annotated_bin_taxonomies (dict): A dictionary with the taxid and scientific name for each bin in the taxonomy files upload_taxonomy_data (dict): A dictionary with the taxid and scientific name for each bin in the manual taxonomy file """ - ids_from_taxonomies = set(upload_taxonomy_data.keys()) - ids_from_taxonomies.update(set(annotated_bin_taxonomies.keys())) + # Get the set of bin_ids from different sources + ids_from_quality = set(bin_quality_data.keys()) + ids_from_taxonomies = set(upload_taxonomy_data.keys()).union(set(annotated_bin_taxonomies.keys())) ids_from_fasta = set(bin_basenames) - only_in_fasta = ids_from_fasta.difference(ids_from_taxonomies) - only_in_taxonomies = ids_from_taxonomies.difference(ids_from_fasta) - - if len(only_in_fasta) > 0: - err = f"\nERROR: The following bins are in the BINS_DIRECTORY but not in the taxonomy files:" - loggingC.message(err, threshold=-1) - for b in only_in_fasta: - msg = f"\t{b}" - loggingC.message(msg, threshold=0) - exit(1) - if len(only_in_taxonomies) > 0: - err = f"\nERROR: The following bins are in the taxonomy files but not in the BINS_DIRECTORY:" - loggingC.message(err, threshold=-1) - for b in only_in_taxonomies: - msg = f"\t{b}" - loggingC.message(msg, threshold=0) + # Check for discrepancies between the sets + missing_in_fasta = ids_from_taxonomies.union(ids_from_quality).difference(ids_from_fasta) + missing_in_taxonomies = ids_from_fasta.union(ids_from_quality).difference(ids_from_taxonomies) + missing_in_quality = ids_from_fasta.union(ids_from_taxonomies).difference(ids_from_quality) + + # Log errors and exit if there are any + if missing_in_fasta or missing_in_taxonomies or missing_in_quality: + msg = "\n>ERROR: Bin sources are not coherent." + if missing_in_fasta: + msg += f"\nBins missing in fasta files: {', '.join(missing_in_fasta)}" + if missing_in_taxonomies: + msg += f"\nBins missing in taxonomy files: {', '.join(missing_in_taxonomies)}" + if missing_in_quality: + msg += f"\nBins missing in quality data: {', '.join(missing_in_quality)}" + loggingC.message(msg, threshold=-1) exit(1) @@ -114,6 +116,66 @@ def __read_manual_taxonomy_file(manual_taxonomy_file: str) -> dict: return result +def __report_manual_tax_issues(issues): + """ + When the tool fails because of issues with the manual taxonomy file, this + function reports the issues in detail. + + Args: + issues (list): A list of dictionaries with the issues encountered. + """ + # Log the error + err = f"\nERROR: Manual taxonomy file contains issues for {len(issues)} bins:" + loggingC.message(err, threshold=-1) + + # Give a detailed listing of the issues + for i in issues: + bin = i['bin'] + scientific_name = i['scientific_name'] + tax_id = i['tax_id'] + ena_tax_id = i['ena_tax_id'] + if ena_tax_id == 'N/A': + msg = f"\tbin {bin} - no taxid found for scientific name {scientific_name}" + loggingC.message(msg, threshold=-1) + else: + msg = f"\tbin {bin} - tax_id {tax_id} does not match ENA tax_id {ena_tax_id} for scientific name {scientific_name}" + loggingC.message(msg, threshold=-1) + + +def check_manual_taxonomies(manual_taxonomy_file: str) -> bool: + """ + Read in the manual taxonomy file. Perform an ENA query for each bin to + make sure the taxid matches up with the scientific name. + + Args: + manual_taxonomy_file (str): Path to the manual taxonomy file. + """ + manual_taxonomies = __read_manual_taxonomy_file(manual_taxonomy_file) + issues = [] + for bin_name, data in manual_taxonomies.items(): + tax_id = data['tax_id'] + scientific_name = data['scientific_name'] + ena_tax_id = taxid_from_scientific_name(scientific_name) + if not ena_tax_id: + issues.append({ + 'bin': bin_name, + 'scientific_name': scientific_name, + 'tax_id': tax_id, + 'ena_tax_id': 'N/A', + }) + elif not ena_tax_id == tax_id: + issues.append({ + 'bin': bin_name, + 'scientific_name': scientific_name, + 'tax_id': tax_id, + 'ena_tax_id': ena_tax_id, + }) + if len(issues) > 0: + __report_manual_tax_issues(issues) + return False + return True + + def __read_ncbi_taxonomy(ncbi_taxonomy_file: str) -> dict: """ Read the output of GTDB-TKs 'gtdb_to_ncbi_majority_vote.py' or a file using @@ -329,12 +391,14 @@ def __parse_classification_tsvs(ncbi_taxonomy_files: list) -> dict: return all_classifications -def get_bin_taxonomy(config) -> dict: +def get_bin_taxonomy(filtered_bins, config) -> dict: """ Based on the NCBI taxonomy files and manual taxonomy file defined in the config, derive the taxid and scientific name for each bin Args: + filtered_bins (list): List of bins that was filtered to remove + bins with bad completeness / contamination config (dict): The configuration dictionary Returns: @@ -381,6 +445,7 @@ def get_bin_taxonomy(config) -> dict: # Make sure that, for each bin showing up in the taxonomy files, we have # a corresponding fasta file __check_bin_coherence(bin_basenames, + binSubmission.get_bin_quality(config, silent=True), annotated_bin_taxonomies, upload_taxonomy_data) @@ -393,6 +458,9 @@ def get_bin_taxonomy(config) -> dict: last_request_time = time.time() - min_interval for bin_name, taxonomy in tqdm(annotated_bin_taxonomies.items(), leave=False): + # Only check the bins that we actually want to submit + if not bin_name in filtered_bins: + continue # Make sure we don't run into the ENA API rate limit current_time = time.time() time_since_last_request = current_time - last_request_time @@ -426,17 +494,17 @@ def get_bin_taxonomy(config) -> dict: 'suggestions': all_ena_suggestions, }) - # Add any bins that only show up in the files to the issues as unclassified - for basename in bin_basenames: - if not basename in upload_taxonomy_data: + # Add any bins that are missing from the taxonomy files als unclassified + for bin_name in filtered_bins: + if not bin_name in upload_taxonomy_data: is_issue = False for i in issues: - if i['mag_bin'] == basename: + if i['mag_bin'] == bin_name: is_issue = True if is_issue: continue issues.append({ - 'mag_bin': basename, + 'mag_bin': bin_name, 'level': 'unclassified', 'classification': 'N/A', 'suggestions': [], diff --git a/submg/utility.py b/submg/utility.py index 25f0aac..d788977 100644 --- a/submg/utility.py +++ b/submg/utility.py @@ -136,6 +136,7 @@ def api_response_check(response: requests.Response): loggingC.message(err, threshold=-1) exit(1) + def calculate_md5(fname): hash_md5 = hashlib.md5() with open(fname, "rb") as f: @@ -143,6 +144,7 @@ def calculate_md5(fname): hash_md5.update(chunk) return hash_md5.hexdigest() + def get_login(): """ Reads ENA login credentials from environmental variables and returns them. @@ -157,6 +159,7 @@ def get_login(): exit(1) return os.environ['ENA_USER'], os.environ['ENA_PASSWORD'] + def read_yaml(file_path): try: with open(file_path, 'r') as yaml_file: @@ -171,6 +174,7 @@ def read_yaml(file_path): loggingC.message(err, threshold=-1) exit(1) + def __strcast(value): """ Cast integers and floats to string. If the input is a list, set or dict, @@ -199,7 +203,8 @@ def prepdir(parent_path, name): newdir = os.path.join(parent_path, name) os.makedirs(newdir, exist_ok=False) return newdir - + + def from_config(config, key, subkey=None, subsubkey=None, supress_errors=False): """ Extracts a value from the dict that was created based on the @@ -259,6 +264,7 @@ def from_config(config, key, subkey=None, subsubkey=None, supress_errors=False): return __strcast(config[key][subkey]) return __strcast(config[key]) + def optional_from_config(config, key, subkey=None, subsubkey=None): """ Calls from config but returns None if the key is missing. @@ -274,6 +280,7 @@ def optional_from_config(config, key, subkey=None, subsubkey=None): except: return None + def stamped_from_config(config, key, subkey=None, subsubkey=None): """ Calls from config but adds a timestamp to relevant fields if timestamping @@ -337,6 +344,7 @@ def is_fasta(filepath, extensions=staticConfig.fasta_extensions.split(';')) -> s basename = filename.rsplit('.', 1)[0] return basename + def check_fasta(fasta_path) -> tuple: """ Checks if the fasta file exists, has a valid extension and whether it is @@ -368,6 +376,52 @@ def check_fasta(fasta_path) -> tuple: return fasta_path, gzipped +def filter_bins(quality_data, config): + """ + Filter bins based on the quality data. + + Args: + quality_data (dict): The quality data for the bins. + """ + filtered_bins = [] + + # Check arguments in config + if 'MIN_COMPLETENESS' in config['BINS']: + min_completeness = config['BINS']['MIN_COMPLETENESS'] + msg = f">Filtering bins based on minimum completeness of {min_completeness}." + else: + min_completeness = 0 + msg = ">No MIN_COMPLETENESS specified, bins will not be filtered for completeness." + loggingC.message(msg, threshold=0) + if 'MAX_CONTAMINATION' in config['BINS']: + max_contamination = config['BINS']['MAX_CONTAMINATION'] + msg = f">Filtering bins based on maximum contamination of {max_contamination}." + else: + max_contamination = 100 + msg = ">No MAX_CONTAMINATION specified, maximum contamination is set to 100." + loggingC.message(msg, threshold=0) + + # Filtering + filtered_out = [] + for bin in quality_data: + if quality_data[bin]['completeness'] >= min_completeness and quality_data[bin]['contamination'] <= max_contamination: + filtered_bins.append(bin) + else: + filtered_out.append(bin) + if len(filtered_out) > 0: + msg = f">WARNING: {len(filtered_out)} bins have been excluded from submission due to quality thresholds:" + loggingC.message(msg, threshold=0) + time.sleep(5) # Give user some extra time to notice message + for bin in filtered_out: + msg = f"\t{bin} (completeness {quality_data[bin]['completeness']}, contamination {quality_data[bin]['contamination']})" + loggingC.message(msg, threshold=0) + if len(filtered_bins) == 0: + err = "\nERROR: No bins left after filtering. Please adjust the quality thresholds." + loggingC.message(err, threshold=-1) + exit(1) + return filtered_bins + + def check_bam(bam_file, num_threads=4) -> str: """ @@ -408,6 +462,7 @@ def check_bam(bam_file, return sorted_bam_file + def make_depth_file(bam_file, outdir, num_threads=4): """ Uses pysam.depth to call samtools depth and create a depth file with @@ -426,6 +481,7 @@ def make_depth_file(bam_file, outdir, num_threads=4): pysam.depth("-@", str(num_threads), "-a", sorted_bam_file, "-o", outfile) return outfile + def contigs_coverage(depth_file): """ Calculates the coverage per contig from a depth file. @@ -452,6 +508,7 @@ def contigs_coverage(depth_file): contig_length[contig] = position return contig_coverage, contig_length + def calculate_coverage(depth_files: list, target_contigs: set = None, threads=4, @@ -589,7 +646,7 @@ def validate_parameter_combination(submit_samples: bool, is_valid = True if not is_valid: - # Dont use loggingC here because this might be called from configGen + # Dont use loggingC here, because this might be called from configGen print(f"\nERROR: The combination of parameters you have specified is not valid.") print(staticConfig.submission_modes_message) exit(1) diff --git a/todo.txt b/todo.txt index 0ac0d91..20331c1 100644 --- a/todo.txt +++ b/todo.txt @@ -15,9 +15,10 @@ [x] Preflight checks überarbeiten, so dass ich mehrere ERRORs auf einmal sehen kann >> RELEASE 1.0.0 +[ ] Refactor main.py [ ] Antworten auf meine ENA tickets [ ] minitest-option - wir uploaden nur 1 readfile, nur 1 bin, simulieren coverage werte -[ ] enaSearching überarbeiten - kann ich wirklich nicht auf dem dev server suchen? +[x] enaSearching überarbeiten - kann ich wirklich nicht auf dem dev server suchen? >> RELEASE irgendwann [ ] preflight macht erst alle preflight checks, reported alle probleme. Erst dann passiert exit(1).- From 47f3c04136ac795b778f821af60ecbba93330a97 Mon Sep 17 00:00:00 2001 From: ttubb Date: Fri, 10 May 2024 13:48:31 +0000 Subject: [PATCH 08/12] unclassified organisms --- examples/localtest.yaml | 12 ++++++------ submg/main.py | 10 ++++++---- submg/preflight.py | 2 +- submg/taxQuery.py | 10 ++++++---- submg/utility.py | 2 +- 5 files changed, 20 insertions(+), 16 deletions(-) diff --git a/examples/localtest.yaml b/examples/localtest.yaml index f50da8f..0bbd0be 100644 --- a/examples/localtest.yaml +++ b/examples/localtest.yaml @@ -9,29 +9,29 @@ METAGENOME_SCIENTIFIC_NAME: 'ant fungus garden metagenome' METAGENOME_TAXID: '797283' # Taxonomic identifier of the assembly. Must match SPECIES_SCIENTIFIC_NAME >>EXAMPLE: "718289" SEQUENCING_PLATFORMS: ['ILLUMINA'] # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#platform >>EXAMPLE: ["ILLUMINA","OXFORD_NANOPORE"] NEW_SAMPLES: # These samples will be created in ENA according to the data entered below. Your assembly MUST BE BASED ON ALL OF THESE. -- TITLE: 'dlhr2_example02_sample01' # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample" +- TITLE: '4YzFv_example02_sample01' # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample" collection date: '2012' # Any ISO compliant time. Can be truncated from the right (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" geographic location (country and/or sea): 'missing: data agreement established pre-2023' # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment SINGLE_READS: -- NAME: 'dlhr2_pe_reads_01_sample01' # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" +- NAME: '4YzFv_pe_reads_01_sample01' # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" SEQUENCING_INSTRUMENT: 'Illumina MiSeq' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] LIBRARY_SOURCE: 'METAGENOMIC' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" LIBRARY_SELECTION: 'RANDOM' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" LIBRARY_STRATEGY: 'WGS' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" FASTQ_FILE: "data/reads/fwd1.fastq" # Path to the fastq file >>EXAMPLE: "/mnt/data/reads.fastq.gz" - RELATED_SAMPLE_TITLE: 'dlhr2_example02_sample01' # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" + RELATED_SAMPLE_TITLE: '4YzFv_example02_sample01' # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest -- NAME: 'dlhr2_pe_reads_02_sample01' # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" +- NAME: '4YzFv_pe_reads_02_sample01' # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" SEQUENCING_INSTRUMENT: 'Illumina MiSeq' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] LIBRARY_SOURCE: 'METAGENOMIC' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" LIBRARY_SELECTION: 'RANDOM' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" LIBRARY_STRATEGY: 'WGS' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" FASTQ_FILE: "data/reads/fwd2.fastq" # Path to the fastq file >>EXAMPLE: "/mnt/data/reads.fastq.gz" - RELATED_SAMPLE_TITLE: 'dlhr2_example02_sample01' # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" + RELATED_SAMPLE_TITLE: '4YzFv_example02_sample01' # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest ASSEMBLY: - ASSEMBLY_NAME: 'dlhr2_e02_asm' # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" + ASSEMBLY_NAME: '4YzFv_e02_asm' # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" ASSEMBLY_SOFTWARE: 'metaSPAdes' # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" ISOLATION_SOURCE: 'ant fungus garden' # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" diff --git a/submg/main.py b/submg/main.py index acfa5e1..6c6ae8b 100644 --- a/submg/main.py +++ b/submg/main.py @@ -246,7 +246,7 @@ def submit(args): else: loggingC.message((">Initializing a LIVE SUBMISSION to " \ "the ENA production server."), 0) - time.sleep(5) + time.sleep(5) # Give user some extra time to notice message if not args.skip_checks: utility.validate_parameter_combination(args.submit_samples, @@ -263,7 +263,7 @@ def submit(args): if args.submit_bins or args.submit_mags: bin_quality = get_bin_quality(config, silent=True) # If there are quality cutoffs, make a list of bins to submit - filtered_bins = utility.filter_bins(bin_quality, config) + filtered_bins = utility.quality_filter_bins(bin_quality, config) # Test if there are bins which are too contaminated for name in filtered_bins: contamination = bin_quality[name]['contamination'] @@ -280,8 +280,10 @@ def submit(args): ) loggingC.message(err, threshold=-1) exit(1) - bin_taxonomy = taxQuery.get_bin_taxonomy(filtered_bins, config) - + # Query the taxonomy of bins + bin_taxonomy = taxQuery.get_bin_taxonomy(filtered_bins, + config) + # Construct depth files if there are .bam files in the config if 'BAM_FILES' in config.keys(): bam_files = utility.from_config(config, 'BAM_FILES') diff --git a/submg/preflight.py b/submg/preflight.py index af327bf..dfed94d 100644 --- a/submg/preflight.py +++ b/submg/preflight.py @@ -711,7 +711,7 @@ def __check_mags(arguments: dict, # Check if all MAGs bins pass the filtering that is being applied to bins bin_quality = binSubmission.get_bin_quality(config, silent=True) - filtered_bins = utility.filter_bins(bin_quality, config) + filtered_bins = utility.quality_filter_bins(bin_quality, config) for bin_id in all_mag_bins: if bin_id not in filtered_bins: err = f"\nERROR: The bin {bin_id} in the MAG_METADATA_FILE does not pass the filtering criteria for bins (MIN_COMPLETENESS / MAX_CONTAMINATION)." diff --git a/submg/taxQuery.py b/submg/taxQuery.py index f75fd1e..4edf243 100644 --- a/submg/taxQuery.py +++ b/submg/taxQuery.py @@ -24,7 +24,9 @@ def __report_tax_issues(issues): problematic_bins = set(problematic_bins) msg = '\n'.join(problematic_bins) loggingC.message(msg, threshold=-1) - msg = "Please consult the README. You can manually enter taxonomy data for these bins into a .tsv file and specify it in the MANUAL_TAXONOMY_FILE field in the config file." + msg = "Please consult the README. You can manually enter taxonomy data for these bins into a .tsv file and specify it in the MANUAL_TAXONOMY_FILE field in the config file." + msg += " If your annotation process failed to classify a bin even at domain level, consider excluding it from your submission. If you want to submit it anyways," + msg += " you may choose to add it to the MANUAL_TAXONOMY_FILES using taxid 155900 (unclassified organism)." loggingC.message(msg, threshold=-1) # Give a detailed listing of the issues @@ -278,7 +280,7 @@ def __best_classification(ncbi_classifications: dict) -> dict: return result -def ena_taxonomy_suggestion(level: str, +def __ena_taxonomy_suggestion(level: str, domain: str, classification: str, filtered: bool = True) -> list: @@ -472,7 +474,7 @@ def get_bin_taxonomy(filtered_bins, config) -> dict: if bin_name in upload_taxonomy_data: loggingC.message(f">INFO: Bin {bin_name} was found in the manual taxonomy file and will be skipped.", threshold=1) continue - suggestions = ena_taxonomy_suggestion(taxonomy['level'], + suggestions = __ena_taxonomy_suggestion(taxonomy['level'], taxonomy['domain'], taxonomy['classification'], filtered=True) @@ -482,7 +484,7 @@ def get_bin_taxonomy(filtered_bins, config) -> dict: 'tax_id': suggestions[0]['tax_id'], } else: - all_ena_suggestions = ena_taxonomy_suggestion(taxonomy['level'], + all_ena_suggestions = __ena_taxonomy_suggestion(taxonomy['level'], taxonomy['domain'], taxonomy['classification'], filtered=False) diff --git a/submg/utility.py b/submg/utility.py index d788977..33d4485 100644 --- a/submg/utility.py +++ b/submg/utility.py @@ -376,7 +376,7 @@ def check_fasta(fasta_path) -> tuple: return fasta_path, gzipped -def filter_bins(quality_data, config): +def quality_filter_bins(quality_data, config): """ Filter bins based on the quality data. From 11cf2f9b62f5e05c893ac1704b78133b47b00467 Mon Sep 17 00:00:00 2001 From: ttubb Date: Mon, 13 May 2024 11:13:32 +0000 Subject: [PATCH 09/12] updated taxonomy error handling --- README.md | 12 ++++++++++-- examples/localtest.yaml | 15 +++++++-------- submg/taxQuery.py | 5 +++-- submg/utility.py | 3 ++- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index bf3826a..b08a999 100644 --- a/README.md +++ b/README.md @@ -129,13 +129,21 @@ Using the table below, MAG `m1` will be submitted as a medium quality contig ass A submission can take several hours to complete. We recommend using [nohup](https://en.wikipedia.org/wiki/Nohup), [tmux](https://github.com/tmux/tmux/wiki) or something similar to prevent the process from being interrupted. # Taxonomy Assignment -Assemblies and bins need a valid NCBI taxonomy (scientific name and taxonomic identifier) for submission. If you did taxonomic annotation of bins based on [GTDB](https://gtdb.ecogenomic.org/), you can use the `gtdb_to_ncbi_majority_vote.py` script of the [GTDB-Toolkit](https://github.com/Ecogenomics/GTDBTk) to translate your results to NCBI taxonomy. +Assemblies and bins need a valid NCBI taxonomy (scientific name and taxonomic identifier) for submission. While in most cases the assignment works automatically, it is important to note that [environmental organism-level taxonomy](https://ena-docs.readthedocs.io/en/latest/faq/taxonomy.html#environmental-organism-level-taxonomy) has to be used for metagenome submissions. For example: Consider a bin that was classified only on the class level and was determined to belong to class `Clostridia`. The taxonomy id of the class `Clostridia` is `186801`. However, the correct environmental organism-level taxonomy for the bin is `uncultured Clostridia bacterium` with the taxid `244328`. +## GTDB-Toolkit Taxonomy +If you did taxonomic annotation of bins based on [GTDB](https://gtdb.ecogenomic.org/), you can use the `gtdb_to_ncbi_majority_vote.py` script of the [GTDB-Toolkit](https://github.com/Ecogenomics/GTDBTk) to translate your results to NCBI taxonomy. + +## NCBI-Taxonomy You can provide tables with NCBI taxonomy information for each bin (see `./tests/bacteria_taxonomy.tsv` for an example - the output of `gtdb_to_ncbi_majority_vote.py` has the correct format already). submg will use ENAs [suggest-for-submission-sendpoint](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access/taxon-api.html) to derive taxids that follow the [rules for bin taxonomy](https://ena-docs.readthedocs.io/en/latest/faq/taxonomy.html). +## Manually Specified Taxonomy Either in addition to those files, or as an alternative you can provide a `MANUAL_TAXONOMY` table. This should specify the correct taxids and scientific names for bins. An example of such a document can be found in `./examples/data/taxonomy/manual_taxonomy_3bins.tsv`. If a bin is present in this document, the taxonomic data from the NCBI taxonomy tables will be ignored. -In some cases submg will be unable to assign a valid taxonomy to a bin. The submission will be aborted and you will be informed which bins are causing problems. In such cases you have to determine the correct scientific name and taxid for the bin and specify it in the `MANUAL_TAXONOMY` field of your config file. Sometimes the reason for a failed taxonomic assignment is that no proper taxid exists yet. You can [create a taxon request](https://ena-docs.readthedocs.io/en/latest/faq/taxonomy_requests.html) in the ENA Webin Portal to register the taxon. +## Taxonomy Assignment Failure +In some cases submg will be unable to assign a valid taxonomy to a bin. The submission will be aborted and you will be informed which bins are causing problems. In such cases you have to determine the correct scientific name and taxid for the bin and specify it in the `MANUAL_TAXONOMY` field of your config file. + +A possible reason for a failed taxonomic assignment is that no proper taxid exists yet. This happens more often than one might expect. You can [create a taxon request](https://ena-docs.readthedocs.io/en/latest/faq/taxonomy_requests.html) in the ENA Webin Portal to register the taxon. ## NCBI Taxonomy File This file contains the NCBI taxonomy for bins. You can provide multiple taxonomy files covering different bins. If you created it with `gtdb_to_ncbi_majority_vote.py` of the [GTDB-Toolkit](https://github.com/Ecogenomics/GTDBTk) it will have the following, compatible format already. Alternatively, provide a .tsv file with the columns 'Bin_id' and 'NCBI_taxonomy'. The string in the 'NCBI_taxonomy' column has to adhere to the format shown below. Taxonomic ranks are separated by semicolons. On each rank, a letter indicating the rank is followed by two underscores and the classification at that rank. The ranks have to be in the order 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'. If a classification at a certain rank is unavailable, the rank itself still needs to be present in the string (e.g. "s__"). diff --git a/examples/localtest.yaml b/examples/localtest.yaml index 0bbd0be..bbb584c 100644 --- a/examples/localtest.yaml +++ b/examples/localtest.yaml @@ -9,29 +9,29 @@ METAGENOME_SCIENTIFIC_NAME: 'ant fungus garden metagenome' METAGENOME_TAXID: '797283' # Taxonomic identifier of the assembly. Must match SPECIES_SCIENTIFIC_NAME >>EXAMPLE: "718289" SEQUENCING_PLATFORMS: ['ILLUMINA'] # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#platform >>EXAMPLE: ["ILLUMINA","OXFORD_NANOPORE"] NEW_SAMPLES: # These samples will be created in ENA according to the data entered below. Your assembly MUST BE BASED ON ALL OF THESE. -- TITLE: '4YzFv_example02_sample01' # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample" +- TITLE: 'rv7ck_example02_sample01' # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample" collection date: '2012' # Any ISO compliant time. Can be truncated from the right (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" geographic location (country and/or sea): 'missing: data agreement established pre-2023' # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment SINGLE_READS: -- NAME: '4YzFv_pe_reads_01_sample01' # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" +- NAME: 'rv7ck_pe_reads_01_sample01' # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" SEQUENCING_INSTRUMENT: 'Illumina MiSeq' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] LIBRARY_SOURCE: 'METAGENOMIC' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" LIBRARY_SELECTION: 'RANDOM' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" LIBRARY_STRATEGY: 'WGS' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" FASTQ_FILE: "data/reads/fwd1.fastq" # Path to the fastq file >>EXAMPLE: "/mnt/data/reads.fastq.gz" - RELATED_SAMPLE_TITLE: '4YzFv_example02_sample01' # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" + RELATED_SAMPLE_TITLE: 'rv7ck_example02_sample01' # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest -- NAME: '4YzFv_pe_reads_02_sample01' # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" +- NAME: 'rv7ck_pe_reads_02_sample01' # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" SEQUENCING_INSTRUMENT: 'Illumina MiSeq' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] LIBRARY_SOURCE: 'METAGENOMIC' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" LIBRARY_SELECTION: 'RANDOM' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" LIBRARY_STRATEGY: 'WGS' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" FASTQ_FILE: "data/reads/fwd2.fastq" # Path to the fastq file >>EXAMPLE: "/mnt/data/reads.fastq.gz" - RELATED_SAMPLE_TITLE: '4YzFv_example02_sample01' # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" + RELATED_SAMPLE_TITLE: 'rv7ck_example02_sample01' # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest ASSEMBLY: - ASSEMBLY_NAME: '4YzFv_e02_asm' # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" + ASSEMBLY_NAME: 'rv7ck_e02_asm' # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" ASSEMBLY_SOFTWARE: 'metaSPAdes' # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" ISOLATION_SOURCE: 'ant fungus garden' # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" @@ -44,9 +44,8 @@ ASSEMBLY: BINS: BINS_DIRECTORY: "data/3bins" # Directory containing the fasta files of all bins/MAGs >>EXAMPLE: "/mnt/data/bins" COMPLETENESS_SOFTWARE: "CheckM" - NCBI_TAXONOMY_FILES: "taxotest/archaea_taxonomy.tsv" # Software used to calculate completeness >>EXAMPLE: "CheckM" + NCBI_TAXONOMY_FILES: ["taxotest/archaea_taxonomy.tsv", "taxotest/bacteria_taxonomy.tsv"] # Software used to calculate completeness >>EXAMPLE: "CheckM" QUALITY_FILE: "taxotest/checkm_quality_3bins.tsv" # tsv file containing quality values of each bin. Header must include 'Bin_id', 'Completeness', 'Contamination'. A CheckM output table will work here. >>EXAMPLE: "/mnt/data/checkm_quality.tsv" - MANUAL_TAXONOMY_FILE: "taxotest/manual_taxonomy_3bins.tsv" # Scientific names and taxids for bins. See example file for the structure. Columns must be 'Bin_id', 'Tax_id' and 'Scientific_name'. Consult the README for more information. >>EXAMPLE: "/mnt/data/manual_tax.tsv" BINNING_SOFTWARE: 'VAMB' MIN_COMPLETENESS: 1 # Bins with smaller completeness value will be discarded (values in percent, 0-100). Remove this row to ignore bin completeness. >>EXAMPLE: "90" MAX_CONTAMINATION: 100 # Bins with larger contamination value will be discarded (values in percent, 0-100). Remove this row to ignore bin contamination (>100% contamination bins will still be discarded). >>EXAMPLE: "5" diff --git a/submg/taxQuery.py b/submg/taxQuery.py index 4edf243..7b194e7 100644 --- a/submg/taxQuery.py +++ b/submg/taxQuery.py @@ -22,9 +22,10 @@ def __report_tax_issues(issues): loggingC.message(err, threshold=-1) problematic_bins = [x['mag_bin'] for x in issues] problematic_bins = set(problematic_bins) - msg = '\n'.join(problematic_bins) + msg = "\t"+'\n\t'.join(problematic_bins) loggingC.message(msg, threshold=-1) - msg = "Please consult the README. You can manually enter taxonomy data for these bins into a .tsv file and specify it in the MANUAL_TAXONOMY_FILE field in the config file." + msg = "Please consult the Taxonomy Assignment section of the README." + msg += "\nYou can manually enter taxonomy data for these bins into a .tsv file and specify it in the MANUAL_TAXONOMY_FILE field in the config file." msg += " If your annotation process failed to classify a bin even at domain level, consider excluding it from your submission. If you want to submit it anyways," msg += " you may choose to add it to the MANUAL_TAXONOMY_FILES using taxid 155900 (unclassified organism)." loggingC.message(msg, threshold=-1) diff --git a/submg/utility.py b/submg/utility.py index 33d4485..7a7febc 100644 --- a/submg/utility.py +++ b/submg/utility.py @@ -411,10 +411,11 @@ def quality_filter_bins(quality_data, config): if len(filtered_out) > 0: msg = f">WARNING: {len(filtered_out)} bins have been excluded from submission due to quality thresholds:" loggingC.message(msg, threshold=0) - time.sleep(5) # Give user some extra time to notice message for bin in filtered_out: msg = f"\t{bin} (completeness {quality_data[bin]['completeness']}, contamination {quality_data[bin]['contamination']})" loggingC.message(msg, threshold=0) + if len(filtered_out) > 0: + time.sleep(5) # Give user some extra time to notice message if len(filtered_bins) == 0: err = "\nERROR: No bins left after filtering. Please adjust the quality thresholds." loggingC.message(err, threshold=-1) From 47bafdb3c616539590e8582908507e0cbee34ab7 Mon Sep 17 00:00:00 2001 From: ttubb Date: Mon, 13 May 2024 12:58:04 +0000 Subject: [PATCH 10/12] added minitest option --- examples/localtest.yaml | 134 ++++++++++++++++++++++++---------------- submg/main.py | 29 +++++++-- submg/readSubmission.py | 13 +++- 3 files changed, 117 insertions(+), 59 deletions(-) diff --git a/examples/localtest.yaml b/examples/localtest.yaml index bbb584c..4d61715 100644 --- a/examples/localtest.yaml +++ b/examples/localtest.yaml @@ -1,55 +1,85 @@ -# ABOUT: This is a config for submitting 1 sample, 2 sets of single-end reads, an assembly and bins. -# ABOUT: Coverage is derived from one unsorted .bam file. -# ABOUT: Taxonomy is derived from `gtdb_to_ncbi_majority_vote.py` output +# ABOUT: This a config for submitting samples, paired-end reads, an assembly, bins and MAGs +# ABOUT: Coverage is derived from 2 bam files. +# ABOUT: Taxonomy is derived from `gtdb_to_ncbi_majority_vote.py` output and a MANUAL_TAXONOMY_FILE # USAGE: navigate to the directory -# USAGE: submg submit --config 02_samples_reads_assembly_bins.yaml --staging_dir --logging_dir --submit_samples --submit_reads --submit_assembly --submit_bins - -STUDY: "PRJEB71644" # The accession of your study (which has to already exist in ENA) >>EXAMPLE: "PRJEB71644" -METAGENOME_SCIENTIFIC_NAME: 'ant fungus garden metagenome' # Taxonomic identifier of the metagenome. Check the ENA metagenome taxonomy tree to find a taxonomy ID and species name fitting your sample >>EXAMPLE: "biogas fermenter metagenome" -METAGENOME_TAXID: '797283' # Taxonomic identifier of the assembly. Must match SPECIES_SCIENTIFIC_NAME >>EXAMPLE: "718289" -SEQUENCING_PLATFORMS: ['ILLUMINA'] # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#platform >>EXAMPLE: ["ILLUMINA","OXFORD_NANOPORE"] -NEW_SAMPLES: # These samples will be created in ENA according to the data entered below. Your assembly MUST BE BASED ON ALL OF THESE. -- TITLE: 'rv7ck_example02_sample01' # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample" - collection date: '2012' # Any ISO compliant time. Can be truncated from the right (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" - geographic location (country and/or sea): 'missing: data agreement established pre-2023' # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" - ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment -SINGLE_READS: -- NAME: 'rv7ck_pe_reads_01_sample01' # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" - SEQUENCING_INSTRUMENT: 'Illumina MiSeq' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] - LIBRARY_SOURCE: 'METAGENOMIC' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" - LIBRARY_SELECTION: 'RANDOM' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" - LIBRARY_STRATEGY: 'WGS' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" - FASTQ_FILE: "data/reads/fwd1.fastq" # Path to the fastq file >>EXAMPLE: "/mnt/data/reads.fastq.gz" - RELATED_SAMPLE_TITLE: 'rv7ck_example02_sample01' # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" - ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest -- NAME: 'rv7ck_pe_reads_02_sample01' # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" - SEQUENCING_INSTRUMENT: 'Illumina MiSeq' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] - LIBRARY_SOURCE: 'METAGENOMIC' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" - LIBRARY_SELECTION: 'RANDOM' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" - LIBRARY_STRATEGY: 'WGS' # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" - FASTQ_FILE: "data/reads/fwd2.fastq" # Path to the fastq file >>EXAMPLE: "/mnt/data/reads.fastq.gz" - RELATED_SAMPLE_TITLE: 'rv7ck_example02_sample01' # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" - ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest +# USAGE: submg submit --config 01_samples_reads_assembly_bins_mags.yaml --staging_dir --logging_dir --submit_samples --submit_reads --submit_assembly --submit_bins --submit_mags + +STUDY: "PRJEB71644" # The accession of your study (which has to already exist in ENA) >>EXAMPLE: "PRJEB71644" +METAGENOME_SCIENTIFIC_NAME: "biogas fermenter metagenome" # Taxonomic identifier of the metagenome. Check the ENA metagenome taxonomy tree to find a taxonomy ID and species name fitting your sample >>EXAMPLE: "biogas fermenter metagenome" +METAGENOME_TAXID: "718289" # Taxonomic identifier of the assembly. Must match SPECIES_SCIENTIFIC_NAME >>EXAMPLE: "718289" +SEQUENCING_PLATFORMS: ["ILLUMINA"] # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#platform >>EXAMPLE: ["ILLUMINA","OXFORD_NANOPORE"] +PROJECT_NAME: "7yyIc_AgRFex 2 Survey" # Name of the project within which the sequencing was organized >>EXAMPLE: "AgRFex 2 Biogas Survey" +NEW_SAMPLES: # These samples will be created in ENA according to the data entered below. Your assembly MUST BE BASED ON ALL OF THESE. +- TITLE: "7yyIc_hydrolysis digester sample" # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample" + collection date: "2022-07-12" # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" + geographic location (country and/or sea): "Germany" # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" + ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment + geographic location (latitude): 52.51 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85" + geographic location (longitude): 8.77 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65" + broad-scale environmental context: "tropical biome" # For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical biome" + local environmental context: "tropical marine upwelling biome" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical marine upwelling biome" + environmental medium: "grass silage|animal waste material|anoxic water" # Pipe separated! For more information consult an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) and https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS >>EXAMPLE: "grass silage|animal waste material|anoxic water" +- TITLE: "7yyIc_main digester sample" # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample" + collection date: "2022-07-12" # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" + geographic location (country and/or sea): "Germany" # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" + ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment + geographic location (latitude): 52.51 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85" + geographic location (longitude): 8.77 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65" + broad-scale environmental context: "tropical biome" # For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical biome" + local environmental context: "tropical marine upwelling biome" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical marine upwelling biome" + environmental medium: "grass silage|animal waste material|anoxic water" # Pipe separated! For more information consult an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) and https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS >>EXAMPLE: "grass silage|animal waste material|anoxic water" +PAIRED_END_READS: +- NAME: "7yyIc_dh_pe" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" + SEQUENCING_INSTRUMENT: "Illumina HiSeq 1500" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] + LIBRARY_SOURCE: "METAGENOMIC" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" + LIBRARY_SELECTION: "RANDOM" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" + LIBRARY_STRATEGY: "WGS" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" + INSERT_SIZE: "300" # Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html) >>EXAMPLE: "300" + FASTQ1_FILE: "data/reads/fwd1.fastq" # Path to the fastq file with forward reads >>EXAMPLE: "/mnt/data/reads_R1.fastq.gz" + FASTQ2_FILE: "data/reads/rev1.fastq" # Path to the fastq file with reverse reads >>EXAMPLE: "/mnt/data/reads_R2.fastq.gz" + RELATED_SAMPLE_TITLE: "7yyIc_hydrolysis digester sample" # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" + ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest +- NAME: "7yyIc_dm_pe" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" + SEQUENCING_INSTRUMENT: "Illumina HiSeq 1500" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] + LIBRARY_SOURCE: "GENOMIC" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" + LIBRARY_SELECTION: "RANDOM" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" + LIBRARY_STRATEGY: "WGS" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" + INSERT_SIZE: "300" # Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html) >>EXAMPLE: "300" + FASTQ1_FILE: "data/reads/fwd2.fastq" # Path to the fastq file with forward reads >>EXAMPLE: "/mnt/data/reads_R1.fastq.gz" + FASTQ2_FILE: "data/reads/rev2.fastq" # Path to the fastq file with reverse reads >>EXAMPLE: "/mnt/data/reads_R2.fastq.gz" + RELATED_SAMPLE_TITLE: "7yyIc_main digester sample" # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" + ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest ASSEMBLY: - ASSEMBLY_NAME: 'rv7ck_e02_asm' # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "Northern Germany biogas digester metagenome" - ASSEMBLY_SOFTWARE: 'metaSPAdes' # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" - ISOLATION_SOURCE: 'ant fungus garden' # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" - FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" - collection date: '2012-02' # Any ISO compliant time. Can be truncated from the right (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" - geographic location (country and/or sea): 'missing: data agreement established pre-2023' # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" - ADDITIONAL_SAMPLESHEET_FIELDS: - geographic location (latitude): 52.51 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85" - geographic location (longitude): 8.77 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65" - ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest + ASSEMBLY_NAME: "7yyIc_e01_coasm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg" + ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" + ISOLATION_SOURCE: "biogas plant anaerobic digester" # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" + FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" + collection date: "2022-07-12" # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" + geographic location (country and/or sea): "Germany" # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" + ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment + geographic location (latitude): 52.51 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85" + geographic location (longitude): 8.77 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65" + broad-scale environmental context: "tropical biome" # For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical biome" + local environmental context: "tropical marine upwelling biome" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical marine upwelling biome" + environmental medium: "grass silage|animal waste material|anoxic water" # Pipe separated! For more information consult an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) and https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS >>EXAMPLE: "grass silage|animal waste material|anoxic water" + ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest BINS: - BINS_DIRECTORY: "data/3bins" # Directory containing the fasta files of all bins/MAGs >>EXAMPLE: "/mnt/data/bins" - COMPLETENESS_SOFTWARE: "CheckM" - NCBI_TAXONOMY_FILES: ["taxotest/archaea_taxonomy.tsv", "taxotest/bacteria_taxonomy.tsv"] # Software used to calculate completeness >>EXAMPLE: "CheckM" - QUALITY_FILE: "taxotest/checkm_quality_3bins.tsv" # tsv file containing quality values of each bin. Header must include 'Bin_id', 'Completeness', 'Contamination'. A CheckM output table will work here. >>EXAMPLE: "/mnt/data/checkm_quality.tsv" - BINNING_SOFTWARE: 'VAMB' - MIN_COMPLETENESS: 1 # Bins with smaller completeness value will be discarded (values in percent, 0-100). Remove this row to ignore bin completeness. >>EXAMPLE: "90" - MAX_CONTAMINATION: 100 # Bins with larger contamination value will be discarded (values in percent, 0-100). Remove this row to ignore bin contamination (>100% contamination bins will still be discarded). >>EXAMPLE: "5" - ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment - ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest -BAM_FILES: - - "data/mapping/1.unsorted.bam" \ No newline at end of file + BINS_DIRECTORY: "data/3bins" # Directory containing the fasta files of all bins/MAGs >>EXAMPLE: "/mnt/data/bins" + COMPLETENESS_SOFTWARE: "CheckM" # Software used to calculate completeness >>EXAMPLE: "CheckM" + QUALITY_FILE: "data/checkm_quality_3bins.tsv" # tsv file containing quality values of each bin. Header must include 'Bin_id', 'Completeness', 'Contamination'. A CheckM output table will work here. >>EXAMPLE: "/mnt/data/checkm_quality.tsv" + NCBI_TAXONOMY_FILES: # A list of files with NCBI taxonomy information about the bins. Consult the README to see how they should be structured. >>EXAMPLE: ["/mnt/data/bacteria_tax.tsv","/mnt/data/archaea_tax.tsv"] + - "data/taxonomy/archaea_taxonomy.tsv" + - "data/taxonomy/bacteria_taxonomy.tsv" + MANUAL_TAXONOMY_FILE: "data/taxonomy/manual_taxonomy.tsv" # Scientific names and taxids for bins. See example file for the structure. Columns must be 'Bin_id', 'Tax_id' and 'Scientific_name'. Consult the README for more information. >>EXAMPLE: "/mnt/data/manual_tax.tsv" + BINNING_SOFTWARE: "metabat2" # The program that was used for binning. >>EXAMPLE: "metabat2" + ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment + binning parameters: "default" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000047) >>EXAMPLE: "default" + taxonomic identity marker: "multi marker approach" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000047) >>EXAMPLE: "multi marker approach" + ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest +MAGS: + MAG_METADATA_FILE: "data/mag_metadata/mag_metadata.tsv" # A .tsv specifying 'Bin_id', 'Sample_id', 'Quality_category', 'Flatfile_path' and 'Unlocalised_path' for all MAGs. See README for more details. >>EXAMPLE: "/mnt/data/mag_data.tsv" + ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment + ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest +BAM_FILES: # The reads from your experiment mapped back to the assembly + - "data/mapping/1.sorted.bam" + - "data/mapping/2.sorted.bam" diff --git a/submg/main.py b/submg/main.py index 6c6ae8b..d1b8403 100644 --- a/submg/main.py +++ b/submg/main.py @@ -70,6 +70,12 @@ def init_argparse(): default=1, help="Make submissions to the ENA development " "test server. [default 1/true]") + parser_submit.add_argument("-i", + "--minitest", + action="store_true", + help="Run a minimal test submission using just " + "a fraction of your dataset. Intended for quick " + "troubleshooting. [default false]") parser_submit.add_argument("-t", "--threads", type=int, @@ -236,7 +242,11 @@ def submit(args): if args.timestamps or (args.timestamps is None and args.development_service): utility.set_up_timestamps(vars(args)) - try: + if args.minitest and not args.development_service: + loggingC.message("ERROR: The --minitest mode cannot be used for a submission to the ENA production server.", + threshold=-1) + + try: sver = staticConfig.submg_version wver = staticConfig.webin_cli_version loggingC.message(f">Running submg {sver} with webin-cli {wver}", 0) @@ -281,14 +291,22 @@ def submit(args): loggingC.message(err, threshold=-1) exit(1) # Query the taxonomy of bins - bin_taxonomy = taxQuery.get_bin_taxonomy(filtered_bins, - config) + bin_taxonomy = taxQuery.get_bin_taxonomy(filtered_bins, config) + if args.minitest: + msg = f">Minitest: Discarding every bin except {filtered_bins[0]}" + loggingC.message(msg, threshold=0) + filtered_bins = filtered_bins[0:1] # Construct depth files if there are .bam files in the config if 'BAM_FILES' in config.keys(): bam_files = utility.from_config(config, 'BAM_FILES') + if not isinstance(bam_files, list): bam_files = [bam_files] + if args.minitest: + msg = f">Minitest: Ignoring bam files except for {bam_files[0]}" + loggingC.message(msg, threshold=0) + bam_files = bam_files[0:1] depth_files = utility.construct_depth_files(args.staging_dir, args.threads, bam_files) @@ -325,7 +343,8 @@ def submit(args): sample_accession_data, prepdir(args.staging_dir, 'reads'), prepdir(args.logging_dir, 'reads'), - test=args.development_service) + test=args.development_service, + minitest=args.minitest) else: run_accessions = utility.from_config(config, 'ASSEMBLY', 'RUN_ACCESSIONS') if not isinstance(run_accessions, list): @@ -431,7 +450,7 @@ def submit(args): os.remove(depth_file) except Exception: - err = "\n\nTERMINATING BECAUSE AN UNHANDLED EXCEPTION OCCURED:\n" + err = "\n\nTERMINATING BECAUSE AN UNEXPECTED ERROR OCCURED:\n" loggingC.message(err, threshold=-1) exc_info = traceback.format_exc() loggingC.message(exc_info, threshold=-1) diff --git a/submg/readSubmission.py b/submg/readSubmission.py index ca8ad17..c3958d2 100644 --- a/submg/readSubmission.py +++ b/submg/readSubmission.py @@ -143,7 +143,8 @@ def submit_reads(config, sample_accession_data, staging_dir, logging_dir, - test=True): + test=True, + minitest=False): """ Submits the specified reads to ENA. @@ -180,6 +181,10 @@ def submit_reads(config, if not name in read_manifests: read_manifests[name] = manifest counter = i + 1 + if minitest: + msg = ">Minitest: Only submitting the first paired-end read set." + loggingC.message(msg, threshold=0) + break if 'SINGLE_READS' in config.keys(): loggingC.message(">Staging single-end reads for submission. This might take a while.", threshold=0) @@ -197,7 +202,11 @@ def submit_reads(config, read_set_logging_dir) if not name in read_manifests: - read_manifests[name] = manifest + read_manifests[name] = manifest + if minitest: + msg = ">Minitest: Only submitting the first single-end read set." + loggingC.message(msg, threshold=0) + break # Upload the reads loggingC.message(f">Using ENA Webin-CLI to submit reads.", threshold=0) From a4b7301c45556dde8779393b025cc7a724e49c31 Mon Sep 17 00:00:00 2001 From: ttubb Date: Mon, 13 May 2024 13:31:34 +0000 Subject: [PATCH 11/12] bugfix --- examples/localtest.yaml | 61 +++++++++++------------------------------ submg/magSubmission.py | 9 +++--- 2 files changed, 21 insertions(+), 49 deletions(-) diff --git a/examples/localtest.yaml b/examples/localtest.yaml index 4d61715..b98bfcc 100644 --- a/examples/localtest.yaml +++ b/examples/localtest.yaml @@ -1,35 +1,17 @@ -# ABOUT: This a config for submitting samples, paired-end reads, an assembly, bins and MAGs -# ABOUT: Coverage is derived from 2 bam files. +# ABOUT: This is a config for submitting 2 set of paired end reads, an assembly and bins +# ABOUT: Coverage is known. # ABOUT: Taxonomy is derived from `gtdb_to_ncbi_majority_vote.py` output and a MANUAL_TAXONOMY_FILE # USAGE: navigate to the directory -# USAGE: submg submit --config 01_samples_reads_assembly_bins_mags.yaml --staging_dir --logging_dir --submit_samples --submit_reads --submit_assembly --submit_bins --submit_mags +# USAGE: submg submit --config 05_reads_assembly_bins.yaml --staging_dir --logging_dir --submit_reads --submit_assembly --submit_bins + STUDY: "PRJEB71644" # The accession of your study (which has to already exist in ENA) >>EXAMPLE: "PRJEB71644" METAGENOME_SCIENTIFIC_NAME: "biogas fermenter metagenome" # Taxonomic identifier of the metagenome. Check the ENA metagenome taxonomy tree to find a taxonomy ID and species name fitting your sample >>EXAMPLE: "biogas fermenter metagenome" METAGENOME_TAXID: "718289" # Taxonomic identifier of the assembly. Must match SPECIES_SCIENTIFIC_NAME >>EXAMPLE: "718289" SEQUENCING_PLATFORMS: ["ILLUMINA"] # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#platform >>EXAMPLE: ["ILLUMINA","OXFORD_NANOPORE"] -PROJECT_NAME: "7yyIc_AgRFex 2 Survey" # Name of the project within which the sequencing was organized >>EXAMPLE: "AgRFex 2 Biogas Survey" -NEW_SAMPLES: # These samples will be created in ENA according to the data entered below. Your assembly MUST BE BASED ON ALL OF THESE. -- TITLE: "7yyIc_hydrolysis digester sample" # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample" - collection date: "2022-07-12" # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" - geographic location (country and/or sea): "Germany" # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" - ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment - geographic location (latitude): 52.51 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85" - geographic location (longitude): 8.77 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65" - broad-scale environmental context: "tropical biome" # For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical biome" - local environmental context: "tropical marine upwelling biome" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical marine upwelling biome" - environmental medium: "grass silage|animal waste material|anoxic water" # Pipe separated! For more information consult an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) and https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS >>EXAMPLE: "grass silage|animal waste material|anoxic water" -- TITLE: "7yyIc_main digester sample" # A unique title for your sample >>EXAMPLE: "Bioreactor_2_sample" - collection date: "2022-07-12" # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" - geographic location (country and/or sea): "Germany" # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" - ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment - geographic location (latitude): 52.51 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85" - geographic location (longitude): 8.77 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65" - broad-scale environmental context: "tropical biome" # For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical biome" - local environmental context: "tropical marine upwelling biome" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical marine upwelling biome" - environmental medium: "grass silage|animal waste material|anoxic water" # Pipe separated! For more information consult an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) and https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS >>EXAMPLE: "grass silage|animal waste material|anoxic water" +SAMPLE_ACCESSIONS: ['SAMEA113417017', 'SAMEA113417018'] # These samples exist in ENA. Your assembly is based on them. >>EXAMPLE: ["ERS15898933","ERS15898932"] PAIRED_END_READS: -- NAME: "7yyIc_dh_pe" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" +- NAME: "AKQ4G_ex05_rp1" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" SEQUENCING_INSTRUMENT: "Illumina HiSeq 1500" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] LIBRARY_SOURCE: "METAGENOMIC" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" LIBRARY_SELECTION: "RANDOM" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" @@ -37,31 +19,27 @@ PAIRED_END_READS: INSERT_SIZE: "300" # Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html) >>EXAMPLE: "300" FASTQ1_FILE: "data/reads/fwd1.fastq" # Path to the fastq file with forward reads >>EXAMPLE: "/mnt/data/reads_R1.fastq.gz" FASTQ2_FILE: "data/reads/rev1.fastq" # Path to the fastq file with reverse reads >>EXAMPLE: "/mnt/data/reads_R2.fastq.gz" - RELATED_SAMPLE_TITLE: "7yyIc_hydrolysis digester sample" # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" + RELATED_SAMPLE_ACCESSION: 'SAMEA113417017' # The accession of the sample that these reads originate from >>EXAMPLE: "ERS15898933" ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest -- NAME: "7yyIc_dm_pe" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" +- NAME: "AKQ4G_ex05_rp2" # Choose a unique name >>EXAMPLE: "Bioreactor_2_replicate_1" SEQUENCING_INSTRUMENT: "Illumina HiSeq 1500" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#instrument >>EXAMPLE: ["Illumina HiSeq 1500", "GridION"] - LIBRARY_SOURCE: "GENOMIC" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" + LIBRARY_SOURCE: "METAGENOMIC" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "GENOMIC" LIBRARY_SELECTION: "RANDOM" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-source >>EXAMPLE: "RANDOM" LIBRARY_STRATEGY: "WGS" # One of https://ena-docs.readthedocs.io/en/latest/submit/reads/webin-cli.html#permitted-values-for-library-strategy >>EXAMPLE: "WGS" INSERT_SIZE: "300" # Insert size of the paired-end reads (https://www.ebi.ac.uk/fg/annotare/help/seq_lib_spec.html) >>EXAMPLE: "300" FASTQ1_FILE: "data/reads/fwd2.fastq" # Path to the fastq file with forward reads >>EXAMPLE: "/mnt/data/reads_R1.fastq.gz" FASTQ2_FILE: "data/reads/rev2.fastq" # Path to the fastq file with reverse reads >>EXAMPLE: "/mnt/data/reads_R2.fastq.gz" - RELATED_SAMPLE_TITLE: "7yyIc_main digester sample" # The title of the sample that these reads originate from >>EXAMPLE: "Bioreactor_2_sample" + RELATED_SAMPLE_ACCESSION: 'SAMEA113417018' # The accession of the sample that these reads originate from >>EXAMPLE: "ERS15898933" ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest ASSEMBLY: - ASSEMBLY_NAME: "7yyIc_e01_coasm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg" + ASSEMBLY_NAME: "AKQ4G_e05_coasm" # Choose a name, even if your assembly has been uploaded already. Will only be used for naming assembly and bins/MAGs. >>EXAMPLE: "SGMA project mg" ASSEMBLY_SOFTWARE: "MEGAHIT" # Software used to generate the assembly >>EXAMPLE: "MEGAHIT" ISOLATION_SOURCE: "biogas plant anaerobic digester" # Describe where your sample was taken from >>EXAMPLE: "biogas plant anaerobic digester" FASTA_FILE: "data/assembly.fasta" # Path to the fasta file >>EXAMPLE: "/mnt/data/assembly.fasta.gz" - collection date: "2022-07-12" # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" + collection date: "2024-01-01" # Any ISO compliant time. Can be truncated from the righ (e.g. '2023-12-27T16:07' or '2023-12') >>EXAMPLE: "2023-03" geographic location (country and/or sea): "Germany" # See ENA checklists (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000011) for valid values >>EXAMPLE: "Germany" + COVERAGE_VALUE: 128.27 # Read coverage of the assembly. ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment - geographic location (latitude): 52.51 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "41.85" - geographic location (longitude): 8.77 # Use WGS84. For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "-87.65" - broad-scale environmental context: "tropical biome" # For more information consult appropriate an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical biome" - local environmental context: "tropical marine upwelling biome" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) >>EXAMPLE: "tropical marine upwelling biome" - environmental medium: "grass silage|animal waste material|anoxic water" # Pipe separated! For more information consult an ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000050) and https://github.com/EnvironmentOntology/envo/wiki/Using-ENVO-with-MIxS >>EXAMPLE: "grass silage|animal waste material|anoxic water" ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest BINS: BINS_DIRECTORY: "data/3bins" # Directory containing the fasta files of all bins/MAGs >>EXAMPLE: "/mnt/data/bins" @@ -69,17 +47,10 @@ BINS: QUALITY_FILE: "data/checkm_quality_3bins.tsv" # tsv file containing quality values of each bin. Header must include 'Bin_id', 'Completeness', 'Contamination'. A CheckM output table will work here. >>EXAMPLE: "/mnt/data/checkm_quality.tsv" NCBI_TAXONOMY_FILES: # A list of files with NCBI taxonomy information about the bins. Consult the README to see how they should be structured. >>EXAMPLE: ["/mnt/data/bacteria_tax.tsv","/mnt/data/archaea_tax.tsv"] - "data/taxonomy/archaea_taxonomy.tsv" - - "data/taxonomy/bacteria_taxonomy.tsv" + - "data/taxonomy/bacteria_taxonomy.tsv" MANUAL_TAXONOMY_FILE: "data/taxonomy/manual_taxonomy.tsv" # Scientific names and taxids for bins. See example file for the structure. Columns must be 'Bin_id', 'Tax_id' and 'Scientific_name'. Consult the README for more information. >>EXAMPLE: "/mnt/data/manual_tax.tsv" BINNING_SOFTWARE: "metabat2" # The program that was used for binning. >>EXAMPLE: "metabat2" - ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment - binning parameters: "default" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000047) >>EXAMPLE: "default" - taxonomic identity marker: "multi marker approach" # For more information consult an appropriate ENA samplesheet template (e.g. https://www.ebi.ac.uk/ena/browser/view/ERC000047) >>EXAMPLE: "multi marker approach" - ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest -MAGS: - MAG_METADATA_FILE: "data/mag_metadata/mag_metadata.tsv" # A .tsv specifying 'Bin_id', 'Sample_id', 'Quality_category', 'Flatfile_path' and 'Unlocalised_path' for all MAGs. See README for more details. >>EXAMPLE: "/mnt/data/mag_data.tsv" ADDITIONAL_SAMPLESHEET_FIELDS: # Please add more fields from the ENA samplesheet that most closely matches your experiment ADDITIONAL_MANIFEST_FIELDS: # You can add additional fields that will be written to the manifest -BAM_FILES: # The reads from your experiment mapped back to the assembly - - "data/mapping/1.sorted.bam" - - "data/mapping/2.sorted.bam" + COVERAGE_FILE: "data/bin_coverage.tsv" # .tsv file containing the coverage values of each bin. Columns must be 'Bin_id' and 'Coverage'. + \ No newline at end of file diff --git a/submg/magSubmission.py b/submg/magSubmission.py index 0046c9e..74fb9bf 100644 --- a/submg/magSubmission.py +++ b/submg/magSubmission.py @@ -399,11 +399,12 @@ def submit_mags(config: dict, bin_files = binSubmission.get_bins_in_dir(bins_directory) if not depth_files is None: bin_coverages = binSubmission.bin_coverage_from_depth(depth_files, - bin_files, - threads=threads) + bin_files, + threads=threads) elif not bin_coverage_file is None: - bin_coverages = binSubmission.bin_coverage_from_tsv(bin_coverage_file, - bin_files) + bin_coverages = binSubmission.bin_coverage_from_tsv(mag_metadata.keys(), + bin_coverage_file, + bin_files) # Make a samplesheet for all MAGs loggingC.message(">Making MAG samplesheet", threshold=1) From 9acb471fda2eba1cd50e68c7722614444b08eb69 Mon Sep 17 00:00:00 2001 From: ttubb Date: Tue, 14 May 2024 08:55:24 +0000 Subject: [PATCH 12/12] no build on pull request --- .github/workflows/container.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/container.yml b/.github/workflows/container.yml index 4e86558..6d97b21 100644 --- a/.github/workflows/container.yml +++ b/.github/workflows/container.yml @@ -3,8 +3,6 @@ name: Build and Push Docker Image on: push: branches: [ "main" ] - pull_request: - branches: [ "main" ] jobs: build-and-push: