diff --git a/templates b/templates deleted file mode 160000 index 1f5ae7e..0000000 --- a/templates +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1f5ae7efac621fd8155de110f4d6a819f3faa790 diff --git a/templates/__init__.py b/templates/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/templates/assemblerflow_utils/__init__.py b/templates/assemblerflow_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/templates/assemblerflow_utils/__pycache__/__init__.cpython-35.pyc b/templates/assemblerflow_utils/__pycache__/__init__.cpython-35.pyc new file mode 100644 index 0000000..c299e17 Binary files /dev/null and b/templates/assemblerflow_utils/__pycache__/__init__.cpython-35.pyc differ diff --git a/templates/assemblerflow_utils/__pycache__/assemblerflow_base.cpython-35.pyc b/templates/assemblerflow_utils/__pycache__/assemblerflow_base.cpython-35.pyc new file mode 100644 index 0000000..8af7e00 Binary files /dev/null and b/templates/assemblerflow_utils/__pycache__/assemblerflow_base.cpython-35.pyc differ diff --git a/templates/assemblerflow_utils/assemblerflow_base.py b/templates/assemblerflow_utils/assemblerflow_base.py new file mode 100644 index 0000000..0d14721 --- /dev/null +++ b/templates/assemblerflow_utils/assemblerflow_base.py @@ -0,0 +1,123 @@ +""" + +""" + +import os +import sys +import json +import logging +import traceback + +from time import gmtime, strftime + + +def get_logger(filepath, level=logging.DEBUG): + # create logger + logger = logging.getLogger(os.path.basename(filepath)) + logger.setLevel(level) + # create console handler and set level to debug + ch = logging.StreamHandler() + ch.setLevel(level) + # create formatter + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + # add formatter to ch + ch.setFormatter(formatter) + # add ch to logger + logger.addHandler(ch) + + return logger + + +def log_error(): + """Nextflow specific function that logs an error upon unexpected failing + """ + + with open(".status", "w") as status_fh: + status_fh.write("error") + + +class MainWrapper: + + def __init__(self, f): + + self.f = f + self.context = self.f.__globals__ + self.logger = self.context.get("logger", None) + + def __call__(self, *args, **kwargs): + + self.logger.debug("Starting template at {}".format( + strftime("%Y-%m-%d %H:%M:%S", gmtime()))) + self.logger.debug("Working directory: {}".format(os.getcwd())) + + try: + self.build_versions() + self.f(*args, **kwargs) + except SystemExit as e: + sys.exit(e) + except: + if self.logger: + self.logger.error("Module exited unexpectedly with error:" + "\\n{}".format(traceback.format_exc())) + log_error() + + self.logger.debug("Finished template at {}".format( + strftime("%Y-%m-%d %H:%M:%S", gmtime()))) + + def build_versions(self): + """Writes versions JSON for a template file + + This method creates the JSON file ``.versions`` based on the metadata + and specific functions that are present in a given template script. + + It starts by fetching the template metadata, which can be specified + via the ``__version__``, ``__template__`` and ``__build__`` + attributes. If all of these attributes exist, it starts to populate + a JSON/dict array (Note that the absence of any one of them will + prevent the version from being written). + + Then, it will search the + template scope for functions that start with the substring + ``__set_version`` (For example ``def __set_version_fastqc()`). + These functions should gather the version of + an arbitrary program and return a JSON/dict object with the following + information:: + + { + "program": , + "version": + "build": + } + + This JSON/dict object is then written in the ``.versions`` file. + """ + + version_storage = [] + + template_version = self.context.get("__version__", None) + template_program = self.context.get("__template__", None) + template_build = self.context.get("__build__", None) + + if template_version and template_program and template_build: + if self.logger: + self.logger.debug("Adding template version: {}; {}; " + "{}".format(template_program, + template_version, + template_build)) + version_storage.append({ + "program": template_program, + "version": template_version, + "build": template_build + }) + + for var, obj in self.context.items(): + if var.startswith("__get_version"): + ver = obj() + version_storage.append(ver) + if self.logger: + self.logger.debug("Found additional software version" + "{}".format(ver)) + + with open(".versions", "w") as fh: + fh.write(json.dumps(version_storage, separators=(",", ":"))) + diff --git a/templates/mapping2json.py b/templates/mapping2json.py new file mode 100644 index 0000000..968219f --- /dev/null +++ b/templates/mapping2json.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 + + +""" +Purpose +------- + +This module is intended to generate a json output for mapping results that +can be imported in pATLAS. + +Expected input +-------------- + +The following variables are expected whether using NextFlow or the +:py:func:`main` executor. + +- ``depth_file`` : String with the name of the mash screen output file. + - e.g.: ``'samtoolsDepthOutput_sampleA.txt'`` +- ``json_dict`` : the file that contains the dictionary with keys and values for + accessions and their respective lengths. + - e.g.: ``'reads_sample_result_length.json'`` +- ``cutoff`` : The cutoff used to trim the unwanted matches for the minimum + coverage results from mapping. This value may range between 0 and 1. + - e.g.: ``0.6`` + + +Code documentation +------------------ + +""" + +__version__ = "1.0.1" +__build__ = "20022018" +__template__ = "mapping2json-nf" + +import os +import json + +from templates.assemblerflow_utils import get_logger, MainWrapper + +logger = get_logger(__file__) + +if __file__.endswith(".command.sh"): + DEPTH_TXT = '$depthFile' + JSON_LENGTH = '$lengthJson' + CUTOFF = '$params.cov_cutoff' + logger.debug("Running {} with parameters:".format( + os.path.basename(__file__))) + logger.debug("DEPTH_TXT: {}".format(DEPTH_TXT)) + logger.debug("JSON_LENGHT: {}".format(JSON_LENGTH)) + logger.debug("CUTOFF: {}".format(CUTOFF)) + + +def depthfilereader(depth_file, plasmid_length, cutoff): + ''' + Function that parse samtools depth file and creates 3 dictionaries that + will be useful to make the outputs of this script, both the tabular file + and the json file that may be imported by pATLAS + + Parameters + ---------- + depth_file: str + the path to depth file for each sample + plasmid_length: dict + a dictionary that stores length of all plasmids in fasta given as input + cutoff: str + the cutoff used to trim the unwanted matches for the minimum coverage + results from mapping. This is then converted into a float within this + function in order to compare with the value returned from the perc_value_per_ref. + + Returns + ------- + percentage_basescovered: dict + stores the percentage of the total sequence of a + reference/accession (plasmid) in a dictionary + ''' + depth_dic_coverage = {} + for line in depth_file: + tab_split = line.split() # split by any white space + reference = "_".join(tab_split[0].strip().split("_")[0:3]) # store + # only the gi for the reference + position = tab_split[1] + numreadsalign = float(tab_split[2].rstrip()) + if reference not in depth_dic_coverage: + depth_dic_coverage[reference] = {} + depth_dic_coverage[reference][position] = numreadsalign + + percentage_basescovered = {} + for ref in depth_dic_coverage: + # calculates the percentage value per each reference + perc_value_per_ref = float(len(depth_dic_coverage[ref])) / \ + float(plasmid_length[ref]) + # checks if percentage value is higher or equal to the cutoff defined + if perc_value_per_ref >= float(cutoff): + percentage_basescovered[ref] = perc_value_per_ref + + return percentage_basescovered + +@MainWrapper +def main(depth_file, json_dict, cutoff): + ''' + Function that handles the inputs required to parse depth files from bowtie + and dumps a dict to a json file that can be imported into pATLAS. + + Parameters + ---------- + depth_file: str + the path to depth file for each sample + json_dict: str + the file that contains the dictionary with keys and values for accessions + and their respective lengths + cutoff: str + the cutoff used to trim the unwanted matches for the minimum coverage + results from mapping. This value may range between 0 and 1. + + + ''' + + # check for the appropriate value for the cutoff value for coverage results + try: + cutoff_val = float(cutoff) + except ValueError: + logger.error("Cutoff value should be a string such as: '0.6'. " + "The outputted value: {}. Make sure to provide an " + "appropriate value for --cov_cutoff".format(cutoff)) + + # loads dict from file, this file is provided in docker image + + plasmid_length = json.load(open(json_dict)) + + # read depth file + depth_file_reader = open(depth_file) + + # first reads the depth file and generates dictionaries to handle the input + # to a simpler format + logger.info("Reading depth file and creating dictionary to dump") + percentage_basescovered = depthfilereader(depth_file_reader, plasmid_length, + cutoff_val) + + # then dump do file + output_json = open("{}_mapping.json".format(depth_file), "w") + logger.info("Dumping to {}".format("{}_mapping.json".format(depth_file))) + output_json.write(json.dumps(percentage_basescovered)) + output_json.close() + + +if __name__ == "__main__": + + main(DEPTH_TXT, JSON_LENGTH, CUTOFF) diff --git a/templates/mashdist2json.py b/templates/mashdist2json.py new file mode 100644 index 0000000..e46d416 --- /dev/null +++ b/templates/mashdist2json.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 + + +""" +Purpose +------- + +This module is intended to generate a json output for mash dist results that +can be imported in pATLAS. + +Expected input +-------------- + +The following variables are expected whether using NextFlow or the +:py:func:`main` executor. + +- ``mash_output`` : String with the name of the mash screen output file. + - e.g.: ``'fastaFileA_mashdist.txt'`` + + +Code documentation +------------------ + +""" + +__version__ = "1.2.0" +__build__ = "17052018" +__template__ = "mashsdist2json-nf" + +import os +import json + +from templates.assemblerflow_utils import get_logger, MainWrapper + +logger = get_logger(__file__) + +if __file__.endswith(".command.sh"): + MASH_TXT = '$mashtxt' + HASH_CUTOFF = '$params.shared_hashes' + logger.debug("Running {} with parameters:".format( + os.path.basename(__file__))) + logger.debug("MASH_TXT: {}".format(MASH_TXT)) + logger.debug("HASH_CUTOFF: {}".format(HASH_CUTOFF)) + + +def send_to_output(master_dict, mash_output): + """Send dictionary to output json file + This function sends master_dict dictionary to a json file if master_dict is + populated with entries, otherwise it won't create the file + + Parameters + ---------- + master_dict: dict + dictionary that stores all entries for a specific query sequence + in multi-fasta given to mash dist as input against patlas database + last_seq: str + string that stores the last sequence that was parsed before writing to + file and therefore after the change of query sequence between different + rows on the input file + mash_output: str + the name/path of input file to main function, i.e., the name/path of + the mash dist output txt file. + + Returns + ------- + + """ + # create a new file only if master_dict is populated + if master_dict: + out_file = open("{}.json".format( + "".join(mash_output.split(".")[0])), "w") + out_file.write(json.dumps(master_dict)) + out_file.close() + + +@MainWrapper +def main(mash_output, hash_cutoff): + ''' + Main function that allows to dump a mash dist txt file to a json file + + Parameters + ---------- + mash_output: str + A string with the input file. + + ''' + # out_file = open(".".join(mash_output.split(".")[:-1]) + ".json", "w") + input_f = open(mash_output, "r") + + master_dict = {} + # used to store the last sequence to be parsed (useful for multifasta) + last_seq = "" + counter = 0 + + for line in input_f: + + tab_split = line.split("\t") + current_seq = tab_split[1].strip() + ref_accession = "_".join(tab_split[0].strip().split("_")[0:3]) + mash_dist = tab_split[2].strip() + hashes_list = tab_split[-1].strip().split("/") + + # creates a percentage of the shared hashes between the sample and the + # reference + perc_hashes = float(hashes_list[0]) / float(hashes_list[1]) + + # assures that only the hashes with a given shared percentage are + # reported to json file + if perc_hashes > float(hash_cutoff): + + master_dict[ref_accession] = [1 - float(mash_dist), perc_hashes, + current_seq] + + # assures that file is closed in last iteration of the loop + send_to_output(master_dict, mash_output) + + +if __name__ == "__main__": + + main(MASH_TXT, HASH_CUTOFF) diff --git a/templates/mashscreen2json.py b/templates/mashscreen2json.py new file mode 100644 index 0000000..9e85909 --- /dev/null +++ b/templates/mashscreen2json.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +""" +Purpose +------- + +This module is intended to generate a json output for mash screen results that +can be imported in pATLAS. + +Expected input +-------------- + +The following variables are expected whether using NextFlow or the +:py:func:`main` executor. + +- ``mash_output`` : String with the name of the mash screen output file. + - e.g.: ``'sortedMashScreenResults_SampleA.txt'`` + + +Code documentation +------------------ + +""" + +__version__ = "1.0.1" +__build__ = "20022018" +__template__ = "mashscreen2json-nf" + +from statistics import median +import os +import json + +from templates.assemblerflow_utils import get_logger, MainWrapper + +logger = get_logger(__file__) + +if __file__.endswith(".command.sh"): + MASH_TXT = '$mashtxt' + logger.debug("Running {} with parameters:".format( + os.path.basename(__file__))) + logger.debug("MASH_TXT: {}".format(MASH_TXT)) + +@MainWrapper +def main(mash_output): + ''' + converts top results from mash screen txt output to json format + + Parameters + ---------- + mash_output: str + this is a string that stores the path to this file, i.e, the name of + the file + + ''' + logger.info("Reading file : {}".format(mash_output)) + read_mash_output = open(mash_output) + + dic = {} + median_list = [] + filtered_dic = {} + + logger.info("Generating dictionary and list to pre-process the final json") + for line in read_mash_output: + tab_split = line.split("\t") + identity = tab_split[0] + #shared_hashes = tab_split[1] + median_multiplicity = tab_split[2] + #p_value = tab_split[3] + query_id = tab_split[4] + #query-comment should not exist here and it is irrelevant + + # here identity is what in fact interests to report to json but + # median_multiplicity also is important since it gives an rough + # estimation of the coverage depth for each plasmid. + # Plasmids should have higher coverage depth due to their increased + # copy number in relation to the chromosome. + dic[query_id] = [identity, median_multiplicity] + median_list.append(float(median_multiplicity)) + + output_json = open(" ".join(mash_output.split(".")[:-1]) + ".json", "w") + + # median cutoff is twice the median of all median_multiplicity values + # reported by mash screen. In the case of plasmids, since the database + # has 9k entries and reads shouldn't have that many sequences it seems ok... + if len(median_list) > 0: + # this statement assures that median_list has indeed any entries + median_cutoff = median(median_list) + logger.info("Generating final json to dump to a file") + for k, v in dic.items(): + # estimated copy number + copy_number = int(float(v[1]) / median_cutoff) + # assure that plasmid as at least twice the median coverage depth + if float(v[1]) > median_cutoff: + filtered_dic["_".join(k.split("_")[0:3])] = [v[0], + str(copy_number)] + logger.info( + "Exported dictionary has {} entries".format(len(filtered_dic))) + else: + # if no entries were found raise an error + logger.error("No matches were found using mash screen for the queried reads") + + output_json.write(json.dumps(filtered_dic)) + output_json.close() + +if __name__ == "__main__": + + main(MASH_TXT) diff --git a/templates/pATLAS_consensus_json.py b/templates/pATLAS_consensus_json.py new file mode 100644 index 0000000..939bb96 --- /dev/null +++ b/templates/pATLAS_consensus_json.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +""" +Purpose +------- + +This module is intended to generate a json output from the consensus results from +all the approaches available through options (mapping, assembly, mash screen) + +Expected input +-------------- + +The following variables are expected whether using NextFlow or the +:py:func:`main` executor. + +- ``mapping_json`` : String with the name of the json file with mapping results. + - e.g.: ``'mapping_SampleA.json'`` +- ``dist_json`` : String with the name of the json file with mash dist results. + - e.g.: ``'mash_dist_SampleA.json'`` +- ``screen_json`` : String with the name of the json file with mash screen results. + - e.g.: ``'mash_screen_sampleA.json'`` + + +Code documentation +------------------ + +""" + +__version__ = "0.1.0" +__build__ = "24022018" +__template__ = "pATLAS_consensus_json-nf" + +import os +import json + +from templates.assemblerflow_utils import get_logger, MainWrapper + +logger = get_logger(__file__) + +if __file__.endswith(".command.sh"): + LIST_OF_FILES = '$infile_list'.split() + logger.debug("Running {} with parameters:".format( + os.path.basename(__file__))) + logger.debug("LIST_OF_FILES: {}".format(LIST_OF_FILES)) + + +@MainWrapper +def main(list_of_jsons): + """ + + Parameters + ---------- + list_of_jsons: list + A list of files provided by fullConsensus process provided by nextflow + + """ + + # first lets gather a collection of the input and their corresponding dicts + file_correspondence = {} + + for infile in list_of_jsons: + file_dict = json.load(open(infile)) + file_correspondence[infile] = file_dict + + json_dict = {} + for accession in list(file_correspondence.values())[0]: + if all([True if accession in f_dict else False + for f_dict in file_correspondence.values()]): + accession_dict = {} + for infile in file_correspondence.keys(): + accession_dict[infile] = file_correspondence[infile][accession] + + json_dict[accession] = accession_dict + + out_file = open("consensus_{}.json".format( + list_of_jsons[0].split(".")[0].split("_")[-1]), "w") + + out_file.write(json.dumps(json_dict)) + out_file.close() + + +if __name__ == "__main__": + main(LIST_OF_FILES) \ No newline at end of file