removed submodule

tiagofilipe12 · Sep 7, 2018 · 480a341 · 480a341
1 parent b1187ed
commit 480a341
Show file tree

Hide file tree

Showing 10 changed files with 582 additions and 1 deletion.
diff --git a/templates b/templates
diff --git a/templates/__init__.py b/templates/__init__.py
diff --git a/templates/assemblerflow_utils/__init__.py b/templates/assemblerflow_utils/__init__.py
diff --git a/templates/assemblerflow_utils/__pycache__/__init__.cpython-35.pyc b/templates/assemblerflow_utils/__pycache__/__init__.cpython-35.pyc
diff --git a/templates/assemblerflow_utils/__pycache__/assemblerflow_base.cpython-35.pyc b/templates/assemblerflow_utils/__pycache__/assemblerflow_base.cpython-35.pyc
diff --git a/templates/assemblerflow_utils/assemblerflow_base.py b/templates/assemblerflow_utils/assemblerflow_base.py
@@ -0,0 +1,123 @@
+"""
+
+"""
+
+import os
+import sys
+import json
+import logging
+import traceback
+
+from time import gmtime, strftime
+
+
+def get_logger(filepath, level=logging.DEBUG):
+    # create logger
+    logger = logging.getLogger(os.path.basename(filepath))
+    logger.setLevel(level)
+    # create console handler and set level to debug
+    ch = logging.StreamHandler()
+    ch.setLevel(level)
+    # create formatter
+    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    # add formatter to ch
+    ch.setFormatter(formatter)
+    # add ch to logger
+    logger.addHandler(ch)
+
+    return logger
+
+
+def log_error():
+    """Nextflow specific function that logs an error upon unexpected failing
+    """
+
+    with open(".status", "w") as status_fh:
+        status_fh.write("error")
+
+
+class MainWrapper:
+
+    def __init__(self, f):
+
+        self.f = f
+        self.context = self.f.__globals__
+        self.logger = self.context.get("logger", None)
+
+    def __call__(self, *args, **kwargs):
+
+        self.logger.debug("Starting template at {}".format(
+            strftime("%Y-%m-%d %H:%M:%S", gmtime())))
+        self.logger.debug("Working directory: {}".format(os.getcwd()))
+
+        try:
+            self.build_versions()
+            self.f(*args, **kwargs)
+        except SystemExit as e:
+            sys.exit(e)
+        except:
+            if self.logger:
+                self.logger.error("Module exited unexpectedly with error:"
+                                  "\\n{}".format(traceback.format_exc()))
+            log_error()
+
+        self.logger.debug("Finished template at {}".format(
+            strftime("%Y-%m-%d %H:%M:%S", gmtime())))
+
+    def build_versions(self):
+        """Writes versions JSON for a template file
+
+        This method creates the JSON file ``.versions`` based on the metadata
+        and specific functions that are present in a given template script.
+
+        It starts by fetching the template metadata, which can be specified
+        via the ``__version__``, ``__template__`` and ``__build__``
+        attributes. If all of these attributes exist, it starts to populate
+        a JSON/dict array (Note that the absence of any one of them will
+        prevent the version from being written).
+
+        Then, it will search the
+        template scope for functions that start with the substring
+        ``__set_version`` (For example ``def __set_version_fastqc()`).
+        These functions should gather the version of
+        an arbitrary program and return a JSON/dict object with the following
+        information::
+
+            {
+                "program": <program_name>,
+                "version": <version>
+                "build": <build>
+            }
+
+        This JSON/dict object is then written in the ``.versions`` file.
+        """
+
+        version_storage = []
+
+        template_version = self.context.get("__version__", None)
+        template_program = self.context.get("__template__", None)
+        template_build = self.context.get("__build__", None)
+
+        if template_version and template_program and template_build:
+            if self.logger:
+                self.logger.debug("Adding template version: {}; {}; "
+                                  "{}".format(template_program,
+                                              template_version,
+                                              template_build))
+            version_storage.append({
+                "program": template_program,
+                "version": template_version,
+                "build": template_build
+            })
+
+        for var, obj in self.context.items():
+            if var.startswith("__get_version"):
+                ver = obj()
+                version_storage.append(ver)
+                if self.logger:
+                    self.logger.debug("Found additional software version"
+                                      "{}".format(ver))
+
+        with open(".versions", "w") as fh:
+            fh.write(json.dumps(version_storage, separators=(",", ":")))
+
diff --git a/templates/mapping2json.py b/templates/mapping2json.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+
+
+"""
+Purpose
+-------
+
+This module is intended to generate a json output for mapping results that
+can be imported in pATLAS.
+
+Expected input
+--------------
+
+The following variables are expected whether using NextFlow or the
+:py:func:`main` executor.
+
+- ``depth_file`` : String with the name of the mash screen output file.
+    - e.g.: ``'samtoolsDepthOutput_sampleA.txt'``
+- ``json_dict`` : the file that contains the dictionary with keys and values for
+        accessions and their respective lengths.
+    - e.g.: ``'reads_sample_result_length.json'``
+- ``cutoff`` : The cutoff used to trim the unwanted matches for the minimum
+        coverage results from mapping. This value may range between 0 and 1.
+    - e.g.: ``0.6``
+
+
+Code documentation
+------------------
+
+"""
+
+__version__ = "1.0.1"
+__build__ = "20022018"
+__template__ = "mapping2json-nf"
+
+import os
+import json
+
+from templates.assemblerflow_utils import get_logger, MainWrapper
+
+logger = get_logger(__file__)
+
+if __file__.endswith(".command.sh"):
+    DEPTH_TXT = '$depthFile'
+    JSON_LENGTH = '$lengthJson'
+    CUTOFF = '$params.cov_cutoff'
+    logger.debug("Running {} with parameters:".format(
+        os.path.basename(__file__)))
+    logger.debug("DEPTH_TXT: {}".format(DEPTH_TXT))
+    logger.debug("JSON_LENGHT: {}".format(JSON_LENGTH))
+    logger.debug("CUTOFF: {}".format(CUTOFF))
+
+
+def depthfilereader(depth_file, plasmid_length, cutoff):
+    '''
+    Function that parse samtools depth file and creates 3 dictionaries that
+    will be useful to make the outputs of this script, both the tabular file
+    and the json file that may be imported by pATLAS
+
+    Parameters
+    ----------
+    depth_file: str
+        the path to depth file for each sample
+    plasmid_length: dict
+        a dictionary that stores length of all plasmids in fasta given as input
+    cutoff: str
+        the cutoff used to trim the unwanted matches for the minimum coverage
+        results from mapping. This is then converted into a float within this
+        function in order to compare with the value returned from the perc_value_per_ref.
+
+    Returns
+    -------
+    percentage_basescovered: dict
+            stores the percentage of the total sequence of a
+            reference/accession (plasmid) in a dictionary
+    '''
+    depth_dic_coverage = {}
+    for line in depth_file:
+        tab_split = line.split()    # split by any white space
+        reference = "_".join(tab_split[0].strip().split("_")[0:3])  # store
+        # only the gi for the reference
+        position = tab_split[1]
+        numreadsalign = float(tab_split[2].rstrip())
+        if reference not in depth_dic_coverage:
+            depth_dic_coverage[reference] = {}
+        depth_dic_coverage[reference][position] = numreadsalign
+
+    percentage_basescovered = {}
+    for ref in depth_dic_coverage:
+        # calculates the percentage value per each reference
+        perc_value_per_ref = float(len(depth_dic_coverage[ref])) / \
+                                       float(plasmid_length[ref])
+        # checks if percentage value is higher or equal to the cutoff defined
+        if perc_value_per_ref >= float(cutoff):
+            percentage_basescovered[ref] = perc_value_per_ref
+
+    return percentage_basescovered
+
+@MainWrapper
+def main(depth_file, json_dict, cutoff):
+    '''
+    Function that handles the inputs required to parse depth files from bowtie
+    and dumps a dict to a json file that can be imported into pATLAS.
+
+    Parameters
+    ----------
+    depth_file: str
+         the path to depth file for each sample
+    json_dict: str
+        the file that contains the dictionary with keys and values for accessions
+        and their respective lengths
+    cutoff: str
+        the cutoff used to trim the unwanted matches for the minimum coverage
+        results from mapping. This value may range between 0 and 1.
+
+
+    '''
+
+    # check for the appropriate value for the cutoff value for coverage results
+    try:
+        cutoff_val = float(cutoff)
+    except ValueError:
+        logger.error("Cutoff value should be a string such as: '0.6'. "
+                     "The outputted value: {}. Make sure to provide an "
+                     "appropriate value for --cov_cutoff".format(cutoff))
+
+    # loads dict from file, this file is provided in docker image
+
+    plasmid_length = json.load(open(json_dict))
+
+    # read depth file
+    depth_file_reader = open(depth_file)
+
+    # first reads the depth file and generates dictionaries to handle the input
+    # to a simpler format
+    logger.info("Reading depth file and creating dictionary to dump")
+    percentage_basescovered = depthfilereader(depth_file_reader, plasmid_length,
+                                              cutoff_val)
+
+    # then dump do file
+    output_json = open("{}_mapping.json".format(depth_file), "w")
+    logger.info("Dumping to {}".format("{}_mapping.json".format(depth_file)))
+    output_json.write(json.dumps(percentage_basescovered))
+    output_json.close()
+
+
+if __name__ == "__main__":
+
+    main(DEPTH_TXT, JSON_LENGTH, CUTOFF)
diff --git a/templates/mashdist2json.py b/templates/mashdist2json.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+
+
+"""
+Purpose
+-------
+
+This module is intended to generate a json output for mash dist results that
+can be imported in pATLAS.
+
+Expected input
+--------------
+
+The following variables are expected whether using NextFlow or the
+:py:func:`main` executor.
+
+- ``mash_output`` : String with the name of the mash screen output file.
+    - e.g.: ``'fastaFileA_mashdist.txt'``
+
+
+Code documentation
+------------------
+
+"""
+
+__version__ = "1.2.0"
+__build__ = "17052018"
+__template__ = "mashsdist2json-nf"
+
+import os
+import json
+
+from templates.assemblerflow_utils import get_logger, MainWrapper
+
+logger = get_logger(__file__)
+
+if __file__.endswith(".command.sh"):
+    MASH_TXT = '$mashtxt'
+    HASH_CUTOFF = '$params.shared_hashes'
+    logger.debug("Running {} with parameters:".format(
+        os.path.basename(__file__)))
+    logger.debug("MASH_TXT: {}".format(MASH_TXT))
+    logger.debug("HASH_CUTOFF: {}".format(HASH_CUTOFF))
+
+
+def send_to_output(master_dict, mash_output):
+    """Send dictionary to output json file
+    This function sends master_dict dictionary to a json file if master_dict is
+    populated with entries, otherwise it won't create the file
+
+    Parameters
+    ----------
+    master_dict: dict
+        dictionary that stores all entries for a specific query sequence
+        in multi-fasta given to mash dist as input against patlas database
+    last_seq: str
+        string that stores the last sequence that was parsed before writing to
+        file and therefore after the change of query sequence between different
+        rows on the input file
+    mash_output: str
+        the name/path of input file to main function, i.e., the name/path of
+        the mash dist output txt file.
+
+    Returns
+    -------
+
+    """
+    # create a new file only if master_dict is populated
+    if master_dict:
+        out_file = open("{}.json".format(
+            "".join(mash_output.split(".")[0])), "w")
+        out_file.write(json.dumps(master_dict))
+        out_file.close()
+
+
+@MainWrapper
+def main(mash_output, hash_cutoff):
+    '''
+    Main function that allows to dump a mash dist txt file to a json file
+
+    Parameters
+    ----------
+    mash_output: str
+        A string with the input file.
+
+    '''
+    # out_file = open(".".join(mash_output.split(".")[:-1]) + ".json", "w")
+    input_f = open(mash_output, "r")
+
+    master_dict = {}
+    # used to store the last sequence to be parsed (useful for multifasta)
+    last_seq = ""
+    counter = 0
+
+    for line in input_f:
+
+        tab_split = line.split("\t")
+        current_seq = tab_split[1].strip()
+        ref_accession = "_".join(tab_split[0].strip().split("_")[0:3])
+        mash_dist = tab_split[2].strip()
+        hashes_list = tab_split[-1].strip().split("/")
+
+        # creates a percentage of the shared hashes between the sample and the
+        # reference
+        perc_hashes = float(hashes_list[0]) / float(hashes_list[1])
+
+        # assures that only the hashes with a given shared percentage are
+        # reported to json file
+        if perc_hashes > float(hash_cutoff):
+
+            master_dict[ref_accession] = [1 - float(mash_dist), perc_hashes,
+                                              current_seq]
+
+    # assures that file is closed in last iteration of the loop
+    send_to_output(master_dict, mash_output)
+
+
+if __name__ == "__main__":
+
+    main(MASH_TXT, HASH_CUTOFF)