Skip to content

Commit

Permalink
removed submodule
Browse files Browse the repository at this point in the history
  • Loading branch information
tiagofilipe12 committed Sep 7, 2018
1 parent b1187ed commit 480a341
Show file tree
Hide file tree
Showing 10 changed files with 582 additions and 1 deletion.
1 change: 0 additions & 1 deletion templates
Submodule templates deleted from 1f5ae7
Empty file added templates/__init__.py
Empty file.
Empty file.
Binary file not shown.
Binary file not shown.
123 changes: 123 additions & 0 deletions templates/assemblerflow_utils/assemblerflow_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""
"""

import os
import sys
import json
import logging
import traceback

from time import gmtime, strftime


def get_logger(filepath, level=logging.DEBUG):
# create logger
logger = logging.getLogger(os.path.basename(filepath))
logger.setLevel(level)
# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(level)
# create formatter
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)

return logger


def log_error():
"""Nextflow specific function that logs an error upon unexpected failing
"""

with open(".status", "w") as status_fh:
status_fh.write("error")


class MainWrapper:

def __init__(self, f):

self.f = f
self.context = self.f.__globals__
self.logger = self.context.get("logger", None)

def __call__(self, *args, **kwargs):

self.logger.debug("Starting template at {}".format(
strftime("%Y-%m-%d %H:%M:%S", gmtime())))
self.logger.debug("Working directory: {}".format(os.getcwd()))

try:
self.build_versions()
self.f(*args, **kwargs)
except SystemExit as e:
sys.exit(e)
except:
if self.logger:
self.logger.error("Module exited unexpectedly with error:"
"\\n{}".format(traceback.format_exc()))
log_error()

self.logger.debug("Finished template at {}".format(
strftime("%Y-%m-%d %H:%M:%S", gmtime())))

def build_versions(self):
"""Writes versions JSON for a template file
This method creates the JSON file ``.versions`` based on the metadata
and specific functions that are present in a given template script.
It starts by fetching the template metadata, which can be specified
via the ``__version__``, ``__template__`` and ``__build__``
attributes. If all of these attributes exist, it starts to populate
a JSON/dict array (Note that the absence of any one of them will
prevent the version from being written).
Then, it will search the
template scope for functions that start with the substring
``__set_version`` (For example ``def __set_version_fastqc()`).
These functions should gather the version of
an arbitrary program and return a JSON/dict object with the following
information::
{
"program": <program_name>,
"version": <version>
"build": <build>
}
This JSON/dict object is then written in the ``.versions`` file.
"""

version_storage = []

template_version = self.context.get("__version__", None)
template_program = self.context.get("__template__", None)
template_build = self.context.get("__build__", None)

if template_version and template_program and template_build:
if self.logger:
self.logger.debug("Adding template version: {}; {}; "
"{}".format(template_program,
template_version,
template_build))
version_storage.append({
"program": template_program,
"version": template_version,
"build": template_build
})

for var, obj in self.context.items():
if var.startswith("__get_version"):
ver = obj()
version_storage.append(ver)
if self.logger:
self.logger.debug("Found additional software version"
"{}".format(ver))

with open(".versions", "w") as fh:
fh.write(json.dumps(version_storage, separators=(",", ":")))

149 changes: 149 additions & 0 deletions templates/mapping2json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#!/usr/bin/env python3


"""
Purpose
-------
This module is intended to generate a json output for mapping results that
can be imported in pATLAS.
Expected input
--------------
The following variables are expected whether using NextFlow or the
:py:func:`main` executor.
- ``depth_file`` : String with the name of the mash screen output file.
- e.g.: ``'samtoolsDepthOutput_sampleA.txt'``
- ``json_dict`` : the file that contains the dictionary with keys and values for
accessions and their respective lengths.
- e.g.: ``'reads_sample_result_length.json'``
- ``cutoff`` : The cutoff used to trim the unwanted matches for the minimum
coverage results from mapping. This value may range between 0 and 1.
- e.g.: ``0.6``
Code documentation
------------------
"""

__version__ = "1.0.1"
__build__ = "20022018"
__template__ = "mapping2json-nf"

import os
import json

from templates.assemblerflow_utils import get_logger, MainWrapper

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
DEPTH_TXT = '$depthFile'
JSON_LENGTH = '$lengthJson'
CUTOFF = '$params.cov_cutoff'
logger.debug("Running {} with parameters:".format(
os.path.basename(__file__)))
logger.debug("DEPTH_TXT: {}".format(DEPTH_TXT))
logger.debug("JSON_LENGHT: {}".format(JSON_LENGTH))
logger.debug("CUTOFF: {}".format(CUTOFF))


def depthfilereader(depth_file, plasmid_length, cutoff):
'''
Function that parse samtools depth file and creates 3 dictionaries that
will be useful to make the outputs of this script, both the tabular file
and the json file that may be imported by pATLAS
Parameters
----------
depth_file: str
the path to depth file for each sample
plasmid_length: dict
a dictionary that stores length of all plasmids in fasta given as input
cutoff: str
the cutoff used to trim the unwanted matches for the minimum coverage
results from mapping. This is then converted into a float within this
function in order to compare with the value returned from the perc_value_per_ref.
Returns
-------
percentage_basescovered: dict
stores the percentage of the total sequence of a
reference/accession (plasmid) in a dictionary
'''
depth_dic_coverage = {}
for line in depth_file:
tab_split = line.split() # split by any white space
reference = "_".join(tab_split[0].strip().split("_")[0:3]) # store
# only the gi for the reference
position = tab_split[1]
numreadsalign = float(tab_split[2].rstrip())
if reference not in depth_dic_coverage:
depth_dic_coverage[reference] = {}
depth_dic_coverage[reference][position] = numreadsalign

percentage_basescovered = {}
for ref in depth_dic_coverage:
# calculates the percentage value per each reference
perc_value_per_ref = float(len(depth_dic_coverage[ref])) / \
float(plasmid_length[ref])
# checks if percentage value is higher or equal to the cutoff defined
if perc_value_per_ref >= float(cutoff):
percentage_basescovered[ref] = perc_value_per_ref

return percentage_basescovered

@MainWrapper
def main(depth_file, json_dict, cutoff):
'''
Function that handles the inputs required to parse depth files from bowtie
and dumps a dict to a json file that can be imported into pATLAS.
Parameters
----------
depth_file: str
the path to depth file for each sample
json_dict: str
the file that contains the dictionary with keys and values for accessions
and their respective lengths
cutoff: str
the cutoff used to trim the unwanted matches for the minimum coverage
results from mapping. This value may range between 0 and 1.
'''

# check for the appropriate value for the cutoff value for coverage results
try:
cutoff_val = float(cutoff)
except ValueError:
logger.error("Cutoff value should be a string such as: '0.6'. "
"The outputted value: {}. Make sure to provide an "
"appropriate value for --cov_cutoff".format(cutoff))

# loads dict from file, this file is provided in docker image

plasmid_length = json.load(open(json_dict))

# read depth file
depth_file_reader = open(depth_file)

# first reads the depth file and generates dictionaries to handle the input
# to a simpler format
logger.info("Reading depth file and creating dictionary to dump")
percentage_basescovered = depthfilereader(depth_file_reader, plasmid_length,
cutoff_val)

# then dump do file
output_json = open("{}_mapping.json".format(depth_file), "w")
logger.info("Dumping to {}".format("{}_mapping.json".format(depth_file)))
output_json.write(json.dumps(percentage_basescovered))
output_json.close()


if __name__ == "__main__":

main(DEPTH_TXT, JSON_LENGTH, CUTOFF)
120 changes: 120 additions & 0 deletions templates/mashdist2json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/env python3


"""
Purpose
-------
This module is intended to generate a json output for mash dist results that
can be imported in pATLAS.
Expected input
--------------
The following variables are expected whether using NextFlow or the
:py:func:`main` executor.
- ``mash_output`` : String with the name of the mash screen output file.
- e.g.: ``'fastaFileA_mashdist.txt'``
Code documentation
------------------
"""

__version__ = "1.2.0"
__build__ = "17052018"
__template__ = "mashsdist2json-nf"

import os
import json

from templates.assemblerflow_utils import get_logger, MainWrapper

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
MASH_TXT = '$mashtxt'
HASH_CUTOFF = '$params.shared_hashes'
logger.debug("Running {} with parameters:".format(
os.path.basename(__file__)))
logger.debug("MASH_TXT: {}".format(MASH_TXT))
logger.debug("HASH_CUTOFF: {}".format(HASH_CUTOFF))


def send_to_output(master_dict, mash_output):
"""Send dictionary to output json file
This function sends master_dict dictionary to a json file if master_dict is
populated with entries, otherwise it won't create the file
Parameters
----------
master_dict: dict
dictionary that stores all entries for a specific query sequence
in multi-fasta given to mash dist as input against patlas database
last_seq: str
string that stores the last sequence that was parsed before writing to
file and therefore after the change of query sequence between different
rows on the input file
mash_output: str
the name/path of input file to main function, i.e., the name/path of
the mash dist output txt file.
Returns
-------
"""
# create a new file only if master_dict is populated
if master_dict:
out_file = open("{}.json".format(
"".join(mash_output.split(".")[0])), "w")
out_file.write(json.dumps(master_dict))
out_file.close()


@MainWrapper
def main(mash_output, hash_cutoff):
'''
Main function that allows to dump a mash dist txt file to a json file
Parameters
----------
mash_output: str
A string with the input file.
'''
# out_file = open(".".join(mash_output.split(".")[:-1]) + ".json", "w")
input_f = open(mash_output, "r")

master_dict = {}
# used to store the last sequence to be parsed (useful for multifasta)
last_seq = ""
counter = 0

for line in input_f:

tab_split = line.split("\t")
current_seq = tab_split[1].strip()
ref_accession = "_".join(tab_split[0].strip().split("_")[0:3])
mash_dist = tab_split[2].strip()
hashes_list = tab_split[-1].strip().split("/")

# creates a percentage of the shared hashes between the sample and the
# reference
perc_hashes = float(hashes_list[0]) / float(hashes_list[1])

# assures that only the hashes with a given shared percentage are
# reported to json file
if perc_hashes > float(hash_cutoff):

master_dict[ref_accession] = [1 - float(mash_dist), perc_hashes,
current_seq]

# assures that file is closed in last iteration of the loop
send_to_output(master_dict, mash_output)


if __name__ == "__main__":

main(MASH_TXT, HASH_CUTOFF)
Loading

0 comments on commit 480a341

Please sign in to comment.