-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b1187ed
commit 480a341
Showing
10 changed files
with
582 additions
and
1 deletion.
There are no files selected for viewing
Empty file.
Empty file.
Binary file not shown.
Binary file added
BIN
+3.89 KB
templates/assemblerflow_utils/__pycache__/assemblerflow_base.cpython-35.pyc
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
""" | ||
""" | ||
|
||
import os | ||
import sys | ||
import json | ||
import logging | ||
import traceback | ||
|
||
from time import gmtime, strftime | ||
|
||
|
||
def get_logger(filepath, level=logging.DEBUG): | ||
# create logger | ||
logger = logging.getLogger(os.path.basename(filepath)) | ||
logger.setLevel(level) | ||
# create console handler and set level to debug | ||
ch = logging.StreamHandler() | ||
ch.setLevel(level) | ||
# create formatter | ||
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') | ||
# add formatter to ch | ||
ch.setFormatter(formatter) | ||
# add ch to logger | ||
logger.addHandler(ch) | ||
|
||
return logger | ||
|
||
|
||
def log_error(): | ||
"""Nextflow specific function that logs an error upon unexpected failing | ||
""" | ||
|
||
with open(".status", "w") as status_fh: | ||
status_fh.write("error") | ||
|
||
|
||
class MainWrapper: | ||
|
||
def __init__(self, f): | ||
|
||
self.f = f | ||
self.context = self.f.__globals__ | ||
self.logger = self.context.get("logger", None) | ||
|
||
def __call__(self, *args, **kwargs): | ||
|
||
self.logger.debug("Starting template at {}".format( | ||
strftime("%Y-%m-%d %H:%M:%S", gmtime()))) | ||
self.logger.debug("Working directory: {}".format(os.getcwd())) | ||
|
||
try: | ||
self.build_versions() | ||
self.f(*args, **kwargs) | ||
except SystemExit as e: | ||
sys.exit(e) | ||
except: | ||
if self.logger: | ||
self.logger.error("Module exited unexpectedly with error:" | ||
"\\n{}".format(traceback.format_exc())) | ||
log_error() | ||
|
||
self.logger.debug("Finished template at {}".format( | ||
strftime("%Y-%m-%d %H:%M:%S", gmtime()))) | ||
|
||
def build_versions(self): | ||
"""Writes versions JSON for a template file | ||
This method creates the JSON file ``.versions`` based on the metadata | ||
and specific functions that are present in a given template script. | ||
It starts by fetching the template metadata, which can be specified | ||
via the ``__version__``, ``__template__`` and ``__build__`` | ||
attributes. If all of these attributes exist, it starts to populate | ||
a JSON/dict array (Note that the absence of any one of them will | ||
prevent the version from being written). | ||
Then, it will search the | ||
template scope for functions that start with the substring | ||
``__set_version`` (For example ``def __set_version_fastqc()`). | ||
These functions should gather the version of | ||
an arbitrary program and return a JSON/dict object with the following | ||
information:: | ||
{ | ||
"program": <program_name>, | ||
"version": <version> | ||
"build": <build> | ||
} | ||
This JSON/dict object is then written in the ``.versions`` file. | ||
""" | ||
|
||
version_storage = [] | ||
|
||
template_version = self.context.get("__version__", None) | ||
template_program = self.context.get("__template__", None) | ||
template_build = self.context.get("__build__", None) | ||
|
||
if template_version and template_program and template_build: | ||
if self.logger: | ||
self.logger.debug("Adding template version: {}; {}; " | ||
"{}".format(template_program, | ||
template_version, | ||
template_build)) | ||
version_storage.append({ | ||
"program": template_program, | ||
"version": template_version, | ||
"build": template_build | ||
}) | ||
|
||
for var, obj in self.context.items(): | ||
if var.startswith("__get_version"): | ||
ver = obj() | ||
version_storage.append(ver) | ||
if self.logger: | ||
self.logger.debug("Found additional software version" | ||
"{}".format(ver)) | ||
|
||
with open(".versions", "w") as fh: | ||
fh.write(json.dumps(version_storage, separators=(",", ":"))) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
#!/usr/bin/env python3 | ||
|
||
|
||
""" | ||
Purpose | ||
------- | ||
This module is intended to generate a json output for mapping results that | ||
can be imported in pATLAS. | ||
Expected input | ||
-------------- | ||
The following variables are expected whether using NextFlow or the | ||
:py:func:`main` executor. | ||
- ``depth_file`` : String with the name of the mash screen output file. | ||
- e.g.: ``'samtoolsDepthOutput_sampleA.txt'`` | ||
- ``json_dict`` : the file that contains the dictionary with keys and values for | ||
accessions and their respective lengths. | ||
- e.g.: ``'reads_sample_result_length.json'`` | ||
- ``cutoff`` : The cutoff used to trim the unwanted matches for the minimum | ||
coverage results from mapping. This value may range between 0 and 1. | ||
- e.g.: ``0.6`` | ||
Code documentation | ||
------------------ | ||
""" | ||
|
||
__version__ = "1.0.1" | ||
__build__ = "20022018" | ||
__template__ = "mapping2json-nf" | ||
|
||
import os | ||
import json | ||
|
||
from templates.assemblerflow_utils import get_logger, MainWrapper | ||
|
||
logger = get_logger(__file__) | ||
|
||
if __file__.endswith(".command.sh"): | ||
DEPTH_TXT = '$depthFile' | ||
JSON_LENGTH = '$lengthJson' | ||
CUTOFF = '$params.cov_cutoff' | ||
logger.debug("Running {} with parameters:".format( | ||
os.path.basename(__file__))) | ||
logger.debug("DEPTH_TXT: {}".format(DEPTH_TXT)) | ||
logger.debug("JSON_LENGHT: {}".format(JSON_LENGTH)) | ||
logger.debug("CUTOFF: {}".format(CUTOFF)) | ||
|
||
|
||
def depthfilereader(depth_file, plasmid_length, cutoff): | ||
''' | ||
Function that parse samtools depth file and creates 3 dictionaries that | ||
will be useful to make the outputs of this script, both the tabular file | ||
and the json file that may be imported by pATLAS | ||
Parameters | ||
---------- | ||
depth_file: str | ||
the path to depth file for each sample | ||
plasmid_length: dict | ||
a dictionary that stores length of all plasmids in fasta given as input | ||
cutoff: str | ||
the cutoff used to trim the unwanted matches for the minimum coverage | ||
results from mapping. This is then converted into a float within this | ||
function in order to compare with the value returned from the perc_value_per_ref. | ||
Returns | ||
------- | ||
percentage_basescovered: dict | ||
stores the percentage of the total sequence of a | ||
reference/accession (plasmid) in a dictionary | ||
''' | ||
depth_dic_coverage = {} | ||
for line in depth_file: | ||
tab_split = line.split() # split by any white space | ||
reference = "_".join(tab_split[0].strip().split("_")[0:3]) # store | ||
# only the gi for the reference | ||
position = tab_split[1] | ||
numreadsalign = float(tab_split[2].rstrip()) | ||
if reference not in depth_dic_coverage: | ||
depth_dic_coverage[reference] = {} | ||
depth_dic_coverage[reference][position] = numreadsalign | ||
|
||
percentage_basescovered = {} | ||
for ref in depth_dic_coverage: | ||
# calculates the percentage value per each reference | ||
perc_value_per_ref = float(len(depth_dic_coverage[ref])) / \ | ||
float(plasmid_length[ref]) | ||
# checks if percentage value is higher or equal to the cutoff defined | ||
if perc_value_per_ref >= float(cutoff): | ||
percentage_basescovered[ref] = perc_value_per_ref | ||
|
||
return percentage_basescovered | ||
|
||
@MainWrapper | ||
def main(depth_file, json_dict, cutoff): | ||
''' | ||
Function that handles the inputs required to parse depth files from bowtie | ||
and dumps a dict to a json file that can be imported into pATLAS. | ||
Parameters | ||
---------- | ||
depth_file: str | ||
the path to depth file for each sample | ||
json_dict: str | ||
the file that contains the dictionary with keys and values for accessions | ||
and their respective lengths | ||
cutoff: str | ||
the cutoff used to trim the unwanted matches for the minimum coverage | ||
results from mapping. This value may range between 0 and 1. | ||
''' | ||
|
||
# check for the appropriate value for the cutoff value for coverage results | ||
try: | ||
cutoff_val = float(cutoff) | ||
except ValueError: | ||
logger.error("Cutoff value should be a string such as: '0.6'. " | ||
"The outputted value: {}. Make sure to provide an " | ||
"appropriate value for --cov_cutoff".format(cutoff)) | ||
|
||
# loads dict from file, this file is provided in docker image | ||
|
||
plasmid_length = json.load(open(json_dict)) | ||
|
||
# read depth file | ||
depth_file_reader = open(depth_file) | ||
|
||
# first reads the depth file and generates dictionaries to handle the input | ||
# to a simpler format | ||
logger.info("Reading depth file and creating dictionary to dump") | ||
percentage_basescovered = depthfilereader(depth_file_reader, plasmid_length, | ||
cutoff_val) | ||
|
||
# then dump do file | ||
output_json = open("{}_mapping.json".format(depth_file), "w") | ||
logger.info("Dumping to {}".format("{}_mapping.json".format(depth_file))) | ||
output_json.write(json.dumps(percentage_basescovered)) | ||
output_json.close() | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
main(DEPTH_TXT, JSON_LENGTH, CUTOFF) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#!/usr/bin/env python3 | ||
|
||
|
||
""" | ||
Purpose | ||
------- | ||
This module is intended to generate a json output for mash dist results that | ||
can be imported in pATLAS. | ||
Expected input | ||
-------------- | ||
The following variables are expected whether using NextFlow or the | ||
:py:func:`main` executor. | ||
- ``mash_output`` : String with the name of the mash screen output file. | ||
- e.g.: ``'fastaFileA_mashdist.txt'`` | ||
Code documentation | ||
------------------ | ||
""" | ||
|
||
__version__ = "1.2.0" | ||
__build__ = "17052018" | ||
__template__ = "mashsdist2json-nf" | ||
|
||
import os | ||
import json | ||
|
||
from templates.assemblerflow_utils import get_logger, MainWrapper | ||
|
||
logger = get_logger(__file__) | ||
|
||
if __file__.endswith(".command.sh"): | ||
MASH_TXT = '$mashtxt' | ||
HASH_CUTOFF = '$params.shared_hashes' | ||
logger.debug("Running {} with parameters:".format( | ||
os.path.basename(__file__))) | ||
logger.debug("MASH_TXT: {}".format(MASH_TXT)) | ||
logger.debug("HASH_CUTOFF: {}".format(HASH_CUTOFF)) | ||
|
||
|
||
def send_to_output(master_dict, mash_output): | ||
"""Send dictionary to output json file | ||
This function sends master_dict dictionary to a json file if master_dict is | ||
populated with entries, otherwise it won't create the file | ||
Parameters | ||
---------- | ||
master_dict: dict | ||
dictionary that stores all entries for a specific query sequence | ||
in multi-fasta given to mash dist as input against patlas database | ||
last_seq: str | ||
string that stores the last sequence that was parsed before writing to | ||
file and therefore after the change of query sequence between different | ||
rows on the input file | ||
mash_output: str | ||
the name/path of input file to main function, i.e., the name/path of | ||
the mash dist output txt file. | ||
Returns | ||
------- | ||
""" | ||
# create a new file only if master_dict is populated | ||
if master_dict: | ||
out_file = open("{}.json".format( | ||
"".join(mash_output.split(".")[0])), "w") | ||
out_file.write(json.dumps(master_dict)) | ||
out_file.close() | ||
|
||
|
||
@MainWrapper | ||
def main(mash_output, hash_cutoff): | ||
''' | ||
Main function that allows to dump a mash dist txt file to a json file | ||
Parameters | ||
---------- | ||
mash_output: str | ||
A string with the input file. | ||
''' | ||
# out_file = open(".".join(mash_output.split(".")[:-1]) + ".json", "w") | ||
input_f = open(mash_output, "r") | ||
|
||
master_dict = {} | ||
# used to store the last sequence to be parsed (useful for multifasta) | ||
last_seq = "" | ||
counter = 0 | ||
|
||
for line in input_f: | ||
|
||
tab_split = line.split("\t") | ||
current_seq = tab_split[1].strip() | ||
ref_accession = "_".join(tab_split[0].strip().split("_")[0:3]) | ||
mash_dist = tab_split[2].strip() | ||
hashes_list = tab_split[-1].strip().split("/") | ||
|
||
# creates a percentage of the shared hashes between the sample and the | ||
# reference | ||
perc_hashes = float(hashes_list[0]) / float(hashes_list[1]) | ||
|
||
# assures that only the hashes with a given shared percentage are | ||
# reported to json file | ||
if perc_hashes > float(hash_cutoff): | ||
|
||
master_dict[ref_accession] = [1 - float(mash_dist), perc_hashes, | ||
current_seq] | ||
|
||
# assures that file is closed in last iteration of the loop | ||
send_to_output(master_dict, mash_output) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
main(MASH_TXT, HASH_CUTOFF) |
Oops, something went wrong.