Skip to content

Commit

Permalink
roll back test input to state of origin/master; add functions to util…
Browse files Browse the repository at this point in the history
….file to sanitize fasta file IDs

fasta ID is sanitized for picard CreateSequenceDictionary to adhere to character set restrictions in SAM/BAM RNAME spec. see: samtools/hts-specs#333

Roll back test input now that we are doing the sanitization. As of this commit, sanitization is only performed on fasta files destined for CreateSequenceDictionary
  • Loading branch information
tomkinsc committed Aug 9, 2019
1 parent 9f53977 commit 42d420d
Show file tree
Hide file tree
Showing 10 changed files with 107 additions and 44 deletions.
Binary file modified test/input/TestBlastnDbBuild/expected/TestBlastnDbBuild.nhr
Binary file not shown.
Binary file modified test/input/TestBlastnDbBuild/expected/TestBlastnDbBuild.nin
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
20
Bundibugyo_ebolavirus_complete_genome
Bundibugyo_ebolavirus_isolate_EboBund-14_2012_complete_genome
Cote_dIvoire_ebolavirus_complete_genome
Reston_ebolavirus_isolate_RESTV/M.fascicularis-tc/PHL-USA/1996/Ferlite_Philippines/Alice_TX_complete_genome
Reston_ebolavirus_isolate_RESTV/Sus-wt/PHL/2009/09A_Farm_A_complete_genome
Reston_ebolavirus_-_Reston_strain_Reston08-C_complete_genome
Reston_ebolavirus_-_Reston_strain_Reston08-E_complete_genome
Reston_Ebola_virus_strain_Pennsylvania_complete_genome
Sudan_ebolavirus_isolate_EboSud-609_2012_complete_genome
Sudan_ebolavirus_isolate_EboSud-639_complete_genome
Sudan_ebolavirus_isolate_EBOV-S-2004_from_Sudan_complete_genome
Sudan_ebolavirus_-_Nakisamata_complete_genome
Sudan_ebolavirus_strain_Gulu_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/COD/1977/Bonduni_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/COD/2007/43_Luebo_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/1Eko_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/2Nza_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/Ilembe_complete_genome
Zaire_ebolavirus_isolate_H.sapiens-wt/GIN/2014/Gueckedou-C05_complete_genome
Zaire_ebolavirus_strain_Zaire_1995_complete_genome
Bundibugyo_ebolavirus,_complete_genome
Bundibugyo_ebolavirus_isolate_EboBund-14_2012,_complete_genome
Cote_d'Ivoire_ebolavirus,_complete_genome
Reston_ebolavirus_isolate_RESTV/M.fascicularis-tc/PHL-USA/1996/Ferlite,_Philippines/Alice,_TX,_complete_genome
Reston_ebolavirus_isolate_RESTV/Sus-wt/PHL/2009/09A_Farm_A,_complete_genome
Reston_ebolavirus_-_Reston_strain_Reston08-C,_complete_genome
Reston_ebolavirus_-_Reston_strain_Reston08-E,_complete_genome
Reston_Ebola_virus_strain_Pennsylvania,_complete_genome
Sudan_ebolavirus_isolate_EboSud-609_2012,_complete_genome
Sudan_ebolavirus_isolate_EboSud-639,_complete_genome
Sudan_ebolavirus_isolate_EBOV-S-2004_from_Sudan,_complete_genome
Sudan_ebolavirus_-_Nakisamata,_complete_genome
Sudan_ebolavirus_strain_Gulu,_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/COD/1977/Bonduni,_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/COD/2007/43_Luebo,_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/1Eko,_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/2Nza,_complete_genome
Zaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/Ilembe,_complete_genome
Zaire_ebolavirus_isolate_H.sapiens-wt/GIN/2014/Gueckedou-C05,_complete_genome
Zaire_ebolavirus_strain_Zaire_1995,_complete_genome
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Bundibugyo_ebolavirus_complete_genomeBundibugyo_ebolavirus_isolate_EboBund-14_2012_complete_genomeCote_dIvoire_ebolavirus_complete_genomeReston_ebolavirus_isolate_RESTV/M.fascicularis-tc/PHL-USA/1996/Ferlite_Philippines/Alice_TX_complete_genomeReston_ebolavirus_isolate_RESTV/Sus-wt/PHL/2009/09A_Farm_A_complete_genomeReston_ebolavirus_-_Reston_strain_Reston08-C_complete_genomeReston_ebolavirus_-_Reston_strain_Reston08-E_complete_genomeReston_Ebola_virus_strain_Pennsylvania_complete_genomeSudan_ebolavirus_isolate_EboSud-609_2012_complete_genomeSudan_ebolavirus_isolate_EboSud-639_complete_genomeSudan_ebolavirus_isolate_EBOV-S-2004_from_Sudan_complete_genomeSudan_ebolavirus_-_Nakisamata_complete_genomeSudan_ebolavirus_strain_Gulu_complete_genomeZaire_ebolavirus_isolate_EBOV/H.sapiens-tc/COD/1977/Bonduni_complete_genomeZaire_ebolavirus_isolate_EBOV/H.sapiens-tc/COD/2007/43_Luebo_complete_genomeZaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/1Eko_complete_genomeZaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/2Nza_complete_genomeZaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/Ilembe_complete_genomeZaire_ebolavirus_isolate_H.sapiens-wt/GIN/2014/Gueckedou-C05_complete_genomeZaire_ebolavirus_strain_Zaire_1995_complete_genome
Bundibugyo_ebolavirus,_complete_genomeBundibugyo_ebolavirus_isolate_EboBund-14_2012,_complete_genomeCote_d'Ivoire_ebolavirus,_complete_genomeReston_ebolavirus_isolate_RESTV/M.fascicularis-tc/PHL-USA/1996/Ferlite,_Philippines/Alice,_TX,_complete_genomeReston_ebolavirus_isolate_RESTV/Sus-wt/PHL/2009/09A_Farm_A,_complete_genomeReston_ebolavirus_-_Reston_strain_Reston08-C,_complete_genomeReston_ebolavirus_-_Reston_strain_Reston08-E,_complete_genomeReston_Ebola_virus_strain_Pennsylvania,_complete_genomeSudan_ebolavirus_isolate_EboSud-609_2012,_complete_genomeSudan_ebolavirus_isolate_EboSud-639,_complete_genomeSudan_ebolavirus_isolate_EBOV-S-2004_from_Sudan,_complete_genomeSudan_ebolavirus_-_Nakisamata,_complete_genomeSudan_ebolavirus_strain_Gulu,_complete_genomeZaire_ebolavirus_isolate_EBOV/H.sapiens-tc/COD/1977/Bonduni,_complete_genomeZaire_ebolavirus_isolate_EBOV/H.sapiens-tc/COD/2007/43_Luebo,_complete_genomeZaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/1Eko,_complete_genomeZaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/2Nza,_complete_genomeZaire_ebolavirus_isolate_EBOV/H.sapiens-tc/GAB/1996/Ilembe,_complete_genomeZaire_ebolavirus_isolate_H.sapiens-wt/GIN/2014/Gueckedou-C05,_complete_genomeZaire_ebolavirus_strain_Zaire_1995,_complete_genome
Binary file modified test/input/TestLastalDbBuild/expected/TestLastalDbBuild.sds
Binary file not shown.
40 changes: 20 additions & 20 deletions test/input/ebola.fasta

Large diffs are not rendered by default.

Binary file modified test/input/ebola.fasta.gz
Binary file not shown.
Binary file modified test/input/ebola.fasta.lz4
Binary file not shown.
7 changes: 4 additions & 3 deletions tools/picard.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import util.misc

TOOL_NAME = "picard"
TOOL_VERSION = '2.20.3'
TOOL_VERSION = '2.20.5'
TOOL_URL = 'https://github.com/broadinstitute/picard/releases/download/' \
+ '{ver}/picard-tools-{ver}.zip'.format(ver=TOOL_VERSION)
# Note: /seq/software/picard/{versionnumber}/ does not correspond with github release numbers!
Expand Down Expand Up @@ -408,8 +408,9 @@ def execute(
os.unlink(outDict)
else:
return
opts = ['REFERENCE=' + inFasta, 'OUTPUT=' + outDict]
PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory)
with util.file.fastas_with_sanitized_ids(inFasta, use_tmp=False) as sanitized_fastas:
opts = ['REFERENCE=' + sanitized_fastas[0], 'OUTPUT=' + outDict]
PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory)


class BuildBamIndexTool(PicardTools):
Expand Down
62 changes: 62 additions & 0 deletions util/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO import FastaIO

# imports needed for download_file() and webfile_readlines()
import re
Expand Down Expand Up @@ -625,6 +626,67 @@ def max_path_length(file_system_path):
"""Return the maximum valid length of a path on the given filesystem."""
return _get_pathconf(file_system_path, '_PATH_MAX', 255)-1

def sanitize_id_for_sam_rname(string_in):
#[0-9A-Za-z!#$%&+./:;?@^_|~-]
# See character set restrictions in SAM/BAM RNAME spec:
# https://samtools.github.io/hts-specs/SAMv1.pdf
# [0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*
# Here we are being conservative and replacing anything disallowed:
# [^0-9A-Za-z!#$%&+./:;?@^_|~-]
disallowed_char_re = re.compile(r'[^0-9A-Za-z!#$%&+./:;?@^_|~-]')
string_value = disallowed_char_re.sub("_", string_in)

# condense runs of underscores
double_underscore_re = re.compile(r'_{2,}')
string_value = double_underscore_re.sub("_", string_value)

# ensure all the character removals did not make the name empty
string_value = string_value or '_'
print("sanitizing: %s ====> %s " % (string_in, string_value))
return string_value

def write_fasta_with_sanitized_ids(fasta_in, out_filepath):
with open(out_filepath, "w") as handle:
fasta_out = FastaIO.FastaWriter(handle, wrap=None)
fasta_out.write_header()
for record in SeqIO.parse(fasta_in, "fasta"):
record.id=sanitize_id_for_sam_rname(record.id)
fasta_out.write_record(record)
return out_filepath

@contextlib.contextmanager
def fastas_with_sanitized_ids(input_fasta_paths, use_tmp=False):
""" Returns a list of file paths for fasta files with
sanitized IDs
( Suitable for Picard; see: https://github.com/samtools/hts-specs/pull/333 )
input_fasta_paths is a list of file paths to fasta files
if use_tmp==False, companion fasta files will be created with ".sanitized_ids.fasta" appended
in the same location as the input
if use_tmp==True, temp files will be written instead
"""
sanitized_fasta_paths=[]
if use_tmp:
with tempfnames(["{inf_name}.sanitized_id.fasta".format(inf_name=os.path.basename(inf_path)) for inf_path in [input_fasta_paths]]) as temp_fasta_paths:
for fasta_in, out_filepath in zip([input_fasta_paths], temp_fasta_paths):
sanitized_fasta_paths.append(write_fasta_with_sanitized_ids(fasta_in, out_filepath))
yield sanitized_fasta_paths
else:
for fasta_in in [input_fasta_paths]:
in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0]
out_basedir = os.path.realpath(os.path.dirname(fasta_in))
new_basename = in_fasta_basename
if new_basename.lower().endswith('.fa'):
new_basename = new_basename[:-3] + '.sanitized_ids.fa'
elif new_basename.lower().endswith('.fasta'):
new_basename = new_basename[:-6] + '.sanitized_ids.fasta'
else:
new_basename = new_basename + '.sanitized_ids.fasta'
out_filepath = os.path.join(out_basedir,new_basename)
sanitized_fasta_paths.append(write_fasta_with_sanitized_ids(fasta_in, out_filepath))
yield sanitized_fasta_paths

def string_to_file_name(string_value, file_system_path=None, length_margin=0):
"""Constructs a valid file name from a given string, replacing or deleting invalid characters.
If `file_system_path` is given, makes sure the file name is valid on that file system.
Expand Down

0 comments on commit 42d420d

Please sign in to comment.