Skip to content

Commit

Permalink
#48: Readability suggested changes. Altered split read count to have …
Browse files Browse the repository at this point in the history
…split reads and forced split reads distinct without overlap
  • Loading branch information
creisle committed Feb 8, 2018
1 parent 629005c commit e40fdbf
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 74 deletions.
2 changes: 1 addition & 1 deletion mavis/annotate/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
'min_domain_mapping_match', 0.9, cast_type=float_fraction,
defn='a number between 0 and 1 representing the minimum percent match a domain must map to the fusion transcript '
'to be displayed')
DEFAULTS.add('min_orf_size', 300, defn='the minimum length (in amino acids) to retain a putative open reading frame (ORF)')
DEFAULTS.add('min_orf_size', 300, defn='the minimum length (in base pairs) to retain a putative open reading frame (ORF)')
DEFAULTS.add('max_orf_cap', 3, defn='the maximum number of ORFs to return (best putative ORFs will be retained)')
DEFAULTS.add(
'annotation_filters', 'choose_more_annotated,choose_transcripts_by_priority',
Expand Down
22 changes: 12 additions & 10 deletions mavis/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,27 +302,29 @@ def parse_overlay_args(parser, required, optional):
parser.error('argument --marker: start and end must be integers: {}'.format(marker))

defaults = [None, None, 1, None, True]
bam_file, bin_size, ymax, stranded = range(1, 5)

for plot in args.read_depth_plots:
for i, d in enumerate(defaults):
if i >= len(plot):
plot.append(d)
if not os.path.exists(plot[1]):
parser.error('argument --read_depth_plots: the bam file given does not exist: {}'.format(plot[1]))
if not os.path.exists(plot[bam_file]):
parser.error('argument --read_depth_plots: the bam file given does not exist: {}'.format(plot[bam_file]))
try:
plot[2] = int(plot[2])
plot[bin_size] = int(plot[bin_size])
except ValueError:
parser.error('argument --read_depth_plots: bin size must be an integer: {}'.format(plot[2]))
parser.error('argument --read_depth_plots: bin size must be an integer: {}'.format(plot[bin_size]))
try:
if str(plot[3]).lower() in ['null', 'none']:
plot[3] = None
if str(plot[ymax]).lower() in ['null', 'none']:
plot[ymax] = None
else:
plot[3] = int(plot[3])
plot[ymax] = int(plot[ymax])
except ValueError:
parser.error('argument --read_depth_plots: ymax must be an integer: {}'.format(plot[3]))
parser.error('argument --read_depth_plots: ymax must be an integer: {}'.format(plot[ymax]))
try:
plot[4] = tab.cast_boolean(plot[4])
plot[stranded] = tab.cast_boolean(plot[stranded])
except TypeError:
parser.error('argument --read_depth_plots: stranded must be an boolean: {}'.format(plot[4]))
parser.error('argument --read_depth_plots: stranded must be an boolean: {}'.format(plot[stranded]))
return args


Expand Down
21 changes: 8 additions & 13 deletions mavis/summary/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,20 +251,15 @@ def filter_by_evidence(
removed.append(bpp)
continue
elif bpp.call_method == CALL_METHOD.SPLIT:
linking_split_reads = bpp.linking_split_reads
if bpp.event_type == SVTYPE.INS:
linking_split_reads += bpp.flanking_pairs
if any([
bpp.break1_split_reads < filter_min_split_reads,
bpp.break2_split_reads < filter_min_split_reads,
all([
bpp.event_type != SVTYPE.INS,
bpp.linking_split_reads < filter_min_linking_split_reads
]),
all([
bpp.event_type == SVTYPE.INS,
bpp.flanking_pairs < filter_min_linking_split_reads,
bpp.break2_split_reads_forced + bpp.break1_split_reads_forced < filter_min_linking_split_reads
]),
bpp.break1_split_reads - bpp.break1_split_reads_forced < 1,
bpp.break2_split_reads - bpp.break2_split_reads_forced < 1
bpp.break1_split_reads + bpp.break1_split_reads_forced < filter_min_split_reads,
bpp.break2_split_reads + bpp.break2_split_reads_forced < filter_min_split_reads,
linking_split_reads < filter_min_linking_split_reads,
bpp.break1_split_reads < 1,
bpp.break2_split_reads < 1
]):
removed.append(bpp)
continue
Expand Down
96 changes: 48 additions & 48 deletions mavis/validate/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,30 +249,44 @@ def flanking_metrics(self):
stdev = math.sqrt(err)
return median, stdev

def break1_tgt_align_split_read_names(self):
def break1_split_read_names(self, tgt=False, both=False):
"""
Args:
tgt (bool): return only target re-aligned read names
both (bool): return both original alignments and target-realigned
"""
reads = set()
for read in self.break1_split_reads:
if read.has_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT) and read.get_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT):
if tgt:
reads.add(read.query_name)
elif not tgt:
reads.add(read.query_name)

if both:
reads.add(read.query_name)
return reads

def break2_tgt_align_split_read_names(self):
def break2_split_read_names(self, tgt=False, both=False):
"""
Args:
tgt (bool): return only target re-aligned read names
both (bool): return both original alignments and target-realigned
"""
reads = set()
for read in self.break2_split_reads:
if read.has_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT) and read.get_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT):
if tgt:
reads.add(read.query_name)
elif not tgt:
reads.add(read.query_name)

if both:
reads.add(read.query_name)
return reads

def linking_split_read_names(self):
reads1 = set()
for read in self.break1_split_reads:
reads1.add(read.query_name)

reads2 = set()
for read in self.break2_split_reads:
reads2.add(read.query_name)

return reads1 & reads2
return self.break1_split_read_names(both=True) & self.break2_split_read_names(both=True)

@staticmethod
def characterize_repeat_region(event, reference_genome):
Expand Down Expand Up @@ -353,33 +367,15 @@ def flatten(self):
COLUMNS.flanking_pairs_read_names: ';'.join(sorted(list(flank)))
})

b1 = set()
b1_tgt = set()
b2 = set()
b2_tgt = set()

for read in self.break1_split_reads:
name = read.query_name
b1.add(name)
if read.has_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT) and read.get_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT):
b1_tgt.add(name)
for read in self.break2_split_reads:
name = read.query_name
b2.add(name)
if read.has_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT) and read.get_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT):
b2_tgt.add(name)

linking = b1 & b2

row.update({
COLUMNS.break1_split_reads: len(b1),
COLUMNS.break1_split_reads_forced: len(b1_tgt),
COLUMNS.break1_split_read_names: ';'.join(sorted(b1)),
COLUMNS.break2_split_reads: len(b2),
COLUMNS.break2_split_reads_forced: len(b2_tgt),
COLUMNS.break2_split_read_names: ';'.join(sorted(b2)),
COLUMNS.linking_split_reads: len(linking),
COLUMNS.linking_split_read_names: ';'.join(sorted(linking)),
COLUMNS.break1_split_reads: len(self.break1_split_read_names()),
COLUMNS.break1_split_reads_forced: len(self.break1_split_read_names(tgt=True)),
COLUMNS.break1_split_read_names: ';'.join(sorted(self.break1_split_read_names(both=True))),
COLUMNS.break2_split_reads: len(self.break2_split_read_names()),
COLUMNS.break2_split_reads_forced: len(self.break2_split_read_names(tgt=True)),
COLUMNS.break2_split_read_names: ';'.join(sorted(self.break2_split_read_names(both=True))),
COLUMNS.linking_split_reads: len(self.linking_split_read_names()),
COLUMNS.linking_split_read_names: ';'.join(sorted(self.linking_split_read_names())),
COLUMNS.spanning_reads: len(self.spanning_reads),
COLUMNS.spanning_read_names: ';'.join(sorted([r.query_name for r in self.spanning_reads]))
})
Expand Down Expand Up @@ -838,15 +834,15 @@ def _call_by_supporting_reads(evidence, event_type, consumed_evidence=None):
continue

# check if any of the aligned reads are 'double' aligned
double_aligned = {}
double_aligned = dict()
for read in pos1[first] | pos2[second]:
seq_key = tuple(sorted([
read.query_name, read.query_sequence, reverse_complement(read.query_sequence)
])) # seq and revseq are equal
double_aligned.setdefault(seq_key, []).append(read)

# now create calls using the double aligned split read pairs if possible (to resolve untemplated sequence)
resolved_calls = {}
resolved_calls = dict()
event_types = {event_type}
if event_type in {SVTYPE.DUP, SVTYPE.INS}:
event_types.update({SVTYPE.DUP, SVTYPE.INS})
Expand Down Expand Up @@ -893,17 +889,21 @@ def _call_by_supporting_reads(evidence, event_type, consumed_evidence=None):
call.add_break1_split_read(read)
for read in uncons_break2_reads - consumed_evidence:
call.add_break2_split_read(read)
linking_reads = len(call.linking_split_read_names())
if call.event_type == SVTYPE.INS: # may not expect linking split reads for insertions
linking_reads += len(call.flanking_pairs)
# does it pass the requirements?
if any([
len(call.break1_split_reads) < evidence.min_splits_reads_resolution,
len(call.break2_split_reads) < evidence.min_splits_reads_resolution,
len(call.linking_split_read_names()) < evidence.min_linking_split_reads
if not any([
len(call.break1_split_read_names(both=True)) < evidence.min_splits_reads_resolution,
len(call.break2_split_read_names(both=True)) < evidence.min_splits_reads_resolution,
len(call.break1_split_read_names()) < 1,
len(call.break2_split_read_names()) < 1,
linking_reads < evidence.min_linking_split_reads,
]):
continue
linked_pairings.append(call)
# consume the evidence
consumed_evidence.update(call.break1_split_reads)
consumed_evidence.update(call.break2_split_reads)
linked_pairings.append(call)
# consume the evidence
consumed_evidence.update(call.break1_split_reads)
consumed_evidence.update(call.break2_split_reads)
except ValueError: # incompatible types
continue

Expand Down
4 changes: 2 additions & 2 deletions mavis/validate/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,8 +232,8 @@ def main(
', flanking pairs: {}{}'.format(
0 if not call.contig else len(call.contig.input_reads),
len(call.spanning_reads),
len(call.break1_split_reads), len(call.break1_tgt_align_split_read_names()),
len(call.break2_split_reads), len(call.break2_tgt_align_split_read_names()),
len(call.break1_split_read_names()), len(call.break1_split_read_names(tgt=True)),
len(call.break2_split_read_names()), len(call.break2_split_read_names(tgt=True)),
len(call.linking_split_read_names()),
len(call.flanking_pairs),
'' if not call.has_compatible else '(' + str(len(call.compatible_flanking_pairs)) + ')'
Expand Down

0 comments on commit e40fdbf

Please sign in to comment.