diff --git a/mavis/annotate/constants.py b/mavis/annotate/constants.py index 994b9a37..88247bbe 100644 --- a/mavis/annotate/constants.py +++ b/mavis/annotate/constants.py @@ -17,7 +17,7 @@ 'min_domain_mapping_match', 0.9, cast_type=float_fraction, defn='a number between 0 and 1 representing the minimum percent match a domain must map to the fusion transcript ' 'to be displayed') -DEFAULTS.add('min_orf_size', 300, defn='the minimum length (in amino acids) to retain a putative open reading frame (ORF)') +DEFAULTS.add('min_orf_size', 300, defn='the minimum length (in base pairs) to retain a putative open reading frame (ORF)') DEFAULTS.add('max_orf_cap', 3, defn='the maximum number of ORFs to return (best putative ORFs will be retained)') DEFAULTS.add( 'annotation_filters', 'choose_more_annotated,choose_transcripts_by_priority', diff --git a/mavis/main.py b/mavis/main.py index d7cc3e77..13bed8f7 100644 --- a/mavis/main.py +++ b/mavis/main.py @@ -302,27 +302,29 @@ def parse_overlay_args(parser, required, optional): parser.error('argument --marker: start and end must be integers: {}'.format(marker)) defaults = [None, None, 1, None, True] + bam_file, bin_size, ymax, stranded = range(1, 5) + for plot in args.read_depth_plots: for i, d in enumerate(defaults): if i >= len(plot): plot.append(d) - if not os.path.exists(plot[1]): - parser.error('argument --read_depth_plots: the bam file given does not exist: {}'.format(plot[1])) + if not os.path.exists(plot[bam_file]): + parser.error('argument --read_depth_plots: the bam file given does not exist: {}'.format(plot[bam_file])) try: - plot[2] = int(plot[2]) + plot[bin_size] = int(plot[bin_size]) except ValueError: - parser.error('argument --read_depth_plots: bin size must be an integer: {}'.format(plot[2])) + parser.error('argument --read_depth_plots: bin size must be an integer: {}'.format(plot[bin_size])) try: - if str(plot[3]).lower() in ['null', 'none']: - plot[3] = None + if str(plot[ymax]).lower() in ['null', 'none']: + plot[ymax] = None else: - plot[3] = int(plot[3]) + plot[ymax] = int(plot[ymax]) except ValueError: - parser.error('argument --read_depth_plots: ymax must be an integer: {}'.format(plot[3])) + parser.error('argument --read_depth_plots: ymax must be an integer: {}'.format(plot[ymax])) try: - plot[4] = tab.cast_boolean(plot[4]) + plot[stranded] = tab.cast_boolean(plot[stranded]) except TypeError: - parser.error('argument --read_depth_plots: stranded must be an boolean: {}'.format(plot[4])) + parser.error('argument --read_depth_plots: stranded must be an boolean: {}'.format(plot[stranded])) return args diff --git a/mavis/summary/summary.py b/mavis/summary/summary.py index de7635c7..2c6898a5 100644 --- a/mavis/summary/summary.py +++ b/mavis/summary/summary.py @@ -251,20 +251,15 @@ def filter_by_evidence( removed.append(bpp) continue elif bpp.call_method == CALL_METHOD.SPLIT: + linking_split_reads = bpp.linking_split_reads + if bpp.event_type == SVTYPE.INS: + linking_split_reads += bpp.flanking_pairs if any([ - bpp.break1_split_reads < filter_min_split_reads, - bpp.break2_split_reads < filter_min_split_reads, - all([ - bpp.event_type != SVTYPE.INS, - bpp.linking_split_reads < filter_min_linking_split_reads - ]), - all([ - bpp.event_type == SVTYPE.INS, - bpp.flanking_pairs < filter_min_linking_split_reads, - bpp.break2_split_reads_forced + bpp.break1_split_reads_forced < filter_min_linking_split_reads - ]), - bpp.break1_split_reads - bpp.break1_split_reads_forced < 1, - bpp.break2_split_reads - bpp.break2_split_reads_forced < 1 + bpp.break1_split_reads + bpp.break1_split_reads_forced < filter_min_split_reads, + bpp.break2_split_reads + bpp.break2_split_reads_forced < filter_min_split_reads, + linking_split_reads < filter_min_linking_split_reads, + bpp.break1_split_reads < 1, + bpp.break2_split_reads < 1 ]): removed.append(bpp) continue diff --git a/mavis/validate/call.py b/mavis/validate/call.py index 497d6c8b..f796b112 100644 --- a/mavis/validate/call.py +++ b/mavis/validate/call.py @@ -249,30 +249,44 @@ def flanking_metrics(self): stdev = math.sqrt(err) return median, stdev - def break1_tgt_align_split_read_names(self): + def break1_split_read_names(self, tgt=False, both=False): + """ + Args: + tgt (bool): return only target re-aligned read names + both (bool): return both original alignments and target-realigned + """ reads = set() for read in self.break1_split_reads: if read.has_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT) and read.get_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT): + if tgt: + reads.add(read.query_name) + elif not tgt: + reads.add(read.query_name) + + if both: reads.add(read.query_name) return reads - def break2_tgt_align_split_read_names(self): + def break2_split_read_names(self, tgt=False, both=False): + """ + Args: + tgt (bool): return only target re-aligned read names + both (bool): return both original alignments and target-realigned + """ reads = set() for read in self.break2_split_reads: if read.has_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT) and read.get_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT): + if tgt: + reads.add(read.query_name) + elif not tgt: + reads.add(read.query_name) + + if both: reads.add(read.query_name) return reads def linking_split_read_names(self): - reads1 = set() - for read in self.break1_split_reads: - reads1.add(read.query_name) - - reads2 = set() - for read in self.break2_split_reads: - reads2.add(read.query_name) - - return reads1 & reads2 + return self.break1_split_read_names(both=True) & self.break2_split_read_names(both=True) @staticmethod def characterize_repeat_region(event, reference_genome): @@ -353,33 +367,15 @@ def flatten(self): COLUMNS.flanking_pairs_read_names: ';'.join(sorted(list(flank))) }) - b1 = set() - b1_tgt = set() - b2 = set() - b2_tgt = set() - - for read in self.break1_split_reads: - name = read.query_name - b1.add(name) - if read.has_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT) and read.get_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT): - b1_tgt.add(name) - for read in self.break2_split_reads: - name = read.query_name - b2.add(name) - if read.has_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT) and read.get_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT): - b2_tgt.add(name) - - linking = b1 & b2 - row.update({ - COLUMNS.break1_split_reads: len(b1), - COLUMNS.break1_split_reads_forced: len(b1_tgt), - COLUMNS.break1_split_read_names: ';'.join(sorted(b1)), - COLUMNS.break2_split_reads: len(b2), - COLUMNS.break2_split_reads_forced: len(b2_tgt), - COLUMNS.break2_split_read_names: ';'.join(sorted(b2)), - COLUMNS.linking_split_reads: len(linking), - COLUMNS.linking_split_read_names: ';'.join(sorted(linking)), + COLUMNS.break1_split_reads: len(self.break1_split_read_names()), + COLUMNS.break1_split_reads_forced: len(self.break1_split_read_names(tgt=True)), + COLUMNS.break1_split_read_names: ';'.join(sorted(self.break1_split_read_names(both=True))), + COLUMNS.break2_split_reads: len(self.break2_split_read_names()), + COLUMNS.break2_split_reads_forced: len(self.break2_split_read_names(tgt=True)), + COLUMNS.break2_split_read_names: ';'.join(sorted(self.break2_split_read_names(both=True))), + COLUMNS.linking_split_reads: len(self.linking_split_read_names()), + COLUMNS.linking_split_read_names: ';'.join(sorted(self.linking_split_read_names())), COLUMNS.spanning_reads: len(self.spanning_reads), COLUMNS.spanning_read_names: ';'.join(sorted([r.query_name for r in self.spanning_reads])) }) @@ -838,7 +834,7 @@ def _call_by_supporting_reads(evidence, event_type, consumed_evidence=None): continue # check if any of the aligned reads are 'double' aligned - double_aligned = {} + double_aligned = dict() for read in pos1[first] | pos2[second]: seq_key = tuple(sorted([ read.query_name, read.query_sequence, reverse_complement(read.query_sequence) @@ -846,7 +842,7 @@ def _call_by_supporting_reads(evidence, event_type, consumed_evidence=None): double_aligned.setdefault(seq_key, []).append(read) # now create calls using the double aligned split read pairs if possible (to resolve untemplated sequence) - resolved_calls = {} + resolved_calls = dict() event_types = {event_type} if event_type in {SVTYPE.DUP, SVTYPE.INS}: event_types.update({SVTYPE.DUP, SVTYPE.INS}) @@ -893,17 +889,21 @@ def _call_by_supporting_reads(evidence, event_type, consumed_evidence=None): call.add_break1_split_read(read) for read in uncons_break2_reads - consumed_evidence: call.add_break2_split_read(read) + linking_reads = len(call.linking_split_read_names()) + if call.event_type == SVTYPE.INS: # may not expect linking split reads for insertions + linking_reads += len(call.flanking_pairs) # does it pass the requirements? - if any([ - len(call.break1_split_reads) < evidence.min_splits_reads_resolution, - len(call.break2_split_reads) < evidence.min_splits_reads_resolution, - len(call.linking_split_read_names()) < evidence.min_linking_split_reads + if not any([ + len(call.break1_split_read_names(both=True)) < evidence.min_splits_reads_resolution, + len(call.break2_split_read_names(both=True)) < evidence.min_splits_reads_resolution, + len(call.break1_split_read_names()) < 1, + len(call.break2_split_read_names()) < 1, + linking_reads < evidence.min_linking_split_reads, ]): - continue - linked_pairings.append(call) - # consume the evidence - consumed_evidence.update(call.break1_split_reads) - consumed_evidence.update(call.break2_split_reads) + linked_pairings.append(call) + # consume the evidence + consumed_evidence.update(call.break1_split_reads) + consumed_evidence.update(call.break2_split_reads) except ValueError: # incompatible types continue diff --git a/mavis/validate/main.py b/mavis/validate/main.py index 24e76f54..7f3a3208 100644 --- a/mavis/validate/main.py +++ b/mavis/validate/main.py @@ -232,8 +232,8 @@ def main( ', flanking pairs: {}{}'.format( 0 if not call.contig else len(call.contig.input_reads), len(call.spanning_reads), - len(call.break1_split_reads), len(call.break1_tgt_align_split_read_names()), - len(call.break2_split_reads), len(call.break2_tgt_align_split_read_names()), + len(call.break1_split_read_names()), len(call.break1_split_read_names(tgt=True)), + len(call.break2_split_read_names()), len(call.break2_split_read_names(tgt=True)), len(call.linking_split_read_names()), len(call.flanking_pairs), '' if not call.has_compatible else '(' + str(len(call.compatible_flanking_pairs)) + ')'