Skip to content

Commit

Permalink
Merge pull request #181 from bcgsc/develop
Browse files Browse the repository at this point in the history
Release 2.2.2 scope review
  • Loading branch information
calchoo authored Apr 5, 2019
2 parents 2d11a9f + de96c7a commit 7821a56
Show file tree
Hide file tree
Showing 8 changed files with 456 additions and 20 deletions.
16 changes: 8 additions & 8 deletions mavis/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def trim_tails_by_freq(self, min_weight):
Args:
min_weight (int): the minimum weight for an edge to be retained
"""
ends = {n for n in self.nodes() if self.out_degree(n) == 0 or self.in_degree(n) == 0}
ends = sorted([n for n in self.nodes() if self.out_degree(n) == 0 or self.in_degree(n) == 0])
visited = set()

while ends:
Expand All @@ -107,9 +107,9 @@ def trim_tails_by_freq(self, min_weight):
if data['freq'] < min_weight:
self.remove_edge(src, tgt)
if src not in visited:
ends.add(src)
ends.append(src)
if tgt not in visited:
ends.add(tgt)
ends.append(tgt)

# remove any resulting singlets
for node in visited:
Expand All @@ -124,7 +124,7 @@ def trim_forks_by_freq(self, min_weight):
edges has freq < min_weight. then that outgoing edge is deleted
"""
nodes = [n for n in self.nodes() if self.degree(n) > 2]
for node in nodes:
for node in sorted(nodes):
if self.out_degree(node) > 1:
outgoing_edges = self.out_edges(node, data=True)
best = max([e[2]['freq'] for e in outgoing_edges])
Expand Down Expand Up @@ -253,6 +253,7 @@ def pull_contigs_from_component(
"""
path_scores = {} # path_str => score_int
w = min_edge_trim_weight

unresolved_components = [component]

while unresolved_components:
Expand All @@ -278,8 +279,7 @@ def pull_contigs_from_component(
unresolved_components.extend(digraph_connected_components(assembly, component))
else:
for source, sink in itertools.product(assembly.get_sources(component), assembly.get_sinks(component)):
paths = list(nx.all_simple_paths(assembly, source, sink))
for path in paths:
for path in nx.all_simple_paths(assembly, source, sink):
s = path[0] + ''.join([p[-1] for p in path[1:]])
score = 0
for i in range(0, len(path) - 1):
Expand Down Expand Up @@ -389,11 +389,11 @@ def assemble(
assembly.trim_noncutting_paths_by_freq(min_edge_trim_weight)

path_scores = {}

for component in digraph_connected_components(assembly):

# pull the path scores
path_scores.update(pull_contigs_from_component(
assembly, component,
assembly.subgraph(component), component,
min_edge_trim_weight=min_edge_trim_weight,
assembly_max_paths=assembly_max_paths,
log=log
Expand Down
2 changes: 1 addition & 1 deletion mavis/validate/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,7 +764,7 @@ def assemble_contig(self, log=DEVNULL):
rseq = reverse_complement(contig.seq)
if contig.seq not in filtered_contigs and rseq not in filtered_contigs:
filtered_contigs[contig.seq] = contig
self.contigs = list(filtered_contigs.values())
self.contigs = sorted(list(filtered_contigs.values()), key=lambda x: (x.remap_score() * -1, x.seq))

def load_evidence(self, log=DEVNULL):
"""
Expand Down
16 changes: 8 additions & 8 deletions mavis/validate/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ def add_flanking_support(self, flanking_pairs, is_compatible=False):
self.source_evidence.min_expected_fragment_size + Interval.dist(self.break1, self.break2),
self.source_evidence.max_expected_fragment_size])
max_frag = len(self.break1 | self.break2) + self.source_evidence.max_expected_fragment_size

for read, mate in flanking_pairs:
# check that the fragment size is reasonable
fragment_size = self.source_evidence.compute_fragment_size(read, mate)
Expand Down Expand Up @@ -704,7 +705,7 @@ def _compute_coverage_intervals(pairs):
cover2 = Interval(min(second_positions), max(second_positions))
return cover1, cover2

for read, mate in available_flanking_pairs:
for read, mate in sorted(available_flanking_pairs, key=lambda r: (r[0].key(), r[1].key())):
# check that the fragment size is reasonable
fragment_size = evidence.compute_fragment_size(read, mate)
if event_type == SVTYPE.DEL:
Expand Down Expand Up @@ -737,8 +738,7 @@ def _compute_coverage_intervals(pairs):
# remove the farthest outlier from the pairs wrt fragment size (most likely to belong to a different event)
average = Interval(
sum([f.start for f in fragments]) / len(fragments),
sum([f.end for f in fragments]) / len(fragments)
)
sum([f.end for f in fragments]) / len(fragments))
farthest = max(fragments, key=lambda f: abs(Interval.dist(f, average)))
fragments = [f for f in fragments if f != farthest]
selected_flanking_pairs = [(r, m) for r, m in selected_flanking_pairs if evidence.compute_fragment_size(r, m) != farthest]
Expand Down Expand Up @@ -827,7 +827,7 @@ def _call_by_split_reads(evidence, event_type, consumed_evidence=None):

linked_pairings = []
# now pair up the breakpoints with their putative partners
for first, second in itertools.product(pos1, pos2):
for first, second in itertools.product(sorted(pos1.keys()), sorted(pos2.keys())):
if evidence.break1.chr == evidence.break2.chr:
if first >= second:
continue
Expand Down Expand Up @@ -863,13 +863,13 @@ def _call_by_split_reads(evidence, event_type, consumed_evidence=None):
# now create calls using the double aligned split read pairs if possible (to resolve untemplated sequence)
resolved_calls = dict()
for reads in [d for d in double_aligned.values() if len(d) > 1]:
for read1, read2 in itertools.combinations(reads, 2):
for read1, read2 in itertools.combinations(sorted(list(reads), key=lambda x: x.key()), 2):
try:
call = call_paired_read_event(read1, read2, is_stranded=evidence.bam_cache.stranded)
# check the type later, we want this to fail if wrong type
resolved_calls.setdefault(call, (set(), set()))
resolved_calls[call][0].add(read1)
resolved_calls[call][1].add(read2)
resolved_calls[call][0].add(call.read1)
resolved_calls[call][1].add(call.read2)
except AssertionError:
pass # will be thrown if the reads do not actually belong together

Expand All @@ -884,7 +884,6 @@ def _call_by_split_reads(evidence, event_type, consumed_evidence=None):

uncons_break1_reads = evidence.split_reads[0] - consumed_evidence
uncons_break2_reads = evidence.split_reads[1] - consumed_evidence

for call, (reads1, reads2) in sorted(
resolved_calls.items(),
key=lambda x: (len(x[1][0]) + len(x[1][1]), x[0]),
Expand All @@ -901,6 +900,7 @@ def _call_by_split_reads(evidence, event_type, consumed_evidence=None):
else:
call.break1_split_reads.update(reads1 - consumed_evidence)
call.break2_split_reads.update(reads2 - consumed_evidence)

call.add_flanking_support(available_flanking_pairs)
if call.has_compatible:
call.add_flanking_support(available_flanking_pairs, is_compatible=True)
Expand Down
2 changes: 0 additions & 2 deletions mavis/validate/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import itertools
import os
import re
import subprocess
import time
import warnings

Expand All @@ -14,7 +13,6 @@
from .evidence import GenomeEvidence, TranscriptomeEvidence
from ..align import align_sequences, select_contig_alignments, SUPPORTED_ALIGNER
from ..annotate.base import BioInterval
from ..annotate import file_io as _file_io
from ..bam import cigar as _cigar
from ..bam.cache import BamCache
from ..breakpoint import BreakpointPair
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re


VERSION = '2.2.1'
VERSION = '2.2.2'


def parse_md_readme():
Expand Down
13 changes: 13 additions & 0 deletions tests/integration/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def __init__(
is_read1=True,
is_paired=True,
is_unmapped=False,
is_supplementary=False,
mate_is_unmapped=False,
mapping_quality=NA_MAPPING_QUALITY,
**kwargs
Expand Down Expand Up @@ -121,6 +122,7 @@ def __init__(
self.is_paired = is_paired
self.is_unmapped = is_unmapped
self.mate_is_unmapped = mate_is_unmapped
self.is_supplementary = is_supplementary
if self.reference_start and self.reference_end:
if not cigar:
self.cigar = [(CIGAR.M, self.reference_end - self.reference_start)]
Expand Down Expand Up @@ -157,6 +159,17 @@ def __str__(self):
return '{}(ref_id={}, start={}, end={}, seq={})'.format(
self.__class__.__name__, self.reference_id, self.reference_start, self.reference_end, self.query_sequence)

def key(self):
"""
uses a stored _key attribute, if available. This is to avoid the hash changing if the reference start (for example)
is changed but also allow this attribute to be used and calculated for non SamRead objects
This way to change the hash behaviour the user must be explicit and use the set_key method
"""
if hasattr(self, '_key') and self._key is not None:
return self._key
return (self.query_name, self.query_sequence, self.reference_id, self.reference_start, self.is_supplementary)


class MockBamFileHandle:
def __init__(self, chrom_to_tid={}):
Expand Down
Loading

0 comments on commit 7821a56

Please sign in to comment.