Skip to content

Commit

Permalink
Merge pull request #349 from bcgsc/Release/v3.1.1
Browse files Browse the repository at this point in the history
Release/v3.1.1
  • Loading branch information
calchoo authored Mar 21, 2023
2 parents dbbd9a3 + fb4b645 commit 0af520a
Show file tree
Hide file tree
Showing 72 changed files with 845 additions and 128 deletions.
5 changes: 5 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,8 @@ concurrency = multiprocessing
[html]
directory = coverage
title = mavis coverage report

[report]
exclude_lines =
pragma: no cover
if TYPE_CHECKING:
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:

jobs:
build:
runs-on: ubuntu-latest
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10"]
Expand Down
10 changes: 7 additions & 3 deletions .github/workflows/quick-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on: [push]

jobs:
build:
runs-on: ubuntu-latest
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10"]
Expand All @@ -26,12 +26,16 @@ jobs:
run: |
pip install flake8
# stop the build if there are Python syntax errors or undefined names
flake8 src/mavis --count --select=E9,F63,F7,F82 --show-source --statistics
flake8 src tests --count --show-source --statistics
- name: Lint with black
run: |
pip install black
# stop the build if black needs to be run
black src/mavis -S -l 100 --check
black src tests -S -l 100 --check
- name: Lint with isort
run: |
pip install isort
isort src tests --check
- name: install bwa
run: |
git clone https://github.com/lh3/bwa.git
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ junit
*eggs/
.mypy_cache
.snakemake
.venv*

# aligners
blat
Expand Down
12 changes: 12 additions & 0 deletions docs/outputs/columns.md
Original file line number Diff line number Diff line change
Expand Up @@ -610,3 +610,15 @@ non-specific events.
Flag to indicate if the
current event was a supplementary call, meaning a call that was
found as a result of validating another event.

## dgv

**type**: `str`

ID(s) of SVs from dgv database matched to a SV call from the summary step

## known\_sv\_count

**type**: `int`

Number of known SVs matched to a call in the summary step
6 changes: 5 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = mavis
version = 3.1.0
version = 3.1.1
url = https://github.com/bcgsc/mavis.git
download_url = https://github.com/bcgsc/mavis/archive/v2.2.10.tar.gz
description = A Structural Variant Post-Processing Package
Expand All @@ -25,6 +25,9 @@ statistics = True
[flake8]
ignore = E501,W503,E203

[isort]
profile = black

[options]
packages = find:
package_dir =
Expand Down Expand Up @@ -71,6 +74,7 @@ test =
dev =
black
flake8
isort
twine
wheel
timeout-decorator>=0.3.3
Expand Down
82 changes: 73 additions & 9 deletions src/mavis/annotate/file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import json
import os
import re
from typing import Callable, Dict, List, Optional
import warnings
from typing import TYPE_CHECKING, Callable, Dict, List, Optional

import pandas as pd
from Bio import SeqIO
Expand All @@ -13,34 +14,37 @@
from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, STRAND, translate
from ..interval import Interval
from ..types import ReferenceAnnotations, ReferenceGenome
from ..util import logger
from ..util import logger, read_bpp_from_input_file
from .base import BioInterval, ReferenceName
from .genomic import Exon, Gene, PreTranscript, Template, Transcript
from .protein import Domain, Translation

if TYPE_CHECKING:
from ..breakpoint import BreakpointPair


def load_masking_regions(*filepaths: str) -> Dict[str, List[BioInterval]]:
"""
reads a file of regions. The expect input format for the file is tab-delimited and
the header should contain the following columns
- chr: the chromosome
- start: start of the region, 1-based inclusive
- end: end of the region, 1-based inclusive
- name: the name/label of the region
For example:
.. code-block:: text
#chr start end name
chr20 25600000 27500000 centromere
Args:
filepath: path to the input tab-delimited file
Returns:
a dictionary keyed by chromosome name with values of lists of regions on the chromosome
"""
warnings.warn(
"BED file support will be deprecated in future versions.",
category=DeprecationWarning,
stacklevel=2,
)
regions: Dict[str, List[BioInterval]] = {}
for filepath in filepaths:
df = pd.read_csv(
Expand All @@ -58,6 +62,67 @@ def load_masking_regions(*filepaths: str) -> Dict[str, List[BioInterval]]:
return regions


def load_known_sv(*filepaths: str) -> Dict[str, List["BreakpointPair"]]:
"""
loads a standard MAVIS or BED file input to a list of known breakpoints.
Standard BED file requirements:
reads a file of regions. The expect input format for the file is tab-delimited and
the header should contain the following columns
- chr: the chromosome
- start: start of the region, 1-based inclusive
- end: end of the region, 1-based inclusive
- name: the name/label of the region
For example:
.. code-block:: text
#chr start end name
chr20 25600000 27500000 centromere
Args:
filepath: path to standard MAVIS format file
Returns:
a dictionary with {str:{BreakpointPair}}
"""
regions = {}
for filepath in filepaths:
header = set(pd.read_csv(filepath, nrows=1, sep='\t').columns)
mavis_header = {'break1_chromosome', 'break2_chromosome'}
bed_header = {'chr', 'start', 'end', 'name'}
if mavis_header.issubset(header):
bpps = read_bpp_from_input_file(filepath, expand_orient=True, expand_svtype=True)
for bpp in bpps:
chr_list = [bpp.break1.chr, bpp.break2.chr]
regions.setdefault(tuple(chr_list), []).append(bpp)

else:
warnings.warn(
"BED file support will be deprecated in future versions.",
category=DeprecationWarning,
stacklevel=2,
)

df = pd.read_csv(
filepath, sep='\t', dtype={'chr': str, 'start': int, 'end': int, 'name': str}
)
for col in bed_header:
if col not in df:
raise KeyError(f'missing required column ({col})')
df['chr'] = df['chr'].apply(lambda c: ReferenceName(c))
for row in df.to_dict('records'):
known_sv_region = BioInterval(
reference_object=row['chr'],
start=row['start'],
end=row['end'],
name=row['name'],
)
regions.setdefault(known_sv_region.reference_object, []).append(known_sv_region)

return regions


def load_annotations(
*filepaths: str,
reference_genome: Optional[ReferenceGenome] = None,
Expand Down Expand Up @@ -117,7 +182,6 @@ def parse_annotations_json(
domain_errors = 0

for gene_dict in data['genes']:

gene = Gene(
chr=gene_dict['chr'],
start=gene_dict['start'],
Expand Down Expand Up @@ -346,7 +410,7 @@ class ReferenceFile:
'reference_genome': load_reference_genome,
'masking': load_masking_regions,
'template_metadata': load_templates,
'dgv_annotation': load_masking_regions,
'dgv_annotation': load_known_sv,
'aligner_reference': None,
}
"""dict: Mapping of file types (based on ENV name) to load functions"""
Expand Down
1 change: 0 additions & 1 deletion src/mavis/annotate/variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,6 @@ def __init__(self, refseq: str, mutseq: str):
self.ins_seq = self.mut_seq[: 0 - self.cterm_aligned]

elif len(self.ref_seq) - self.cterm_aligned + 1 <= self.nterm_aligned:

# repeat region
diff = len(self.mut_seq) - len(self.ref_seq)
if diff > 0:
Expand Down
3 changes: 3 additions & 0 deletions src/mavis/cluster/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
__all__ = ['merge_breakpoint_pairs']


from .cluster import merge_breakpoint_pairs
6 changes: 5 additions & 1 deletion src/mavis/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ class CIGAR(MavisNamespace):
"""

M = 0
I = 1
I = 1 # noqa
D = 2
N = 3
S = 4
Expand Down Expand Up @@ -386,6 +386,7 @@ class COLUMNS(MavisNamespace):
library: str = 'library'
cluster_id: str = 'cluster_id'
cluster_size: str = 'cluster_size'
dgv: str = 'dgv'
validation_id: str = 'validation_id'
annotation_id: str = 'annotation_id'
product_id: str = 'product_id'
Expand Down Expand Up @@ -463,6 +464,7 @@ class COLUMNS(MavisNamespace):
contig_strand_specific: str = 'contig_strand_specific'
contigs_assembled: str = 'contigs_assembled'
call_sequence_complexity: str = 'call_sequence_complexity'
known_sv_count: str = 'known_sv_count'
spanning_reads: str = 'spanning_reads'
spanning_read_names: str = 'spanning_read_names'
flanking_median_fragment_size: str = 'flanking_median_fragment_size'
Expand Down Expand Up @@ -555,4 +557,6 @@ def sort_columns(input_columns):
COLUMNS.tools,
COLUMNS.tools,
COLUMNS.tracking_id,
COLUMNS.dgv,
COLUMNS.known_sv_count,
}
9 changes: 0 additions & 9 deletions src/mavis/convert/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,23 +89,18 @@ def _convert_tool_row(
SUPPORTED_TOOL.BREAKSEQ,
SUPPORTED_TOOL.STRELKA,
]:

std_row.update(row)

elif file_type == SUPPORTED_TOOL.CHIMERASCAN:

std_row.update(_parse_chimerascan(row))

elif file_type == SUPPORTED_TOOL.CNVNATOR:

std_row.update(_parse_cnvnator(row))

elif file_type == SUPPORTED_TOOL.STARFUSION:

std_row.update(_parse_starfusion(row))

elif file_type == SUPPORTED_TOOL.DEFUSE:

std_row[COLUMNS.break1_orientation] = (
ORIENT.LEFT if row['genomic_strand1'] == STRAND.POS else ORIENT.RIGHT
)
Expand All @@ -126,11 +121,9 @@ def _convert_tool_row(
std_row[TRACKING_COLUMN] = '{}-{}'.format(file_type, row['cluster_id'])

elif file_type == SUPPORTED_TOOL.TA:

std_row.update(_parse_transabyss(row))

elif file_type == SUPPORTED_TOOL.BREAKDANCER:

std_row.update(
{
COLUMNS.event_type: row['Type'],
Expand All @@ -145,11 +138,9 @@ def _convert_tool_row(
)

elif file_type == SUPPORTED_TOOL.ARRIBA:

std_row.update(_parse_arriba(row))

elif file_type == SUPPORTED_TOOL.STRAGLR:

std_row.update(_parse_straglr(row))

else:
Expand Down
4 changes: 1 addition & 3 deletions src/mavis/convert/arriba.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from ..constants import COLUMNS, ORIENT, STRAND

from .constants import TRACKING_COLUMN, SUPPORTED_TOOL
from ..constants import COLUMNS, ORIENT


def get_orient(string):
Expand Down
1 change: 0 additions & 1 deletion src/mavis/convert/transabyss.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import re

from ..constants import COLUMNS

from .constants import SUPPORTED_TOOL, TRACKING_COLUMN


Expand Down
5 changes: 3 additions & 2 deletions src/mavis/convert/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import re
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from ..interval import Interval

import pandas as pd

from ..interval import Interval

try:
# TypedDict added to typing package directly in later versions
from typing import TypedDict
Expand Down Expand Up @@ -335,7 +337,6 @@ def parse_info(info_field):

rows = []
for _, row in df.iterrows():

rows.append(
VcfRecordType(
id=row['ID'],
Expand Down
11 changes: 9 additions & 2 deletions src/mavis/illustrate/diagram.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,22 @@
"""
from typing import Iterable, List, Optional

from svgwrite import Drawing

from mavis.annotate.genomic import Gene, Template
from mavis.annotate.variant import Annotation
from mavis.types import ReferenceGenome
from svgwrite import Drawing

from ..annotate.genomic import IntergenicRegion
from ..interval import Interval
from .constants import DiagramSettings
from .elements import draw_exon_track, draw_genes, draw_template, draw_ustranscript, draw_vmarker
from .elements import (
draw_exon_track,
draw_genes,
draw_template,
draw_ustranscript,
draw_vmarker,
)
from .scatter import ScatterPlot, draw_scatter
from .util import LabelMapping, generate_interval_mapping

Expand Down
Loading

0 comments on commit 0af520a

Please sign in to comment.