Merge pull request #349 from bcgsc/Release/v3.1.1

Release/v3.1.1
bcgsc · Mar 21, 2023 · 0af520a · 0af520a
2 parents dbbd9a3 + fb4b645
commit 0af520a
Show file tree

Hide file tree

Showing 72 changed files with 845 additions and 128 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -5,3 +5,8 @@ concurrency = multiprocessing
 [html]
 directory = coverage
 title = mavis coverage report
+
+[report]
+exclude_lines = 
+    pragma: no cover
+    if TYPE_CHECKING:
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -12,7 +12,7 @@ on:
 
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04
     strategy:
       matrix:
         python-version: ["3.7", "3.8", "3.9", "3.10"]

diff --git a/.github/workflows/quick-tests.yml b/.github/workflows/quick-tests.yml
@@ -7,7 +7,7 @@ on: [push]
 
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04
     strategy:
       matrix:
         python-version: ["3.7", "3.8", "3.9", "3.10"]
@@ -26,12 +26,16 @@ jobs:
       run: |
         pip install flake8
         # stop the build if there are Python syntax errors or undefined names
-        flake8 src/mavis --count --select=E9,F63,F7,F82 --show-source --statistics
+        flake8 src tests --count --show-source --statistics
     - name: Lint with black
       run: |
         pip install black
         # stop the build if black needs to be run
-        black src/mavis -S -l 100 --check
+        black src tests -S -l 100 --check
+    - name: Lint with isort
+      run: |
+        pip install isort
+        isort src tests --check
     - name: install bwa
       run: |
         git clone https://github.com/lh3/bwa.git

diff --git a/.gitignore b/.gitignore
@@ -16,6 +16,7 @@ junit
 *eggs/
 .mypy_cache
 .snakemake
+.venv*
 
 # aligners
 blat

diff --git a/docs/outputs/columns.md b/docs/outputs/columns.md
@@ -610,3 +610,15 @@ non-specific events.
 Flag to indicate if the
 current event was a supplementary call, meaning a call that was
 found as a result of validating another event.
+
+## dgv
+
+**type**: `str`
+
+ID(s) of SVs from dgv database matched to a SV call from the summary step
+
+## known\_sv\_count
+
+**type**: `int`
+
+Number of known SVs matched to a call in the summary step
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = mavis
-version = 3.1.0
+version = 3.1.1
 url = https://github.com/bcgsc/mavis.git
 download_url = https://github.com/bcgsc/mavis/archive/v2.2.10.tar.gz
 description = A Structural Variant Post-Processing Package
@@ -25,6 +25,9 @@ statistics = True
 [flake8]
 ignore = E501,W503,E203
 
+[isort]
+profile = black
+
 [options]
 packages = find:
 package_dir =
@@ -71,6 +74,7 @@ test =
 dev =
     black
     flake8
+    isort
     twine
     wheel
     timeout-decorator>=0.3.3

diff --git a/src/mavis/annotate/file_io.py b/src/mavis/annotate/file_io.py
@@ -4,7 +4,8 @@
 import json
 import os
 import re
-from typing import Callable, Dict, List, Optional
+import warnings
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional
 
 import pandas as pd
 from Bio import SeqIO
@@ -13,34 +14,37 @@
 from ..constants import CODON_SIZE, GIEMSA_STAIN, START_AA, STOP_AA, STRAND, translate
 from ..interval import Interval
 from ..types import ReferenceAnnotations, ReferenceGenome
-from ..util import logger
+from ..util import logger, read_bpp_from_input_file
 from .base import BioInterval, ReferenceName
 from .genomic import Exon, Gene, PreTranscript, Template, Transcript
 from .protein import Domain, Translation
 
+if TYPE_CHECKING:
+    from ..breakpoint import BreakpointPair
+
 
 def load_masking_regions(*filepaths: str) -> Dict[str, List[BioInterval]]:
     """
     reads a file of regions. The expect input format for the file is tab-delimited and
     the header should contain the following columns
-
     - chr: the chromosome
     - start: start of the region, 1-based inclusive
     - end: end of the region, 1-based inclusive
     - name: the name/label of the region
-
     For example:
-
     .. code-block:: text
-
         #chr    start       end         name
         chr20   25600000    27500000    centromere
-
     Args:
         filepath: path to the input tab-delimited file
     Returns:
         a dictionary keyed by chromosome name with values of lists of regions on the chromosome
     """
+    warnings.warn(
+        "BED file support will be deprecated in future versions.",
+        category=DeprecationWarning,
+        stacklevel=2,
+    )
     regions: Dict[str, List[BioInterval]] = {}
     for filepath in filepaths:
         df = pd.read_csv(
@@ -58,6 +62,67 @@ def load_masking_regions(*filepaths: str) -> Dict[str, List[BioInterval]]:
     return regions
 
 
+def load_known_sv(*filepaths: str) -> Dict[str, List["BreakpointPair"]]:
+    """
+    loads a standard MAVIS or BED file input to a list of known breakpoints.
+
+    Standard BED file requirements:
+    reads a file of regions. The expect input format for the file is tab-delimited and
+    the header should contain the following columns
+
+    - chr: the chromosome
+    - start: start of the region, 1-based inclusive
+    - end: end of the region, 1-based inclusive
+    - name: the name/label of the region
+
+    For example:
+
+    .. code-block:: text
+
+        #chr    start       end         name
+        chr20   25600000    27500000    centromere
+    Args:
+        filepath: path to standard MAVIS format file
+    Returns:
+        a dictionary with {str:{BreakpointPair}}
+    """
+    regions = {}
+    for filepath in filepaths:
+        header = set(pd.read_csv(filepath, nrows=1, sep='\t').columns)
+        mavis_header = {'break1_chromosome', 'break2_chromosome'}
+        bed_header = {'chr', 'start', 'end', 'name'}
+        if mavis_header.issubset(header):
+            bpps = read_bpp_from_input_file(filepath, expand_orient=True, expand_svtype=True)
+            for bpp in bpps:
+                chr_list = [bpp.break1.chr, bpp.break2.chr]
+                regions.setdefault(tuple(chr_list), []).append(bpp)
+
+        else:
+            warnings.warn(
+                "BED file support will be deprecated in future versions.",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+
+            df = pd.read_csv(
+                filepath, sep='\t', dtype={'chr': str, 'start': int, 'end': int, 'name': str}
+            )
+            for col in bed_header:
+                if col not in df:
+                    raise KeyError(f'missing required column ({col})')
+            df['chr'] = df['chr'].apply(lambda c: ReferenceName(c))
+            for row in df.to_dict('records'):
+                known_sv_region = BioInterval(
+                    reference_object=row['chr'],
+                    start=row['start'],
+                    end=row['end'],
+                    name=row['name'],
+                )
+                regions.setdefault(known_sv_region.reference_object, []).append(known_sv_region)
+
+    return regions
+
+
 def load_annotations(
     *filepaths: str,
     reference_genome: Optional[ReferenceGenome] = None,
@@ -117,7 +182,6 @@ def parse_annotations_json(
     domain_errors = 0
 
     for gene_dict in data['genes']:
-
         gene = Gene(
             chr=gene_dict['chr'],
             start=gene_dict['start'],
@@ -346,7 +410,7 @@ class ReferenceFile:
         'reference_genome': load_reference_genome,
         'masking': load_masking_regions,
         'template_metadata': load_templates,
-        'dgv_annotation': load_masking_regions,
+        'dgv_annotation': load_known_sv,
         'aligner_reference': None,
     }
     """dict: Mapping of file types (based on ENV name) to load functions"""

diff --git a/src/mavis/annotate/variant.py b/src/mavis/annotate/variant.py
@@ -368,7 +368,6 @@ def __init__(self, refseq: str, mutseq: str):
                 self.ins_seq = self.mut_seq[: 0 - self.cterm_aligned]
 
             elif len(self.ref_seq) - self.cterm_aligned + 1 <= self.nterm_aligned:
-
                 # repeat region
                 diff = len(self.mut_seq) - len(self.ref_seq)
                 if diff > 0:

diff --git a/src/mavis/cluster/__init__.py b/src/mavis/cluster/__init__.py
@@ -1 +1,4 @@
+__all__ = ['merge_breakpoint_pairs']
+
+
 from .cluster import merge_breakpoint_pairs
diff --git a/src/mavis/constants.py b/src/mavis/constants.py
@@ -223,7 +223,7 @@ class CIGAR(MavisNamespace):
     """
 
     M = 0
-    I = 1
+    I = 1  # noqa
     D = 2
     N = 3
     S = 4
@@ -386,6 +386,7 @@ class COLUMNS(MavisNamespace):
     library: str = 'library'
     cluster_id: str = 'cluster_id'
     cluster_size: str = 'cluster_size'
+    dgv: str = 'dgv'
     validation_id: str = 'validation_id'
     annotation_id: str = 'annotation_id'
     product_id: str = 'product_id'
@@ -463,6 +464,7 @@ class COLUMNS(MavisNamespace):
     contig_strand_specific: str = 'contig_strand_specific'
     contigs_assembled: str = 'contigs_assembled'
     call_sequence_complexity: str = 'call_sequence_complexity'
+    known_sv_count: str = 'known_sv_count'
     spanning_reads: str = 'spanning_reads'
     spanning_read_names: str = 'spanning_read_names'
     flanking_median_fragment_size: str = 'flanking_median_fragment_size'
@@ -555,4 +557,6 @@ def sort_columns(input_columns):
     COLUMNS.tools,
     COLUMNS.tools,
     COLUMNS.tracking_id,
+    COLUMNS.dgv,
+    COLUMNS.known_sv_count,
 }
diff --git a/src/mavis/convert/__init__.py b/src/mavis/convert/__init__.py
@@ -89,23 +89,18 @@ def _convert_tool_row(
         SUPPORTED_TOOL.BREAKSEQ,
         SUPPORTED_TOOL.STRELKA,
     ]:
-
         std_row.update(row)
 
     elif file_type == SUPPORTED_TOOL.CHIMERASCAN:
-
         std_row.update(_parse_chimerascan(row))
 
     elif file_type == SUPPORTED_TOOL.CNVNATOR:
-
         std_row.update(_parse_cnvnator(row))
 
     elif file_type == SUPPORTED_TOOL.STARFUSION:
-
         std_row.update(_parse_starfusion(row))
 
     elif file_type == SUPPORTED_TOOL.DEFUSE:
-
         std_row[COLUMNS.break1_orientation] = (
             ORIENT.LEFT if row['genomic_strand1'] == STRAND.POS else ORIENT.RIGHT
         )
@@ -126,11 +121,9 @@ def _convert_tool_row(
             std_row[TRACKING_COLUMN] = '{}-{}'.format(file_type, row['cluster_id'])
 
     elif file_type == SUPPORTED_TOOL.TA:
-
         std_row.update(_parse_transabyss(row))
 
     elif file_type == SUPPORTED_TOOL.BREAKDANCER:
-
         std_row.update(
             {
                 COLUMNS.event_type: row['Type'],
@@ -145,11 +138,9 @@ def _convert_tool_row(
         )
 
     elif file_type == SUPPORTED_TOOL.ARRIBA:
-
         std_row.update(_parse_arriba(row))
 
     elif file_type == SUPPORTED_TOOL.STRAGLR:
-
         std_row.update(_parse_straglr(row))
 
     else:

diff --git a/src/mavis/convert/arriba.py b/src/mavis/convert/arriba.py
@@ -1,6 +1,4 @@
-from ..constants import COLUMNS, ORIENT, STRAND
-
-from .constants import TRACKING_COLUMN, SUPPORTED_TOOL
+from ..constants import COLUMNS, ORIENT
 
 
 def get_orient(string):

diff --git a/src/mavis/convert/transabyss.py b/src/mavis/convert/transabyss.py
@@ -1,7 +1,6 @@
 import re
 
 from ..constants import COLUMNS
-
 from .constants import SUPPORTED_TOOL, TRACKING_COLUMN
 
 

diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py
@@ -3,9 +3,11 @@
 import re
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
-from ..interval import Interval
+
 import pandas as pd
 
+from ..interval import Interval
+
 try:
     # TypedDict added to typing package directly in later versions
     from typing import TypedDict
@@ -335,7 +337,6 @@ def parse_info(info_field):
 
     rows = []
     for _, row in df.iterrows():
-
         rows.append(
             VcfRecordType(
                 id=row['ID'],

diff --git a/src/mavis/illustrate/diagram.py b/src/mavis/illustrate/diagram.py
@@ -4,15 +4,22 @@
 """
 from typing import Iterable, List, Optional
 
+from svgwrite import Drawing
+
 from mavis.annotate.genomic import Gene, Template
 from mavis.annotate.variant import Annotation
 from mavis.types import ReferenceGenome
-from svgwrite import Drawing
 
 from ..annotate.genomic import IntergenicRegion
 from ..interval import Interval
 from .constants import DiagramSettings
-from .elements import draw_exon_track, draw_genes, draw_template, draw_ustranscript, draw_vmarker
+from .elements import (
+    draw_exon_track,
+    draw_genes,
+    draw_template,
+    draw_ustranscript,
+    draw_vmarker,
+)
 from .scatter import ScatterPlot, draw_scatter
 from .util import LabelMapping, generate_interval_mapping