From 9f288e6dce62d4b8692e63a93930abb464a76877 Mon Sep 17 00:00:00 2001 From: zhemingfan <43304373+zhemingfan@users.noreply.github.com> Date: Mon, 28 Feb 2022 10:23:36 -0800 Subject: [PATCH 01/56] Update README.md Wheel needs to be upgraded for numpy to successfully install. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b6e6ef45..bf57b2de 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ The simplest way to use MAVIS is via Singularity. The MAVIS docker container use by singularity will take care of installing the aligner as well. ```bash -pip install -U setuptools pip +pip install -U setuptools pip wheel pip install mavis_config # also installs snakemake ``` From 0ecece556593f0d085fdca8c35b13cdd8cf22b4b Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 3 Mar 2022 19:44:00 -0800 Subject: [PATCH 02/56] Add straglr to built-in converters --- src/mavis/convert/__init__.py | 5 +++++ src/mavis/convert/constants.py | 1 + src/mavis/convert/straglr.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) create mode 100644 src/mavis/convert/straglr.py diff --git a/src/mavis/convert/__init__.py b/src/mavis/convert/__init__.py index 2dd0bc72..23917bca 100644 --- a/src/mavis/convert/__init__.py +++ b/src/mavis/convert/__init__.py @@ -13,6 +13,7 @@ from .cnvnator import convert_row as _parse_cnvnator from .constants import SUPPORTED_TOOL, TOOL_SVTYPE_MAPPING, TRACKING_COLUMN from .starfusion import convert_row as _parse_starfusion +from .straglr import convert_row as _parse_straglr from .transabyss import convert_row as _parse_transabyss from .vcf import convert_file as read_vcf @@ -142,6 +143,10 @@ def _convert_tool_row( {k: v for k, v in row.items() if k not in {'Type', 'Chr1', 'Chr2', 'Pos1', 'Pos2'}} ) + elif file_type == SUPPORTED_TOOL.STRAGLR: + + std_row.update(_parse_straglr(row)) + else: raise NotImplementedError('unsupported file type', file_type) diff --git a/src/mavis/convert/constants.py b/src/mavis/convert/constants.py index 821d6a6a..fe59ad68 100644 --- a/src/mavis/convert/constants.py +++ b/src/mavis/convert/constants.py @@ -27,6 +27,7 @@ class SUPPORTED_TOOL(MavisNamespace): CNVNATOR = 'cnvnator' STRELKA = 'strelka' STARFUSION = 'starfusion' + STRAGLR = 'straglr' TOOL_SVTYPE_MAPPING = {v: [v] for v in SVTYPE.values()} # type: ignore diff --git a/src/mavis/convert/straglr.py b/src/mavis/convert/straglr.py new file mode 100644 index 00000000..aa50db65 --- /dev/null +++ b/src/mavis/convert/straglr.py @@ -0,0 +1,32 @@ +from typing import Dict + +from ..constants import COLUMNS, SVTYPE + + +def convert_row(row: Dict) -> Dict: + """ + Converts the fields from the original STRAGLR BED output into MAVIS definitions of an SV + Since STRAGLR defines regions where short tandem repeats exist we make the definitions here fairly + non-specific + + See their github page for more details: https://github.com/bcgsc/straglr + + BED Columns + - chrom: chromosome name + - start: start coordinate of locus + - end: end coordinate of locus + - repeat_unit: repeat motif + - allele.size: where N={1,2,3...} depending on --max_num_clusters e.g. N={1,2} if --max_num_clusters==2 (default) + - allele.copy_number + - allele.support + """ + return { + COLUMNS.break1_chromosome: row['chrom'], + COLUMNS.break2_chromosome: row['chrom'], + COLUMNS.break1_position_start: row['start'], + COLUMNS.break1_position_end: row['end'], + COLUMNS.break2_position_start: row['start'], + COLUMNS.break2_position_end: row['end'], + COLUMNS.untemplated_seq: None, + COLUMNS.event_type: SVTYPE.INS, + } From ad9a7f9c02a8f89047192a5e55685c8542b63636 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 3 Mar 2022 19:46:29 -0800 Subject: [PATCH 03/56] Update SV callers in docs --- docs/inputs/.pages | 4 + docs/inputs/non_python_dependencies.md | 28 +++++ docs/inputs/reference.md | 16 +-- docs/inputs/sv_callers.md | 157 +++++++++++++++++++++++++ 4 files changed, 197 insertions(+), 8 deletions(-) create mode 100644 docs/inputs/.pages create mode 100644 docs/inputs/non_python_dependencies.md create mode 100644 docs/inputs/sv_callers.md diff --git a/docs/inputs/.pages b/docs/inputs/.pages new file mode 100644 index 00000000..886e3d79 --- /dev/null +++ b/docs/inputs/.pages @@ -0,0 +1,4 @@ +nav: + - reference.md + - standard.md + - ... diff --git a/docs/inputs/non_python_dependencies.md b/docs/inputs/non_python_dependencies.md new file mode 100644 index 00000000..fdfdc4e9 --- /dev/null +++ b/docs/inputs/non_python_dependencies.md @@ -0,0 +1,28 @@ +# Non-python Dependencies + +MAVIS integrates with +[SV callers](./sv_callers.md), +[job schedulers](#job-schedulers), and +[aligners](#aligners). While some of +these dependencies are optional, all currently supported options are +detailed below. The versions column in the tables below list all the +versions which were tested for each tool. Each version listed is known +to be compatible with MAVIS. + +## Job Schedulers + +MAVIS v3 uses [snakemake](https://snakemake.readthedocs.io/en/stable/) to handle job scheduling + +## Aligners + +Two aligners are supported [bwa](../../glossary/#bwa) and +[blat](../../glossary/#blat) (default). These are both included in the docker image by default. + +| Name | Version(s) | Environment Setting | +| ---------------------------------------------- | ----------------------- | ------------------------- | +| [blat](../../glossary/#blat) | `36x2` `36` | `MAVIS_ALIGNER=blat` | +| [bwa mem ](../../glossary/#bwa mem ) | `0.7.15-r1140` `0.7.12` | `MAVIS_ALIGNER='bwa mem'` | + +!!! note + When setting the aligner you will also need to set the + [aligner_reference](../../configuration/settings/#aligner_reference) to match diff --git a/docs/inputs/reference.md b/docs/inputs/reference.md index 8c7f0f36..224091c2 100644 --- a/docs/inputs/reference.md +++ b/docs/inputs/reference.md @@ -10,16 +10,16 @@ To improve the install experience for the users, different configurations of the MAVIS annotations file have been made available. These files can be downloaded below, or if the required configuration is not available, -(instructions on generating the annotations file)[/inputs/reference/#generating-the-annotations-from-ensembl] can be found below. +[instructions on generating the annotations file](/inputs/reference/#generating-the-annotations-from-ensembl) can be found below. -| File Name (Type/Format) | Environment Variable | Download | -| --------------------------------------------------------------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [reference genome](../../inputs/reference/#reference-genome) ([fasta](../../glossary/#fasta)) | `MAVIS_REFERENCE_GENOME` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz)
[![](../images/get_app-24px.svg) GRCh38](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.tar.gz) | +| File Name (Type/Format) | Environment Variable | Download | +| --------------------------------------------------------------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [reference genome](../../inputs/reference/#reference-genome) ([fasta](../../glossary/#fasta)) | `MAVIS_REFERENCE_GENOME` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz)
[![](../images/get_app-24px.svg) GRCh38](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.tar.gz) | | [annotations](../../inputs/reference/#annotations) ([JSON](../../glossary/#json)) | `MAVIS_ANNOTATIONS` | [![](../images/get_app-24px.svg) GRCh37/Hg19 + Ensembl69](http://www.bcgsc.ca/downloads/mavis/v3/ensembl69_hg19_annotations.v3.json.gz)
[![](../images/get_app-24px.svg) GRCh38 + Ensembl79](http://www.bcgsc.ca/downloads/mavis/v3/ensembl79_hg38_annotations.v3.json.gz) | -| [masking](../../inputs/reference/#masking-file) (text/tabbed) | `MAVIS_MASKING` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://www.bcgsc.ca/downloads/mavis/hg19_masking.tab)
[![](../images/get_app-24px.svg) GRCh38](http://www.bcgsc.ca/downloads/mavis/GRCh38_masking.tab) | -| [template metadata](../../inputs/reference/#template-metadata) (text/tabbed) | `MAVIS_TEMPLATE_METADATA` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz)
[![](../images/get_app-24px.svg) GRCh38](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz) | -| [DGV annotations](../../inputs/reference/#dgv-database-of-genomic-variants) (text/tabbed) | `MAVIS_DGV_ANNOTATION` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://www.bcgsc.ca/downloads/mavis/dgv_hg19_variants.tab)
[![](../images/get_app-24px.svg) GRCh38](http://www.bcgsc.ca/downloads/mavis/dgv_hg38_variants.tab) | -| [aligner reference](../../inputs/reference/#aligner-reference) | `MAVIS_ALIGNER_REFERENCE` | [![](../images/get_app-24px.svg) GRCh37/Hg19 2bit (blat)](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit)
[![](../images/get_app-24px.svg) GRCh38 2bit (blat)](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit) | +| [masking](../../inputs/reference/#masking-file) (text/tabbed) | `MAVIS_MASKING` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://www.bcgsc.ca/downloads/mavis/hg19_masking.tab)
[![](../images/get_app-24px.svg) GRCh38](http://www.bcgsc.ca/downloads/mavis/GRCh38_masking.tab) | +| [template metadata](../../inputs/reference/#template-metadata) (text/tabbed) | `MAVIS_TEMPLATE_METADATA` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz)
[![](../images/get_app-24px.svg) GRCh38](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz) | +| [DGV annotations](../../inputs/reference/#dgv-database-of-genomic-variants) (text/tabbed) | `MAVIS_DGV_ANNOTATION` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://www.bcgsc.ca/downloads/mavis/dgv_hg19_variants.tab)
[![](../images/get_app-24px.svg) GRCh38](http://www.bcgsc.ca/downloads/mavis/dgv_hg38_variants.tab) | +| [aligner reference](../../inputs/reference/#aligner-reference) | `MAVIS_ALIGNER_REFERENCE` | [![](../images/get_app-24px.svg) GRCh37/Hg19 2bit (blat)](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit)
[![](../images/get_app-24px.svg) GRCh38 2bit (blat)](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit) | If the environment variables above are set they will be used as the default values when any step of the pipeline script is called (including diff --git a/docs/inputs/sv_callers.md b/docs/inputs/sv_callers.md new file mode 100644 index 00000000..abc5fcfd --- /dev/null +++ b/docs/inputs/sv_callers.md @@ -0,0 +1,157 @@ +# SV Callers + +MAVIS supports output from a wide-variety of SV callers. Assumptions are made for each tool based on interpretation of the output and the publications for each tool. + +## Configuring Conversions + +Adding a conversion step to your MAVIS run is as simple as adding that section to the input JSON config. + +The general structure of this section is as follows + +```jsonc +{ + "convert": { + "": { + "file_type": "", + "name": "", // optional field for supported tools + "inputs": [ + "/path/to/tool/output/file" + ] + } + } +} +``` + +A full version of the input configuration file specification can be found in the [configuration](../configuration/general.md) section. + +## Supported Tools + +The tools and versions currently supported are given below. Versions listed indicate the version of the tool for which output files have been tested as input into MAVIS. MAVIS also supports a [general VCF input](#general-vcf-inputs). + +| SV Caller | Version(s) Tested | Files used as MAVIS input | +| --------------------------------------------------------------------------- | ----------------- | --------------------------------------------- | +| [BreakDancer (Chen, 2009)](../../background/citations#chen-2009) | `1.4.5` | `Tools main output file(s)` | +| [BreakSeq (Abyzov, 2015)](../../background/citations#abyzov-2015) | `2.2` | `work/breakseq.vcf.gz` | +| [Chimerascan (Iyer, 2011)](../../background/citations#iyer-2011) | `0.4.5` | `*.bedpe` | +| [CNVnator (Abyzov, 2011)](../../background/citations#abyzov-2011) | `0.3.3` | `Tools main output file(s)` | +| [CuteSV (Jiang, 2020)](../../background/citations#jiang-2020) | `1.0.10` | `*.vcf` | +| [DeFuse (McPherson. 2011)](../../background/citations#mcpherson-2011) | `0.6.2` | `results/results.classify.tsv` | +| [DELLY (Rausch, 2012)](../../background/citations#rausch-2012) | `0.6.1` `0.7.3` | `combined.vcf` (converted from bcf) | +| [Manta (Chen, 2016)](../../background/citations#chen-2016) | `1.0.0` | `{diploidSV,somaticSV}.vcf` | +| [Pindel (Ye, 2009)](../../background/citations#ye-2009) | `0.2.5b9` | `Tools main output file(s)` | +| [Sniffles (Sedlazeck, 2018)](../../background/citations#sedlazeck-2018) | `1.0.12b` | `*.vcf` | +| [STAR-Fusion (Haas, 2017)](../../background/citations#haas-2017) | `1.4.0` | `star-fusion.fusion_predictions.abridged.tsv` | +| [Strelka (Saunders, 2012)](../../background/citations#saunders-2012) | `1.0.6` | `passed.somatic.indels.vcf` | +| [Trans-ABySS (Robertson, 2010)](../../background/citations/#robertson-2010) | `1.4.8 (custom)` | `{indels/events_novel_exons,fusions/*}.tsv` | + +!!! note + [Trans-ABySS](../../glossary/#trans-abyss): The trans-abyss version + used was an in-house dev version. However the output columns are + compatible with 1.4.8 as that was the version branched from. + Additionally, although indels can be used from both genome and + transcriptome outputs of Trans-ABySS, it is recommended to only use the + genome indel calls as the transcriptome indels calls (for versions + tested) introduce a very high number of false positives. This will slow + down validation. It is much faster to simply use the genome indels for + both genome and transcriptome. + +## [DELLY](../../glossary/#delly) Post-processing + +Some post-processing on the delly output files is generally done prior +to input. The output BCF files are converted to a VCF file + +```bash +bcftools concat -f /path/to/file/with/vcf/list --allow-overlaps --output-type v --output combined.vcf +``` + +## General VCF inputs + +Assuming that the tool outputting the VCF file follows standard +conventions, then it is possible to use a +[general VCF conversion](../../package/mavis/tools/vcf) +that is not tool-specific. Given the wide variety in content for VCF files, +MAVIS makes a number of assumptions and the VCF conversion may not work +for all VCFs. In general MAVIS follows the [VCF 4.2 +specification](https://samtools.github.io/hts-specs/VCFv4.2.pdf). If the +input tool you are using differs, it would be better to use a +[custom conversion script](#custom-conversions). + +Using the general VCF tool with a non-standard tool can be done as follows + +```json +{ + "convert": { + "my_tool_alias": { + "file_type": "vcf", + "name": "my_tool", + "inputs": ["/path/to/my_tool/output.vcf"] + } + } +} +``` + +### Assumptions on non-standard INFO fields + +- `PRECISE` if given, Confidence intervals are ignored if given in favour of exact breakpoint calls using pos and END as the breakpoint positions +- `CT` values if given are representative of the breakpoint orientations. +- `CHR2` is given for all interchromosomal events + +### Translating BND type Alt fields + +There are four possible configurations for the alt field of a BND type structural variant +based on the VCF specification. These correspond 1-1 to the orientation types for MAVIS +translocation structural variants. + +```text +r = reference base/seq +u = untemplated sequence/alternate sequence +p = chromosome:position +``` + +| alt format | orients | +| ---------- | ------- | +| `ru[p[` | LR | +| `[p[ur` | RR | +| `]p]ur` | RL | +| `ru]p]` | LL | + +## Custom Conversions + +If there is a tool that is not yet supported by MAVIS and you would like it to be, you can either add a [feature request](https://github.com/bcgsc/mavis/issues) to our GitHub page or tackle writing the conversion script yourself. Either way there are a few things you will need + +- A sample output from the tool in question +- Tool metadata for the citation, version, etc + +### Logic Example - [Chimerascan](../../glossary/#chimerascan) + +The following is a description of how the conversion script for +[Chimerascan](../../background/citations/#iyer-2011) was generated. +While this is a built-in conversion command now, the logic could also +have been put in an external script. As mentioned above, there are a +number of assumptions that had to be made about the tools output to +convert it to the +[standard mavis format](../../inputs/standard/). Assumptions were then verified by reviewing at a series of +called events in [IGV](../../glossary/#igv). In the current +example, [Chimerascan](../../background/citations/#iyer-2011) output +has six columns of interest that were used in the conversion + +- start3p +- end3p +- strand3p +- start5p +- end5p +- strand5p + +The above columns describe two segments which are joined. MAVIS requires +the position of the join. It was assumed that the segments are always +joined as a [sense fusion](../../glossary/#sense-fusion). Using this +assumption there are four logical cases to determine the position of the +breakpoints. + +i.e. the first case would be: If both strands are positive, then the end +of the five-prime segment (end5p) is the first breakpoint and the start +of the three-prime segment is the second breakpoint + +### Calling a Custom Conversion Script + +Since MAVIS v3+ is run using [snakemake](https://snakemake.readthedocs.io/en/stable/) the simplest way to incorporate your custom conversion scripts is to modify the Snakefile and add them as rules. From 17b7ebd108c30d4c61def694749ad170a902760d Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 3 Mar 2022 20:02:46 -0800 Subject: [PATCH 04/56] Add straglr tests --- tests/data/straglr.bed | 10 ++++++++++ tests/test_mavis/convert/test_convert.py | 10 ++++++++++ 2 files changed, 20 insertions(+) create mode 100644 tests/data/straglr.bed diff --git a/tests/data/straglr.bed b/tests/data/straglr.bed new file mode 100644 index 00000000..bb325e59 --- /dev/null +++ b/tests/data/straglr.bed @@ -0,0 +1,10 @@ +#chrom start end repeat_unit allele1:size allele1:copy_number allele1:support allele2:size allele2:copy_number allele2:support +chr11 776686 778078 CT 100.0 150.0 10 100.0 100.0 1 +chr10 3079216 3079421 AGAGGTCACCACCCCTTCCCAACAATCCAGTAACAATCC 100.0 150.0 10 100.0 100.0 1 +chr9 2080637 2081030 CTCCTTCCCTCCGCCCCCACCTCGGTCCCTGT 100.0 150.0 10 100.0 100.0 1 +chrX 244719 245293 CCCCGGGAACCGCCT 100.0 150.0 10 - - - +chr7 284096 284233 GGT 100.0 150.0 10 - - - +chr8 288173 290242 CCCTGCTCCGT 100.0 150.0 10 100.0 100.0 1 +chr3 2382228 2382908 CCGTGGGGGAGGCTGAGGCTATGGGGACT 100.0 100.0 10 - - - +chr2 2427285 2427528 CCTCC 100.0 150.0 10 - - - +chr2 2427953 2428216 GGAGG 100.0 150.0 10 100.0 100.0 1 diff --git a/tests/test_mavis/convert/test_convert.py b/tests/test_mavis/convert/test_convert.py index 4470bc66..3ce64c4a 100644 --- a/tests/test_mavis/convert/test_convert.py +++ b/tests/test_mavis/convert/test_convert.py @@ -1,3 +1,4 @@ +import itertools import os import shutil import sys @@ -47,6 +48,15 @@ def run_main(self, inputfile, file_type, strand_specific=False): result.setdefault(pair.data['tracking_id'], []).append(pair) return result + def test_straglr(self): + result = self.run_main(get_data('straglr.bed'), SUPPORTED_TOOL.STRAGLR, False) + assert len(result) == 9 + for bpp in itertools.chain(*result.values()): + assert bpp.break1.chr == bpp.break2.chr + assert bpp.break1.start == bpp.break2.start + assert bpp.break1.end == bpp.break2.end + assert bpp.event_type == SVTYPE.INS + def test_chimerascan(self): self.run_main(get_data('chimerascan_output.bedpe'), SUPPORTED_TOOL.CHIMERASCAN, False) From 7702fdbb01b686149cafaeb65e2dccbbc4d21d7d Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 3 Mar 2022 20:04:27 -0800 Subject: [PATCH 05/56] Check altSeq is not set --- tests/test_mavis/convert/test_convert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_mavis/convert/test_convert.py b/tests/test_mavis/convert/test_convert.py index 3ce64c4a..44b8aa24 100644 --- a/tests/test_mavis/convert/test_convert.py +++ b/tests/test_mavis/convert/test_convert.py @@ -56,6 +56,7 @@ def test_straglr(self): assert bpp.break1.start == bpp.break2.start assert bpp.break1.end == bpp.break2.end assert bpp.event_type == SVTYPE.INS + assert bpp.untemplated_seq is None def test_chimerascan(self): self.run_main(get_data('chimerascan_output.bedpe'), SUPPORTED_TOOL.CHIMERASCAN, False) From af96ac56c7d1fd9d566efd3f8ba0b0ede81086d3 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 3 Mar 2022 20:08:49 -0800 Subject: [PATCH 06/56] Add straglr to docs --- docs/background/citations.md | 5 +++++ docs/inputs/sv_callers.md | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/background/citations.md b/docs/background/citations.md index 742344f8..11194b04 100644 --- a/docs/background/citations.md +++ b/docs/background/citations.md @@ -23,6 +23,11 @@ Chen,X. et al. (2016) Manta: rapid detection of structural variants and indels for germline and cancer sequencing applications. Bioinformatics, 32, 1220--1222. +## Chiu-2021 + +Chiu,R. et al. (2021) Straglr: discovering and genotyping tandem repeat + expansions using whole genome long-read sequences. Genome Biol., 22, 224. + ## Haas-2017 Haas,B et al. (2017) STAR-Fusion: Fast and Accurate Fusion diff --git a/docs/inputs/sv_callers.md b/docs/inputs/sv_callers.md index abc5fcfd..521d76cd 100644 --- a/docs/inputs/sv_callers.md +++ b/docs/inputs/sv_callers.md @@ -41,8 +41,9 @@ The tools and versions currently supported are given below. Versions listed indi | [Pindel (Ye, 2009)](../../background/citations#ye-2009) | `0.2.5b9` | `Tools main output file(s)` | | [Sniffles (Sedlazeck, 2018)](../../background/citations#sedlazeck-2018) | `1.0.12b` | `*.vcf` | | [STAR-Fusion (Haas, 2017)](../../background/citations#haas-2017) | `1.4.0` | `star-fusion.fusion_predictions.abridged.tsv` | +| [Straglr (Chiu, 2021)](../../background/citations#chiu-2021) | | | [Strelka (Saunders, 2012)](../../background/citations#saunders-2012) | `1.0.6` | `passed.somatic.indels.vcf` | -| [Trans-ABySS (Robertson, 2010)](../../background/citations/#robertson-2010) | `1.4.8 (custom)` | `{indels/events_novel_exons,fusions/*}.tsv` | +| [Trans-ABySS (Robertson, 2010)](../../background/citations/#robertson-2010) | `1.4.8 (custom)` | `{indels/events_novel_exons,fusions/*}.tsv` | `.bed` | !!! note [Trans-ABySS](../../glossary/#trans-abyss): The trans-abyss version From f3a86d3616440bfc2bc4880f9f85efa42904ea1c Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Thu, 3 Mar 2022 20:10:43 -0800 Subject: [PATCH 07/56] added change for CILEN to support cuteSV --- src/mavis/convert/vcf.py | 22 ++++++++++++++++-- tests/test_mavis/convert/test_tools_vcf.py | 27 ++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index c1c07faf..72aa5ddd 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -37,6 +37,7 @@ class VcfInfoType(TypedDict, total=False): CHR2: str CIPOS: Tuple[int, int] CIEND: Tuple[int, int] + CILEN: Tuple[int, int] CT: str END: Optional[int] PRECISE: bool @@ -192,6 +193,19 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end, } ) + elif ( + info.get('CILEN') != None + ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating break2 + std_row.update( + { + COLUMNS.break1_position_start: max( + 1, record.pos + info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max(1, end + info.get('CILEN', (0, 0))[0]), + COLUMNS.break2_position_end: end + info.get('CILEN', (0, 0))[1], + } + ) else: std_row.update( { @@ -220,7 +234,11 @@ def convert_record(record: VcfRecordType) -> List[Dict]: except KeyError: pass std_row.update( - {k: v for k, v in info.items() if k not in {'CHR2', 'SVTYPE', 'CIPOS', 'CIEND', 'CT'}} + { + k: v + for k, v in info.items() + if k not in {'CHR2', 'SVTYPE', 'CIPOS', 'CIEND', 'CILEN', 'CT'} + } ) records.append(std_row) return records @@ -238,7 +256,7 @@ def parse_info(info_field): # convert info types for key in info: - if key in {'CIPOS', 'CIEND'}: + if key in {'CIPOS', 'CIEND', 'CILEN'}: ci_start, ci_end = info[key].split(',') info[key] = (int(ci_start), int(ci_end)) elif key == 'END': diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index f846d41a..e59a48c3 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -59,3 +59,30 @@ def test_convert_record(): assert precise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' + + +def test_convert_record_cuteSV(): + variant_cilen = VcfRecordType( + id='vcf-cuteSV.INS', + pos=1853407, + chrom='chr5', + alts=[ + 'AGGATCTATGTGGCTGTTGCAGGGTGACCCGAGGTCACGAGAGGCAAGGTCAGAGGACGATGTGAGGGCTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGAGACCCAGGTCACGCAGGCAAGGTCAGAGGACGATGTGAGGGAGTTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGTGACCCGAGGTCA' + ], + ref='A', + info=VcfInfoType( + CHR2="chr5", + IMPRECISE=True, + SVMETHOD="cuteSV-1.0.12", + SVTYPE="INS", + SUPTYPE="None", + STRANDS="None", + CIPOS=(-30, 30), + CILEN=(-65, 65), + ), + ) + variant_ins_cutesv = convert_record(variant_cilen) + assert len(variant_ins_cutesv) == 1 + variant_ins_cutesv = variant_ins_cutesv[0] + assert variant_ins_cutesv.get('break2_position_start') == 1853342 + assert variant_ins_cutesv.get('break2_position_end') == 1853472 From 608b3cd6a7278856e51e7fe5330608fe556cacb2 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Fri, 4 Mar 2022 13:03:10 -0800 Subject: [PATCH 08/56] new checks implemented to prevent bp sorting in edge cases --- src/mavis/breakpoint.py | 16 ++++++++++------ src/mavis/summary/summary.py | 4 +++- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/mavis/breakpoint.py b/src/mavis/breakpoint.py index ac3b188f..7ea5245a 100644 --- a/src/mavis/breakpoint.py +++ b/src/mavis/breakpoint.py @@ -259,11 +259,14 @@ def __init__( >>> BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) >>> BreakpointPair(Breakpoint('1', 1, strand='+'), Breakpoint('1', 9999, strand='-')) """ - - if b1.key[:3] > b2.key[:3]: - self.break1 = b2 - self.break2 = b1 - else: + if b1.chr != b2.chr or not Interval.overlaps(b1, b2): + if b1.key[:3] > b2.key[:3]: + self.break1 = b2 + self.break2 = b1 + else: + self.break1 = b1 + self.break2 = b2 + else: # handles case whereby overlapping calls should skip sorting https://github.com/bcgsc/mavis/issues/319 self.break1 = b1 self.break2 = b2 self.stranded = stranded @@ -310,11 +313,12 @@ def column(self, colname: str): return self.data.get(COLUMNS[colname]) def __str__(self): - return 'BPP({}, {}{}{})'.format( + return 'BPP({}, {}{}{}{})'.format( str(self.break1), str(self.break2), ', opposing={}'.format(self.opposing_strands) if not self.stranded else '', ', seq=' + repr(self.untemplated_seq) if self.untemplated_seq is not None else '', + ', tracking_id=' + self.tracking_id if self.tracking_id else '', ) def flatten(self): diff --git a/src/mavis/summary/summary.py b/src/mavis/summary/summary.py index f30089b4..4c022e97 100644 --- a/src/mavis/summary/summary.py +++ b/src/mavis/summary/summary.py @@ -129,7 +129,9 @@ def group_events(events): bpp.break2.strand != new_bpp.break2.strand, ] ): - raise AssertionError('cannot group events differing on key elements', bpp, new_bpp) + raise AssertionError( + 'cannot group events differing on key elements', str(bpp), str(new_bpp) + ) # Note: There are some attributes that shouldn't be lost if different, currently appending the information # The evidence could be better off as a max instead of a join From 165fc6439ac7df77783da050cd0227a5971198d1 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Fri, 4 Mar 2022 15:19:24 -0800 Subject: [PATCH 09/56] added file check prior to convertion --- src/mavis/convert/vcf.py | 11 ++++++++-- tests/test_mavis/convert/test_tools_vcf.py | 25 ++++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 72aa5ddd..1ce63ff5 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -195,7 +195,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: ) elif ( info.get('CILEN') != None - ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating break2 + ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 std_row.update( { COLUMNS.break1_position_start: max( @@ -206,6 +206,10 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CILEN', (0, 0))[1], } ) + if std_row.get("break1_position_end") > std_row.get( + "break2_position_end" + ): # truncate breakpoint1 to max lenght of breakpoint2 + std_row.update({COLUMNS.break1_position_end: std_row.get("break2_position_end")}) else: std_row.update( { @@ -217,6 +221,10 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], } ) + if std_row.get("break1_position_end") > std_row.get( + "break2_position_end" + ): # truncate breakpoint1 to max lenght of breakpoint2 + std_row.update({COLUMNS.break1_position_end: std_row.get("break2_position_end")}) if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 std_row.update({'break1_position_end': 1}) @@ -225,7 +233,6 @@ def convert_record(record: VcfRecordType) -> List[Dict]: if 'SVTYPE' in info: std_row[COLUMNS.event_type] = info['SVTYPE'] - try: orient1, orient2 = info['CT'].split('to') connection_type = {'3': ORIENT.LEFT, '5': ORIENT.RIGHT, 'N': ORIENT.NS} diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index e59a48c3..7cdd9631 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -86,3 +86,28 @@ def test_convert_record_cuteSV(): variant_ins_cutesv = variant_ins_cutesv[0] assert variant_ins_cutesv.get('break2_position_start') == 1853342 assert variant_ins_cutesv.get('break2_position_end') == 1853472 + + variant_cilen2 = VcfRecordType( + id='vcf-cuteSV.INS', + pos=1853407, + chrom='chr5', + alts=[ + 'AGGATCTATGTGGCTGTTGCAGGGTGACCCGAGGTCACGAGAGGCAAGGTCAGAGGACGATGTGAGGGCTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGAGACCCAGGTCACGCAGGCAAGGTCAGAGGACGATGTGAGGGAGTTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGTGACCCGAGGTCA' + ], + ref='A', + info=VcfInfoType( + CHR2="chr5", + IMPRECISE=True, + SVMETHOD="cuteSV-1.0.12", + SVTYPE="INS", + SUPTYPE="None", + STRANDS="None", + CIPOS=(-30, 9999), + CILEN=(-65, 65), + ), + ) + variant_ins_cutesv_2 = convert_record(variant_cilen2) + assert len(variant_ins_cutesv_2) == 1 + variant_ins_cutesv_2 = variant_ins_cutesv_2[0] + assert variant_ins_cutesv_2.get('break2_position_end') == 1853472 + assert variant_ins_cutesv_2.get('break1_position_end') == 1853472 From 7051fc9b9cbcc8b44b680d0de7a29c78096e2513 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 7 Mar 2022 15:15:59 -0800 Subject: [PATCH 10/56] quick bugfixes of INS errors --- src/mavis/breakpoint.py | 25 ++-------- src/mavis/convert/vcf.py | 57 ++++++++++++++++++---- tests/test_mavis/convert/test_tools_vcf.py | 32 ++++++++++-- 3 files changed, 81 insertions(+), 33 deletions(-) diff --git a/src/mavis/breakpoint.py b/src/mavis/breakpoint.py index 7ea5245a..695f2416 100644 --- a/src/mavis/breakpoint.py +++ b/src/mavis/breakpoint.py @@ -39,7 +39,6 @@ def __init__( orient (ORIENT): the orientation (which side is retained at the break) strand (STRAND): the strand seq: the seq - Examples: >>> Breakpoint('1', 1, 2) >>> Breakpoint('1', 1) @@ -251,22 +250,17 @@ def __init__( opposing_strands: are the strands at the breakpoint opposite? i.e. +/- instead of +/+ untemplated_seq: seq between the breakpoints that is not part of either breakpoint data: optional dictionary of attributes associated with this pair - Note: untemplated_seq should always be given wrt to the positive/forward reference strand - Example: >>> BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) >>> BreakpointPair(Breakpoint('1', 1, strand='+'), Breakpoint('1', 9999, strand='-')) """ - if b1.chr != b2.chr or not Interval.overlaps(b1, b2): - if b1.key[:3] > b2.key[:3]: - self.break1 = b2 - self.break2 = b1 - else: - self.break1 = b1 - self.break2 = b2 - else: # handles case whereby overlapping calls should skip sorting https://github.com/bcgsc/mavis/issues/319 + + if b1.key[:3] > b2.key[:3]: + self.break1 = b2 + self.break2 = b1 + else: self.break1 = b1 self.break2 = b2 self.stranded = stranded @@ -355,13 +349,11 @@ def classify(cls, pair, distance: Optional[Callable] = None) -> Set[str]: """ uses the chr, orientations and strands to determine the possible structural_variant types that this pair could support - Args: pair (BreakpointPair): the pair to classify distance: if defined, will be passed to net size to use in narrowing the list of putative types (del vs ins) Returns: a list of possible SVTYPE - Example: >>> bpp = BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) >>> BreakpointPair.classify(bpp) @@ -369,7 +361,6 @@ def classify(cls, pair, distance: Optional[Callable] = None) -> Set[str]: >>> bpp = BreakpointPair(Breakpoint('1', 1, orient='L'), Breakpoint('1', 9999, orient='R'), opposing_strands=False) >>> BreakpointPair.classify(bpp) {'deletion', 'insertion'} - Note: see [related theory documentation](/background/theory/#classifying-events) """ @@ -446,22 +437,16 @@ def breakpoint_sequence_homology(self, reference_genome: ReferenceGenome): this sequence comparison is done with reference to a reference genome and does not use novel or untemplated sequence in the comparison. For this reason, insertions will never return any homologous sequence - - small duplication event CTT => CTTCTT - GATACATTTCTTCTTGAAAA reference ---------<========== first breakpoint ===========>-------- second breakpoint ---------CT-CT------ first break homology -------TT-TT-------- second break homology - Args: reference_genome: dict of reference sequence by template/chr name - Returns: Tuple[str,str]: homologous sequence at the first breakpoint and second breakpoints - Raises: AttributeError: for non specific breakpoints """ diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 1ce63ff5..683addc2 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -123,6 +123,46 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) +def handle_edge_case(std_row: Dict) -> Dict: + """ + Checks for known errors caused by callers in IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. + + bp1_s = breakpoint1 start + bp1_e = breakpoint1 end + bp2_s = breakpoint2 start + bp2_e = breakpoint2 end + + Insertion edge cases include: + Scenario 1: bp1_s = 1890, bp1_e = 2000, bp2_s = 1900, bp2_e = 1900. bp1_e > bp2_s + Scenario 2: bp1_s = 1890, bp1_e = 2000, bp2_s = 1800, bp2_e = 2000. bp1_e > bp1_s + Scenario 3: bp1_s = 1950, bp1_e = 2000, bp2_s = 1900, bp2_e = 3000. bp1_s > bp2_s + """ + + try: + if ( + std_row['break1_position_end'] > std_row['break2_position_end'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row["event_type"] == 'INS' + ): # Scenario 1 + std_row.update({'break1_position_end': std_row['break2_position_end']}) + if ( + std_row['break1_position_start'] > std_row['break2_position_end'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row["event_type"] == 'INS' + ): # Scenario 2 + std_row.update({'break1_position_start': std_row['break2_position_end']}) + if ( + std_row['break1_position_start'] > std_row['break2_position_start'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row["event_type"] == 'INS' + ): # Scenario 2 + std_row.update({'break2_position_start': std_row['break1_position_start']}) + return std_row + + except KeyError as err: + logger.warn(f'Missing required column inputs: {err}') + + def convert_record(record: VcfRecordType) -> List[Dict]: """ converts a vcf record @@ -182,6 +222,10 @@ def convert_record(record: VcfRecordType) -> List[Dict]: elif size < 0: std_row[COLUMNS.event_type] = SVTYPE.DEL std_row.update({COLUMNS.break1_chromosome: record.chrom, COLUMNS.break2_chromosome: chr2}) + + if 'SVTYPE' in info: + std_row[COLUMNS.event_type] = info['SVTYPE'] + if info.get( 'PRECISE', False ): # DELLY CI only apply when split reads were not used to refine the breakpoint which is then flagged @@ -206,10 +250,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CILEN', (0, 0))[1], } ) - if std_row.get("break1_position_end") > std_row.get( - "break2_position_end" - ): # truncate breakpoint1 to max lenght of breakpoint2 - std_row.update({COLUMNS.break1_position_end: std_row.get("break2_position_end")}) + handle_edge_case(std_row) else: std_row.update( { @@ -221,18 +262,14 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], } ) - if std_row.get("break1_position_end") > std_row.get( - "break2_position_end" - ): # truncate breakpoint1 to max lenght of breakpoint2 - std_row.update({COLUMNS.break1_position_end: std_row.get("break2_position_end")}) + handle_edge_case(std_row) + if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 std_row.update({'break1_position_end': 1}) if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: std_row.update({'break2_position_end': 1}) - if 'SVTYPE' in info: - std_row[COLUMNS.event_type] = info['SVTYPE'] try: orient1, orient2 = info['CT'].split('to') connection_type = {'3': ORIENT.LEFT, '5': ORIENT.RIGHT, 'N': ORIENT.NS} diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index 7cdd9631..563f8d2c 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -10,7 +10,7 @@ def test_read_vcf(): assert df.shape[0] == 106 -def test_convert_record(): +def test_convert_record_sniffle(): variant_imprecise = VcfRecordType( id='mock-BND-imprecise', pos=0, @@ -60,6 +60,30 @@ def test_convert_record(): assert precise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' + variant_cilen3 = VcfRecordType( + id='Sniffle.INS', + pos=11184, + chrom='chr2', + alts=[ + 'AGGAGGCGCACCGGGGCACCGCGCAGGCGCAGAGAGGCGCACCGCGCCCGCGCAGGCGCAGAGAGGCGCACCGCGCCCGCGCAGGCGGCGAGAGAGGCGCGACCGCGCCCGCGTAGGCGCAGAGAGGCGCACCGCGCCCCGCGCAGGCGCAGGCGCACCCGCGCCCGCGCAGGCGCAGAGAGGCGTGACCCGCGCCCGCGCAGG' + ], + ref='N', + info=VcfInfoType( + CHR2="chr2", + IMPRECISE=True, + SVMETHOD="Snifflesv1.0.12", + SVTYPE="INS", + SUPTYPE="AL", + STRANDS="+-", + END=11183, + ), + ) + variant_ins_cutesv_3 = convert_record(variant_cilen3) + assert len(variant_ins_cutesv_3) == 1 + variant_ins_cutesv_3 = variant_ins_cutesv_3[0] + assert variant_ins_cutesv_3.get('break2_position_end') == 11183 + assert variant_ins_cutesv_3.get('break1_position_end') == 11183 + def test_convert_record_cuteSV(): variant_cilen = VcfRecordType( @@ -84,7 +108,7 @@ def test_convert_record_cuteSV(): variant_ins_cutesv = convert_record(variant_cilen) assert len(variant_ins_cutesv) == 1 variant_ins_cutesv = variant_ins_cutesv[0] - assert variant_ins_cutesv.get('break2_position_start') == 1853342 + assert variant_ins_cutesv.get('break2_position_start') == 1853377 assert variant_ins_cutesv.get('break2_position_end') == 1853472 variant_cilen2 = VcfRecordType( @@ -109,5 +133,7 @@ def test_convert_record_cuteSV(): variant_ins_cutesv_2 = convert_record(variant_cilen2) assert len(variant_ins_cutesv_2) == 1 variant_ins_cutesv_2 = variant_ins_cutesv_2[0] - assert variant_ins_cutesv_2.get('break2_position_end') == 1853472 + assert variant_ins_cutesv_2.get('break1_position_start') == 1853377 assert variant_ins_cutesv_2.get('break1_position_end') == 1853472 + assert variant_ins_cutesv_2.get('break2_position_end') == 1853472 + assert variant_ins_cutesv_2.get('break2_position_start') == 1853377 From 81ff13c2a5bd712caba99e7e9e430bf32b6fbae7 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 7 Mar 2022 16:27:41 -0800 Subject: [PATCH 11/56] added to description and logic --- src/mavis/breakpoint.py | 9 ++++++ src/mavis/convert/vcf.py | 66 +++++++++++++++++++++++----------------- 2 files changed, 47 insertions(+), 28 deletions(-) diff --git a/src/mavis/breakpoint.py b/src/mavis/breakpoint.py index 695f2416..b3010357 100644 --- a/src/mavis/breakpoint.py +++ b/src/mavis/breakpoint.py @@ -349,11 +349,13 @@ def classify(cls, pair, distance: Optional[Callable] = None) -> Set[str]: """ uses the chr, orientations and strands to determine the possible structural_variant types that this pair could support + Args: pair (BreakpointPair): the pair to classify distance: if defined, will be passed to net size to use in narrowing the list of putative types (del vs ins) Returns: a list of possible SVTYPE + Example: >>> bpp = BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) >>> BreakpointPair.classify(bpp) @@ -361,6 +363,7 @@ def classify(cls, pair, distance: Optional[Callable] = None) -> Set[str]: >>> bpp = BreakpointPair(Breakpoint('1', 1, orient='L'), Breakpoint('1', 9999, orient='R'), opposing_strands=False) >>> BreakpointPair.classify(bpp) {'deletion', 'insertion'} + Note: see [related theory documentation](/background/theory/#classifying-events) """ @@ -437,16 +440,22 @@ def breakpoint_sequence_homology(self, reference_genome: ReferenceGenome): this sequence comparison is done with reference to a reference genome and does not use novel or untemplated sequence in the comparison. For this reason, insertions will never return any homologous sequence + + small duplication event CTT => CTTCTT + GATACATTTCTTCTTGAAAA reference ---------<========== first breakpoint ===========>-------- second breakpoint ---------CT-CT------ first break homology -------TT-TT-------- second break homology + Args: reference_genome: dict of reference sequence by template/chr name + Returns: Tuple[str,str]: homologous sequence at the first breakpoint and second breakpoints + Raises: AttributeError: for non specific breakpoints """ diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 683addc2..92066112 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -123,7 +123,7 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def handle_edge_case(std_row: Dict) -> Dict: +def convert_imprecise_break(std_row: Dict, info: Dict) -> Dict: """ Checks for known errors caused by callers in IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. @@ -133,31 +133,43 @@ def handle_edge_case(std_row: Dict) -> Dict: bp2_e = breakpoint2 end Insertion edge cases include: - Scenario 1: bp1_s = 1890, bp1_e = 2000, bp2_s = 1900, bp2_e = 1900. bp1_e > bp2_s - Scenario 2: bp1_s = 1890, bp1_e = 2000, bp2_s = 1800, bp2_e = 2000. bp1_e > bp1_s - Scenario 3: bp1_s = 1950, bp1_e = 2000, bp2_s = 1900, bp2_e = 3000. bp1_s > bp2_s + Scenario 1 - in which bp1_e > bp2_s + E.g bp1_s = 1890, bp1_e = 2000, bp2_s = 1900, bp2_e = 1900. + break1 ------------------------=======================-------------- + break2 ------------------------==========--------------------------- + + Scenario 2 - in which bp1_e > bp1_s + E.g bp1_s = 1890, bp1_e = 1800, bp2_s = 1800, bp2_e = 1800. + break1 ------------------------==----------------------------------- + break2 ------------------------=------------------------------------ + + Scenario 3 - in which bp1_s > bp2_s + E.g bp1_s = 1950, bp1_e = 2000, bp2_s = 1900, bp2_e = 3000. + break1 ------------------------==----------------------------------- + break2 -----------------------========------------------------------ """ try: - if ( - std_row['break1_position_end'] > std_row['break2_position_end'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # Scenario 1 - std_row.update({'break1_position_end': std_row['break2_position_end']}) - if ( - std_row['break1_position_start'] > std_row['break2_position_end'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # Scenario 2 - std_row.update({'break1_position_start': std_row['break2_position_end']}) - if ( - std_row['break1_position_start'] > std_row['break2_position_start'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # Scenario 2 - std_row.update({'break2_position_start': std_row['break1_position_start']}) - return std_row + if info.get('IMPRECISE', True): + if ( + std_row['break1_position_end'] > std_row['break2_position_end'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row["event_type"] == 'INS' + ): # bp1_e > bp2_s + std_row.update({'break1_position_end': std_row['break2_position_end']}) + if ( + std_row['break1_position_start'] > std_row['break2_position_end'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row["event_type"] == 'INS' + ): # bp1_e > bp1_s + std_row.update({'break1_position_start': std_row['break2_position_end']}) + if ( + std_row['break1_position_start'] > std_row['break2_position_start'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row["event_type"] == 'INS' + ): # bp1_s > bp2_s + std_row.update({'break2_position_start': std_row['break1_position_start']}) + return std_row except KeyError as err: logger.warn(f'Missing required column inputs: {err}') @@ -237,8 +249,8 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end, } ) - elif ( - info.get('CILEN') != None + elif info.get( + 'CILEN' ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 std_row.update( { @@ -250,7 +262,6 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CILEN', (0, 0))[1], } ) - handle_edge_case(std_row) else: std_row.update( { @@ -262,8 +273,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], } ) - handle_edge_case(std_row) - + convert_imprecise_break(std_row, info) if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 std_row.update({'break1_position_end': 1}) From 59e0167d06c0e9f633c476b62a16c818cdb0e125 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 8 Mar 2022 11:19:29 -0800 Subject: [PATCH 12/56] Add missing trailing pipe character --- docs/inputs/sv_callers.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/inputs/sv_callers.md b/docs/inputs/sv_callers.md index 521d76cd..4c5ae23e 100644 --- a/docs/inputs/sv_callers.md +++ b/docs/inputs/sv_callers.md @@ -41,7 +41,7 @@ The tools and versions currently supported are given below. Versions listed indi | [Pindel (Ye, 2009)](../../background/citations#ye-2009) | `0.2.5b9` | `Tools main output file(s)` | | [Sniffles (Sedlazeck, 2018)](../../background/citations#sedlazeck-2018) | `1.0.12b` | `*.vcf` | | [STAR-Fusion (Haas, 2017)](../../background/citations#haas-2017) | `1.4.0` | `star-fusion.fusion_predictions.abridged.tsv` | -| [Straglr (Chiu, 2021)](../../background/citations#chiu-2021) | | +| [Straglr (Chiu, 2021)](../../background/citations#chiu-2021) | | | | [Strelka (Saunders, 2012)](../../background/citations#saunders-2012) | `1.0.6` | `passed.somatic.indels.vcf` | | [Trans-ABySS (Robertson, 2010)](../../background/citations/#robertson-2010) | `1.4.8 (custom)` | `{indels/events_novel_exons,fusions/*}.tsv` | `.bed` | From 42585abf95c3370f45826b62746be44af66e076c Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Wed, 9 Mar 2022 21:34:03 -0800 Subject: [PATCH 13/56] imprecise calls handling put into one function --- src/mavis/convert/vcf.py | 105 ++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 50 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 92066112..d5c0f9b2 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -123,53 +123,80 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def convert_imprecise_break(std_row: Dict, info: Dict) -> Dict: +def convert_imprecise_break(std_row: Dict, info: Dict, record: List) -> Dict: """ - Checks for known errors caused by callers in IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. + Handles IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. bp1_s = breakpoint1 start bp1_e = breakpoint1 end bp2_s = breakpoint2 start bp2_e = breakpoint2 end - Insertion edge cases include: - Scenario 1 - in which bp1_e > bp2_s + Insertion and deletion edge case - in which bp1_e > bp2_s E.g bp1_s = 1890, bp1_e = 2000, bp2_s = 1900, bp2_e = 1900. break1 ------------------------=======================-------------- break2 ------------------------==========--------------------------- - Scenario 2 - in which bp1_e > bp1_s + Insertion edge case - in which bp1_e > bp1_s E.g bp1_s = 1890, bp1_e = 1800, bp2_s = 1800, bp2_e = 1800. break1 ------------------------==----------------------------------- break2 ------------------------=------------------------------------ - Scenario 3 - in which bp1_s > bp2_s + Insertion edge case - in which bp1_s > bp2_s E.g bp1_s = 1950, bp1_e = 2000, bp2_s = 1900, bp2_e = 3000. break1 ------------------------==----------------------------------- break2 -----------------------========------------------------------ """ try: - if info.get('IMPRECISE', True): - if ( - std_row['break1_position_end'] > std_row['break2_position_end'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # bp1_e > bp2_s - std_row.update({'break1_position_end': std_row['break2_position_end']}) - if ( - std_row['break1_position_start'] > std_row['break2_position_end'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # bp1_e > bp1_s - std_row.update({'break1_position_start': std_row['break2_position_end']}) - if ( - std_row['break1_position_start'] > std_row['break2_position_start'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # bp1_s > bp2_s - std_row.update({'break2_position_start': std_row['break1_position_start']}) - return std_row + if info.get( + 'CILEN' + ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 + std_row.update( + { + COLUMNS.break1_position_start: max( + 1, record.pos + info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max( + 1, record.stop + info.get('CILEN', (0, 0))[0] + ), + COLUMNS.break2_position_end: record.stop + info.get('CILEN', (0, 0))[1], + } + ) + else: + std_row.update( + { + COLUMNS.break1_position_start: max( + 1, record.pos + info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max( + 1, record.stop + info.get('CIEND', (0, 0))[0] + ), + COLUMNS.break2_position_end: record.stop + info.get('CIEND', (0, 0))[1], + } + ) + + if ( + std_row['break1_position_end'] > std_row['break2_position_end'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and (std_row["event_type"] == 'INS') + ): # bp1_e > bp2_e + std_row.update({'break1_position_end': std_row['break2_position_end']}) + if ( + std_row['break1_position_start'] > std_row['break2_position_end'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and (std_row["event_type"] == 'INS') + ): # bp1_s > bp2_e + std_row.update({'break1_position_start': std_row['break2_position_end']}) + if ( + std_row['break1_position_start'] > std_row['break2_position_start'] + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and (std_row["event_type"] == 'INS') + ): # bp1_s > bp2_s + std_row.update({'break2_position_start': std_row['break1_position_start']}) + return std_row except KeyError as err: logger.warn(f'Missing required column inputs: {err}') @@ -249,31 +276,9 @@ def convert_record(record: VcfRecordType) -> List[Dict]: COLUMNS.break2_position_end: end, } ) - elif info.get( - 'CILEN' - ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 - std_row.update( - { - COLUMNS.break1_position_start: max( - 1, record.pos + info.get('CIPOS', (0, 0))[0] - ), - COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max(1, end + info.get('CILEN', (0, 0))[0]), - COLUMNS.break2_position_end: end + info.get('CILEN', (0, 0))[1], - } - ) else: - std_row.update( - { - COLUMNS.break1_position_start: max( - 1, record.pos + info.get('CIPOS', (0, 0))[0] - ), - COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max(1, end + info.get('CIEND', (0, 0))[0]), - COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], - } - ) - convert_imprecise_break(std_row, info) + convert_imprecise_break(std_row, info, record) + if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 std_row.update({'break1_position_end': 1}) From 32f7bec264c5bb49367ae2bba942ad5be6665977 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 14 Mar 2022 15:07:57 -0700 Subject: [PATCH 14/56] fixes for ins handling --- src/mavis/breakpoint.py | 3 +++ src/mavis/convert/vcf.py | 24 +++++++++++------------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/mavis/breakpoint.py b/src/mavis/breakpoint.py index b3010357..6d29fa4b 100644 --- a/src/mavis/breakpoint.py +++ b/src/mavis/breakpoint.py @@ -39,6 +39,7 @@ def __init__( orient (ORIENT): the orientation (which side is retained at the break) strand (STRAND): the strand seq: the seq + Examples: >>> Breakpoint('1', 1, 2) >>> Breakpoint('1', 1) @@ -250,8 +251,10 @@ def __init__( opposing_strands: are the strands at the breakpoint opposite? i.e. +/- instead of +/+ untemplated_seq: seq between the breakpoints that is not part of either breakpoint data: optional dictionary of attributes associated with this pair + Note: untemplated_seq should always be given wrt to the positive/forward reference strand + Example: >>> BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) >>> BreakpointPair(Breakpoint('1', 1, strand='+'), Breakpoint('1', 9999, strand='-')) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index d5c0f9b2..8e723edb 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -123,7 +123,9 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def convert_imprecise_break(std_row: Dict, info: Dict, record: List) -> Dict: +def convert_imprecise_break( + std_row: Dict, info: Dict, record: List[VcfRecordType], bp_end: int +) -> Dict: """ Handles IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. @@ -158,10 +160,8 @@ def convert_imprecise_break(std_row: Dict, info: Dict, record: List) -> Dict: 1, record.pos + info.get('CIPOS', (0, 0))[0] ), COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max( - 1, record.stop + info.get('CILEN', (0, 0))[0] - ), - COLUMNS.break2_position_end: record.stop + info.get('CILEN', (0, 0))[1], + COLUMNS.break2_position_start: max(1, bp_end + info.get('CILEN', (0, 0))[0]), + COLUMNS.break2_position_end: bp_end + info.get('CILEN', (0, 0))[1], } ) else: @@ -171,29 +171,27 @@ def convert_imprecise_break(std_row: Dict, info: Dict, record: List) -> Dict: 1, record.pos + info.get('CIPOS', (0, 0))[0] ), COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max( - 1, record.stop + info.get('CIEND', (0, 0))[0] - ), - COLUMNS.break2_position_end: record.stop + info.get('CIEND', (0, 0))[1], + COLUMNS.break2_position_start: max(1, bp_end + info.get('CIEND', (0, 0))[0]), + COLUMNS.break2_position_end: bp_end + info.get('CIEND', (0, 0))[1], } ) if ( std_row['break1_position_end'] > std_row['break2_position_end'] and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and (std_row["event_type"] == 'INS') + and std_row["event_type"] == 'INS' ): # bp1_e > bp2_e std_row.update({'break1_position_end': std_row['break2_position_end']}) if ( std_row['break1_position_start'] > std_row['break2_position_end'] and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and (std_row["event_type"] == 'INS') + and std_row["event_type"] == 'INS' ): # bp1_s > bp2_e std_row.update({'break1_position_start': std_row['break2_position_end']}) if ( std_row['break1_position_start'] > std_row['break2_position_start'] and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and (std_row["event_type"] == 'INS') + and std_row["event_type"] == 'INS' ): # bp1_s > bp2_s std_row.update({'break2_position_start': std_row['break1_position_start']}) return std_row @@ -277,7 +275,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: } ) else: - convert_imprecise_break(std_row, info, record) + convert_imprecise_break(std_row, info, record, end) if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 From ef2dfeb0240d1559279ae285e784a5db3206a089 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 14 Mar 2022 17:49:44 -0700 Subject: [PATCH 15/56] removed info signature --- src/mavis/convert/vcf.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 8e723edb..20ebd8b1 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -123,9 +123,7 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def convert_imprecise_break( - std_row: Dict, info: Dict, record: List[VcfRecordType], bp_end: int -) -> Dict: +def convert_imprecise_break(std_row: Dict, record: List[VcfRecordType], bp_end: int) -> Dict: """ Handles IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. @@ -151,28 +149,32 @@ def convert_imprecise_break( """ try: - if info.get( + if record.info.get( 'CILEN' ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 std_row.update( { COLUMNS.break1_position_start: max( - 1, record.pos + info.get('CIPOS', (0, 0))[0] + 1, record.pos + record.info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max( + 1, bp_end + record.info.get('CILEN', (0, 0))[0] ), - COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max(1, bp_end + info.get('CILEN', (0, 0))[0]), - COLUMNS.break2_position_end: bp_end + info.get('CILEN', (0, 0))[1], + COLUMNS.break2_position_end: bp_end + record.info.get('CILEN', (0, 0))[1], } ) else: std_row.update( { COLUMNS.break1_position_start: max( - 1, record.pos + info.get('CIPOS', (0, 0))[0] + 1, record.pos + record.info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max( + 1, bp_end + record.info.get('CIEND', (0, 0))[0] ), - COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max(1, bp_end + info.get('CIEND', (0, 0))[0]), - COLUMNS.break2_position_end: bp_end + info.get('CIEND', (0, 0))[1], + COLUMNS.break2_position_end: bp_end + record.info.get('CIEND', (0, 0))[1], } ) @@ -275,7 +277,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: } ) else: - convert_imprecise_break(std_row, info, record, end) + convert_imprecise_break(std_row, record, end) if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 From 939f56323c1b0d4a5c992bb708500f6e3cf96684 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 29 Mar 2022 18:25:48 -0700 Subject: [PATCH 16/56] shorten code --- src/mavis/convert/vcf.py | 98 ++++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 53 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 20ebd8b1..7b6eb53b 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from typing import Dict, List, Optional, Tuple - +from ..interval import Interval import pandas as pd try: @@ -123,7 +123,7 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def convert_imprecise_break(std_row: Dict, record: List[VcfRecordType], bp_end: int) -> Dict: +def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_end: int) -> Dict: """ Handles IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. @@ -148,58 +148,50 @@ def convert_imprecise_break(std_row: Dict, record: List[VcfRecordType], bp_end: break2 -----------------------========------------------------------ """ - try: - if record.info.get( - 'CILEN' - ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 - std_row.update( - { - COLUMNS.break1_position_start: max( - 1, record.pos + record.info.get('CIPOS', (0, 0))[0] - ), - COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max( - 1, bp_end + record.info.get('CILEN', (0, 0))[0] - ), - COLUMNS.break2_position_end: bp_end + record.info.get('CILEN', (0, 0))[1], - } + if record.info.get( + 'CILEN' + ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 + std_row.update( + { + COLUMNS.break1_position_start: max( + 1, record.pos + record.info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max(1, bp_end + record.info.get('CILEN', (0, 0))[0]), + COLUMNS.break2_position_end: bp_end + record.info.get('CILEN', (0, 0))[1], + } + ) + else: + std_row.update( + { + COLUMNS.break1_position_start: max( + 1, record.pos + record.info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max(1, bp_end + record.info.get('CIEND', (0, 0))[0]), + COLUMNS.break2_position_end: bp_end + record.info.get('CIEND', (0, 0))[1], + } + ) + + if std_row["break1_chromosome"] == std_row["break2_chromosome"] and ( + Interval.overlaps( + (std_row['break1_position_start'], std_row['break1_position_end']), + (std_row['break2_position_start'], std_row['break2_position_end']), + ) + or std_row['break1_position_start'] > std_row['break2_position_start'] + ): + if 'event_type' in std_row and std_row['event_type'] != 'BND': + std_row['break2_position_start'] = max( + std_row['break1_position_start'], std_row['break2_position_start'] ) - else: - std_row.update( - { - COLUMNS.break1_position_start: max( - 1, record.pos + record.info.get('CIPOS', (0, 0))[0] - ), - COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max( - 1, bp_end + record.info.get('CIEND', (0, 0))[0] - ), - COLUMNS.break2_position_end: bp_end + record.info.get('CIEND', (0, 0))[1], - } + std_row['break1_position_end'] = min( + std_row['break1_position_end'], std_row['break2_position_end'] + ) + std_row['break1_position_start'] = min( + std_row['break1_position_start'], std_row['break2_position_end'] ) - if ( - std_row['break1_position_end'] > std_row['break2_position_end'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # bp1_e > bp2_e - std_row.update({'break1_position_end': std_row['break2_position_end']}) - if ( - std_row['break1_position_start'] > std_row['break2_position_end'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # bp1_s > bp2_e - std_row.update({'break1_position_start': std_row['break2_position_end']}) - if ( - std_row['break1_position_start'] > std_row['break2_position_start'] - and std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row["event_type"] == 'INS' - ): # bp1_s > bp2_s - std_row.update({'break2_position_start': std_row['break1_position_start']}) - return std_row - - except KeyError as err: - logger.warn(f'Missing required column inputs: {err}') + return std_row def convert_record(record: VcfRecordType) -> List[Dict]: @@ -277,7 +269,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: } ) else: - convert_imprecise_break(std_row, record, end) + convert_imprecise_breakend(std_row, record, end) if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 @@ -324,7 +316,7 @@ def parse_info(info_field): return info df['info'] = df['INFO'].apply(parse_info) - df['alts'] = df['ALT'].apply(lambda a: a.split(',')) + df['alts'] = df['ALT'].apply(lambda a: str(a).split(',')) rows = [] for _, row in df.iterrows(): From e76c27f20d1068a8a2c57540fb97664aaf4dca35 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 19 Apr 2022 11:02:45 -0700 Subject: [PATCH 17/56] changed naming and updated breakend case in vcf conversion --- src/mavis/convert/vcf.py | 8 +- tests/test_mavis/convert/test_tools_vcf.py | 106 ++++++++++++--------- 2 files changed, 66 insertions(+), 48 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 7b6eb53b..3a5f1c35 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -123,7 +123,7 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) -def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_end: int) -> Dict: +def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_end: int): """ Handles IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. @@ -179,6 +179,7 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en (std_row['break2_position_start'], std_row['break2_position_end']), ) or std_row['break1_position_start'] > std_row['break2_position_start'] + or std_row['break2_position_start'] > std_row['break2_position_end'] ): if 'event_type' in std_row and std_row['event_type'] != 'BND': std_row['break2_position_start'] = max( @@ -190,8 +191,9 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en std_row['break1_position_start'] = min( std_row['break1_position_start'], std_row['break2_position_end'] ) - - return std_row + std_row['break2_position_start'] = min( + std_row['break2_position_start'], std_row['break2_position_end'] + ) def convert_record(record: VcfRecordType) -> List[Dict]: diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index 563f8d2c..aa822ed3 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -10,7 +10,7 @@ def test_read_vcf(): assert df.shape[0] == 106 -def test_convert_record_sniffle(): +def test_convert_telomeric_region(): variant_imprecise = VcfRecordType( id='mock-BND-imprecise', pos=0, @@ -60,39 +60,36 @@ def test_convert_record_sniffle(): assert precise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' - variant_cilen3 = VcfRecordType( - id='Sniffle.INS', - pos=11184, - chrom='chr2', - alts=[ - 'AGGAGGCGCACCGGGGCACCGCGCAGGCGCAGAGAGGCGCACCGCGCCCGCGCAGGCGCAGAGAGGCGCACCGCGCCCGCGCAGGCGGCGAGAGAGGCGCGACCGCGCCCGCGTAGGCGCAGAGAGGCGCACCGCGCCCCGCGCAGGCGCAGGCGCACCCGCGCCCGCGCAGGCGCAGAGAGGCGTGACCCGCGCCCGCGCAGG' - ], - ref='N', + +def test_convert_intrachromosomal_imprecise_breakend(): + # breakpoint_2_start < breakpoint_1_start + variant_imprecise = VcfRecordType( + id='vcf-cuteSV.INS', + pos=1853407, + chrom='chr5', + alts=['AGG'], + ref='A', info=VcfInfoType( - CHR2="chr2", + CHR2="chr5", IMPRECISE=True, - SVMETHOD="Snifflesv1.0.12", + SVMETHOD="cuteSV-1.0.12", SVTYPE="INS", - SUPTYPE="AL", - STRANDS="+-", - END=11183, + CIPOS=(-30, 30), + CILEN=(-65, 65), ), ) - variant_ins_cutesv_3 = convert_record(variant_cilen3) - assert len(variant_ins_cutesv_3) == 1 - variant_ins_cutesv_3 = variant_ins_cutesv_3[0] - assert variant_ins_cutesv_3.get('break2_position_end') == 11183 - assert variant_ins_cutesv_3.get('break1_position_end') == 11183 - + variant_ins_imprecise = convert_record(variant_imprecise) + assert len(variant_ins_imprecise) == 1 + variant_ins_imprecise = variant_ins_imprecise[0] + assert variant_ins_imprecise.get('break2_position_start') == 1853377 + assert variant_ins_imprecise.get('break2_position_end') == 1853472 -def test_convert_record_cuteSV(): - variant_cilen = VcfRecordType( + # breakpoint_1_end > breakpoint_2_end + variant_imprecise2 = VcfRecordType( id='vcf-cuteSV.INS', pos=1853407, chrom='chr5', - alts=[ - 'AGGATCTATGTGGCTGTTGCAGGGTGACCCGAGGTCACGAGAGGCAAGGTCAGAGGACGATGTGAGGGCTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGAGACCCAGGTCACGCAGGCAAGGTCAGAGGACGATGTGAGGGAGTTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGTGACCCGAGGTCA' - ], + alts=['AGG'], ref='A', info=VcfInfoType( CHR2="chr5", @@ -101,39 +98,58 @@ def test_convert_record_cuteSV(): SVTYPE="INS", SUPTYPE="None", STRANDS="None", - CIPOS=(-30, 30), + CIPOS=(-30, 9999), CILEN=(-65, 65), ), ) - variant_ins_cutesv = convert_record(variant_cilen) - assert len(variant_ins_cutesv) == 1 - variant_ins_cutesv = variant_ins_cutesv[0] - assert variant_ins_cutesv.get('break2_position_start') == 1853377 - assert variant_ins_cutesv.get('break2_position_end') == 1853472 + variant_ins_imprecise_2 = convert_record(variant_imprecise2) + assert len(variant_ins_imprecise_2) == 1 + variant_ins_imprecise_2 = variant_ins_imprecise_2[0] + assert variant_ins_imprecise_2.get('break1_position_start') == 1853377 + assert variant_ins_imprecise_2.get('break1_position_end') == 1853472 + assert variant_ins_imprecise_2.get('break2_position_end') == 1853472 + assert variant_ins_imprecise_2.get('break2_position_start') == 1853377 - variant_cilen2 = VcfRecordType( - id='vcf-cuteSV.INS', + # breakpoint_2_start > breakpoint_2_end + variant_imprecise3 = VcfRecordType( + id='mock-INS-imprecise', pos=1853407, chrom='chr5', - alts=[ - 'AGGATCTATGTGGCTGTTGCAGGGTGACCCGAGGTCACGAGAGGCAAGGTCAGAGGACGATGTGAGGGCTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGAGACCCAGGTCACGCAGGCAAGGTCAGAGGACGATGTGAGGGAGTTGCAGGGTGACCCGAGGTCACGTAGGGCAAGGTCAGAGGACGATGTGGCGGTTGCAGGGTGACCCGAGGTCA' - ], + alts=['AGG'], ref='A', info=VcfInfoType( CHR2="chr5", IMPRECISE=True, - SVMETHOD="cuteSV-1.0.12", + SVMETHOD="Snifflesv1.0.11", SVTYPE="INS", SUPTYPE="None", STRANDS="None", CIPOS=(-30, 9999), - CILEN=(-65, 65), + CILEN=(70, 65), + ), + ) + variant_ins_imprecise_3 = convert_record(variant_imprecise3) + assert len(variant_ins_imprecise_3) == 1 + variant_ins_imprecise_3 = variant_ins_imprecise_3[0] + assert variant_ins_imprecise_3.get('break2_position_end') == 1853472 + assert variant_ins_imprecise_3.get('break2_position_start') == 1853472 + + # breakpoint_1_start > breakpoint_1_end + variant_cilen4 = VcfRecordType( + id='Sniffle.INS', + pos=11184, + chrom='chr2', + alts=['AGG'], + ref='N', + info=VcfInfoType( + CHR2="chr2", + IMPRECISE=True, + SVTYPE="INS", + END=11183, ), ) - variant_ins_cutesv_2 = convert_record(variant_cilen2) - assert len(variant_ins_cutesv_2) == 1 - variant_ins_cutesv_2 = variant_ins_cutesv_2[0] - assert variant_ins_cutesv_2.get('break1_position_start') == 1853377 - assert variant_ins_cutesv_2.get('break1_position_end') == 1853472 - assert variant_ins_cutesv_2.get('break2_position_end') == 1853472 - assert variant_ins_cutesv_2.get('break2_position_start') == 1853377 + variant_ins_imprecise_4 = convert_record(variant_cilen4) + assert len(variant_ins_imprecise_4) == 1 + variant_ins_imprecise_4 = variant_ins_imprecise_4[0] + assert variant_ins_imprecise_4.get('break2_position_end') == 11183 + assert variant_ins_imprecise_4.get('break1_position_end') == 11183 From de14f6a5db6df631c6dc0d437ff0e077c44f4030 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 19 Apr 2022 11:16:14 -0700 Subject: [PATCH 18/56] added warning to improper formatting --- src/mavis/convert/vcf.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 3a5f1c35..434353af 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -179,7 +179,6 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en (std_row['break2_position_start'], std_row['break2_position_end']), ) or std_row['break1_position_start'] > std_row['break2_position_start'] - or std_row['break2_position_start'] > std_row['break2_position_end'] ): if 'event_type' in std_row and std_row['event_type'] != 'BND': std_row['break2_position_start'] = max( @@ -195,6 +194,18 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en std_row['break2_position_start'], std_row['break2_position_end'] ) + elif ( + std_row["break1_chromosome"] == std_row["break2_chromosome"] + and std_row['break2_position_start'] > std_row['break2_position_end'] + ): + if 'event_type' in std_row and std_row['event_type'] != 'BND': + std_row['break2_position_start'] = min( + std_row['break2_position_start'], std_row['break2_position_end'] + ) + logger.warning( + f'Improper entry. Breakpoint2 start: {std_row["break2_position_start"]} is greater than Breakpoint2 end: {std_row["break2_position_end"]}\n Breakpoint2 start position coerced to breakpoint2 end position.' + ) + def convert_record(record: VcfRecordType) -> List[Dict]: """ From d335c1f4034f85b8a9dce70aaadde76baca924a2 Mon Sep 17 00:00:00 2001 From: Caleb Choo Date: Wed, 20 Apr 2022 16:17:41 -0700 Subject: [PATCH 19/56] #325 Adding arriba support --- mavis/tools/__init__.py | 5 ++++ mavis/tools/arriba.py | 37 ++++++++++++++++++++++++ mavis/tools/constants.py | 1 + tests/unit/test_tool.py | 62 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 105 insertions(+) create mode 100644 mavis/tools/arriba.py diff --git a/mavis/tools/__init__.py b/mavis/tools/__init__.py index db7be5cc..c5c7c9ff 100644 --- a/mavis/tools/__init__.py +++ b/mavis/tools/__init__.py @@ -13,6 +13,7 @@ from .cnvnator import convert_row as _parse_cnvnator from .vcf import convert_file as read_vcf from .breakdancer import convert_file as _convert_breakdancer_file +from .arriba import convert_row as _parse_arriba from .starfusion import convert_row as _parse_starfusion from .chimerascan import convert_row as _parse_chimerascan @@ -142,6 +143,10 @@ def _convert_tool_row(row, file_type, stranded, assume_no_untemplated=True): {k: v for k, v in row.items() if k not in {'Type', 'Chr1', 'Chr2', 'Pos1', 'Pos2'}} ) + elif file_type == SUPPORTED_TOOL.ARRIBA: + + std_row.update(_parse_arriba(row)) + else: raise NotImplementedError('unsupported file type', file_type) diff --git a/mavis/tools/arriba.py b/mavis/tools/arriba.py new file mode 100644 index 00000000..17c19132 --- /dev/null +++ b/mavis/tools/arriba.py @@ -0,0 +1,37 @@ +from ..constants import COLUMNS, ORIENT, STRAND + +from .constants import TRACKING_COLUMN, SUPPORTED_TOOL + +def get_orient(string): + if string == "downstream": + return ORIENT.LEFT + elif string == "upstream": + return ORIENT.RIGHT + return ORIENT.NS + +def convert_row(row): + """ + transforms the aribba output into the common format for expansion. Maps the input column + names to column names which MAVIS can read + """ + std_row = {} + + try: + std_row[COLUMNS.break1_chromosome], b1_start = row["breakpoint1"].split(":") + std_row[COLUMNS.break2_chromosome], b2_start = row["breakpoint2"].split(":") + + std_row[COLUMNS.break1_strand] = row["strand1(gene/fusion)"].split("/")[1] + std_row[COLUMNS.break2_strand] = row["strand2(gene/fusion)"].split("/")[1] + std_row[COLUMNS.event_type] = row["type"].split("/")[0] + std_row[COLUMNS.break1_orientation] = get_orient(row["direction1"]) + std_row[COLUMNS.break2_orientation] = get_orient(row["direction2"]) + + std_row[COLUMNS.break1_position_start] = std_row[COLUMNS.break1_position_end] = b1_start + std_row[COLUMNS.break2_position_start] = std_row[COLUMNS.break2_position_end] = b2_start + except (ValueError, TypeError): + raise AssertionError( + 'Could not parse the breakpoint from the Arriba row: {}, {}'.format( + row['breakpoint1'], row['breakpoint2'] + ) + ) + return std_row diff --git a/mavis/tools/constants.py b/mavis/tools/constants.py index 0969bf3e..e828ce3b 100644 --- a/mavis/tools/constants.py +++ b/mavis/tools/constants.py @@ -9,6 +9,7 @@ MAVIS='mavis', DEFUSE='defuse', BREAKDANCER='breakdancer', + ARRIBA='arriba', VCF='vcf', BREAKSEQ='breakseq', CNVNATOR='cnvnator', diff --git a/tests/unit/test_tool.py b/tests/unit/test_tool.py index c0aa5ab2..2b43614d 100644 --- a/tests/unit/test_tool.py +++ b/tests/unit/test_tool.py @@ -105,6 +105,68 @@ def test_convert_duplication(self): self.assertEqual('1', bpp.break2.chr) + +class TestArriba(unittest.TestCase): + def test_convert_standard_event(self): + row = { + 'breakpoint1': '13:114529969', + 'breakpoint2': '13:114751269', + 'type': 'inversion', + 'strand1(gene/fusion)': '+/+', + 'strand2(gene/fusion)': '-/-', + 'direction1': 'downstream', + 'direction2': 'downstream', + } + bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, True) + + self.assertEqual(1, len(bpp_list)) + bpp = bpp_list[0] + self.assertEqual('13', bpp.break1.chr) + self.assertEqual('13', bpp.break2.chr) + self.assertEqual(114529969, bpp.break1.start) + self.assertEqual(114751269, bpp.break2.start) + self.assertEqual(SVTYPE.INV, bpp.event_type) + self.assertEqual('L', bpp.break1.orient) + self.assertEqual('L', bpp.break2.orient) + self.assertEqual(True, bpp.opposing_strands) + + def test_convert_translocation(self): + row = { + 'breakpoint1': '17:69313092', + 'breakpoint2': '20:58272875', + 'type': 'translocation', + 'strand1(gene/fusion)': '-/-', + 'strand2(gene/fusion)': '-/-', + 'direction1': 'upstream', + 'direction2': 'downstream', + } + bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, True) + + self.assertEqual(1, len(bpp_list)) + bpp = bpp_list[0] + self.assertEqual('chr17', bpp.break1.chr) + self.assertEqual('chr20', bpp.break2.chr) + self.assertEqual(69313092, bpp.break1.start) + self.assertEqual(58272875, bpp.break2.start) + self.assertEqual(SVTYPE.TRANS, bpp.event_type) + self.assertEqual('R', bpp.break1.orient) + self.assertEqual('L', bpp.break2.orient) + self.assertEqual(False, bpp.opposing_strands) + + + def test_malformed(self): + row = { + 'breakpoint1': '', + 'breakpoint2': None, + 'type': 'translocation', + 'strand1(gene/fusion)': '-/-', + 'strand2(gene/fusion)': '-/-', + 'direction1': 'upstream', + 'direction2': 'downstream', + } + with self.assertRaises(AssertionError): + _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, False) + class TestStarFusion(unittest.TestCase): def test_convert_standard_event(self): row = { From fd413fa2c443103602258e7199008a972b991ed2 Mon Sep 17 00:00:00 2001 From: Caleb Choo Date: Wed, 20 Apr 2022 17:02:57 -0700 Subject: [PATCH 20/56] fix lint on arriba.py --- mavis/tools/arriba.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mavis/tools/arriba.py b/mavis/tools/arriba.py index 17c19132..e523bfd6 100644 --- a/mavis/tools/arriba.py +++ b/mavis/tools/arriba.py @@ -2,6 +2,7 @@ from .constants import TRACKING_COLUMN, SUPPORTED_TOOL + def get_orient(string): if string == "downstream": return ORIENT.LEFT @@ -9,6 +10,7 @@ def get_orient(string): return ORIENT.RIGHT return ORIENT.NS + def convert_row(row): """ transforms the aribba output into the common format for expansion. Maps the input column @@ -30,8 +32,8 @@ def convert_row(row): std_row[COLUMNS.break2_position_start] = std_row[COLUMNS.break2_position_end] = b2_start except (ValueError, TypeError): raise AssertionError( - 'Could not parse the breakpoint from the Arriba row: {}, {}'.format( - row['breakpoint1'], row['breakpoint2'] + "Could not parse the breakpoint from the Arriba row: {}, {}".format( + row["breakpoint1"], row["breakpoint2"] ) ) return std_row From 4e6f5dd9844f855b3fa13b4ca10eb9a9819a59b1 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Fri, 22 Apr 2022 14:36:07 -0700 Subject: [PATCH 21/56] Clean up tests to split into separate cases --- tests/test_mavis/convert/test_tools_vcf.py | 106 +++++++++------------ 1 file changed, 47 insertions(+), 59 deletions(-) diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index aa822ed3..605508e9 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -1,3 +1,4 @@ +import pytest from mavis.convert import SUPPORTED_TOOL, _convert_tool_row from mavis.convert.vcf import VcfInfoType, VcfRecordType, convert_record, pandas_vcf @@ -61,33 +62,44 @@ def test_convert_telomeric_region(): assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' -def test_convert_intrachromosomal_imprecise_breakend(): - # breakpoint_2_start < breakpoint_1_start - variant_imprecise = VcfRecordType( - id='vcf-cuteSV.INS', - pos=1853407, - chrom='chr5', - alts=['AGG'], - ref='A', - info=VcfInfoType( - CHR2="chr5", - IMPRECISE=True, - SVMETHOD="cuteSV-1.0.12", - SVTYPE="INS", - CIPOS=(-30, 30), - CILEN=(-65, 65), - ), - ) - variant_ins_imprecise = convert_record(variant_imprecise) - assert len(variant_ins_imprecise) == 1 - variant_ins_imprecise = variant_ins_imprecise[0] - assert variant_ins_imprecise.get('break2_position_start') == 1853377 - assert variant_ins_imprecise.get('break2_position_end') == 1853472 +TEST_POS = 1853407 - # breakpoint_1_end > breakpoint_2_end - variant_imprecise2 = VcfRecordType( + +@pytest.mark.parametrize( + 'pos,break1_ci,break2_ci,break1,break2', + [ + [ + TEST_POS, + (-30, 30), + (-65, 65), + (TEST_POS - 30, TEST_POS + 30), + (TEST_POS - 30, TEST_POS + 65), + ], + [ + TEST_POS, + (-30, 99999), + (-10, 65), + (TEST_POS - 30, TEST_POS + 65), + (TEST_POS - 10, TEST_POS + 65), + ], + [ + TEST_POS, + (-30, 99999), + (70, 65), + (TEST_POS - 30, TEST_POS + 65), + (TEST_POS + 65, TEST_POS + 65), + ], + ], + ids=[ + 'breakpoint_2_start < breakpoint_1_start', + 'breakpoint_1_end > breakpoint_2_end', + 'breakpoint_2_start > breakpoint_2_end', + ], +) +def test_convert_intrachromosomal_imprecise_breakend(pos, break1_ci, break2_ci, break1, break2): + variant_vcf = VcfRecordType( id='vcf-cuteSV.INS', - pos=1853407, + pos=pos, chrom='chr5', alts=['AGG'], ref='A', @@ -96,44 +108,20 @@ def test_convert_intrachromosomal_imprecise_breakend(): IMPRECISE=True, SVMETHOD="cuteSV-1.0.12", SVTYPE="INS", - SUPTYPE="None", - STRANDS="None", - CIPOS=(-30, 9999), - CILEN=(-65, 65), + CIPOS=break1_ci, + CILEN=break2_ci, ), ) - variant_ins_imprecise_2 = convert_record(variant_imprecise2) - assert len(variant_ins_imprecise_2) == 1 - variant_ins_imprecise_2 = variant_ins_imprecise_2[0] - assert variant_ins_imprecise_2.get('break1_position_start') == 1853377 - assert variant_ins_imprecise_2.get('break1_position_end') == 1853472 - assert variant_ins_imprecise_2.get('break2_position_end') == 1853472 - assert variant_ins_imprecise_2.get('break2_position_start') == 1853377 + result = convert_record(variant_vcf) + assert len(result) == 1 + variant = result[0] + assert variant.get('break1_position_start') == break1[0] + assert variant.get('break1_position_end') == break1[1] + assert variant.get('break2_position_start') == break2[0] + assert variant.get('break2_position_end') == break2[1] - # breakpoint_2_start > breakpoint_2_end - variant_imprecise3 = VcfRecordType( - id='mock-INS-imprecise', - pos=1853407, - chrom='chr5', - alts=['AGG'], - ref='A', - info=VcfInfoType( - CHR2="chr5", - IMPRECISE=True, - SVMETHOD="Snifflesv1.0.11", - SVTYPE="INS", - SUPTYPE="None", - STRANDS="None", - CIPOS=(-30, 9999), - CILEN=(70, 65), - ), - ) - variant_ins_imprecise_3 = convert_record(variant_imprecise3) - assert len(variant_ins_imprecise_3) == 1 - variant_ins_imprecise_3 = variant_ins_imprecise_3[0] - assert variant_ins_imprecise_3.get('break2_position_end') == 1853472 - assert variant_ins_imprecise_3.get('break2_position_start') == 1853472 +def test_convert_intrachromosomal_imprecise_breakend_no_ci(): # breakpoint_1_start > breakpoint_1_end variant_cilen4 = VcfRecordType( id='Sniffle.INS', From f6903c7c2de97f066d3c7dbedee60272678cd721 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 25 Apr 2022 13:13:58 -0700 Subject: [PATCH 22/56] drop calls which violate VCF norms --- src/mavis/convert/vcf.py | 41 +++++++++++----------- tests/test_mavis/convert/test_tools_vcf.py | 37 +++++++++++++------ 2 files changed, 48 insertions(+), 30 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 434353af..47d1646c 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -178,7 +178,6 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en (std_row['break1_position_start'], std_row['break1_position_end']), (std_row['break2_position_start'], std_row['break2_position_end']), ) - or std_row['break1_position_start'] > std_row['break2_position_start'] ): if 'event_type' in std_row and std_row['event_type'] != 'BND': std_row['break2_position_start'] = max( @@ -190,21 +189,27 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en std_row['break1_position_start'] = min( std_row['break1_position_start'], std_row['break2_position_end'] ) - std_row['break2_position_start'] = min( - std_row['break2_position_start'], std_row['break2_position_end'] - ) + if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: + # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 + std_row.update({'break1_position_end': 1}) + if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: + std_row.update({'break2_position_end': 1}) - elif ( - std_row["break1_chromosome"] == std_row["break2_chromosome"] - and std_row['break2_position_start'] > std_row['break2_position_end'] + if std_row["break1_chromosome"] == std_row["break2_chromosome"] and ( + std_row['break2_position_start'] > std_row['break2_position_end'] + or std_row['break1_position_start'] > std_row['break1_position_end'] ): if 'event_type' in std_row and std_row['event_type'] != 'BND': - std_row['break2_position_start'] = min( - std_row['break2_position_start'], std_row['break2_position_end'] - ) - logger.warning( - f'Improper entry. Breakpoint2 start: {std_row["break2_position_start"]} is greater than Breakpoint2 end: {std_row["break2_position_end"]}\n Breakpoint2 start position coerced to breakpoint2 end position.' + logger.error( + f'Improper entry. One of the following breakpoints start is greater than breakpoint end:\n Breakpoint1_start: {std_row["break1_position_start"]}, Breakpoint1_end: {std_row["break1_position_end"]}\n Breakpoint2_start: {std_row["break2_position_start"]}, Breakpoint2_end: {std_row["break2_position_end"]}\n This call has been dropped.' ) + std_row.clear() + + if not None in (record.pos, record.info.get('END')) and record.pos > record.info.get('END'): + logger.error( + f'Improper entry. Starting position ({record.pos}) cannot be greater than ending position ({record.info.get("END")}).\n This call has been dropped.' + ) + std_row.clear() def convert_record(record: VcfRecordType) -> List[Dict]: @@ -283,13 +288,9 @@ def convert_record(record: VcfRecordType) -> List[Dict]: ) else: convert_imprecise_breakend(std_row, record, end) - - if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: - # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 - std_row.update({'break1_position_end': 1}) - if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: - std_row.update({'break2_position_end': 1}) - + print(std_row) + if len(std_row) == 0: + continue try: orient1, orient2 = info['CT'].split('to') connection_type = {'3': ORIENT.LEFT, '5': ORIENT.RIGHT, 'N': ORIENT.NS} @@ -329,7 +330,7 @@ def parse_info(info_field): return info df['info'] = df['INFO'].apply(parse_info) - df['alts'] = df['ALT'].apply(lambda a: str(a).split(',')) + df['alts'] = df['ALT'].apply(lambda a: a.split(',')) rows = [] for _, row in df.iterrows(): diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index aa822ed3..a01b3372 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -110,7 +110,7 @@ def test_convert_intrachromosomal_imprecise_breakend(): assert variant_ins_imprecise_2.get('break2_position_end') == 1853472 assert variant_ins_imprecise_2.get('break2_position_start') == 1853377 - # breakpoint_2_start > breakpoint_2_end + # breakpoint_2_start > breakpoint_2_end (invalid entry) variant_imprecise3 = VcfRecordType( id='mock-INS-imprecise', pos=1853407, @@ -129,12 +129,32 @@ def test_convert_intrachromosomal_imprecise_breakend(): ), ) variant_ins_imprecise_3 = convert_record(variant_imprecise3) - assert len(variant_ins_imprecise_3) == 1 - variant_ins_imprecise_3 = variant_ins_imprecise_3[0] - assert variant_ins_imprecise_3.get('break2_position_end') == 1853472 - assert variant_ins_imprecise_3.get('break2_position_start') == 1853472 + print(variant_ins_imprecise_3) + assert len(variant_ins_imprecise_3) == 0 - # breakpoint_1_start > breakpoint_1_end + # breakpoint_1_start > breakpoint_1_end (invalid entry) + variant_imprecise4 = VcfRecordType( + id='mock-INS-imprecise', + pos=1853407, + chrom='chr5', + alts=['AGG'], + ref='A', + info=VcfInfoType( + CHR2="chr5", + IMPRECISE=True, + SVMETHOD="Snifflesv1.0.11", + SVTYPE="INS", + SUPTYPE="None", + STRANDS="None", + CIPOS=(999, 100), + CILEN=(65, 65), + ), + ) + variant_ins_imprecise_4 = convert_record(variant_imprecise4) + print(variant_ins_imprecise_4) + assert len(variant_ins_imprecise_4) == 0 + + # pos > end (invalid entry) variant_cilen4 = VcfRecordType( id='Sniffle.INS', pos=11184, @@ -149,7 +169,4 @@ def test_convert_intrachromosomal_imprecise_breakend(): ), ) variant_ins_imprecise_4 = convert_record(variant_cilen4) - assert len(variant_ins_imprecise_4) == 1 - variant_ins_imprecise_4 = variant_ins_imprecise_4[0] - assert variant_ins_imprecise_4.get('break2_position_end') == 11183 - assert variant_ins_imprecise_4.get('break1_position_end') == 11183 + assert len(variant_ins_imprecise_4) == 0 From 7528e4a897135dd4b9bdf4f6e0c61c798329fe22 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 25 Apr 2022 15:01:00 -0700 Subject: [PATCH 23/56] drop calls that violate VCF standards in convert vcf --- src/mavis/convert/vcf.py | 1 - tests/test_mavis/convert/test_convert.py | 11 ++ tests/test_mavis/convert/test_tools_vcf.py | 138 +++++++++------------ 3 files changed, 68 insertions(+), 82 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 47d1646c..f5ace9d0 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -288,7 +288,6 @@ def convert_record(record: VcfRecordType) -> List[Dict]: ) else: convert_imprecise_breakend(std_row, record, end) - print(std_row) if len(std_row) == 0: continue try: diff --git a/tests/test_mavis/convert/test_convert.py b/tests/test_mavis/convert/test_convert.py index 4470bc66..44b8aa24 100644 --- a/tests/test_mavis/convert/test_convert.py +++ b/tests/test_mavis/convert/test_convert.py @@ -1,3 +1,4 @@ +import itertools import os import shutil import sys @@ -47,6 +48,16 @@ def run_main(self, inputfile, file_type, strand_specific=False): result.setdefault(pair.data['tracking_id'], []).append(pair) return result + def test_straglr(self): + result = self.run_main(get_data('straglr.bed'), SUPPORTED_TOOL.STRAGLR, False) + assert len(result) == 9 + for bpp in itertools.chain(*result.values()): + assert bpp.break1.chr == bpp.break2.chr + assert bpp.break1.start == bpp.break2.start + assert bpp.break1.end == bpp.break2.end + assert bpp.event_type == SVTYPE.INS + assert bpp.untemplated_seq is None + def test_chimerascan(self): self.run_main(get_data('chimerascan_output.bedpe'), SUPPORTED_TOOL.CHIMERASCAN, False) diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index a01b3372..2c5dcd1f 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -1,3 +1,4 @@ +import pytest from mavis.convert import SUPPORTED_TOOL, _convert_tool_row from mavis.convert.vcf import VcfInfoType, VcfRecordType, convert_record, pandas_vcf @@ -61,100 +62,75 @@ def test_convert_telomeric_region(): assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' -def test_convert_intrachromosomal_imprecise_breakend(): - # breakpoint_2_start < breakpoint_1_start - variant_imprecise = VcfRecordType( - id='vcf-cuteSV.INS', - pos=1853407, - chrom='chr5', - alts=['AGG'], - ref='A', - info=VcfInfoType( - CHR2="chr5", - IMPRECISE=True, - SVMETHOD="cuteSV-1.0.12", - SVTYPE="INS", - CIPOS=(-30, 30), - CILEN=(-65, 65), - ), - ) - variant_ins_imprecise = convert_record(variant_imprecise) - assert len(variant_ins_imprecise) == 1 - variant_ins_imprecise = variant_ins_imprecise[0] - assert variant_ins_imprecise.get('break2_position_start') == 1853377 - assert variant_ins_imprecise.get('break2_position_end') == 1853472 +TEST_POS = 1853407 - # breakpoint_1_end > breakpoint_2_end - variant_imprecise2 = VcfRecordType( - id='vcf-cuteSV.INS', - pos=1853407, - chrom='chr5', - alts=['AGG'], - ref='A', - info=VcfInfoType( - CHR2="chr5", - IMPRECISE=True, - SVMETHOD="cuteSV-1.0.12", - SVTYPE="INS", - SUPTYPE="None", - STRANDS="None", - CIPOS=(-30, 9999), - CILEN=(-65, 65), - ), - ) - variant_ins_imprecise_2 = convert_record(variant_imprecise2) - assert len(variant_ins_imprecise_2) == 1 - variant_ins_imprecise_2 = variant_ins_imprecise_2[0] - assert variant_ins_imprecise_2.get('break1_position_start') == 1853377 - assert variant_ins_imprecise_2.get('break1_position_end') == 1853472 - assert variant_ins_imprecise_2.get('break2_position_end') == 1853472 - assert variant_ins_imprecise_2.get('break2_position_start') == 1853377 - # breakpoint_2_start > breakpoint_2_end (invalid entry) - variant_imprecise3 = VcfRecordType( - id='mock-INS-imprecise', - pos=1853407, +@pytest.mark.parametrize( + 'pos,break1_ci,break2_ci,break1,break2,ids', + [ + [ + TEST_POS, + (-30, 30), + (-65, 65), + (TEST_POS - 30, TEST_POS + 30), + (TEST_POS - 30, TEST_POS + 65), + 'vcf-cuteSV.INS.breakpoint_2_start < breakpoint_1_start', + ], + [ + TEST_POS, + (-30, 99999), + (-10, 65), + (TEST_POS - 30, TEST_POS + 65), + (TEST_POS - 10, TEST_POS + 65), + 'vcf-cuteSV.INS.breakpoint_1_end > breakpoint_2_end', + ], + [ + TEST_POS, + (-30, 99999), + (70, 65), + (TEST_POS - 30, TEST_POS + 65), + (TEST_POS + 65, TEST_POS + 65), + 'vcf-cuteSV.INS.breakpoint_2_start > breakpoint_2_end', + ], + ], + ids=[ + 'breakpoint_2_start < breakpoint_1_start', + 'breakpoint_1_end > breakpoint_2_end', + 'breakpoint_2_start > breakpoint_2_end', + ], +) +def test_convert_intrachromosomal_imprecise_breakend( + pos, break1_ci, break2_ci, break1, break2, ids +): + variant_vcf = VcfRecordType( + id=ids, + pos=pos, chrom='chr5', alts=['AGG'], ref='A', info=VcfInfoType( CHR2="chr5", IMPRECISE=True, - SVMETHOD="Snifflesv1.0.11", + SVMETHOD="cuteSV-1.0.12", SVTYPE="INS", - SUPTYPE="None", - STRANDS="None", - CIPOS=(-30, 9999), - CILEN=(70, 65), + CIPOS=break1_ci, + CILEN=break2_ci, ), ) - variant_ins_imprecise_3 = convert_record(variant_imprecise3) - print(variant_ins_imprecise_3) - assert len(variant_ins_imprecise_3) == 0 + result = convert_record(variant_vcf) + if ids == "vcf-cuteSV.INS.breakpoint_2_start > breakpoint_2_end": + assert len(result) == 0 + else: + assert len(result) == 1 + variant = result[0] + assert variant.get('break1_position_start') == break1[0] + assert variant.get('break1_position_end') == break1[1] + assert variant.get('break2_position_start') == break2[0] + assert variant.get('break2_position_end') == break2[1] - # breakpoint_1_start > breakpoint_1_end (invalid entry) - variant_imprecise4 = VcfRecordType( - id='mock-INS-imprecise', - pos=1853407, - chrom='chr5', - alts=['AGG'], - ref='A', - info=VcfInfoType( - CHR2="chr5", - IMPRECISE=True, - SVMETHOD="Snifflesv1.0.11", - SVTYPE="INS", - SUPTYPE="None", - STRANDS="None", - CIPOS=(999, 100), - CILEN=(65, 65), - ), - ) - variant_ins_imprecise_4 = convert_record(variant_imprecise4) - print(variant_ins_imprecise_4) - assert len(variant_ins_imprecise_4) == 0 - # pos > end (invalid entry) +def test_convert_intrachromosomal_imprecise_breakend_no_ci(): + # breakpoint_1_start > breakpoint_1_end variant_cilen4 = VcfRecordType( id='Sniffle.INS', pos=11184, From f769ce38183307db972982bf42c8c84495998212 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 25 Apr 2022 17:04:45 -0700 Subject: [PATCH 24/56] intrachromosomal test change --- src/mavis/convert/vcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index f5ace9d0..30ac2fe9 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -205,7 +205,7 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en ) std_row.clear() - if not None in (record.pos, record.info.get('END')) and record.pos > record.info.get('END'): + if not None in (record.pos, record.info.get('END')) and record.pos > record.info.get('END') and std_row["break1_chromosome"] == std_row["break2_chromosome"]: logger.error( f'Improper entry. Starting position ({record.pos}) cannot be greater than ending position ({record.info.get("END")}).\n This call has been dropped.' ) From 197a8147d68716ac8910d3d175acec079f411108 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Mon, 25 Apr 2022 18:15:12 -0700 Subject: [PATCH 25/56] reformmated file --- src/mavis/convert/vcf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 30ac2fe9..66cbdf52 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -205,7 +205,11 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en ) std_row.clear() - if not None in (record.pos, record.info.get('END')) and record.pos > record.info.get('END') and std_row["break1_chromosome"] == std_row["break2_chromosome"]: + if ( + not None in (record.pos, record.info.get('END')) + and record.pos > record.info.get('END') + and std_row["break1_chromosome"] == std_row["break2_chromosome"] + ): logger.error( f'Improper entry. Starting position ({record.pos}) cannot be greater than ending position ({record.info.get("END")}).\n This call has been dropped.' ) From 9e64e59b934bb5c4485d4da390f1ebd90a79ac03 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 26 Apr 2022 12:03:44 -0700 Subject: [PATCH 26/56] style changes and change error handling logic --- src/mavis/convert/vcf.py | 26 +++++----- tests/test_mavis/convert/test_tools_vcf.py | 56 ++++++++++++++++------ 2 files changed, 53 insertions(+), 29 deletions(-) diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index 66cbdf52..c6447af5 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -147,6 +147,7 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en break1 ------------------------==----------------------------------- break2 -----------------------========------------------------------ """ + is_intrachromosomal = std_row["break1_chromosome"] == std_row["break2_chromosome"] if record.info.get( 'CILEN' @@ -173,13 +174,13 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en } ) - if std_row["break1_chromosome"] == std_row["break2_chromosome"] and ( + if is_intrachromosomal and ( Interval.overlaps( (std_row['break1_position_start'], std_row['break1_position_end']), (std_row['break2_position_start'], std_row['break2_position_end']), ) ): - if 'event_type' in std_row and std_row['event_type'] != 'BND': + if std_row.get('event_type') != 'BND': std_row['break2_position_start'] = max( std_row['break1_position_start'], std_row['break2_position_start'] ) @@ -195,25 +196,23 @@ def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_en if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: std_row.update({'break2_position_end': 1}) - if std_row["break1_chromosome"] == std_row["break2_chromosome"] and ( + if is_intrachromosomal and ( std_row['break2_position_start'] > std_row['break2_position_end'] or std_row['break1_position_start'] > std_row['break1_position_end'] ): if 'event_type' in std_row and std_row['event_type'] != 'BND': - logger.error( - f'Improper entry. One of the following breakpoints start is greater than breakpoint end:\n Breakpoint1_start: {std_row["break1_position_start"]}, Breakpoint1_end: {std_row["break1_position_end"]}\n Breakpoint2_start: {std_row["break2_position_start"]}, Breakpoint2_end: {std_row["break2_position_end"]}\n This call has been dropped.' + raise ValueError( + f'Improper entry. One of the following breakpoints start is greater than breakpoint end: Breakpoint1_start: {std_row["break1_position_start"]}, Breakpoint1_end: {std_row["break1_position_end"]} Breakpoint2_start: {std_row["break2_position_start"]}, Breakpoint2_end: {std_row["break2_position_end"]} This call has been dropped.' ) - std_row.clear() if ( - not None in (record.pos, record.info.get('END')) + None not in (record.pos, record.info.get('END')) and record.pos > record.info.get('END') - and std_row["break1_chromosome"] == std_row["break2_chromosome"] + and is_intrachromosomal ): - logger.error( - f'Improper entry. Starting position ({record.pos}) cannot be greater than ending position ({record.info.get("END")}).\n This call has been dropped.' + raise ValueError( + f'Improper entry. Starting position ({record.pos}) cannot be greater than ending position ({record.info.get("END")}). This call has been dropped.' ) - std_row.clear() def convert_record(record: VcfRecordType) -> List[Dict]: @@ -292,8 +291,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: ) else: convert_imprecise_breakend(std_row, record, end) - if len(std_row) == 0: - continue + try: orient1, orient2 = info['CT'].split('to') connection_type = {'3': ORIENT.LEFT, '5': ORIENT.RIGHT, 'N': ORIENT.NS} @@ -413,6 +411,6 @@ def convert_file(input_file: str) -> List[Dict]: for variant_record in convert_pandas_rows_to_variants(data): try: rows.extend(convert_record(variant_record)) - except NotImplementedError as err: + except (NotImplementedError, ValueError) as err: logging.warning(str(err)) return rows diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index 2c5dcd1f..b1cf72bd 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -84,6 +84,42 @@ def test_convert_telomeric_region(): (TEST_POS - 10, TEST_POS + 65), 'vcf-cuteSV.INS.breakpoint_1_end > breakpoint_2_end', ], + ], + ids=[ + 'breakpoint_2_start < breakpoint_1_start', + 'breakpoint_1_end > breakpoint_2_end', + ], +) +def test_convert_intrachromosomal_imprecise_breakend( + pos, break1_ci, break2_ci, break1, break2, ids +): + variant_vcf = VcfRecordType( + id=ids, + pos=pos, + chrom='chr5', + alts=['AGG'], + ref='A', + info=VcfInfoType( + CHR2="chr5", + IMPRECISE=True, + SVMETHOD="cuteSV-1.0.12", + SVTYPE="INS", + CIPOS=break1_ci, + CILEN=break2_ci, + ), + ) + result = convert_record(variant_vcf) + assert len(result) == 1 + variant = result[0] + assert variant.get('break1_position_start') == break1[0] + assert variant.get('break1_position_end') == break1[1] + assert variant.get('break2_position_start') == break2[0] + assert variant.get('break2_position_end') == break2[1] + + +@pytest.mark.parametrize( + 'pos,break1_ci,break2_ci,break1,break2,ids', + [ [ TEST_POS, (-30, 99999), @@ -94,12 +130,10 @@ def test_convert_telomeric_region(): ], ], ids=[ - 'breakpoint_2_start < breakpoint_1_start', - 'breakpoint_1_end > breakpoint_2_end', 'breakpoint_2_start > breakpoint_2_end', ], ) -def test_convert_intrachromosomal_imprecise_breakend( +def test_error_on_convert_intrachromosomal_imprecise_breakend( pos, break1_ci, break2_ci, break1, break2, ids ): variant_vcf = VcfRecordType( @@ -117,16 +151,8 @@ def test_convert_intrachromosomal_imprecise_breakend( CILEN=break2_ci, ), ) - result = convert_record(variant_vcf) - if ids == "vcf-cuteSV.INS.breakpoint_2_start > breakpoint_2_end": - assert len(result) == 0 - else: - assert len(result) == 1 - variant = result[0] - assert variant.get('break1_position_start') == break1[0] - assert variant.get('break1_position_end') == break1[1] - assert variant.get('break2_position_start') == break2[0] - assert variant.get('break2_position_end') == break2[1] + with pytest.raises(ValueError): + convert_record(variant_vcf) def test_convert_intrachromosomal_imprecise_breakend_no_ci(): @@ -144,5 +170,5 @@ def test_convert_intrachromosomal_imprecise_breakend_no_ci(): END=11183, ), ) - variant_ins_imprecise_4 = convert_record(variant_cilen4) - assert len(variant_ins_imprecise_4) == 0 + with pytest.raises(ValueError): + convert_record(variant_cilen4) From 5ad23ca94b2d2d671982be5277cbb9bcb03f0189 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 27 Apr 2022 15:01:57 -0700 Subject: [PATCH 27/56] Split read gathering methods into a separate module --- src/mavis/validate/base.py | 714 ------------------- src/mavis/validate/gather.py | 746 ++++++++++++++++++++ src/mavis/validate/main.py | 3 +- tests/test_mavis/validate/test_evidence.py | 15 +- tests/test_mavis/validate/test_validate2.py | 64 +- 5 files changed, 789 insertions(+), 753 deletions(-) create mode 100644 src/mavis/validate/gather.py diff --git a/src/mavis/validate/base.py b/src/mavis/validate/base.py index 65a70bc5..9df8a978 100644 --- a/src/mavis/validate/base.py +++ b/src/mavis/validate/base.py @@ -281,515 +281,6 @@ def supporting_reads(self): result.update(self.spanning_reads) return result - def collect_spanning_read(self, read: pysam.AlignedSegment): - """ - spanning read: a read covering BOTH breakpoints - - This is only applicable to small events. Do not need to look for soft clipped reads - here since they will be collected already - - Args: - read: the putative spanning read - - Returns: - bool: - - True: the read was collected and stored in the current evidence object - - False: the read was not collected - """ - if self.interchromosomal: - return False - elif not Interval.overlaps(self.inner_window1, self.inner_window2): - # too far apart to call spanning reads - return False - - if self.stranded: - strand = _read.sequenced_strand(read, self.strand_determining_read) - if strand != self.break1.strand and strand != self.break2.strand: - return False - - combined = self.inner_window1 & self.inner_window2 - read_interval = Interval(read.reference_start + 1, read.reference_end) - - if Interval.overlaps(combined, read_interval): - - if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( - PYSAM_READ_FLAGS.RECOMPUTED_CIGAR - ): - - read = self.standardize_read(read) - # in the correct position, now determine if it can support the event types - for event_type in self.putative_event_types(): - if event_type in [SVTYPE.DUP, SVTYPE.INS]: - if CIGAR.I in [c[0] for c in read.cigar]: - self.spanning_reads.add(read) - return True - elif event_type == SVTYPE.DEL: - if CIGAR.D in [c[0] for c in read.cigar]: - self.spanning_reads.add(read) - return True - elif event_type == SVTYPE.INV: - if CIGAR.X in [c[0] for c in read.cigar]: - return True - return False - - def collect_compatible_flanking_pair( - self, read: pysam.AlignedSegment, mate: pysam.AlignedSegment, compatible_type: str - ) -> bool: - """ - checks if a given read meets the minimum quality criteria to be counted as evidence as stored as support for - this event - - Args: - read: the read to add - mate: the mate - compatible_type (SVTYPE): the type we are collecting for - - Returns: - bool: - - True: the pair was collected and stored in the current evidence object - - False: the pair was not collected - Raises: - ValueError: if the input reads are not a valid pair - - Note: - see [theory - types of flanking evidence](/background/theory/#compatible-flanking-pairs) - """ - if ( - read.is_unmapped - or mate.is_unmapped - or read.query_name != mate.query_name - or read.is_read1 == mate.is_read1 - ): - raise ValueError('input reads must be a mapped and mated pair') - if not self.compatible_window1: - raise ValueError('compatible windows were not given') - if self.interchromosomal: - raise NotImplementedError( - 'interchromosomal events do not have compatible flanking pairs' - ) - # check that the references are right for the pair - if read.reference_id != read.next_reference_id: - return False - elif ( - read.mapping_quality < self.min_mapping_quality - or mate.mapping_quality < self.min_mapping_quality - ): - return False - if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( - PYSAM_READ_FLAGS.RECOMPUTED_CIGAR - ): - read = self.standardize_read(read) - if not mate.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not mate.get_tag( - PYSAM_READ_FLAGS.RECOMPUTED_CIGAR - ): - mate = self.standardize_read(mate) - # order the read pairs so that they are in the same order that we expect for the breakpoints - if read.reference_start > mate.reference_start: - read, mate = mate, read - - if ( - self.bam_cache.get_read_reference_name(read) != self.break1.chr - or self.bam_cache.get_read_reference_name(mate) != self.break2.chr - ): - return False - - # check if this read falls in the first breakpoint window - iread = Interval(read.reference_start + 1, read.reference_end) - imate = Interval(mate.reference_start + 1, mate.reference_end) - - if self.stranded: - strand1 = _read.sequenced_strand(read, self.strand_determining_read) - strand2 = _read.sequenced_strand(mate, self.strand_determining_read) - if strand1 != self.break1.strand or strand2 != self.break2.strand: - return False - - # check that the pair orientation is correct - if not _read.orientation_supports_type(read, compatible_type): - return False - - # check that the fragment size is reasonable - fragment_size = self.compute_fragment_size(read, mate) - - if compatible_type == SVTYPE.DEL: - if fragment_size.end <= self.max_expected_fragment_size: - return False - elif compatible_type == SVTYPE.INS: - if fragment_size.start >= self.min_expected_fragment_size: - return False - - # check that the positions of the reads and the strands make sense - if Interval.overlaps(iread, self.compatible_window1) and Interval.overlaps( - imate, self.compatible_window2 - ): - self.compatible_flanking_pairs.add((read, mate)) - return True - - return False - - def collect_flanking_pair(self, read: pysam.AlignedSegment, mate: pysam.AlignedSegment): - """ - checks if a given read meets the minimum quality criteria to be counted as evidence as stored as support for - this event - - Args: - read: the read to add - mate: the mate - - Returns: - bool: - - True: the pair was collected and stored in the current evidence object - - False: the pair was not collected - Raises: - ValueError: if the input reads are not a valid pair - - see [theory - types of flanking evidence](/background/theory/#types-of-flanking-evidence) - """ - if read.is_unmapped or mate.is_unmapped: - raise ValueError( - 'input reads must be a mapped and mated pair. One or both of the reads is unmapped' - ) - elif read.query_name != mate.query_name: - raise ValueError( - 'input reads must be a mapped and mated pair. The query names do not match' - ) - elif abs(read.template_length) != abs(mate.template_length): - raise ValueError( - 'input reads must be a mapped and mated pair. The template lengths (abs value) do not match', - abs(read.template_length), - abs(mate.template_length), - ) - elif ( - read.mapping_quality < self.min_mapping_quality - or mate.mapping_quality < self.min_mapping_quality - ): - return False # do not meet the minimum mapping quality requirements - # check that the references are right for the pair - if self.interchromosomal: - if read.reference_id == read.next_reference_id: - return False - elif read.reference_id != read.next_reference_id: - return False - - if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( - PYSAM_READ_FLAGS.RECOMPUTED_CIGAR - ): - read = self.standardize_read(read) - if not mate.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not mate.get_tag( - PYSAM_READ_FLAGS.RECOMPUTED_CIGAR - ): - mate = self.standardize_read(mate) - # order the read pairs so that they are in the same order that we expect for the breakpoints - if read.reference_id != mate.reference_id: - if self.bam_cache.get_read_reference_name( - read - ) > self.bam_cache.get_read_reference_name(mate): - read, mate = mate, read - elif read.reference_start > mate.reference_start: - read, mate = mate, read - - if ( - self.bam_cache.get_read_reference_name(read) != self.break1.chr - or self.bam_cache.get_read_reference_name(mate) != self.break2.chr - ): - return False - if any( - [ - self.break1.orient == ORIENT.LEFT and read.is_reverse, - self.break1.orient == ORIENT.RIGHT and not read.is_reverse, - self.break2.orient == ORIENT.LEFT and mate.is_reverse, - self.break2.orient == ORIENT.RIGHT and not mate.is_reverse, - ] - ): - return False - # check if this read falls in the first breakpoint window - iread = Interval(read.reference_start + 1, read.reference_end) - imate = Interval(mate.reference_start + 1, mate.reference_end) - - if self.stranded: - strand1 = _read.sequenced_strand(read, self.strand_determining_read) - strand2 = _read.sequenced_strand(mate, self.strand_determining_read) - - if strand1 != self.break1.strand or strand2 != self.break2.strand: - return False - - for event_type in self.putative_event_types(): - - # check that the pair orientation is correct - if not _read.orientation_supports_type(read, event_type): - continue - - # check that the fragment size is reasonable - fragment_size = self.compute_fragment_size(read, mate) - - if event_type == SVTYPE.DEL: - if fragment_size.end <= self.max_expected_fragment_size: - continue - elif event_type == SVTYPE.INS: - if fragment_size.start >= self.min_expected_fragment_size: - continue - - # check that the positions of the reads and the strands make sense - if Interval.overlaps(iread, self.outer_window1) and Interval.overlaps( - imate, self.outer_window2 - ): - self.flanking_pairs.add((read, mate)) - return True - - return False - - def collect_half_mapped(self, read: pysam.AlignedSegment, mate: pysam.AlignedSegment): - """ - Args: - read: the read to add - mate: the unmapped mate - - Returns: - bool: - - True: the read was collected and stored in the current evidence object - - False: the read was not collected - Raises: - AssertionError: if the mate is not unmapped - """ - if not mate.is_unmapped: - raise AssertionError('expected the mate to be unmapped') - read_itvl = Interval(read.reference_start + 1, read.reference_end) - added = False - if self.break1.chr == read.reference_name and Interval.overlaps( - self.outer_window1, read_itvl - ): - self.half_mapped[0].add(mate) - added = True - if self.break2.chr == read.reference_name and Interval.overlaps( - self.outer_window2, read_itvl - ): - self.half_mapped[1].add(mate) - added = True - return added - - def collect_split_read(self, read: pysam.AlignedSegment, first_breakpoint: bool): - """ - adds a split read if it passes the criteria filters and raises a warning if it does not - - Args: - read: the read to add - first_breakpoint (bool): add to the first breakpoint (or second if false) - Returns: - bool: - - True: the read was collected and stored in the current evidence object - - False: the read was not collected - Raises: - NotSpecifiedError: if the breakpoint orientation is not specified - """ - breakpoint = self.break1 if first_breakpoint else self.break2 - window = self.inner_window1 if first_breakpoint else self.inner_window2 - opposite_breakpoint = self.break2 if first_breakpoint else self.break1 - opposite_window = self.inner_window2 if first_breakpoint else self.inner_window1 - - if read.cigar[0][0] != CIGAR.S and read.cigar[-1][0] != CIGAR.S: - return False # split read is not softclipped - elif breakpoint.orient == ORIENT.LEFT and read.cigar[-1][0] != CIGAR.S: - return False # split read is not softclipped - elif breakpoint.orient == ORIENT.RIGHT and read.cigar[0][0] != CIGAR.S: - return False # split read is not softclipped - - # the first breakpoint of a BreakpointPair is always the lower breakpoint - # if this is being added to the second breakpoint then we'll need to check if the - # read soft-clipping needs to be adjusted - - # check if the read falls within the evidence collection window - # recall that pysam is 0-indexed and the window is 1-indexed - if ( - read.reference_start > window.end - 1 - or read.reference_end < window.start - or self.bam_cache.get_read_reference_name(read) != breakpoint.chr - ): - return False # read not in breakpoint evidence window - # can only enforce strand if both the breakpoint and the bam are stranded - if self.stranded and self.bam_cache.stranded: - strand = _read.sequenced_strand( - read, strand_determining_read=self.strand_determining_read - ) - if strand != breakpoint.strand: - return False # split read not on the appropriate strand - unused = '' - primary = '' - clipped = '' - if breakpoint.orient == ORIENT.LEFT: - unused = read.query_sequence[: read.query_alignment_start] - primary = read.query_sequence[read.query_alignment_start : read.query_alignment_end] - # end is exclusive in pysam - clipped = read.query_sequence[read.query_alignment_end :] - elif breakpoint.orient == ORIENT.RIGHT: - clipped = read.query_sequence[: read.query_alignment_start] - primary = read.query_sequence[read.query_alignment_start : read.query_alignment_end] - unused = read.query_sequence[read.query_alignment_end :] - else: - raise NotSpecifiedError( - 'cannot assign split reads to a breakpoint where the orientation has not been specified' - ) - if len(primary) + len(clipped) + len(unused) != len(read.query_sequence): - raise AssertionError( - 'unused, primary, and clipped sequences should make up the original sequence', - unused, - primary, - clipped, - read.query_sequence, - len(read.query_sequence), - ) - - if ( - len(primary) < self.config['validate.min_anchor_exact'] - or len(clipped) < self.config['validate.min_softclipping'] - ): - # split read does not meet the minimum anchor criteria - return False - if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( - PYSAM_READ_FLAGS.RECOMPUTED_CIGAR - ): - read = self.standardize_read(read) - # data quality filters - if ( - _cigar.alignment_matches(read.cigar) - >= self.config['validate.min_sample_size_to_apply_percentage'] - and _cigar.match_percent(read.cigar) < self.config['validate.min_anchor_match'] - ): - return False # too poor quality of an alignment - if ( - _cigar.longest_exact_match(read.cigar) < self.config['validate.min_anchor_exact'] - and _cigar.longest_fuzzy_match( - read.cigar, self.config['validate.fuzzy_mismatch_number'] - ) - < self.config['validate.min_anchor_fuzzy'] - ): - return False # too poor quality of an alignment - else: - self.split_reads[0 if first_breakpoint else 1].add(read) - - # try mapping the soft-clipped portion to the other breakpoint - w = (opposite_window[0], opposite_window[1]) - opposite_breakpoint_ref = self.reference_genome[opposite_breakpoint.chr].seq[ - w[0] - 1 : w[1] - ] - - # figure out how much of the read must match when remaped - min_match_tgt = read.cigar[-1][1] if breakpoint.orient == ORIENT.LEFT else read.cigar[0][1] - min_match_tgt = min( - min_match_tgt * self.config['validate.min_anchor_match'], min_match_tgt - 1 - ) / len(read.query_sequence) - if not self.opposing_strands: # same strand - sc_align = _read.nsb_align( - opposite_breakpoint_ref, - read.query_sequence, - min_consecutive_match=self.config['validate.min_anchor_exact'], - min_match=min_match_tgt, - min_overlap_percent=min_match_tgt, - ) # split half to this side - - for alignment in sc_align: - alignment.flag = read.flag - putative_alignments = sc_align - else: - # should align opposite the current read - revcomp_sc_align = reverse_complement(read.query_sequence) - revcomp_sc_align = _read.nsb_align( - opposite_breakpoint_ref, - revcomp_sc_align, - min_consecutive_match=self.config['validate.min_anchor_exact'], - min_match=min_match_tgt, - min_overlap_percent=min_match_tgt, - ) - - for alignment in revcomp_sc_align: - alignment.flag = read.flag ^ PYSAM_READ_FLAGS.REVERSE # EXOR - putative_alignments = revcomp_sc_align - - scores = [] - for alignment in putative_alignments: # loop over the alignments - alignment.flag = alignment.flag | PYSAM_READ_FLAGS.SUPPLEMENTARY - # set this flag so we don't recompute the cigar multiple - alignment.set_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR, 1, value_type='i') - # add information from the original read - alignment.reference_start = w[0] - 1 + alignment.reference_start - alignment._reference_name = ( - opposite_breakpoint.chr - ) # must be set since not associated with an alignment file - alignment.reference_id = self.bam_cache.reference_id(opposite_breakpoint.chr) - alignment.query_name = read.query_name - alignment.set_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT, 1, value_type='i') - alignment.next_reference_start = read.next_reference_start - alignment.next_reference_id = read.next_reference_id - alignment.mapping_quality = NA_MAPPING_QUALITY - try: - cigar, offset = _cigar.extend_softclipping( - alignment.cigar, self.config['validate.min_anchor_exact'] - ) - alignment.cigar = cigar - alignment.reference_start = alignment.reference_start + offset - except AttributeError: - # if the matches section is too small you can't extend the - # softclipping - pass - s = _cigar.score(alignment.cigar) - alignment.cigar = _cigar.join(alignment.cigar) - if alignment.reference_id == alignment.next_reference_id: - # https://samtools.github.io/hts-specs/SAMv1.pdf - # unsigned observed template length equals the number of bases from the leftmost - # mapped base to the rightmost mapped base - tlen = abs(alignment.reference_start - alignment.next_reference_start) + 1 - if alignment.reference_start < alignment.next_reference_start: - alignment.template_length = tlen - else: - alignment.template_length = -1 * tlen - else: - alignment.template_length = 0 - if ( - _cigar.alignment_matches(alignment.cigar) - >= self.config['validate.min_sample_size_to_apply_percentage'] - and _cigar.match_percent(alignment.cigar) < self.config['validate.min_anchor_match'] - ): - continue - if ( - _cigar.longest_exact_match(alignment.cigar) - < self.config['validate.min_anchor_exact'] - and _cigar.longest_fuzzy_match( - alignment.cigar, self.config['validate.fuzzy_mismatch_number'] - ) - < self.config['validate.min_anchor_fuzzy'] - ): - continue - if self.config['validate.max_sc_preceeding_anchor'] is not None: - if opposite_breakpoint.orient == ORIENT.LEFT: - if ( - alignment.cigar[0][0] == CIGAR.S - and alignment.cigar[0][1] > self.config['validate.max_sc_preceeding_anchor'] - ): - continue - elif opposite_breakpoint.orient == ORIENT.RIGHT: - if ( - alignment.cigar[-1][0] == CIGAR.S - and alignment.cigar[-1][1] - > self.config['validate.max_sc_preceeding_anchor'] - ): - continue - alignment.set_key() # set the hash key before we add the read as evidence - scores.append((s, _cigar.match_percent(alignment.cigar), alignment)) - - scores = sorted(scores, key=lambda x: (x[0], x[1]), reverse=True) if scores else [] - - if len(scores) > 1: - if scores[0][0] != scores[1][0] and scores[0][1] != scores[1][1]: - # not multimap, pick highest scoring alignment - clipped = scores[0][2] - self.split_reads[1 if first_breakpoint else 0].add( - clipped - ) # add to the opposite breakpoint - elif len(scores) == 1: - clipped = scores[0][2] - self.split_reads[1 if first_breakpoint else 0].add( - clipped - ) # add to the opposite breakpoint - return True - def decide_sequenced_strand(self, reads: Set[pysam.AlignedSegment]): """ given a set of reads, determines the sequenced strand (if possible) and then returns the majority @@ -978,211 +469,6 @@ def assemble_contig(self): list(filtered_contigs.values()), key=lambda x: (x.remap_score() * -1, x.seq) ) - def load_evidence(self): - """ - open the associated bam file and read and store the evidence - does some preliminary read-quality filtering - """ - - def cache_if_true(read): - if read.is_unmapped or read.mate_is_unmapped: - return True - elif any( - [ - self.config['validate.filter_secondary_alignments'] and read.is_secondary, - read.mapping_quality < self.min_mapping_quality, - ] - ): - return False - elif set([x[0] for x in read.cigar]) & {CIGAR.S, CIGAR.H}: - return True - elif not read.is_proper_pair: - if any( - [_read.orientation_supports_type(read, e) for e in self.putative_event_types()] - ): - return True - elif self.compatible_type and _read.orientation_supports_type( - read, self.compatible_type - ): - return True - elif not self.interchromosomal and not self.opposing_strands: - min_frag_est = ( - abs(read.reference_start - read.next_reference_start) - self.read_length - ) - max_frag_est = min_frag_est + 3 * self.read_length - if ( - min_frag_est < self.min_expected_fragment_size - or max_frag_est > self.max_expected_fragment_size - ): - return True - - return False - - def filter_if_true(read): - if not cache_if_true(read): - if any( - [ - self.config['validate.filter_secondary_alignments'] and read.is_secondary, - read.mapping_quality < self.min_mapping_quality, - ] - ): - return True - elif not self.interchromosomal and set([x[0] for x in read.cigar]) & { - CIGAR.I, - CIGAR.D, - }: - return False - return True - return False - - flanking_pairs = set() # collect putative pairs - half_mapped_partners1 = set() - half_mapped_partners2 = set() - - for read in self.bam_cache.fetch_from_bins( - '{0}'.format(self.break1.chr), - self.outer_window1[0], - self.outer_window1[1], - read_limit=self.config['validate.fetch_reads_limit'], - sample_bins=self.config['validate.fetch_reads_bins'], - min_bin_size=self.config['validate.fetch_min_bin_size'], - cache=True, - cache_if=cache_if_true, - filter_if=filter_if_true, - ): - if read.mapping_quality < self.min_mapping_quality: - continue - self.counts[0] += 1 - if read.is_unmapped: - continue - if not self.collect_split_read(read, True): - self.collect_spanning_read(read) - if read.mate_is_unmapped: - half_mapped_partners1.add(read) - elif ( - any( - [ - _read.orientation_supports_type(read, et) - for et in self.putative_event_types() - ] - ) - and (read.reference_id != read.next_reference_id) == self.interchromosomal - ): - flanking_pairs.add(read) - - for read in self.bam_cache.fetch_from_bins( - '{0}'.format(self.break2.chr), - self.outer_window2[0], - self.outer_window2[1], - read_limit=self.config['validate.fetch_reads_limit'], - sample_bins=self.config['validate.fetch_reads_bins'], - min_bin_size=self.config['validate.fetch_min_bin_size'], - cache=True, - cache_if=cache_if_true, - filter_if=filter_if_true, - ): - if read.mapping_quality < self.min_mapping_quality: - continue - - self.counts[1] += 1 - - if read.is_unmapped: - continue - if not self.collect_split_read(read, False): - self.collect_spanning_read(read) - if read.mate_is_unmapped: - half_mapped_partners2.add(read) - elif ( - any( - [ - _read.orientation_supports_type(read, et) - for et in self.putative_event_types() - ] - ) - and (read.reference_id != read.next_reference_id) == self.interchromosomal - ): - flanking_pairs.add(read) - for flanking_read in sorted( - flanking_pairs, key=lambda x: (x.query_name, x.reference_start) - ): - # try and get the mate from the cache - try: - mates = self.bam_cache.get_mate(flanking_read, allow_file_access=False) - for mate in mates: - if mate.is_unmapped: - logger.debug(f'ignoring unmapped mate {mate.query_name}') - continue - self.collect_flanking_pair(flanking_read, mate) - except KeyError: - pass - - if self.compatible_window1: - compatible_type = SVTYPE.DUP - if SVTYPE.DUP in self.putative_event_types(): - compatible_type = SVTYPE.INS - - compt_flanking = set() - for read in self.bam_cache.fetch_from_bins( - '{0}'.format(self.break1.chr), - self.compatible_window1[0], - self.compatible_window1[1], - read_limit=self.config['validate.fetch_reads_limit'], - sample_bins=self.config['validate.fetch_reads_bins'], - min_bin_size=self.config['validate.fetch_min_bin_size'], - cache=True, - cache_if=cache_if_true, - filter_if=filter_if_true, - ): - if _read.orientation_supports_type(read, compatible_type): - compt_flanking.add(read) - - for read in self.bam_cache.fetch_from_bins( - '{0}'.format(self.break2.chr), - self.compatible_window2[0], - self.compatible_window2[1], - read_limit=self.config['validate.fetch_reads_limit'], - sample_bins=self.config['validate.fetch_reads_bins'], - min_bin_size=self.config['validate.fetch_min_bin_size'], - cache=True, - cache_if=cache_if_true, - filter_if=filter_if_true, - ): - if _read.orientation_supports_type(read, compatible_type): - compt_flanking.add(read) - - for flanking_read in compt_flanking: - # try and get the mate from the cache - try: - mates = self.bam_cache.get_mate(flanking_read, allow_file_access=False) - for mate in mates: - if mate.is_unmapped: - logger.debug(f'ignoring unmapped mate {mate.query_name}') - continue - try: - self.collect_compatible_flanking_pair( - flanking_read, mate, compatible_type - ) - except ValueError: - pass - except KeyError: - pass - - # now collect the half mapped reads - logger.info( - f'collected {len(half_mapped_partners1 | half_mapped_partners2)} putative half mapped reads', - ) - mates_found = 0 - for read in half_mapped_partners1 | half_mapped_partners2: - # try and get the mate from the cache - try: - mates = self.bam_cache.get_mate(read, allow_file_access=False) - mates_found += 1 - for mate in mates: - self.collect_half_mapped(read, mate) - except KeyError: - pass - logger.info(f'{mates_found} half-mapped mates found') - def copy(self): raise NotImplementedError('not appropriate for copy of evidence') diff --git a/src/mavis/validate/gather.py b/src/mavis/validate/gather.py new file mode 100644 index 00000000..167f203b --- /dev/null +++ b/src/mavis/validate/gather.py @@ -0,0 +1,746 @@ +""" +Module which contains functions related to picking reads from a BAM file which support a putative event +""" +from .base import Evidence +import pysam +from ..constants import ( + CIGAR, + ORIENT, + PYSAM_READ_FLAGS, + reverse_complement, + NA_MAPPING_QUALITY, + SVTYPE, +) +from ..bam import cigar as _cigar +from ..bam import read as _read +from ..interval import Interval +from ..util import logger + + +def collect_split_read(evidence_bpp: Evidence, read: pysam.AlignedSegment, first_breakpoint: bool): + """ + adds a split read if it passes the criteria filters and raises a warning if it does not + + Args: + read: the read to add + first_breakpoint: add to the first breakpoint (or second if false) + Returns: + bool: + - True: the read was collected and stored in the current evidence object + - False: the read was not collected + Raises: + NotSpecifiedError: if the breakpoint orientation is not specified + """ + breakpoint = evidence_bpp.break1 if first_breakpoint else evidence_bpp.break2 + window = evidence_bpp.inner_window1 if first_breakpoint else evidence_bpp.inner_window2 + opposite_breakpoint = evidence_bpp.break2 if first_breakpoint else evidence_bpp.break1 + opposite_window = evidence_bpp.inner_window2 if first_breakpoint else evidence_bpp.inner_window1 + + if read.cigar[0][0] != CIGAR.S and read.cigar[-1][0] != CIGAR.S: + return False # split read is not softclipped + elif breakpoint.orient == ORIENT.LEFT and read.cigar[-1][0] != CIGAR.S: + return False # split read is not softclipped + elif breakpoint.orient == ORIENT.RIGHT and read.cigar[0][0] != CIGAR.S: + return False # split read is not softclipped + + # the first breakpoint of a BreakpointPair is always the lower breakpoint + # if this is being added to the second breakpoint then we'll need to check if the + # read soft-clipping needs to be adjusted + + # check if the read falls within the evidence collection window + # recall that pysam is 0-indexed and the window is 1-indexed + if ( + read.reference_start > window.end - 1 + or read.reference_end < window.start + or evidence_bpp.bam_cache.get_read_reference_name(read) != breakpoint.chr + ): + return False # read not in breakpoint evidence window + # can only enforce strand if both the breakpoint and the bam are stranded + if evidence_bpp.stranded and evidence_bpp.bam_cache.stranded: + strand = _read.sequenced_strand( + read, strand_determining_read=evidence_bpp.strand_determining_read + ) + if strand != breakpoint.strand: + return False # split read not on the appropriate strand + unused = '' + primary = '' + clipped = '' + if breakpoint.orient == ORIENT.LEFT: + unused = read.query_sequence[: read.query_alignment_start] + primary = read.query_sequence[read.query_alignment_start : read.query_alignment_end] + # end is exclusive in pysam + clipped = read.query_sequence[read.query_alignment_end :] + elif breakpoint.orient == ORIENT.RIGHT: + clipped = read.query_sequence[: read.query_alignment_start] + primary = read.query_sequence[read.query_alignment_start : read.query_alignment_end] + unused = read.query_sequence[read.query_alignment_end :] + else: + raise NotSpecifiedError( + 'cannot assign split reads to a breakpoint where the orientation has not been specified' + ) + if len(primary) + len(clipped) + len(unused) != len(read.query_sequence): + raise AssertionError( + 'unused, primary, and clipped sequences should make up the original sequence', + unused, + primary, + clipped, + read.query_sequence, + len(read.query_sequence), + ) + + if ( + len(primary) < evidence_bpp.config['validate.min_anchor_exact'] + or len(clipped) < evidence_bpp.config['validate.min_softclipping'] + ): + # split read does not meet the minimum anchor criteria + return False + if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( + PYSAM_READ_FLAGS.RECOMPUTED_CIGAR + ): + read = evidence_bpp.standardize_read(read) + # data quality filters + if ( + _cigar.alignment_matches(read.cigar) + >= evidence_bpp.config['validate.min_sample_size_to_apply_percentage'] + and _cigar.match_percent(read.cigar) < evidence_bpp.config['validate.min_anchor_match'] + ): + return False # too poor quality of an alignment + if ( + _cigar.longest_exact_match(read.cigar) < evidence_bpp.config['validate.min_anchor_exact'] + and _cigar.longest_fuzzy_match( + read.cigar, evidence_bpp.config['validate.fuzzy_mismatch_number'] + ) + < evidence_bpp.config['validate.min_anchor_fuzzy'] + ): + return False # too poor quality of an alignment + else: + evidence_bpp.split_reads[0 if first_breakpoint else 1].add(read) + + # try mapping the soft-clipped portion to the other breakpoint + w = (opposite_window[0], opposite_window[1]) + opposite_breakpoint_ref = evidence_bpp.reference_genome[opposite_breakpoint.chr].seq[ + w[0] - 1 : w[1] + ] + + # figure out how much of the read must match when remaped + min_match_tgt = read.cigar[-1][1] if breakpoint.orient == ORIENT.LEFT else read.cigar[0][1] + min_match_tgt = min( + min_match_tgt * evidence_bpp.config['validate.min_anchor_match'], min_match_tgt - 1 + ) / len(read.query_sequence) + if not evidence_bpp.opposing_strands: # same strand + sc_align = _read.nsb_align( + opposite_breakpoint_ref, + read.query_sequence, + min_consecutive_match=evidence_bpp.config['validate.min_anchor_exact'], + min_match=min_match_tgt, + min_overlap_percent=min_match_tgt, + ) # split half to this side + + for alignment in sc_align: + alignment.flag = read.flag + putative_alignments = sc_align + else: + # should align opposite the current read + revcomp_sc_align = reverse_complement(read.query_sequence) + revcomp_sc_align = _read.nsb_align( + opposite_breakpoint_ref, + revcomp_sc_align, + min_consecutive_match=evidence_bpp.config['validate.min_anchor_exact'], + min_match=min_match_tgt, + min_overlap_percent=min_match_tgt, + ) + + for alignment in revcomp_sc_align: + alignment.flag = read.flag ^ PYSAM_READ_FLAGS.REVERSE # EXOR + putative_alignments = revcomp_sc_align + + scores = [] + for alignment in putative_alignments: # loop over the alignments + alignment.flag = alignment.flag | PYSAM_READ_FLAGS.SUPPLEMENTARY + # set this flag so we don't recompute the cigar multiple + alignment.set_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR, 1, value_type='i') + # add information from the original read + alignment.reference_start = w[0] - 1 + alignment.reference_start + alignment._reference_name = ( + opposite_breakpoint.chr + ) # must be set since not associated with an alignment file + alignment.reference_id = evidence_bpp.bam_cache.reference_id(opposite_breakpoint.chr) + alignment.query_name = read.query_name + alignment.set_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT, 1, value_type='i') + alignment.next_reference_start = read.next_reference_start + alignment.next_reference_id = read.next_reference_id + alignment.mapping_quality = NA_MAPPING_QUALITY + try: + cigar, offset = _cigar.extend_softclipping( + alignment.cigar, evidence_bpp.config['validate.min_anchor_exact'] + ) + alignment.cigar = cigar + alignment.reference_start = alignment.reference_start + offset + except AttributeError: + # if the matches section is too small you can't extend the + # softclipping + pass + s = _cigar.score(alignment.cigar) + alignment.cigar = _cigar.join(alignment.cigar) + if alignment.reference_id == alignment.next_reference_id: + # https://samtools.github.io/hts-specs/SAMv1.pdf + # unsigned observed template length equals the number of bases from the leftmost + # mapped base to the rightmost mapped base + tlen = abs(alignment.reference_start - alignment.next_reference_start) + 1 + if alignment.reference_start < alignment.next_reference_start: + alignment.template_length = tlen + else: + alignment.template_length = -1 * tlen + else: + alignment.template_length = 0 + if ( + _cigar.alignment_matches(alignment.cigar) + >= evidence_bpp.config['validate.min_sample_size_to_apply_percentage'] + and _cigar.match_percent(alignment.cigar) + < evidence_bpp.config['validate.min_anchor_match'] + ): + continue + if ( + _cigar.longest_exact_match(alignment.cigar) + < evidence_bpp.config['validate.min_anchor_exact'] + and _cigar.longest_fuzzy_match( + alignment.cigar, evidence_bpp.config['validate.fuzzy_mismatch_number'] + ) + < evidence_bpp.config['validate.min_anchor_fuzzy'] + ): + continue + if evidence_bpp.config['validate.max_sc_preceeding_anchor'] is not None: + if opposite_breakpoint.orient == ORIENT.LEFT: + if ( + alignment.cigar[0][0] == CIGAR.S + and alignment.cigar[0][1] + > evidence_bpp.config['validate.max_sc_preceeding_anchor'] + ): + continue + elif opposite_breakpoint.orient == ORIENT.RIGHT: + if ( + alignment.cigar[-1][0] == CIGAR.S + and alignment.cigar[-1][1] + > evidence_bpp.config['validate.max_sc_preceeding_anchor'] + ): + continue + alignment.set_key() # set the hash key before we add the read as evidence + scores.append((s, _cigar.match_percent(alignment.cigar), alignment)) + + scores = sorted(scores, key=lambda x: (x[0], x[1]), reverse=True) if scores else [] + + if len(scores) > 1: + if scores[0][0] != scores[1][0] and scores[0][1] != scores[1][1]: + # not multimap, pick highest scoring alignment + clipped = scores[0][2] + evidence_bpp.split_reads[1 if first_breakpoint else 0].add( + clipped + ) # add to the opposite breakpoint + elif len(scores) == 1: + clipped = scores[0][2] + evidence_bpp.split_reads[1 if first_breakpoint else 0].add( + clipped + ) # add to the opposite breakpoint + return True + + +def collect_half_mapped( + evidence_bpp: Evidence, read: pysam.AlignedSegment, mate: pysam.AlignedSegment +): + """ + Args: + read: the read to add + mate: the unmapped mate + + Returns: + bool: + - True: the read was collected and stored in the current evidence object + - False: the read was not collected + Raises: + AssertionError: if the mate is not unmapped + """ + if not mate.is_unmapped: + raise AssertionError('expected the mate to be unmapped') + read_itvl = Interval(read.reference_start + 1, read.reference_end) + added = False + if evidence_bpp.break1.chr == read.reference_name and Interval.overlaps( + evidence_bpp.outer_window1, read_itvl + ): + evidence_bpp.half_mapped[0].add(mate) + added = True + if evidence_bpp.break2.chr == read.reference_name and Interval.overlaps( + evidence_bpp.outer_window2, read_itvl + ): + evidence_bpp.half_mapped[1].add(mate) + added = True + return added + + +def collect_flanking_pair( + evidence_bpp: Evidence, read: pysam.AlignedSegment, mate: pysam.AlignedSegment +): + """ + checks if a given read meets the minimum quality criteria to be counted as evidence as stored as support for + this event + + Args: + read: the read to add + mate: the mate + + Returns: + bool: + - True: the pair was collected and stored in the current evidence object + - False: the pair was not collected + Raises: + ValueError: if the input reads are not a valid pair + + see [theory - types of flanking evidence](/background/theory/#types-of-flanking-evidence) + """ + if read.is_unmapped or mate.is_unmapped: + raise ValueError( + 'input reads must be a mapped and mated pair. One or both of the reads is unmapped' + ) + elif read.query_name != mate.query_name: + raise ValueError( + 'input reads must be a mapped and mated pair. The query names do not match' + ) + elif abs(read.template_length) != abs(mate.template_length): + raise ValueError( + 'input reads must be a mapped and mated pair. The template lengths (abs value) do not match', + abs(read.template_length), + abs(mate.template_length), + ) + elif ( + read.mapping_quality < evidence_bpp.min_mapping_quality + or mate.mapping_quality < evidence_bpp.min_mapping_quality + ): + return False # do not meet the minimum mapping quality requirements + # check that the references are right for the pair + if evidence_bpp.interchromosomal: + if read.reference_id == read.next_reference_id: + return False + elif read.reference_id != read.next_reference_id: + return False + + if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( + PYSAM_READ_FLAGS.RECOMPUTED_CIGAR + ): + read = evidence_bpp.standardize_read(read) + if not mate.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not mate.get_tag( + PYSAM_READ_FLAGS.RECOMPUTED_CIGAR + ): + mate = evidence_bpp.standardize_read(mate) + # order the read pairs so that they are in the same order that we expect for the breakpoints + if read.reference_id != mate.reference_id: + if evidence_bpp.bam_cache.get_read_reference_name( + read + ) > evidence_bpp.bam_cache.get_read_reference_name(mate): + read, mate = mate, read + elif read.reference_start > mate.reference_start: + read, mate = mate, read + + if ( + evidence_bpp.bam_cache.get_read_reference_name(read) != evidence_bpp.break1.chr + or evidence_bpp.bam_cache.get_read_reference_name(mate) != evidence_bpp.break2.chr + ): + return False + if any( + [ + evidence_bpp.break1.orient == ORIENT.LEFT and read.is_reverse, + evidence_bpp.break1.orient == ORIENT.RIGHT and not read.is_reverse, + evidence_bpp.break2.orient == ORIENT.LEFT and mate.is_reverse, + evidence_bpp.break2.orient == ORIENT.RIGHT and not mate.is_reverse, + ] + ): + return False + # check if this read falls in the first breakpoint window + iread = Interval(read.reference_start + 1, read.reference_end) + imate = Interval(mate.reference_start + 1, mate.reference_end) + + if evidence_bpp.stranded: + strand1 = _read.sequenced_strand(read, evidence_bpp.strand_determining_read) + strand2 = _read.sequenced_strand(mate, evidence_bpp.strand_determining_read) + + if strand1 != evidence_bpp.break1.strand or strand2 != evidence_bpp.break2.strand: + return False + + for event_type in evidence_bpp.putative_event_types(): + + # check that the pair orientation is correct + if not _read.orientation_supports_type(read, event_type): + continue + + # check that the fragment size is reasonable + fragment_size = evidence_bpp.compute_fragment_size(read, mate) + + if event_type == SVTYPE.DEL: + if fragment_size.end <= evidence_bpp.max_expected_fragment_size: + continue + elif event_type == SVTYPE.INS: + if fragment_size.start >= evidence_bpp.min_expected_fragment_size: + continue + + # check that the positions of the reads and the strands make sense + if Interval.overlaps(iread, evidence_bpp.outer_window1) and Interval.overlaps( + imate, evidence_bpp.outer_window2 + ): + evidence_bpp.flanking_pairs.add((read, mate)) + return True + + return False + + +def collect_compatible_flanking_pair( + evidence_bpp: Evidence, + read: pysam.AlignedSegment, + mate: pysam.AlignedSegment, + compatible_type: str, +) -> bool: + """ + checks if a given read meets the minimum quality criteria to be counted as evidence as stored as support for + this event + + Args: + read: the read to add + mate: the mate + compatible_type (SVTYPE): the type we are collecting for + + Returns: + bool: + - True: the pair was collected and stored in the current evidence object + - False: the pair was not collected + Raises: + ValueError: if the input reads are not a valid pair + + Note: + see [theory - types of flanking evidence](/background/theory/#compatible-flanking-pairs) + """ + if ( + read.is_unmapped + or mate.is_unmapped + or read.query_name != mate.query_name + or read.is_read1 == mate.is_read1 + ): + raise ValueError('input reads must be a mapped and mated pair') + if not evidence_bpp.compatible_window1: + raise ValueError('compatible windows were not given') + if evidence_bpp.interchromosomal: + raise NotImplementedError('interchromosomal events do not have compatible flanking pairs') + # check that the references are right for the pair + if read.reference_id != read.next_reference_id: + return False + elif ( + read.mapping_quality < evidence_bpp.min_mapping_quality + or mate.mapping_quality < evidence_bpp.min_mapping_quality + ): + return False + if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( + PYSAM_READ_FLAGS.RECOMPUTED_CIGAR + ): + read = evidence_bpp.standardize_read(read) + if not mate.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not mate.get_tag( + PYSAM_READ_FLAGS.RECOMPUTED_CIGAR + ): + mate = evidence_bpp.standardize_read(mate) + # order the read pairs so that they are in the same order that we expect for the breakpoints + if read.reference_start > mate.reference_start: + read, mate = mate, read + + if ( + evidence_bpp.bam_cache.get_read_reference_name(read) != evidence_bpp.break1.chr + or evidence_bpp.bam_cache.get_read_reference_name(mate) != evidence_bpp.break2.chr + ): + return False + + # check if this read falls in the first breakpoint window + iread = Interval(read.reference_start + 1, read.reference_end) + imate = Interval(mate.reference_start + 1, mate.reference_end) + + if evidence_bpp.stranded: + strand1 = _read.sequenced_strand(read, evidence_bpp.strand_determining_read) + strand2 = _read.sequenced_strand(mate, evidence_bpp.strand_determining_read) + if strand1 != evidence_bpp.break1.strand or strand2 != evidence_bpp.break2.strand: + return False + + # check that the pair orientation is correct + if not _read.orientation_supports_type(read, compatible_type): + return False + + # check that the fragment size is reasonable + fragment_size = evidence_bpp.compute_fragment_size(read, mate) + + if compatible_type == SVTYPE.DEL: + if fragment_size.end <= evidence_bpp.max_expected_fragment_size: + return False + elif compatible_type == SVTYPE.INS: + if fragment_size.start >= evidence_bpp.min_expected_fragment_size: + return False + + # check that the positions of the reads and the strands make sense + if Interval.overlaps(iread, evidence_bpp.compatible_window1) and Interval.overlaps( + imate, evidence_bpp.compatible_window2 + ): + evidence_bpp.compatible_flanking_pairs.add((read, mate)) + return True + + return False + + +def collect_spanning_read(evidence_bpp: Evidence, read: pysam.AlignedSegment): + """ + spanning read: a read covering BOTH breakpoints + + This is only applicable to small events. Do not need to look for soft clipped reads + here since they will be collected already + + Args: + read: the putative spanning read + + Returns: + bool: + - True: the read was collected and stored in the current evidence object + - False: the read was not collected + """ + if evidence_bpp.interchromosomal: + return False + elif not Interval.overlaps(evidence_bpp.inner_window1, evidence_bpp.inner_window2): + # too far apart to call spanning reads + return False + + if evidence_bpp.stranded: + strand = _read.sequenced_strand(read, evidence_bpp.strand_determining_read) + if strand != evidence_bpp.break1.strand and strand != evidence_bpp.break2.strand: + return False + + combined = evidence_bpp.inner_window1 & evidence_bpp.inner_window2 + read_interval = Interval(read.reference_start + 1, read.reference_end) + + if Interval.overlaps(combined, read_interval): + + if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( + PYSAM_READ_FLAGS.RECOMPUTED_CIGAR + ): + + read = evidence_bpp.standardize_read(read) + # in the correct position, now determine if it can support the event types + for event_type in evidence_bpp.putative_event_types(): + if event_type in [SVTYPE.DUP, SVTYPE.INS]: + if CIGAR.I in [c[0] for c in read.cigar]: + evidence_bpp.spanning_reads.add(read) + return True + elif event_type == SVTYPE.DEL: + if CIGAR.D in [c[0] for c in read.cigar]: + evidence_bpp.spanning_reads.add(read) + return True + elif event_type == SVTYPE.INV: + if CIGAR.X in [c[0] for c in read.cigar]: + return True + return False + + +def load_evidence(evidence_bpp: Evidence): + """ + open the associated bam file and read and store the evidence + does some preliminary read-quality filtering + """ + + def cache_if_true(read): + if read.is_unmapped or read.mate_is_unmapped: + return True + elif any( + [ + evidence_bpp.config['validate.filter_secondary_alignments'] and read.is_secondary, + read.mapping_quality < evidence_bpp.min_mapping_quality, + ] + ): + return False + elif set([x[0] for x in read.cigar]) & {CIGAR.S, CIGAR.H}: + return True + elif not read.is_proper_pair: + if any( + [ + _read.orientation_supports_type(read, e) + for e in evidence_bpp.putative_event_types() + ] + ): + return True + elif evidence_bpp.compatible_type and _read.orientation_supports_type( + read, evidence_bpp.compatible_type + ): + return True + elif not evidence_bpp.interchromosomal and not evidence_bpp.opposing_strands: + min_frag_est = ( + abs(read.reference_start - read.next_reference_start) - evidence_bpp.read_length + ) + max_frag_est = min_frag_est + 3 * evidence_bpp.read_length + if ( + min_frag_est < evidence_bpp.min_expected_fragment_size + or max_frag_est > evidence_bpp.max_expected_fragment_size + ): + return True + + return False + + def filter_if_true(read): + if not cache_if_true(read): + if any( + [ + evidence_bpp.config['validate.filter_secondary_alignments'] + and read.is_secondary, + read.mapping_quality < evidence_bpp.min_mapping_quality, + ] + ): + return True + elif not evidence_bpp.interchromosomal and set([x[0] for x in read.cigar]) & { + CIGAR.I, + CIGAR.D, + }: + return False + return True + return False + + flanking_pairs = set() # collect putative pairs + half_mapped_partners1 = set() + half_mapped_partners2 = set() + + for read in evidence_bpp.bam_cache.fetch_from_bins( + '{0}'.format(evidence_bpp.break1.chr), + evidence_bpp.outer_window1[0], + evidence_bpp.outer_window1[1], + read_limit=evidence_bpp.config['validate.fetch_reads_limit'], + sample_bins=evidence_bpp.config['validate.fetch_reads_bins'], + min_bin_size=evidence_bpp.config['validate.fetch_min_bin_size'], + cache=True, + cache_if=cache_if_true, + filter_if=filter_if_true, + ): + if read.mapping_quality < evidence_bpp.min_mapping_quality: + continue + evidence_bpp.counts[0] += 1 + if read.is_unmapped: + continue + if not collect_split_read(evidence_bpp, read, True): + collect_spanning_read(evidence_bpp, read) + if read.mate_is_unmapped: + half_mapped_partners1.add(read) + elif ( + any( + [ + _read.orientation_supports_type(read, et) + for et in evidence_bpp.putative_event_types() + ] + ) + and (read.reference_id != read.next_reference_id) == evidence_bpp.interchromosomal + ): + flanking_pairs.add(read) + + for read in evidence_bpp.bam_cache.fetch_from_bins( + '{0}'.format(evidence_bpp.break2.chr), + evidence_bpp.outer_window2[0], + evidence_bpp.outer_window2[1], + read_limit=evidence_bpp.config['validate.fetch_reads_limit'], + sample_bins=evidence_bpp.config['validate.fetch_reads_bins'], + min_bin_size=evidence_bpp.config['validate.fetch_min_bin_size'], + cache=True, + cache_if=cache_if_true, + filter_if=filter_if_true, + ): + if read.mapping_quality < evidence_bpp.min_mapping_quality: + continue + + evidence_bpp.counts[1] += 1 + + if read.is_unmapped: + continue + if not collect_split_read(evidence_bpp, read, False): + collect_spanning_read(evidence_bpp, read) + if read.mate_is_unmapped: + half_mapped_partners2.add(read) + elif ( + any( + [ + _read.orientation_supports_type(read, et) + for et in evidence_bpp.putative_event_types() + ] + ) + and (read.reference_id != read.next_reference_id) == evidence_bpp.interchromosomal + ): + flanking_pairs.add(read) + for flanking_read in sorted(flanking_pairs, key=lambda x: (x.query_name, x.reference_start)): + # try and get the mate from the cache + try: + mates = evidence_bpp.bam_cache.get_mate(flanking_read, allow_file_access=False) + for mate in mates: + if mate.is_unmapped: + logger.debug(f'ignoring unmapped mate {mate.query_name}') + continue + collect_flanking_pair(evidence_bpp, flanking_read, mate) + except KeyError: + pass + + if evidence_bpp.compatible_window1: + compatible_type = SVTYPE.DUP + if SVTYPE.DUP in evidence_bpp.putative_event_types(): + compatible_type = SVTYPE.INS + + compt_flanking = set() + for read in evidence_bpp.bam_cache.fetch_from_bins( + '{0}'.format(evidence_bpp.break1.chr), + evidence_bpp.compatible_window1[0], + evidence_bpp.compatible_window1[1], + read_limit=evidence_bpp.config['validate.fetch_reads_limit'], + sample_bins=evidence_bpp.config['validate.fetch_reads_bins'], + min_bin_size=evidence_bpp.config['validate.fetch_min_bin_size'], + cache=True, + cache_if=cache_if_true, + filter_if=filter_if_true, + ): + if _read.orientation_supports_type(read, compatible_type): + compt_flanking.add(read) + + for read in evidence_bpp.bam_cache.fetch_from_bins( + '{0}'.format(evidence_bpp.break2.chr), + evidence_bpp.compatible_window2[0], + evidence_bpp.compatible_window2[1], + read_limit=evidence_bpp.config['validate.fetch_reads_limit'], + sample_bins=evidence_bpp.config['validate.fetch_reads_bins'], + min_bin_size=evidence_bpp.config['validate.fetch_min_bin_size'], + cache=True, + cache_if=cache_if_true, + filter_if=filter_if_true, + ): + if _read.orientation_supports_type(read, compatible_type): + compt_flanking.add(read) + + for flanking_read in compt_flanking: + # try and get the mate from the cache + try: + mates = evidence_bpp.bam_cache.get_mate(flanking_read, allow_file_access=False) + for mate in mates: + if mate.is_unmapped: + logger.debug(f'ignoring unmapped mate {mate.query_name}') + continue + try: + collect_compatible_flanking_pair( + evidence_bpp, flanking_read, mate, compatible_type + ) + except ValueError: + pass + except KeyError: + pass + + # now collect the half mapped reads + logger.info( + f'collected {len(half_mapped_partners1 | half_mapped_partners2)} putative half mapped reads', + ) + mates_found = 0 + for read in half_mapped_partners1 | half_mapped_partners2: + # try and get the mate from the cache + try: + mates = evidence_bpp.bam_cache.get_mate(read, allow_file_access=False) + mates_found += 1 + for mate in mates: + collect_half_mapped(evidence_bpp, read, mate) + except KeyError: + pass + logger.info(f'{mates_found} half-mapped mates found') diff --git a/src/mavis/validate/main.py b/src/mavis/validate/main.py index 9cbc1383..5336bb4a 100644 --- a/src/mavis/validate/main.py +++ b/src/mavis/validate/main.py @@ -27,6 +27,7 @@ from .call import call_events from .constants import PASS_FILENAME from .evidence import GenomeEvidence, TranscriptomeEvidence +from .gather import load_evidence def main( @@ -171,7 +172,7 @@ def main( logger.info( f'inner window regions: {evidence.break1.chr}:{evidence.inner_window1[0]}-{evidence.inner_window1[1]} {evidence.break2.chr}:{evidence.inner_window2[0]}-{evidence.inner_window2[1]}' ) - evidence.load_evidence() + load_evidence(evidence) logger.info( f'flanking pairs: {len(evidence.flanking_pairs)}' + '; split reads: {}, {}'.format(*[len(a) for a in evidence.split_reads]) diff --git a/tests/test_mavis/validate/test_evidence.py b/tests/test_mavis/validate/test_evidence.py index 0c6c1420..df2d06e4 100644 --- a/tests/test_mavis/validate/test_evidence.py +++ b/tests/test_mavis/validate/test_evidence.py @@ -12,6 +12,7 @@ from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence, TranscriptomeEvidence from mavis_config import DEFAULTS +from mavis.validate.gather import collect_flanking_pair, collect_split_read from ..mock import MockBamFileHandle, MockObject, MockRead, mock_read_pair @@ -618,7 +619,7 @@ def test_collect_flanking_pair_error_unmapped_read(self, flanking_ge): ) read.is_unmapped = True with pytest.raises(ValueError): - flanking_ge.collect_flanking_pair(read, mate) + collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_error_mate_unmapped(self, flanking_ge): read, mate = mock_read_pair( @@ -627,7 +628,7 @@ def test_collect_flanking_pair_error_mate_unmapped(self, flanking_ge): ) mate.is_unmapped = True with pytest.raises(ValueError): - flanking_ge.collect_flanking_pair(read, mate) + collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_error_query_names_dont_match(self, flanking_ge): read, mate = mock_read_pair( @@ -635,7 +636,7 @@ def test_collect_flanking_pair_error_query_names_dont_match(self, flanking_ge): MockRead('test', 0, 6000, 6099, is_reverse=True), ) with pytest.raises(ValueError): - flanking_ge.collect_flanking_pair(read, mate) + collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_error_template_lengths_dont_match(self, flanking_ge): read, mate = mock_read_pair( @@ -644,7 +645,7 @@ def test_collect_flanking_pair_error_template_lengths_dont_match(self, flanking_ ) mate.template_length = 55 with pytest.raises(ValueError): - flanking_ge.collect_flanking_pair(read, mate) + collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_read_low_mq(self, flanking_ge): read, mate = mock_read_pair( @@ -652,7 +653,7 @@ def test_collect_flanking_pair_read_low_mq(self, flanking_ge): MockRead('test', 0, 6000, 6099, is_reverse=True), ) read.mapping_quality = 0 - assert not flanking_ge.collect_flanking_pair(read, mate) + assert not collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_mate_low_mq(self, flanking_ge): read, mate = mock_read_pair( @@ -660,11 +661,11 @@ def test_collect_flanking_pair_mate_low_mq(self, flanking_ge): MockRead('test', 0, 6000, 6099, is_reverse=True), ) mate.mapping_quality = 0 - assert not flanking_ge.collect_flanking_pair(read, mate) + assert not collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_interchromosomal(self, flanking_ge): read, mate = mock_read_pair( MockRead('test', 1, 900, 1000, is_reverse=False), MockRead('test', 0, 6000, 6099, is_reverse=True), ) - assert not flanking_ge.collect_flanking_pair(read, mate) + assert not collect_flanking_pair(flanking_ge, read, mate) diff --git a/tests/test_mavis/validate/test_validate2.py b/tests/test_mavis/validate/test_validate2.py index 0ec4278d..c0a3262f 100644 --- a/tests/test_mavis/validate/test_validate2.py +++ b/tests/test_mavis/validate/test_validate2.py @@ -7,6 +7,7 @@ from mavis.constants import NA_MAPPING_QUALITY, ORIENT, PYSAM_READ_FLAGS from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence +from mavis.validate.gather import collect_split_read, collect_flanking_pair, load_evidence from mavis_config import DEFAULTS from ...util import get_data, long_running_test @@ -125,7 +126,7 @@ def test_load_evidence_translocation(self): Breakpoint('reference19', 964, orient=ORIENT.LEFT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 14 assert self.count_original_reads(ev1.split_reads[1]) == 20 @@ -137,7 +138,7 @@ def test_load_evidence_translocation(self): Breakpoint('reference4', 2000, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 21 # one of the reads that appears to look good in the bam is too low quality % match @@ -152,7 +153,7 @@ def test_load_evidence_inversion(self): opposing_strands=True, ) - ev1.load_evidence() + load_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 54 assert self.count_original_reads(ev1.split_reads[1]) == 20 @@ -164,7 +165,7 @@ def test_load_evidence_inversion(self): Breakpoint('reference7', 19000, orient=ORIENT.RIGHT), opposing_strands=True, ) - ev1.load_evidence() + load_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[1]) == 15 assert self.count_original_reads(ev1.split_reads[0]) == 27 @@ -176,7 +177,7 @@ def test_load_evidence_duplication(self): Breakpoint('reference7', 11000, orient=ORIENT.LEFT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 35 @@ -190,7 +191,7 @@ def test_load_evidence_deletion1(self): Breakpoint('reference20', 6000, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert len(ev1.flanking_pairs) == 49 @@ -204,7 +205,7 @@ def test_load_evidence_deletion2(self): Breakpoint('referenceX', 6000, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 4 @@ -218,7 +219,7 @@ def test_load_evidence_deletion3(self): Breakpoint('referenceX', 14000, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 8 @@ -232,7 +233,7 @@ def test_load_evidence_deletion4(self): Breakpoint('reference10', 3818, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 20 @@ -246,7 +247,7 @@ def test_load_evidence_small_deletion1(self): Breakpoint('reference11', 6003, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -264,7 +265,7 @@ def test_load_evidence_small_deletion2(self): Breakpoint('reference11', 10030, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -283,7 +284,7 @@ def test_load_evidence_small_deletion_test1(self): Breakpoint('reference12', 2120, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -302,7 +303,7 @@ def test_load_evidence_small_deletion_test2(self): Breakpoint('reference10', 3818, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) assert self.count_original_reads(ev1.split_reads[0]) == 20 assert self.count_original_reads(ev1.split_reads[1]) == 18 @@ -315,7 +316,7 @@ def test_load_evidence_small_deletion_test3(self): Breakpoint('reference10', 8927, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -334,7 +335,7 @@ def test_load_evidence_small_deletion_test4(self): Breakpoint('reference10', 13123, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -353,7 +354,7 @@ def test_load_evidence_small_deletion_test5(self): Breakpoint('reference10', 17899, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -372,7 +373,7 @@ def test_load_evidence_small_deletion_test6(self): Breakpoint('reference10', 24330, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) print(len(ev1.spanning_reads)) @@ -389,7 +390,7 @@ def test_load_evidence_small_deletion_test7(self): Breakpoint('reference10', 31827, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -407,7 +408,7 @@ def test_load_evidence_small_deletion_test8(self): Breakpoint('reference10', 42159, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -426,7 +427,7 @@ def test_load_evidence_complex_deletion(self): Breakpoint('reference12', 6016, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -449,7 +450,7 @@ def test_load_evidence_small_insertion(self): Breakpoint('reference1', 2001, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -469,7 +470,7 @@ def test_load_evidence_small_insertion_high_coverage(self): Breakpoint('reference9', 2001, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -487,7 +488,7 @@ def test_load_evidence_small_insertion_high_coverage(self): Breakpoint('reference16', 2001, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -506,7 +507,7 @@ def test_load_evidence_small_duplication(self): Breakpoint('reference12', 10021, orient=ORIENT.LEFT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -525,7 +526,7 @@ def test_load_evidence_small_duplication(self): Breakpoint('reference17', 2020, orient=ORIENT.LEFT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -544,7 +545,7 @@ def test_load_evidence_low_qual_deletion(self): Breakpoint('reference19', 5219, 5219, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.spanning_reads)) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) @@ -587,7 +588,7 @@ def test_collect_split_read(self, ev_gathering_setup): next_reference_id=1, next_reference_start=2341, ) - ev_gathering_setup.collect_split_read(ev1_sr, True) + collect_split_read(ev_gathering_setup, ev1_sr, True) assert list(ev_gathering_setup.split_reads[0])[0] == ev1_sr def test_collect_split_read_failure(self, ev_gathering_setup): @@ -605,10 +606,11 @@ def test_collect_split_read_failure(self, ev_gathering_setup): next_reference_id=1, next_reference_start=2550, ) - assert not ev_gathering_setup.collect_split_read(ev1_sr, True) + assert not collect_split_read(ev_gathering_setup, ev1_sr, True) def test_collect_flanking_pair(self, ev_gathering_setup): - ev_gathering_setup.collect_flanking_pair( + collect_flanking_pair( + ev_gathering_setup, MockRead( reference_id=1, reference_start=2214, @@ -638,12 +640,12 @@ def test_collect_flanking_pair_not_overlapping_evidence_window(self, ev_gatherin MockRead(reference_id=1, reference_start=1903, reference_end=2053, is_reverse=True), MockRead(reference_id=1, reference_start=2052, reference_end=2053, is_reverse=True), ) - assert not ev_gathering_setup.collect_flanking_pair(*pair) + assert not collect_flanking_pair(ev_gathering_setup, *pair) assert len(ev_gathering_setup.flanking_pairs) == 0 def test_load_evidence(self, ev_gathering_setup): print(ev_gathering_setup) - ev_gathering_setup.load_evidence() + load_evidence(ev_gathering_setup) print(ev_gathering_setup.spanning_reads) assert ( len( From ad56ca364653f7e6609ece5dc983e9b92c4fd172 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 27 Apr 2022 15:09:59 -0700 Subject: [PATCH 28/56] Fix linting errors --- src/mavis/validate/base.py | 2 -- src/mavis/validate/gather.py | 1 + tests/test_mavis/convert/test_tools_vcf.py | 1 - tests/test_mavis/test_args.py | 1 - tests/test_mavis/validate/test_evidence.py | 2 +- 5 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/mavis/validate/base.py b/src/mavis/validate/base.py index 9df8a978..c595f86a 100644 --- a/src/mavis/validate/base.py +++ b/src/mavis/validate/base.py @@ -11,9 +11,7 @@ from ..bam.cache import BamCache from ..breakpoint import Breakpoint, BreakpointPair from ..constants import ( - CIGAR, COLUMNS, - NA_MAPPING_QUALITY, ORIENT, PYSAM_READ_FLAGS, STRAND, diff --git a/src/mavis/validate/gather.py b/src/mavis/validate/gather.py index 167f203b..21d18105 100644 --- a/src/mavis/validate/gather.py +++ b/src/mavis/validate/gather.py @@ -15,6 +15,7 @@ from ..bam import read as _read from ..interval import Interval from ..util import logger +from ..error import NotSpecifiedError def collect_split_read(evidence_bpp: Evidence, read: pysam.AlignedSegment, first_breakpoint: bool): diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index b1cf72bd..ef4eaded 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -1,5 +1,4 @@ import pytest -from mavis.convert import SUPPORTED_TOOL, _convert_tool_row from mavis.convert.vcf import VcfInfoType, VcfRecordType, convert_record, pandas_vcf from ...util import get_data diff --git a/tests/test_mavis/test_args.py b/tests/test_mavis/test_args.py index 492509f3..4230bd97 100644 --- a/tests/test_mavis/test_args.py +++ b/tests/test_mavis/test_args.py @@ -4,7 +4,6 @@ from unittest.mock import Mock, patch import pytest -from mavis import util from mavis.cluster import main as cluster_main from mavis.main import main as mavis_main from mavis.validate import main as validate_main diff --git a/tests/test_mavis/validate/test_evidence.py b/tests/test_mavis/validate/test_evidence.py index df2d06e4..a0c43f92 100644 --- a/tests/test_mavis/validate/test_evidence.py +++ b/tests/test_mavis/validate/test_evidence.py @@ -12,7 +12,7 @@ from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence, TranscriptomeEvidence from mavis_config import DEFAULTS -from mavis.validate.gather import collect_flanking_pair, collect_split_read +from mavis.validate.gather import collect_flanking_pair from ..mock import MockBamFileHandle, MockObject, MockRead, mock_read_pair From 119653bc085036d0418f85e8b3f2d28736d152b0 Mon Sep 17 00:00:00 2001 From: Caleb Choo Date: Tue, 3 May 2022 14:11:07 -0700 Subject: [PATCH 29/56] adding arriba to the docs --- docs/background/citations.md | 5 +++++ docs/glossary.md | 5 +++++ docs/inputs/support.md | 1 + 3 files changed, 11 insertions(+) diff --git a/docs/background/citations.md b/docs/background/citations.md index 742344f8..e0c2593a 100644 --- a/docs/background/citations.md +++ b/docs/background/citations.md @@ -62,6 +62,11 @@ Saunders,C.T. et al. (2012) Strelka: accurate somatic small-variant calling from sequenced tumor--normal sample pairs. Bioinformatics, 28, 1811--1817. +## Uhrig-2021 + +Uhrig,S. et al. (2021) Accurate and efficient detection of gene + fusions from RNA sequencing data. Genome Res., 31, 448--460. + ## Yates-2016 Yates,A. et al. (2016) Ensembl 2016. Nucleic Acids Res., 44, diff --git a/docs/glossary.md b/docs/glossary.md index d6147e92..e6ca9e41 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -127,6 +127,11 @@ install instructions. Community based standard of reccommendations for variant notation. See [http://varnomen.hgvs.org/](http://varnomen.hgvs.org/) +## Arriba + +Arriba is an SV caller. Source for Arriba can be found +[here](https://github.com/suhrig/arriba) [Uhrig-2021](../background/citations#uhrig-2021) + ## BreakDancer BreakDancer is an SV caller. Source for BreakDancer can be found diff --git a/docs/inputs/support.md b/docs/inputs/support.md index 89c6c482..ef01dae0 100644 --- a/docs/inputs/support.md +++ b/docs/inputs/support.md @@ -62,6 +62,7 @@ It should be noted however that the tool tracked will only be listed as | Name | Version(s) | MAVIS input | Publication | | ------------------------------------------ | ---------------- | --------------------------------------------- | ----------------------------------------------------------- | +| [Arriba](../../glossary/#arriba) | `2.2.1` | `fusions.tsv` | [Uhrig-2021](../../background/citations#uhrig-2021) | | [BreakDancer](../../glossary/#breakdancer) | `1.4.5` | `Tools main output file(s)` | [Chen-2009](../../background/citations#chen-2009) | | [BreakSeq](../../glossary/#breakseq) | `2.2` | `work/breakseq.vcf.gz` | [Abyzov-2015](../../background/citations#abyzov-2015) | | [Chimerascan](../../glossary/#chimerascan) | `0.4.5` | `*.bedpe` | [Iyer-2011](../../background/citations#Iyer-2011) | From f008dfccbfe58d266f1b4480e70a2b596a71069b Mon Sep 17 00:00:00 2001 From: Caleb Choo Date: Tue, 3 May 2022 15:45:52 -0700 Subject: [PATCH 30/56] fix for github tests --- .github/workflows/build.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3421fd4b..9d7429ff 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,7 +15,9 @@ jobs: steps: - uses: actions/checkout@v2 - name: install machine dependencies - run: sudo apt-get install -y libcurl4-openssl-dev + run: | + sudo apt-get update + sudo apt-get install -y libcurl4-openssl-dev - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: From dc67258eaebfff081dfb8c9aa1094973cc218706 Mon Sep 17 00:00:00 2001 From: Caleb Choo Date: Wed, 4 May 2022 11:02:58 -0700 Subject: [PATCH 31/56] adding more arriba test cases --- tests/unit/test_tool.py | 53 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_tool.py b/tests/unit/test_tool.py index 2b43614d..556d9589 100644 --- a/tests/unit/test_tool.py +++ b/tests/unit/test_tool.py @@ -127,7 +127,7 @@ def test_convert_standard_event(self): self.assertEqual(114751269, bpp.break2.start) self.assertEqual(SVTYPE.INV, bpp.event_type) self.assertEqual('L', bpp.break1.orient) - self.assertEqual('L', bpp.break2.orient) + self.assertEqual('L', bpp.break2.orient) self.assertEqual(True, bpp.opposing_strands) def test_convert_translocation(self): @@ -139,7 +139,7 @@ def test_convert_translocation(self): 'strand2(gene/fusion)': '-/-', 'direction1': 'upstream', 'direction2': 'downstream', - } + } bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, True) self.assertEqual(1, len(bpp_list)) @@ -150,9 +150,54 @@ def test_convert_translocation(self): self.assertEqual(58272875, bpp.break2.start) self.assertEqual(SVTYPE.TRANS, bpp.event_type) self.assertEqual('R', bpp.break1.orient) - self.assertEqual('L', bpp.break2.orient) + self.assertEqual('L', bpp.break2.orient) self.assertEqual(False, bpp.opposing_strands) + def test_convert_translocation(self): + row = { + 'breakpoint1': '20:57265705', + 'breakpoint2': '20:47786405', + 'type': 'inversion/5\'-5\'', + 'strand1(gene/fusion)': '-/-', + 'strand2(gene/fusion)': '-/+', + 'direction1': 'upstream', + 'direction2': 'upstream', + } + bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, True) + + self.assertEqual(1, len(bpp_list)) + bpp = bpp_list[0] + self.assertEqual('chr20', bpp.break1.chr) + self.assertEqual('chr20', bpp.break2.chr) + self.assertEqual(57265705, bpp.break1.start) + self.assertEqual(47786405, bpp.break2.start) + self.assertEqual(SVTYPE.INV, bpp.event_type) + self.assertEqual('R', bpp.break1.orient) + self.assertEqual('R', bpp.break2.orient) + self.assertEqual(True, bpp.opposing_strands) + + def test_convert_translocation(self): + row = { + 'breakpoint1': '14:102877322', + 'breakpoint2': '14:102994672', + 'type': 'deletion/read-through/5\'-5\'', + 'strand1(gene/fusion)': '+/+', + 'strand2(gene/fusion)': '-/+', + 'direction1': 'downstream', + 'direction2': 'upstream', + } + bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, True) + + self.assertEqual(1, len(bpp_list)) + bpp = bpp_list[0] + self.assertEqual('chr14', bpp.break1.chr) + self.assertEqual('chr14', bpp.break2.chr) + self.assertEqual(102877322, bpp.break1.start) + self.assertEqual(102994672, bpp.break2.start) + self.assertEqual(SVTYPE.DEL, bpp.event_type) + self.assertEqual('L', bpp.break1.orient) + self.assertEqual('R', bpp.break2.orient) + self.assertEqual(False, bpp.opposing_strands) def test_malformed(self): row = { @@ -163,7 +208,7 @@ def test_malformed(self): 'strand2(gene/fusion)': '-/-', 'direction1': 'upstream', 'direction2': 'downstream', - } + } with self.assertRaises(AssertionError): _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, False) From 3ec27b051c3c7370ecae94e3a2dbea7e4e31875c Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 9 May 2022 14:06:59 -0700 Subject: [PATCH 32/56] Group validate related submodules under validate subpackage --- src/mavis/main.py | 2 +- src/mavis/{ => validate}/align.py | 26 ++++++++++++++------------ src/mavis/{ => validate}/assemble.py | 10 +++++----- src/mavis/validate/base.py | 13 +++---------- src/mavis/{ => validate}/blat.py | 18 +++++++++--------- src/mavis/validate/call.py | 19 ++++++++++++------- src/mavis/validate/main.py | 2 +- tests/test_mavis/mock.py | 2 +- tests/test_mavis/test_align.py | 6 +++--- tests/test_mavis/test_assemble.py | 2 +- tests/test_mavis/test_assemble2.py | 2 +- tests/test_mavis/test_blat.py | 2 +- tests/test_mavis/test_blat2.py | 4 ++-- tests/test_mavis/validate/test_call.py | 2 +- 14 files changed, 55 insertions(+), 55 deletions(-) rename src/mavis/{ => validate}/align.py (97%) rename src/mavis/{ => validate}/assemble.py (98%) rename src/mavis/{ => validate}/blat.py (98%) diff --git a/src/mavis/main.py b/src/mavis/main.py index fac18049..f00f72fc 100644 --- a/src/mavis/main.py +++ b/src/mavis/main.py @@ -14,7 +14,6 @@ from . import __version__ from . import config as _config from . import util as _util -from .align import get_aligner_version from .annotate import main as annotate_main from .cluster import main as cluster_main from .convert import SUPPORTED_TOOL, convert_tool_output @@ -24,6 +23,7 @@ from .summary import main as summary_main from .util import filepath from .validate import main as validate_main +from .validate.align import get_aligner_version def convert_main(inputs, outputfile, file_type, strand_specific=False, assume_no_untemplated=True): diff --git a/src/mavis/align.py b/src/mavis/validate/align.py similarity index 97% rename from src/mavis/align.py rename to src/mavis/validate/align.py index 223afddb..3fbfad3c 100644 --- a/src/mavis/align.py +++ b/src/mavis/validate/align.py @@ -9,16 +9,16 @@ import pysam -from .bam import cigar as _cigar -from .bam import read as _read -from .breakpoint import Breakpoint, BreakpointPair -from .constants import CIGAR, ORIENT, STRAND, SVTYPE, MavisNamespace, reverse_complement -from .interval import Interval -from .types import ReferenceGenome -from .util import logger +from ..bam import cigar as _cigar +from ..bam import read as _read +from ..breakpoint import Breakpoint, BreakpointPair +from ..constants import CIGAR, ORIENT, STRAND, SVTYPE, MavisNamespace, reverse_complement +from ..interval import Interval +from ..types import ReferenceGenome +from ..util import logger if TYPE_CHECKING: - from .bam.cache import BamCache + from ..bam.cache import BamCache class SUPPORTED_ALIGNER(MavisNamespace): @@ -34,7 +34,7 @@ class SUPPORTED_ALIGNER(MavisNamespace): BLAT = 'blat' -class SplitAlignment(BreakpointPair): +class DiscontinuousAlignment(BreakpointPair): def __init__(self, *pos, **kwargs): self.read1 = kwargs.pop('read1') self.read2 = kwargs.pop('read2', None) @@ -210,7 +210,7 @@ def convert_to_duplication(alignment, reference_genome: ReferenceGenome): if refseq != alignment.untemplated_seq[:dup_len]: continue - result = SplitAlignment( + result = DiscontinuousAlignment( Breakpoint( alignment.break2.chr, alignment.break2.start - dup_len, @@ -280,7 +280,7 @@ def call_read_events(read, secondary_read=None, is_stranded=False): result = [] strand = STRAND.NEG if read.is_reverse else STRAND.POS for ref_start, delsize, insseq in events: - bpp = SplitAlignment( + bpp = DiscontinuousAlignment( Breakpoint( read.reference_name, ref_start, @@ -383,7 +383,9 @@ def call_paired_read_event(read1, read2, is_stranded=False): if not is_stranded: break1.strand = STRAND.NS break2.strand = STRAND.NS - return SplitAlignment(break1, break2, untemplated_seq=untemplated_seq, read1=read1, read2=read2) + return DiscontinuousAlignment( + break1, break2, untemplated_seq=untemplated_seq, read1=read1, read2=read2 + ) def align_sequences( diff --git a/src/mavis/assemble.py b/src/mavis/validate/assemble.py similarity index 98% rename from src/mavis/assemble.py rename to src/mavis/validate/assemble.py index 235078fa..4efbe874 100644 --- a/src/mavis/assemble.py +++ b/src/mavis/validate/assemble.py @@ -4,11 +4,11 @@ import distance import networkx as nx -from .bam import cigar as _cigar -from .bam.read import calculate_alignment_score, nsb_align, sequence_complexity -from .constants import reverse_complement -from .interval import Interval -from .util import logger +from ..bam import cigar as _cigar +from ..bam.read import calculate_alignment_score, nsb_align, sequence_complexity +from ..constants import reverse_complement +from ..interval import Interval +from ..util import logger class Contig: diff --git a/src/mavis/validate/base.py b/src/mavis/validate/base.py index c595f86a..0a3ddd20 100644 --- a/src/mavis/validate/base.py +++ b/src/mavis/validate/base.py @@ -5,22 +5,15 @@ import pysam from mavis_config import DEFAULTS -from ..assemble import assemble from ..bam import cigar as _cigar from ..bam import read as _read from ..bam.cache import BamCache from ..breakpoint import Breakpoint, BreakpointPair -from ..constants import ( - COLUMNS, - ORIENT, - PYSAM_READ_FLAGS, - STRAND, - SVTYPE, - reverse_complement, -) +from ..constants import COLUMNS, ORIENT, PYSAM_READ_FLAGS, STRAND, SVTYPE, reverse_complement from ..error import NotSpecifiedError from ..interval import Interval from ..util import logger +from .assemble import Contig, assemble class Evidence(BreakpointPair): @@ -31,7 +24,7 @@ class Evidence(BreakpointPair): compatible_window1: Optional[Interval] compatible_window2: Optional[Interval] config: Dict - contigs: List + contigs: List[Contig] counts: List[int] flanking_pairs: Set half_mapped: Tuple[Set, Set] diff --git a/src/mavis/blat.py b/src/mavis/validate/blat.py similarity index 98% rename from src/mavis/blat.py rename to src/mavis/validate/blat.py index cbfeaace..939b940f 100644 --- a/src/mavis/blat.py +++ b/src/mavis/validate/blat.py @@ -15,12 +15,11 @@ import pandas as pd -from .align import query_coverage_interval -from .bam import cigar as _cigar -from .bam.cache import BamCache -from .bam.cigar import QUERY_ALIGNED_STATES -from .bam.read import SamRead -from .constants import ( +from ..bam import cigar as _cigar +from ..bam.cache import BamCache +from ..bam.cigar import QUERY_ALIGNED_STATES +from ..bam.read import SamRead +from ..constants import ( CIGAR, DNA_ALPHABET, NA_MAPPING_QUALITY, @@ -28,9 +27,10 @@ STRAND, reverse_complement, ) -from .interval import Interval -from .types import ReferenceGenome -from .util import logger +from ..interval import Interval +from ..types import ReferenceGenome +from ..util import logger +from .align import query_coverage_interval class Blat: diff --git a/src/mavis/validate/call.py b/src/mavis/validate/call.py index 002534a3..e0e04c1c 100644 --- a/src/mavis/validate/call.py +++ b/src/mavis/validate/call.py @@ -3,8 +3,6 @@ import statistics from typing import List, Optional, Set -from ..align import SplitAlignment, call_paired_read_event, call_read_events, convert_to_duplication -from ..assemble import Contig from ..bam import read as _read from ..breakpoint import Breakpoint, BreakpointPair from ..constants import ( @@ -18,6 +16,13 @@ ) from ..interval import Interval from ..validate.base import Evidence +from .align import ( + DiscontinuousAlignment, + call_paired_read_event, + call_read_events, + convert_to_duplication, +) +from .assemble import Contig class EventCall(BreakpointPair): @@ -34,7 +39,7 @@ class for holding evidence and the related calls since we can't freeze the evide compatible_flanking_pairs: Set compatible_type: Optional[str] contig: Optional[Contig] - contig_alignment: Optional[SplitAlignment] + contig_alignment: Optional[DiscontinuousAlignment] @property def has_compatible(self): @@ -48,7 +53,7 @@ def __init__( event_type: str, call_method: str, contig: Optional[Contig] = None, - contig_alignment: Optional[SplitAlignment] = None, + contig_alignment: Optional[DiscontinuousAlignment] = None, untemplated_seq: Optional[str] = None, ): """ @@ -515,13 +520,13 @@ def flatten(self): if self.contig: cseq = self.contig_alignment.query_sequence try: - break1_read_depth = SplitAlignment.breakpoint_contig_remapped_depth( + break1_read_depth = DiscontinuousAlignment.breakpoint_contig_remapped_depth( self.break1, self.contig, self.contig_alignment.read1 ) except AssertionError: break1_read_depth = None try: - break2_read_depth = SplitAlignment.breakpoint_contig_remapped_depth( + break2_read_depth = DiscontinuousAlignment.breakpoint_contig_remapped_depth( self.break2, self.contig, self.contig_alignment.read1 @@ -1076,7 +1081,7 @@ def _call_by_split_reads(evidence, event_type, consumed_evidence=None): event_type, call_method=CALL_METHOD.SPLIT, untemplated_seq=call.untemplated_seq, - contig_alignment=None if not isinstance(call, SplitAlignment) else call, + contig_alignment=None if not isinstance(call, DiscontinuousAlignment) else call, ) except ValueError: # incompatible types continue diff --git a/src/mavis/validate/main.py b/src/mavis/validate/main.py index 5336bb4a..4f78aa87 100644 --- a/src/mavis/validate/main.py +++ b/src/mavis/validate/main.py @@ -8,7 +8,6 @@ import pysam from shortuuid import uuid -from ..align import SUPPORTED_ALIGNER, align_sequences, select_contig_alignments from ..annotate.base import BioInterval from ..annotate.file_io import ReferenceFile from ..bam import cigar as _cigar @@ -24,6 +23,7 @@ read_inputs, write_bed_file, ) +from .align import SUPPORTED_ALIGNER, align_sequences, select_contig_alignments from .call import call_events from .constants import PASS_FILENAME from .evidence import GenomeEvidence, TranscriptomeEvidence diff --git a/tests/test_mavis/mock.py b/tests/test_mavis/mock.py index 6e91f96d..d05051b7 100644 --- a/tests/test_mavis/mock.py +++ b/tests/test_mavis/mock.py @@ -1,11 +1,11 @@ import os import types -from mavis.align import query_coverage_interval from mavis.annotate.file_io import load_annotations, load_reference_genome from mavis.annotate.genomic import PreTranscript, Transcript from mavis.annotate.protein import Translation from mavis.constants import CIGAR, NA_MAPPING_QUALITY +from mavis.validate.align import query_coverage_interval from ..util import get_data diff --git a/tests/test_mavis/test_align.py b/tests/test_mavis/test_align.py index 790a3d09..512fc580 100644 --- a/tests/test_mavis/test_align.py +++ b/tests/test_mavis/test_align.py @@ -1,14 +1,14 @@ from unittest import mock import mavis.bam.cigar as _cigar -from mavis import align from mavis.annotate.file_io import load_reference_genome -from mavis.assemble import Contig from mavis.bam.cache import BamCache from mavis.bam.read import SamRead from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import CIGAR, ORIENT, STRAND, reverse_complement from mavis.interval import Interval +from mavis.validate import align +from mavis.validate.assemble import Contig from mavis.validate.evidence import GenomeEvidence from mavis_config import DEFAULTS @@ -242,7 +242,7 @@ def test_break_left_deletion(self): reference_start=999, reference_name='10', ) - align.SplitAlignment.breakpoint_contig_remapped_depth(b, contig, read) + align.DiscontinuousAlignment.breakpoint_contig_remapped_depth(b, contig, read) class TestSplitEvents: diff --git a/tests/test_mavis/test_assemble.py b/tests/test_mavis/test_assemble.py index 52711a6c..bc7f2bea 100644 --- a/tests/test_mavis/test_assemble.py +++ b/tests/test_mavis/test_assemble.py @@ -2,8 +2,8 @@ import random import pytest -from mavis.assemble import Contig, DeBruijnGraph, assemble, filter_contigs, kmers from mavis.constants import DNA_ALPHABET +from mavis.validate.assemble import Contig, DeBruijnGraph, assemble, filter_contigs, kmers from ..util import get_data, long_running_test diff --git a/tests/test_mavis/test_assemble2.py b/tests/test_mavis/test_assemble2.py index 8baa718c..600d6b91 100644 --- a/tests/test_mavis/test_assemble2.py +++ b/tests/test_mavis/test_assemble2.py @@ -2,9 +2,9 @@ import pytest import timeout_decorator -from mavis.assemble import Contig, assemble, filter_contigs from mavis.constants import reverse_complement from mavis.interval import Interval +from mavis.validate.assemble import Contig, assemble, filter_contigs from mavis_config import DEFAULTS from ..util import get_data, long_running_test diff --git a/tests/test_mavis/test_blat.py b/tests/test_mavis/test_blat.py index 488f14d7..f1c09d73 100644 --- a/tests/test_mavis/test_blat.py +++ b/tests/test_mavis/test_blat.py @@ -1,6 +1,6 @@ import pytest -from mavis.blat import Blat from mavis.constants import CIGAR, reverse_complement +from mavis.validate.blat import Blat from .mock import Mock, MockFunction, MockLongString diff --git a/tests/test_mavis/test_blat2.py b/tests/test_mavis/test_blat2.py index 254f6a5d..030fca61 100644 --- a/tests/test_mavis/test_blat2.py +++ b/tests/test_mavis/test_blat2.py @@ -1,12 +1,12 @@ import mavis.bam.cigar as _cigar import pytest from Bio import SeqIO -from mavis.align import query_coverage_interval from mavis.annotate.file_io import load_reference_genome from mavis.bam.cache import BamCache -from mavis.blat import Blat from mavis.constants import CIGAR, reverse_complement from mavis.interval import Interval +from mavis.validate.align import query_coverage_interval +from mavis.validate.blat import Blat from ..util import get_data from .mock import MockBamFileHandle, MockLongString, MockObject diff --git a/tests/test_mavis/validate/test_call.py b/tests/test_mavis/validate/test_call.py index b959b6d5..cf2a78fc 100644 --- a/tests/test_mavis/validate/test_call.py +++ b/tests/test_mavis/validate/test_call.py @@ -1,7 +1,6 @@ from unittest import mock import pytest -from mavis.align import call_paired_read_event, select_contig_alignments from mavis.annotate.file_io import load_reference_genome from mavis.annotate.genomic import PreTranscript, Transcript from mavis.bam import cigar as _cigar @@ -12,6 +11,7 @@ from mavis.constants import CALL_METHOD, CIGAR, ORIENT, PYSAM_READ_FLAGS, STRAND, SVTYPE from mavis.interval import Interval from mavis.validate import call +from mavis.validate.align import call_paired_read_event, select_contig_alignments from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence, TranscriptomeEvidence From 878b83369a71f36d95344952941d7cd885ca9e0c Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 10 May 2022 14:24:34 -0700 Subject: [PATCH 33/56] Clean up tests to use parametrize --- tests/test_mavis/bam/test_bam_cigar.py | 259 +++++++++++++------------ 1 file changed, 130 insertions(+), 129 deletions(-) diff --git a/tests/test_mavis/bam/test_bam_cigar.py b/tests/test_mavis/bam/test_bam_cigar.py index 11fd7b6a..cddcfae3 100644 --- a/tests/test_mavis/bam/test_bam_cigar.py +++ b/tests/test_mavis/bam/test_bam_cigar.py @@ -24,80 +24,54 @@ from ...util import get_data from ..mock import MockObject, MockRead -REFERENCE_GENOME = None +# get_mapped_cigar_segments -def setUpModule(): +@pytest.fixture(scope='module') +def mock_reference_genome(): warnings.simplefilter('ignore') - global REFERENCE_GENOME - REFERENCE_GENOME = load_reference_genome(get_data('mock_reference_genome.fa')) - if ( - 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' - != REFERENCE_GENOME['fake'].seq[0:50].upper() - ): + ref = load_reference_genome(get_data('mock_reference_genome.fa')) + if 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' != ref['fake'].seq[0:50].upper(): raise AssertionError('fake genome file does not have the expected contents') - - -class TestRecomputeCigarMismatch: - def test_simple(self): - r = MockRead( - reference_start=1456, - query_sequence='CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', - cigar=[(CIGAR.M, 45)], - ) - assert recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']) == [(CIGAR.EQ, 45)] - - def test_hardclipping(self): - r = MockRead( - reference_start=1456, - query_sequence='CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', - cigar=[(CIGAR.H, 20), (CIGAR.M, 45)], - ) - assert [(CIGAR.H, 20), (CIGAR.EQ, 45)] == recompute_cigar_mismatch( - r, REFERENCE_GENOME['fake'] - ) - - def test_with_events(self): - r = MockRead( - reference_start=1456, - query_sequence='TATA' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', - cigar=[(CIGAR.S, 4), (CIGAR.M, 10), (CIGAR.D, 10), (CIGAR.I, 10), (CIGAR.M, 25)], - ) - assert [ - (CIGAR.S, 4), - (CIGAR.EQ, 10), - (CIGAR.D, 10), - (CIGAR.I, 10), - (CIGAR.EQ, 25), - ] == recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']) - - def test_mismatch_to_mismatch(self): - r = MockRead( - reference_start=1452, - query_sequence='CAGC' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', - cigar=[(CIGAR.X, 4), (CIGAR.M, 10), (CIGAR.D, 10), (CIGAR.I, 10), (CIGAR.M, 25)], - ) - assert [ - (CIGAR.X, 4), - (CIGAR.EQ, 10), - (CIGAR.D, 10), - (CIGAR.I, 10), - (CIGAR.EQ, 25), - ] == recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']) - - def test_m_to_mismatch(self): - r = MockRead( - reference_start=1452, - query_sequence='CAGC' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', - cigar=[(CIGAR.M, 14), (CIGAR.D, 10), (CIGAR.I, 10), (CIGAR.M, 25)], - ) - assert [ - (CIGAR.X, 4), - (CIGAR.EQ, 10), - (CIGAR.D, 10), - (CIGAR.I, 10), - (CIGAR.EQ, 25), - ] == recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']) + return ref + + +@pytest.mark.parametrize( + 'reference_start,query_sequence,cigar_in,cigar_out', + [ + [1456, 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', '45M', '45='], + [1456, 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', '20H45M', '20H45='], + [ + 1456, + 'TATA' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', + '4S10M10D10I25M', + '4S10=10D10I25=', + ], + [ + 1452, + 'CAGC' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', + '4X10M10D10I25M', + '4X10=10D10I25=', + ], + [ + 1452, + 'CAGC' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', + '14M10D10I25M', + '4X10=10D10I25=', + ], + ], +) +def test_recompute_cigar_mismatch( + reference_start, query_sequence, cigar_in, cigar_out, mock_reference_genome +): + r = MockRead( + reference_start=reference_start, + query_sequence=query_sequence, + cigar=convert_string_to_cigar(cigar_in), + ) + assert recompute_cigar_mismatch(r, mock_reference_genome['fake']) == convert_string_to_cigar( + cigar_out + ) class TestExtendSoftclipping: @@ -108,23 +82,65 @@ def test_softclipped_right(self): assert cnew == convert_string_to_cigar('70=80S') +@pytest.mark.parametrize( + 'input_cigars,output_cigar', + [ + [['10M10X10X'], '10M20X'], + [['10X10M10X'], '10X10M10X'], + [['10M10X10X', '10X10M10X'], '10M30X10M10X'], + [['1S2S5=7X2=5X28=1X99='], '3S5=7X2=5X28=1X99='], + [['10H10M10X10X'], '10H10M20X'], + ], +) +def test_join(input_cigars, output_cigar): + assert join(*[convert_string_to_cigar(c) for c in input_cigars]) == convert_string_to_cigar( + output_cigar + ) + + +@pytest.mark.parametrize( + 'seq1,seq2,opt,cigar_string,ref_start', + [ + ['GTGAGTAAATTCAACATCGTTTTT', 'AACTTAGAATTCAAC---------', {}, '7S8=', 7], + ['GTGAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------', {}, '5S8=', 7], + [ + 'GTGAGTAAATTCAACATCGTTTTT', + '--CTTAGAATTCAAC---------', + dict(force_softclipping=False), + '5S8=', + 7, + ], + [ + 'GTGAGTAAATTC--CATCGTTTTT', + '--CTTAGAATTCAAC---------', + dict(force_softclipping=False), + '5S5=2I1=', + 7, + ], + ['CCTG', 'CCGT', dict(min_exact_to_stop_softclipping=10), '2=2X', 0], + [ + '--GAGTAAATTCAACATCGTTTTT', + '--CTTAGAATTCAAC---------', + dict(force_softclipping=False), + '5S8=', + 5, + ], + ], +) +def test_compute(seq1, seq2, opt, cigar_string, ref_start): + assert compute(seq1, seq2, **opt) == (convert_string_to_cigar(cigar_string), ref_start) + + +def test_compute_error(): + with pytest.raises(AttributeError): + compute('CCTG', 'CCG') + + class TestCigarTools: def test_alignment_matches(self): c = [(CIGAR.M, 10), (CIGAR.EQ, 10), (CIGAR.X, 10)] assert alignment_matches(c) == 30 - def test_join(self): - c = [(CIGAR.M, 10), (CIGAR.X, 10), (CIGAR.X, 10)] - assert join(c) == [(CIGAR.M, 10), (CIGAR.X, 20)] - k = [(CIGAR.X, 10), (CIGAR.M, 10), (CIGAR.X, 10)] - assert join(c, k) == [(CIGAR.M, 10), (CIGAR.X, 30), (CIGAR.M, 10), (CIGAR.X, 10)] - k = [(4, 1), (4, 2), (7, 5), (8, 7), (7, 2), (8, 5), (7, 28), (8, 1), (7, 99)] - assert [(4, 3), (7, 5), (8, 7), (7, 2), (8, 5), (7, 28), (8, 1), (7, 99)] == join(k) - - def test_join_hardclipping(self): - c = [(CIGAR.H, 10), (CIGAR.M, 10), (CIGAR.X, 10), (CIGAR.X, 10)] - assert join(c) == [(CIGAR.H, 10), (CIGAR.M, 10), (CIGAR.X, 20)] - def test_longest_fuzzy_match(self): c = [ (CIGAR.S, 10), @@ -170,34 +186,6 @@ def test_match_percent(self): with pytest.raises(AttributeError): match_percent([(CIGAR.S, 100)]) - def test_compute(self): - # GTGAGTAAATTCAACATCGTTTTT - # aacttagAATTCAAC--------- - assert ([(CIGAR.S, 7), (CIGAR.EQ, 8)], 7) == compute( - 'GTGAGTAAATTCAACATCGTTTTT', 'AACTTAGAATTCAAC---------' - ) - assert ([(CIGAR.S, 5), (CIGAR.EQ, 8)], 7) == compute( - 'GTGAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------' - ) - assert ([(CIGAR.S, 5), (CIGAR.EQ, 8)], 7) == compute( - 'GTGAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------', False - ) - - assert ([(CIGAR.S, 5), (CIGAR.EQ, 5), (CIGAR.I, 2), (CIGAR.EQ, 1)], 7) == compute( - 'GTGAGTAAATTC--CATCGTTTTT', '--CTTAGAATTCAAC---------', False - ) - - with pytest.raises(AttributeError): - compute('CCTG', 'CCG') - - assert ([(CIGAR.EQ, 2), (CIGAR.X, 2)], 0) == compute( - 'CCTG', 'CCGT', min_exact_to_stop_softclipping=10 - ) - - assert ([(CIGAR.S, 5), (CIGAR.EQ, 8)], 5) == compute( - '--GAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------', False - ) - def test_convert_for_igv(self): c = [(CIGAR.M, 10), (CIGAR.EQ, 10), (CIGAR.X, 10)] assert convert_for_igv(c) == [(CIGAR.M, 30)] @@ -524,7 +512,7 @@ def test_even_insertion_in_repeat(self): read.cigar = new_cigar assert new_cigar == exp - def test_deletion_repeat(self): + def test_deletion_repeat(self, mock_reference_genome): qseq = ( 'GAGT' 'GAGACTCTGT' @@ -568,10 +556,10 @@ def test_deletion_repeat(self): (CIGAR.D, 27), (CIGAR.EQ, 74 - 30), ] - std_cigar = hgvs_standardize_cigar(read, REFERENCE_GENOME[read.reference_name].seq) - print(SamRead.deletion_sequences(read, REFERENCE_GENOME)) + std_cigar = hgvs_standardize_cigar(read, mock_reference_genome[read.reference_name].seq) + print(SamRead.deletion_sequences(read, mock_reference_genome)) read.cigar = std_cigar - print(SamRead.deletion_sequences(read, REFERENCE_GENOME)) + print(SamRead.deletion_sequences(read, mock_reference_genome)) assert std_cigar == expected_cigar @timeout_decorator.timeout(1) @@ -705,21 +693,34 @@ def test_small_exact_match(self): assert new_cigar == exp -class TestConvertStringToCigar: - def test(self): - string = '283M' '17506D' '5M' '21275D' '596M' '17506D' '5M' '21275D' '313M' - exp = [ - (CIGAR.M, 283), - (CIGAR.D, 17506), - (CIGAR.M, 5), - (CIGAR.D, 21275), - (CIGAR.M, 596), - (CIGAR.D, 17506), - (CIGAR.M, 5), - (CIGAR.D, 21275), - (CIGAR.M, 313), - ] - assert convert_string_to_cigar(string) == exp +@pytest.mark.parametrize( + 'string,cigartuples', + [ + [ + '283M' '17506D' '5M' '21275D' '596M' '17506D' '5M' '21275D' '313M', + [ + (CIGAR.M, 283), + (CIGAR.D, 17506), + (CIGAR.M, 5), + (CIGAR.D, 21275), + (CIGAR.M, 596), + (CIGAR.D, 17506), + (CIGAR.M, 5), + (CIGAR.D, 21275), + (CIGAR.M, 313), + ], + ], + [ + '42H25M', + [ + (CIGAR.H, 42), + (CIGAR.M, 25), + ], + ], + ], +) +def test_convert_string_to_cigar(string, cigartuples): + assert convert_string_to_cigar(string) == cigartuples class TestGetSequences: From 9027aa98096aeef511027a2e1ac0dd3c1aa78932 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Tue, 10 May 2022 14:29:36 -0700 Subject: [PATCH 34/56] Rename file to match pattern of other test files --- tests/test_mavis/bam/{test_bam_cigar.py => test_cigar.py} | 2 -- 1 file changed, 2 deletions(-) rename tests/test_mavis/bam/{test_bam_cigar.py => test_cigar.py} (99%) diff --git a/tests/test_mavis/bam/test_bam_cigar.py b/tests/test_mavis/bam/test_cigar.py similarity index 99% rename from tests/test_mavis/bam/test_bam_cigar.py rename to tests/test_mavis/bam/test_cigar.py index cddcfae3..112f9393 100644 --- a/tests/test_mavis/bam/test_bam_cigar.py +++ b/tests/test_mavis/bam/test_cigar.py @@ -24,8 +24,6 @@ from ...util import get_data from ..mock import MockObject, MockRead -# get_mapped_cigar_segments - @pytest.fixture(scope='module') def mock_reference_genome(): From 6e3cf9dbfd29b0fd32a325ab7a675799f5c39b69 Mon Sep 17 00:00:00 2001 From: Caleb Choo Date: Mon, 16 May 2022 12:49:35 -0700 Subject: [PATCH 35/56] readding tests lost during merge --- src/mavis/convert/constants.py | 1 + tests/test_mavis/convert/test_tool.py | 105 ++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/src/mavis/convert/constants.py b/src/mavis/convert/constants.py index fe59ad68..aa18fb89 100644 --- a/src/mavis/convert/constants.py +++ b/src/mavis/convert/constants.py @@ -28,6 +28,7 @@ class SUPPORTED_TOOL(MavisNamespace): STRELKA = 'strelka' STARFUSION = 'starfusion' STRAGLR = 'straglr' + ARRIBA = 'arriba' TOOL_SVTYPE_MAPPING = {v: [v] for v in SVTYPE.values()} # type: ignore diff --git a/tests/test_mavis/convert/test_tool.py b/tests/test_mavis/convert/test_tool.py index a6371452..6603572d 100644 --- a/tests/test_mavis/convert/test_tool.py +++ b/tests/test_mavis/convert/test_tool.py @@ -102,6 +102,111 @@ def test_convert_duplication(self): assert bpp.break1.chr == '1' assert bpp.break2.chr == '1' +class TestArriba: + def test_convert_standard_event(self): + row = { + 'breakpoint1': '13:114529969', + 'breakpoint2': '13:114751269', + 'type': 'inversion', + 'strand1(gene/fusion)': '+/+', + 'strand2(gene/fusion)': '-/-', + 'direction1': 'downstream', + 'direction2': 'downstream', + } + bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, True) + + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == 'chr13' + assert bpp.break2.chr == 'chr13' + assert bpp.break1.start == 114529969 + assert bpp.break2.start == 114751269 + assert bpp.event_type == SVTYPE.INV + assert bpp.break1.orient == 'L' + assert bpp.break2.orient == 'L' + assert bpp.opposing_strands == True + + def test_convert_translocation(self): + row = { + 'breakpoint1': '17:69313092', + 'breakpoint2': '20:58272875', + 'type': 'translocation', + 'strand1(gene/fusion)': '-/-', + 'strand2(gene/fusion)': '-/-', + 'direction1': 'upstream', + 'direction2': 'downstream', + } + bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, True) + + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == 'chr17' + assert bpp.break2.chr == 'chr20' + assert bpp.break1.start == 69313092 + assert bpp.break2.start == 58272875 + assert bpp.event_type == SVTYPE.TRANS + assert bpp.break1.orient == 'R' + assert bpp.break2.orient == 'L' + assert bpp.opposing_strands == False + + def test_convert_translocation(self): + row = { + 'breakpoint1': '20:57265705', + 'breakpoint2': '20:47786405', + 'type': 'inversion/5\'-5\'', + 'strand1(gene/fusion)': '-/-', + 'strand2(gene/fusion)': '-/+', + 'direction1': 'upstream', + 'direction2': 'upstream', + } + bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, True) + + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == 'chr20' + assert bpp.break2.chr == 'chr20' + assert bpp.break1.start == 57265705 + assert bpp.break2.start == 47786405 + assert bpp.event_type == SVTYPE.INV + assert bpp.break1.orient == 'R' + assert bpp.break2.orient == 'R' + assert bpp.opposing_strands == True + + def test_convert_translocation(self): + row = { + 'breakpoint1': '14:102877322', + 'breakpoint2': '14:102994672', + 'type': 'deletion/read-through/5\'-5\'', + 'strand1(gene/fusion)': '+/+', + 'strand2(gene/fusion)': '-/+', + 'direction1': 'downstream', + 'direction2': 'upstream', + } + bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, True) + + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == 'chr14' + assert bpp.break2.chr == 'chr14' + assert bpp.break1.start == 102877322 + assert bpp.break2.start == 102994672 + assert bpp.event_type == SVTYPE.DEL + assert bpp.break1.orient == 'L' + assert bpp.break2.orient == 'R' + assert bpp.opposing_strands == False + + def test_malformed(self): + row = { + 'breakpoint1': '', + 'breakpoint2': None, + 'type': 'translocation', + 'strand1(gene/fusion)': '-/-', + 'strand2(gene/fusion)': '-/-', + 'direction1': 'upstream', + 'direction2': 'downstream', + } + with pytest.raises(AssertionError): + _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, False) class TestStarFusion: def test_convert_standard_event(self): From 8b59a5101b4b840ee628294ad216903539bcdcc4 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 18 May 2022 12:56:40 -0700 Subject: [PATCH 36/56] Move class method off of class definition --- src/mavis/breakpoint.py | 130 ++++++++++++++-------------- src/mavis/convert/__init__.py | 4 +- src/mavis/util.py | 8 +- src/mavis/validate/align.py | 6 +- src/mavis/validate/base.py | 8 +- src/mavis/validate/call.py | 18 ++-- src/mavis/validate/main.py | 4 +- tests/test_mavis/test_breakpoint.py | 54 ++++++------ 8 files changed, 117 insertions(+), 115 deletions(-) diff --git a/src/mavis/breakpoint.py b/src/mavis/breakpoint.py index 6d29fa4b..e6003627 100644 --- a/src/mavis/breakpoint.py +++ b/src/mavis/breakpoint.py @@ -304,7 +304,7 @@ def __init__( ) # try classifying to make sure it's a valid combination - BreakpointPair.classify(self) + classify_breakpoint_pair(self) def column(self, colname: str): return self.data.get(COLUMNS[colname]) @@ -347,70 +347,6 @@ def flatten(self): row.update(temp) return row - @classmethod - def classify(cls, pair, distance: Optional[Callable] = None) -> Set[str]: - """ - uses the chr, orientations and strands to determine the - possible structural_variant types that this pair could support - - Args: - pair (BreakpointPair): the pair to classify - distance: if defined, will be passed to net size to use in narrowing the list of putative types (del vs ins) - Returns: - a list of possible SVTYPE - - Example: - >>> bpp = BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) - >>> BreakpointPair.classify(bpp) - ['inversion'] - >>> bpp = BreakpointPair(Breakpoint('1', 1, orient='L'), Breakpoint('1', 9999, orient='R'), opposing_strands=False) - >>> BreakpointPair.classify(bpp) - {'deletion', 'insertion'} - - Note: - see [related theory documentation](/background/theory/#classifying-events) - """ - if not pair.interchromosomal: # intrachromosomal - if pair.opposing_strands: - if pair.LR or pair.RL: - raise InvalidRearrangement(pair) - return {SVTYPE.INV} - else: - if pair.LL or pair.RR: - raise InvalidRearrangement(pair) - elif pair.break1.orient == ORIENT.LEFT or pair.break2.orient == ORIENT.RIGHT: - if ( - len(pair.break1) == 1 - and len(pair.break2) == 1 - and abs(pair.break1.start - pair.break2.start) < 2 - ): - if pair.untemplated_seq == '': - return set() - return {SVTYPE.INS} - elif pair.untemplated_seq == '': - return {SVTYPE.DEL} - elif distance: - try: - if pair.net_size(distance).start > 0: - return {SVTYPE.INS} - elif pair.net_size(distance).end < 0: - return {SVTYPE.DEL} - except ValueError: - pass - return {SVTYPE.DEL, SVTYPE.INS} - elif pair.break1.orient == ORIENT.RIGHT or pair.break2.orient == ORIENT.LEFT: - return {SVTYPE.DUP} - return {SVTYPE.DEL, SVTYPE.INS, SVTYPE.DUP} - else: # interchromosomal - if pair.opposing_strands: - if pair.LR or pair.RL: - raise InvalidRearrangement(pair) - return {SVTYPE.ITRANS} - else: - if pair.LL or pair.RR: - raise InvalidRearrangement(pair) - return {SVTYPE.TRANS} - def net_size(self, distance=lambda x, y: Interval(abs(x - y))) -> Interval: """ Returns the size of the event for a given pair. Mainly applicable to indels @@ -624,3 +560,67 @@ def get_bed_repesentation(self) -> List[Tuple[str, int, int, Optional[str]]]: ) ) return bed + + +def classify_breakpoint_pair(pair: BreakpointPair, distance: Optional[Callable] = None) -> Set[str]: + """ + uses the chr, orientations and strands to determine the + possible structural_variant types that this pair could support + + Args: + pair (BreakpointPair): the pair to classify + distance: if defined, will be passed to net size to use in narrowing the list of putative types (del vs ins) + Returns: + a list of possible SVTYPE + + Example: + >>> bpp = BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) + >>> classify_breakpoint_pair(bpp) + ['inversion'] + >>> bpp = BreakpointPair(Breakpoint('1', 1, orient='L'), Breakpoint('1', 9999, orient='R'), opposing_strands=False) + >>> classify_breakpoint_pair(bpp) + {'deletion', 'insertion'} + + Note: + see [related theory documentation](/background/theory/#classifying-events) + """ + if not pair.interchromosomal: # intrachromosomal + if pair.opposing_strands: + if pair.LR or pair.RL: + raise InvalidRearrangement(pair) + return {SVTYPE.INV} + else: + if pair.LL or pair.RR: + raise InvalidRearrangement(pair) + elif pair.break1.orient == ORIENT.LEFT or pair.break2.orient == ORIENT.RIGHT: + if ( + len(pair.break1) == 1 + and len(pair.break2) == 1 + and abs(pair.break1.start - pair.break2.start) < 2 + ): + if pair.untemplated_seq == '': + return set() + return {SVTYPE.INS} + elif pair.untemplated_seq == '': + return {SVTYPE.DEL} + elif distance: + try: + if pair.net_size(distance).start > 0: + return {SVTYPE.INS} + elif pair.net_size(distance).end < 0: + return {SVTYPE.DEL} + except ValueError: + pass + return {SVTYPE.DEL, SVTYPE.INS} + elif pair.break1.orient == ORIENT.RIGHT or pair.break2.orient == ORIENT.LEFT: + return {SVTYPE.DUP} + return {SVTYPE.DEL, SVTYPE.INS, SVTYPE.DUP} + else: # interchromosomal + if pair.opposing_strands: + if pair.LR or pair.RL: + raise InvalidRearrangement(pair) + return {SVTYPE.ITRANS} + else: + if pair.LL or pair.RR: + raise InvalidRearrangement(pair) + return {SVTYPE.TRANS} diff --git a/src/mavis/convert/__init__.py b/src/mavis/convert/__init__.py index 3d102d33..1e7f3888 100644 --- a/src/mavis/convert/__init__.py +++ b/src/mavis/convert/__init__.py @@ -4,7 +4,7 @@ import pandas as pd from shortuuid import uuid -from ..breakpoint import Breakpoint, BreakpointPair +from ..breakpoint import Breakpoint, BreakpointPair, classify_breakpoint_pair from ..constants import COLUMNS, ORIENT, STRAND, SVTYPE from ..error import InvalidRearrangement from ..util import logger, read_bpp_from_input_file @@ -232,7 +232,7 @@ def _convert_tool_row( for col, value in std_row.items(): if col not in COLUMNS and col not in bpp.data: bpp.data[col] = value - if not event_type or event_type in BreakpointPair.classify(bpp): + if not event_type or event_type in classify_breakpoint_pair(bpp): result.append(bpp) except (InvalidRearrangement, AssertionError): diff --git a/src/mavis/util.py b/src/mavis/util.py index d4d34508..2e9d58a8 100644 --- a/src/mavis/util.py +++ b/src/mavis/util.py @@ -10,7 +10,7 @@ from mavis_config import bash_expands from shortuuid import uuid -from .breakpoint import Breakpoint, BreakpointPair +from .breakpoint import Breakpoint, BreakpointPair, classify_breakpoint_pair from .constants import ( COLUMNS, FLOAT_COLUMNS, @@ -511,17 +511,17 @@ def validate_pipeline_id(value): bpp.data.update(data) if putative_event_type: bpp.data[COLUMNS.event_type] = putative_event_type - if putative_event_type not in BreakpointPair.classify(bpp): + if putative_event_type not in classify_breakpoint_pair(bpp): raise InvalidRearrangement( 'error: expected one of', - BreakpointPair.classify(bpp), + classify_breakpoint_pair(bpp), 'but found', putative_event_type, str(bpp), row, ) if expand_svtype and not putative_event_type: - for svtype in BreakpointPair.classify( + for svtype in classify_breakpoint_pair( bpp, distance=lambda x, y: Interval(y - x) ): new_bpp = bpp.copy() diff --git a/src/mavis/validate/align.py b/src/mavis/validate/align.py index 3fbfad3c..bbcbe446 100644 --- a/src/mavis/validate/align.py +++ b/src/mavis/validate/align.py @@ -11,7 +11,7 @@ from ..bam import cigar as _cigar from ..bam import read as _read -from ..breakpoint import Breakpoint, BreakpointPair +from ..breakpoint import Breakpoint, BreakpointPair, classify_breakpoint_pair from ..constants import CIGAR, ORIENT, STRAND, SVTYPE, MavisNamespace, reverse_complement from ..interval import Interval from ..types import ReferenceGenome @@ -523,7 +523,7 @@ def select_contig_alignments(evidence, reads_by_query): standardize/simplify reads and filter bad/irrelevant alignments adds the contig alignments to the contigs """ - putative_types = BreakpointPair.classify(evidence) + putative_types = classify_breakpoint_pair(evidence) if {SVTYPE.DUP, SVTYPE.INS} & putative_types: putative_types.update({SVTYPE.DUP, SVTYPE.INS}) @@ -543,7 +543,7 @@ def filter_pass(alignment): def supports_primary_event(alignment): return all( [ - BreakpointPair.classify(alignment) & putative_types, + classify_breakpoint_pair(alignment) & putative_types, alignment.break1.chr == evidence.break1.chr, alignment.break2.chr == evidence.break2.chr, alignment.break1 & evidence.outer_window1, diff --git a/src/mavis/validate/base.py b/src/mavis/validate/base.py index 0a3ddd20..68bb28d7 100644 --- a/src/mavis/validate/base.py +++ b/src/mavis/validate/base.py @@ -8,7 +8,7 @@ from ..bam import cigar as _cigar from ..bam import read as _read from ..bam.cache import BamCache -from ..breakpoint import Breakpoint, BreakpointPair +from ..breakpoint import Breakpoint, BreakpointPair, classify_breakpoint_pair from ..constants import COLUMNS, ORIENT, PYSAM_READ_FLAGS, STRAND, SVTYPE, reverse_complement from ..error import NotSpecifiedError from ..interval import Interval @@ -151,12 +151,12 @@ def __init__( self.stdev_fragment_size = stdev_fragment_size self.strand_determining_read = strand_determining_read - if self.classification is not None and self.classification not in BreakpointPair.classify( + if self.classification is not None and self.classification not in classify_breakpoint_pair( self ): raise AttributeError( 'breakpoint pair improper classification', - BreakpointPair.classify(self), + classify_breakpoint_pair(self), self.classification, ) @@ -243,7 +243,7 @@ def putative_event_types(self): """ if self.classification: return {self.classification} - return BreakpointPair.classify(self) + return classify_breakpoint_pair(self) @property def compatible_type(self): diff --git a/src/mavis/validate/call.py b/src/mavis/validate/call.py index e0e04c1c..106006f1 100644 --- a/src/mavis/validate/call.py +++ b/src/mavis/validate/call.py @@ -4,7 +4,7 @@ from typing import List, Optional, Set from ..bam import read as _read -from ..breakpoint import Breakpoint, BreakpointPair +from ..breakpoint import Breakpoint, BreakpointPair, classify_breakpoint_pair from ..constants import ( CALL_METHOD, COLUMNS, @@ -89,10 +89,10 @@ def __init__( else: self.compatible_type = None # use the distance function from the source evidence to narrow the possible types - putative_types = BreakpointPair.classify(self, source_evidence.distance) + putative_types = classify_breakpoint_pair(self, source_evidence.distance) if event_type not in putative_types and self.compatible_type in putative_types: event_type, self.compatible_type = self.compatible_type, event_type - putative_types = BreakpointPair.classify(self, source_evidence.distance) + putative_types = classify_breakpoint_pair(self, source_evidence.distance) self.data[COLUMNS.event_type] = SVTYPE.enforce(event_type) if event_type not in putative_types | {self.compatible_type}: @@ -177,7 +177,7 @@ def is_supplementary(self): return not all( [ {self.event_type, self.compatible_type} - & BreakpointPair.classify(self.source_evidence), + & classify_breakpoint_pair(self.source_evidence), self.break1 & self.source_evidence.outer_window1, self.break2 & self.source_evidence.outer_window2, self.break1.chr == self.source_evidence.break1.chr, @@ -305,7 +305,7 @@ def add_spanning_read(self, read): read (pysam.AlignedSegment): putative spanning read """ for event in call_read_events(read, is_stranded=self.source_evidence.bam_cache.stranded): - if event == self and self.event_type in BreakpointPair.classify( + if event == self and self.event_type in classify_breakpoint_pair( event, distance=self.source_evidence.distance ): self.spanning_reads.add(read) @@ -567,7 +567,7 @@ def _call_by_contigs(source_evidence): for aln in ctg.alignments: if aln.is_putative_indel and aln.net_size(source_evidence.distance) == Interval(0): continue - for event_type in BreakpointPair.classify(aln, distance=source_evidence.distance): + for event_type in classify_breakpoint_pair(aln, distance=source_evidence.distance): try: new_event = EventCall( aln.break1, @@ -660,8 +660,10 @@ def _call_by_spanning_reads(source_evidence: Evidence, consumed_evidence): None # unless we are collecting a consensus we shouldn't assign sequences to the breaks ) event.break2.seq = None - types = BreakpointPair.classify(source_evidence) | {source_evidence.compatible_type} - for event_type in types & BreakpointPair.classify(event, distance=source_evidence.distance): + types = classify_breakpoint_pair(source_evidence) | {source_evidence.compatible_type} + for event_type in types & classify_breakpoint_pair( + event, distance=source_evidence.distance + ): try: new_event = EventCall( event.break1, diff --git a/src/mavis/validate/main.py b/src/mavis/validate/main.py index 4f78aa87..f47e0e2b 100644 --- a/src/mavis/validate/main.py +++ b/src/mavis/validate/main.py @@ -12,7 +12,7 @@ from ..annotate.file_io import ReferenceFile from ..bam import cigar as _cigar from ..bam.cache import BamCache -from ..breakpoint import BreakpointPair +from ..breakpoint import classify_breakpoint_pair from ..constants import CALL_METHOD, COLUMNS, PROTOCOL from ..util import ( filter_on_overlap, @@ -165,7 +165,7 @@ def main( ), ) logger.info(str(evidence)) - logger.info(f'possible event type(s): {BreakpointPair.classify(evidence)}') + logger.info(f'possible event type(s): {classify_breakpoint_pair(evidence)}') logger.info( f'outer window regions: {evidence.break1.chr}:{evidence.outer_window1[0]}-{evidence.outer_window1[1]} {evidence.break2.chr}:{evidence.outer_window2[0]}-{evidence.outer_window2[1]}' ) diff --git a/tests/test_mavis/test_breakpoint.py b/tests/test_mavis/test_breakpoint.py index 9eeb8347..5515cce2 100644 --- a/tests/test_mavis/test_breakpoint.py +++ b/tests/test_mavis/test_breakpoint.py @@ -1,7 +1,7 @@ from unittest.mock import Mock import pytest -from mavis.breakpoint import Breakpoint, BreakpointPair +from mavis.breakpoint import Breakpoint, BreakpointPair, classify_breakpoint_pair from mavis.constants import ORIENT, STRAND, SVTYPE from mavis.error import InvalidRearrangement, NotSpecifiedError from mavis.interval import Interval @@ -206,7 +206,7 @@ def test_inverted_translocation(self): Breakpoint(2, 1, 2, ORIENT.LEFT), opposing_strands=True, ) - BreakpointPair.classify(b) + classify_breakpoint_pair(b) def test_translocation(self): b = BreakpointPair( @@ -214,123 +214,123 @@ def test_translocation(self): Breakpoint(2, 1, 2, ORIENT.LEFT), opposing_strands=False, ) - BreakpointPair.classify(b) + classify_breakpoint_pair(b) def test_inversion(self): b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.RIGHT), ) - assert BreakpointPair.classify(b) == {SVTYPE.INV} + assert classify_breakpoint_pair(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.RIGHT), ) - assert BreakpointPair.classify(b) == {SVTYPE.INV} + assert classify_breakpoint_pair(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.NS), ) - assert BreakpointPair.classify(b) == {SVTYPE.INV} + assert classify_breakpoint_pair(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.NS), ) - assert BreakpointPair.classify(b) == {SVTYPE.INV} + assert classify_breakpoint_pair(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.LEFT), ) - assert BreakpointPair.classify(b) == {SVTYPE.INV} + assert classify_breakpoint_pair(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.LEFT), ) - assert BreakpointPair.classify(b) == {SVTYPE.INV} + assert classify_breakpoint_pair(b) == {SVTYPE.INV} def test_duplication(self): b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.LEFT), ) - assert BreakpointPair.classify(b) == {SVTYPE.DUP} + assert classify_breakpoint_pair(b) == {SVTYPE.DUP} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.LEFT), ) - assert BreakpointPair.classify(b) == {SVTYPE.DUP} + assert classify_breakpoint_pair(b) == {SVTYPE.DUP} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.LEFT), ) - assert BreakpointPair.classify(b) == {SVTYPE.DUP} + assert classify_breakpoint_pair(b) == {SVTYPE.DUP} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.NS), ) - assert BreakpointPair.classify(b) == {SVTYPE.DUP} + assert classify_breakpoint_pair(b) == {SVTYPE.DUP} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.NS), ) - assert BreakpointPair.classify(b) == {SVTYPE.DUP} + assert classify_breakpoint_pair(b) == {SVTYPE.DUP} def test_deletion_or_insertion(self): b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.RIGHT), ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NS, orient=ORIENT.RIGHT), opposing_strands=False, ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.RIGHT), ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NS, orient=ORIENT.RIGHT), opposing_strands=False, ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.RIGHT), opposing_strands=False, ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.RIGHT), opposing_strands=False, ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NS, orient=ORIENT.RIGHT), opposing_strands=False, ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) def test_insertion(self): b = BreakpointPair( @@ -338,7 +338,7 @@ def test_insertion(self): Breakpoint(1, 2, 2, strand=STRAND.NS, orient=ORIENT.RIGHT), opposing_strands=False, ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.INS]) def test_no_type(self): b = BreakpointPair( @@ -347,7 +347,7 @@ def test_no_type(self): opposing_strands=False, untemplated_seq='', ) - assert BreakpointPair.classify(b) == set() + assert classify_breakpoint_pair(b) == set() def test_deletion(self): b = BreakpointPair( @@ -356,7 +356,7 @@ def test_deletion(self): opposing_strands=False, untemplated_seq='', ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL]) def test_deletion_with_useq(self): bpp = BreakpointPair( @@ -365,20 +365,20 @@ def test_deletion_with_useq(self): opposing=False, untemplated_seq='CCCT', ) - assert sorted(BreakpointPair.classify(bpp)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(bpp)) == sorted([SVTYPE.DEL, SVTYPE.INS]) def distance(x, y): return Interval(abs(x - y)) net_size = BreakpointPair.net_size(bpp, distance) assert net_size == Interval(-71) - assert sorted(BreakpointPair.classify(bpp, distance)) == sorted([SVTYPE.DEL]) + assert sorted(classify_breakpoint_pair(bpp, distance)) == sorted([SVTYPE.DEL]) def test_deletion_no_distance_error(self): bpp = BreakpointPair( Breakpoint('1', 7039, orient='L'), Breakpoint('1', 7040, orient='R'), opposing=False ) - assert sorted(BreakpointPair.classify(bpp)) == sorted([SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(bpp)) == sorted([SVTYPE.INS]) class TestNetSize: From dc9ce86fe28bdff47fee44e2cd1abfb6f37f8771 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 8 Jun 2022 12:35:02 -0700 Subject: [PATCH 37/56] Simplify to use read1 flag only instead of is_read2 --- src/mavis/bam/read.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mavis/bam/read.py b/src/mavis/bam/read.py index 5a986178..e27be0ae 100644 --- a/src/mavis/bam/read.py +++ b/src/mavis/bam/read.py @@ -424,7 +424,7 @@ def sequenced_strand(read: pysam.AlignedSegment, strand_determining_read: int = else: strand = STRAND.NEG if not read.is_reverse else STRAND.POS elif strand_determining_read == 2: - if read.is_read2: + if not read.is_read1: strand = STRAND.NEG if read.is_reverse else STRAND.POS else: strand = STRAND.NEG if not read.is_reverse else STRAND.POS From b3b6e8e74358a0fec07bb31e0d506343961b1dd3 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 8 Jun 2022 12:35:31 -0700 Subject: [PATCH 38/56] Use query_length flag on read to simplify mocking proper reads --- src/mavis/validate/align.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mavis/validate/align.py b/src/mavis/validate/align.py index bbcbe446..62302ad1 100644 --- a/src/mavis/validate/align.py +++ b/src/mavis/validate/align.py @@ -177,9 +177,8 @@ def query_coverage_interval(read: pysam.AlignedSegment) -> Interval: Returns: The portion of the original query sequence that is aligned by this read """ - seq = read.query_sequence st = 0 - end = len(seq) - 1 + end = read.query_length - 1 if read.cigar[0][0] == CIGAR.S: st += read.cigar[0][1] if read.cigar[-1][0] == CIGAR.S: From 249698f26dda165f2e98d9cfcfe6c0b2140ede8b Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 8 Jun 2022 12:35:46 -0700 Subject: [PATCH 39/56] Improve docstring --- src/mavis/validate/align.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mavis/validate/align.py b/src/mavis/validate/align.py index 62302ad1..b3abb9f0 100644 --- a/src/mavis/validate/align.py +++ b/src/mavis/validate/align.py @@ -332,7 +332,8 @@ def read_breakpoint(read): def call_paired_read_event(read1, read2, is_stranded=False): """ For a given pair of reads call all applicable events. Assume there is a major - event from both reads and then call indels from the individual reads + event from both reads and then call indels from the individual reads. Should be alternate alignments of + the same read """ # sort the reads so that we are calling consistently break1 = read_breakpoint(read1) From 742c0448f2809083674603ef219920361038a227 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 8 Jun 2022 12:36:16 -0700 Subject: [PATCH 40/56] fix: logic error on checking flanking read positions relative to the evidence window --- src/mavis/validate/call.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mavis/validate/call.py b/src/mavis/validate/call.py index 106006f1..a69bf62a 100644 --- a/src/mavis/validate/call.py +++ b/src/mavis/validate/call.py @@ -241,8 +241,8 @@ def add_flanking_support(self, flanking_pairs, is_compatible=False): else: # L R if not all( [ - read.reference_start + 1 <= self.break1.end, - mate.reference_end >= self.break2.start, + read.reference_end <= self.break1.end, + mate.reference_start >= self.break2.start, ] ): continue From 071bb34d901dabd1bd9884a25e99f3a410cedc6d Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 8 Jun 2022 12:36:43 -0700 Subject: [PATCH 41/56] use consistent logic to calculate fragment length for genome and transcriptome --- src/mavis/validate/evidence.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/mavis/validate/evidence.py b/src/mavis/validate/evidence.py index db013e46..77a60a64 100644 --- a/src/mavis/validate/evidence.py +++ b/src/mavis/validate/evidence.py @@ -210,11 +210,14 @@ def traverse( # type: ignore return Interval.from_iterable(genomic_end_positions) def compute_fragment_size(self, read, mate): + """ + Template_length to match genome template length (aka. fragment size - 1 read length if all mapped perfectly) + """ if read.reference_start > mate.reference_start: read, mate = mate, read if read.reference_name == mate.reference_name: start, end = self.distance( - read.reference_start + 1, mate.reference_end, chrom=read.reference_name + read.reference_start + 1, mate.reference_start + 1, chrom=read.reference_name ) return Interval(start + 1, end + 1) return Interval(0) From f20f62ebce9fdd49f4337caa143fcdf096795110 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 8 Jun 2022 12:36:56 -0700 Subject: [PATCH 42/56] Sort imports --- src/mavis/validate/gather.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/mavis/validate/gather.py b/src/mavis/validate/gather.py index 21d18105..b5999460 100644 --- a/src/mavis/validate/gather.py +++ b/src/mavis/validate/gather.py @@ -1,21 +1,22 @@ """ Module which contains functions related to picking reads from a BAM file which support a putative event """ -from .base import Evidence import pysam + +from ..bam import cigar as _cigar +from ..bam import read as _read from ..constants import ( CIGAR, + NA_MAPPING_QUALITY, ORIENT, PYSAM_READ_FLAGS, - reverse_complement, - NA_MAPPING_QUALITY, SVTYPE, + reverse_complement, ) -from ..bam import cigar as _cigar -from ..bam import read as _read +from ..error import NotSpecifiedError from ..interval import Interval from ..util import logger -from ..error import NotSpecifiedError +from .base import Evidence def collect_split_read(evidence_bpp: Evidence, read: pysam.AlignedSegment, first_breakpoint: bool): From 8697dca18a41b295ae52facf0ef5b9e511bb0198 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 8 Jun 2022 12:37:08 -0700 Subject: [PATCH 43/56] Fix typo in docstring --- src/mavis/validate/gather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mavis/validate/gather.py b/src/mavis/validate/gather.py index b5999460..41f03914 100644 --- a/src/mavis/validate/gather.py +++ b/src/mavis/validate/gather.py @@ -124,7 +124,7 @@ def collect_split_read(evidence_bpp: Evidence, read: pysam.AlignedSegment, first w[0] - 1 : w[1] ] - # figure out how much of the read must match when remaped + # figure out how much of the read must match when remapped min_match_tgt = read.cigar[-1][1] if breakpoint.orient == ORIENT.LEFT else read.cigar[0][1] min_match_tgt = min( min_match_tgt * evidence_bpp.config['validate.min_anchor_match'], min_match_tgt - 1 From 0236fea21d2e10f93a24a50c6da9ef819b36096f Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 8 Jun 2022 12:41:33 -0700 Subject: [PATCH 44/56] use improved read mock object - use a read mock which requires a CIGAR string and calculates other fields so as to never have an impossible read configuration --- tests/test_mavis/bam/test_bam.py | 105 ++- tests/test_mavis/bam/test_cigar.py | 183 ++--- tests/test_mavis/mock.py | 359 +++++----- tests/test_mavis/test_align.py | 83 +-- tests/test_mavis/validate/test_call.py | 742 ++++++++++++-------- tests/test_mavis/validate/test_evidence.py | 143 +++- tests/test_mavis/validate/test_validate2.py | 106 +-- 7 files changed, 989 insertions(+), 732 deletions(-) diff --git a/tests/test_mavis/bam/test_bam.py b/tests/test_mavis/bam/test_bam.py index 4f2132e7..822cad52 100644 --- a/tests/test_mavis/bam/test_bam.py +++ b/tests/test_mavis/bam/test_bam.py @@ -5,7 +5,6 @@ import pytest import timeout_decorator from mavis.annotate.file_io import load_annotations, load_reference_genome -from mavis.bam import cigar as _cigar from mavis.bam import read as _read from mavis.bam.cache import BamCache from mavis.bam.read import ( @@ -15,7 +14,7 @@ sequenced_strand, ) from mavis.bam.stats import Histogram, compute_genome_bam_stats, compute_transcriptome_bam_stats -from mavis.constants import CIGAR, DNA_ALPHABET, ORIENT, READ_PAIR_TYPE, STRAND, SVTYPE +from mavis.constants import DNA_ALPHABET, ORIENT, READ_PAIR_TYPE, STRAND, SVTYPE from mavis.interval import Interval from ...util import get_data @@ -148,7 +147,7 @@ def test_alphabet_matching(self): def test_breakpoint_pos(self): # ==========+++++++++> - r = MockRead(reference_start=10, cigar=[(CIGAR.M, 10), (CIGAR.S, 10)]) + r = MockRead(reference_start=10, cigarstring='10M10S') assert _read.breakpoint_pos(r) == 19 with pytest.raises(AttributeError): @@ -157,7 +156,7 @@ def test_breakpoint_pos(self): assert _read.breakpoint_pos(r, ORIENT.LEFT) == 19 # ++++++++++=========> - r = MockRead(reference_start=10, cigar=[(CIGAR.S, 10), (CIGAR.M, 10)]) + r = MockRead(reference_start=10, cigarstring='10S10M') assert _read.breakpoint_pos(r) == 10 with pytest.raises(AttributeError): @@ -166,7 +165,7 @@ def test_breakpoint_pos(self): assert _read.breakpoint_pos(r, ORIENT.RIGHT) == 10 with pytest.raises(AttributeError): - r = MockRead(reference_start=10, cigar=[(CIGAR.X, 10), (CIGAR.M, 10)]) + r = MockRead(reference_start=10, cigarstring='10X10M') _read.breakpoint_pos(r, ORIENT.LEFT) def test_nsb_align(self): @@ -240,20 +239,66 @@ def test_min_overlap(self): @pytest.fixture def stranded_reads(): n = argparse.Namespace() - n.read1_pos_neg = MockRead(is_reverse=False, is_read1=True, mate_is_reverse=True) - assert not n.read1_pos_neg.is_read2 - n.read1_neg_pos = MockRead(is_reverse=True, is_read1=True, mate_is_reverse=False) - n.read1_pos_pos = MockRead(is_reverse=False, is_read1=True, mate_is_reverse=False) - n.read1_neg_neg = MockRead(is_reverse=True, is_read1=True, mate_is_reverse=True) - - n.read2_pos_neg = MockRead(is_reverse=True, is_read1=False, mate_is_reverse=True) - assert n.read2_pos_neg.is_read2 - n.read2_neg_pos = MockRead(is_reverse=False, is_read1=False, mate_is_reverse=False) - n.read2_pos_pos = MockRead(is_reverse=False, is_read1=False, mate_is_reverse=False) - n.read2_neg_neg = MockRead(is_reverse=True, is_read1=False, mate_is_reverse=True) - - n.unpaired_pos = MockRead(is_reverse=False, is_paired=False) - n.unpaired_neg = MockRead(is_reverse=True, is_paired=False) + n.read1_pos_neg = MockRead( + is_reverse=False, + is_read1=True, + mate_is_reverse=True, + cigarstring='10=', + reference_start=100, + ) + n.read1_neg_pos = MockRead( + is_reverse=True, + is_read1=True, + mate_is_reverse=False, + cigarstring='10=', + reference_start=100, + ) + n.read1_pos_pos = MockRead( + is_reverse=False, + is_read1=True, + mate_is_reverse=False, + cigarstring='10=', + reference_start=100, + ) + n.read1_neg_neg = MockRead( + is_reverse=True, is_read1=True, mate_is_reverse=True, cigarstring='10=', reference_start=100 + ) + + n.read2_pos_neg = MockRead( + is_reverse=True, + is_read1=False, + mate_is_reverse=True, + cigarstring='10=', + reference_start=100, + ) + n.read2_neg_pos = MockRead( + is_reverse=False, + is_read1=False, + mate_is_reverse=False, + cigarstring='10=', + reference_start=100, + ) + n.read2_pos_pos = MockRead( + is_reverse=False, + is_read1=False, + mate_is_reverse=False, + cigarstring='10=', + reference_start=100, + ) + n.read2_neg_neg = MockRead( + is_reverse=True, + is_read1=False, + mate_is_reverse=True, + cigarstring='10=', + reference_start=100, + ) + + n.unpaired_pos = MockRead( + is_reverse=False, is_paired=False, cigarstring='10=', reference_start=100 + ) + n.unpaired_neg = MockRead( + is_reverse=True, is_paired=False, cigarstring='10=', reference_start=100 + ) return n @@ -329,32 +374,36 @@ def test_read_pair_strand_det_error(self, stranded_reads): def read_pairs(): n = argparse.Namespace() n.LR = MockRead( - reference_id=0, - next_reference_id=0, + reference_name='1', + next_reference_name='1', + cigarstring='50=', reference_start=1, next_reference_start=2, is_reverse=False, mate_is_reverse=True, ) n.LL = MockRead( - reference_id=0, - next_reference_id=0, + reference_name='1', + next_reference_name='1', + cigarstring='50=', reference_start=1, next_reference_start=2, is_reverse=False, mate_is_reverse=False, ) n.RR = MockRead( - reference_id=0, - next_reference_id=0, + reference_name='1', + next_reference_name='1', + cigarstring='50=', reference_start=1, next_reference_start=2, is_reverse=True, mate_is_reverse=True, ) n.RL = MockRead( - reference_id=0, - next_reference_id=0, + reference_name='1', + next_reference_name='1', + cigarstring='50=', reference_start=1, next_reference_start=2, is_reverse=True, @@ -483,7 +532,7 @@ def test_trans_bam_stats(self): @pytest.fixture def contig_read(): return MockRead( - cigar=_cigar.convert_string_to_cigar('275M18I12041D278M'), + cigarstring='275M18I12041D278M', reference_start=89700025, reference_name='10', ) diff --git a/tests/test_mavis/bam/test_cigar.py b/tests/test_mavis/bam/test_cigar.py index 112f9393..862084c6 100644 --- a/tests/test_mavis/bam/test_cigar.py +++ b/tests/test_mavis/bam/test_cigar.py @@ -65,7 +65,7 @@ def test_recompute_cigar_mismatch( r = MockRead( reference_start=reference_start, query_sequence=query_sequence, - cigar=convert_string_to_cigar(cigar_in), + cigarstring=cigar_in, ) assert recompute_cigar_mismatch(r, mock_reference_genome['fake']) == convert_string_to_cigar( cigar_out @@ -192,17 +192,22 @@ def test_convert_for_igv(self): class TestHgvsStandardizeCigars: def no_change_aligned(self): ref = 'AAATTTGGGCCCAATT' - read = MockRead('name', '1', 1, cigar=[(CIGAR.M, 10)], query_sequence='AAATTTGGGC') + read = MockRead( + query_name='name', + reference_name='1', + reference_start=1, + cigarstring='10M', + query_sequence='AAATTTGGGC', + ) assert hgvs_standardize_cigar(read, ref) == [(CIGAR.M, 10)] def no_change_proper_indel(self): - ref = 'ATAGGC' 'ATCTACGAG' 'ATCGCTACG' + ref = 'ATAGGC' 'ATCTAC' 'GAG' 'ATCG' 'CTACG' read = MockRead( - 'name', - 1, - 6, + query_name='name', + reference_start=6, query_sequence='ATCTAC' 'CCC' 'ATCG', - cigar=[(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)], + cigarstring='6=3I3D4=', ) assert [(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)] == hgvs_standardize_cigar( read, ref @@ -211,11 +216,10 @@ def no_change_proper_indel(self): def ins_after_deletion(self): ref = 'ATAGGC' 'ATCTACGAG' 'ATCGCTACG' read = MockRead( - 'name', - 1, - 6, + query_name='name', + reference_start=6, query_sequence='ATCTAC' 'CCC' 'ATCG', - cigar=[(CIGAR.EQ, 6), (CIGAR.D, 3), (CIGAR.I, 3), (CIGAR.EQ, 4)], + cigarstring='6=3D3I4=', ) assert [(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)] == hgvs_standardize_cigar( read, ref @@ -224,22 +228,20 @@ def ins_after_deletion(self): def test_insertion_in_repeat(self): ref = 'ATAGGC' 'ATCT' 'ACGA' 'GATCGCTACG' read = MockRead( - 'name', - 1, - 6, + query_name='name', + reference_start=6, query_sequence='ATCT' 'ACGA' 'ACGA' 'GATC', - cigar=[(CIGAR.EQ, 4), (CIGAR.I, 4), (CIGAR.EQ, 8)], + cigarstring='4=4I8=', ) assert [(CIGAR.EQ, 8), (CIGAR.I, 4), (CIGAR.EQ, 4)] == hgvs_standardize_cigar(read, ref) def test_deletion_in_repeat(self): ref = 'ATAGGC' 'ATCT' 'ACGA' 'ACGA' 'ACGA' 'GATCGCTACG' read = MockRead( - 'name', - 1, - 6, + query_name='name', + reference_start=6, query_sequence='ATCT' 'ACGA' 'ACGA' 'GATC', - cigar=[(CIGAR.EQ, 4), (CIGAR.D, 4), (CIGAR.EQ, 12)], + cigarstring='4=4D12=', ) assert [(CIGAR.EQ, 12), (CIGAR.D, 4), (CIGAR.EQ, 4)] == hgvs_standardize_cigar(read, ref) @@ -247,18 +249,7 @@ def test_bubble_sort_indel_sections(self): rseq = 'ATAGGC' 'ATCT' 'GG' 'GA' 'GCGA' 'GATCGCTACG' qseq = 'ATCT' 'TTT' 'TT' 'GCGA' 'GATC' read = MockRead( - 'name', - 1, - 6, - query_sequence=qseq, - cigar=[ - (CIGAR.EQ, 4), - (CIGAR.D, 2), - (CIGAR.I, 3), - (CIGAR.D, 2), - (CIGAR.I, 2), - (CIGAR.EQ, 8), - ], + query_name='name', reference_start=6, query_sequence=qseq, cigarstring='4=2D3I2D2I8=' ) assert [(CIGAR.EQ, 4), (CIGAR.I, 5), (CIGAR.D, 4), (CIGAR.EQ, 8)] == hgvs_standardize_cigar( read, rseq @@ -269,19 +260,7 @@ def test_bubble_sort_indel_sections_drop_mismatch(self): # ATCT CTTTT TACGA qseq = 'ATCT' 'C' 'TT' 'TTT' 'ACGA' 'GATC' read = MockRead( - 'name', - 1, - 6, - query_sequence=qseq, - cigar=[ - (CIGAR.EQ, 4), - (CIGAR.X, 1), - (CIGAR.D, 3), - (CIGAR.I, 2), - (CIGAR.D, 5), - (CIGAR.I, 3), - (CIGAR.EQ, 8), - ], + query_name='name', reference_start=6, query_sequence=qseq, cigarstring='4=1X3D2I5D3I8=' ) assert [(CIGAR.EQ, 4), (CIGAR.I, 5), (CIGAR.D, 8), (CIGAR.EQ, 9)] == hgvs_standardize_cigar( read, rseq @@ -296,21 +275,11 @@ def test_bubble_sort_indel_sections_drop_mismatch_with_hardclipping(self): # ATAGGCATCT ACGAACGAACGAGATCGCTACG # ATCTCTTTTT CGAACG read = MockRead( - 'name', - 1, - 6, + query_name='name', + reference_start=6, reference_name='1', query_sequence='ATCTCTTTTTCGAACG', - cigar=[ - (CIGAR.H, 10), - (CIGAR.EQ, 4), - (CIGAR.X, 1), - (CIGAR.D, 2), - (CIGAR.I, 3), - (CIGAR.D, 2), - (CIGAR.I, 2), - (CIGAR.EQ, 6), - ], + cigarstring='10H4=1X2D3I2D2I6=', ) print(SamRead.deletion_sequences(read, {'1': MockObject(seq=ref)})) print(SamRead.insertion_sequences(read)) @@ -326,11 +295,10 @@ def test_bubble_sort_indel_sections_drop_mismatch_with_hardclipping(self): def test_homopolymer_even_odd(self): ref = 'ATCGAGAT' + 'A' * 15 + 'TCGAGAT' read = MockRead( - 'name', - 1, - 1, + query_name='name', + reference_start=1, query_sequence='ATCGAGATA' + 'A' * 12 + 'TCGAGAT', - cigar=[(CIGAR.EQ, 8), (CIGAR.D, 2), (CIGAR.EQ, 20)], + cigarstring='8=2D20=', ) assert [(CIGAR.EQ, 9 + 12), (CIGAR.D, 2), (CIGAR.EQ, 7)] == hgvs_standardize_cigar( read, ref @@ -341,15 +309,14 @@ def test_homopolymer_even_odd(self): ) read = MockRead( - 'name', - '1', - 0, - 149, + query_name='name', + reference_name='1', + reference_start=0, query_sequence=( 'CCCCGGCTCATGTCTGGTTTTGTTTTCCGGGGGCGGGGGGGCTCCCTGGGGATGATGGTGATTTTTTTTTTTTTTTTTAATCCTCAACTAGGAGAGAAAA' 'TGAGGCAGAGACAATGTGGGGAGCGAGAGAGGGGAAAAGGACGGGGGAGG' ), - cigar=[(CIGAR.EQ, 61), (CIGAR.I, 2), (CIGAR.EQ, 87)], + cigarstring='61=2I87=', ) assert [(CIGAR.EQ, 61 + 15), (CIGAR.I, 2), (CIGAR.EQ, 87 - 15)] == hgvs_standardize_cigar( read, ref @@ -361,15 +328,14 @@ def test_homopolymer_even_odd(self): ) read = MockRead( - 'name', - '1', - 0, - 149, + query_name='name', + reference_name='1', + reference_start=0, query_sequence=( 'CCCCTCCTCGGTCGGGCAGATCTTTCAGAAGCAGGAGCCCAGGATCATGTCTGGTTTTGTTTTCCGAGGGCGAGGGGGCTCCCTGAGGATGATGGTGATTTT' 'TTTTTTTTTTTTTAATCCTCAACTAGGAGAGAAAATGAGGCAGAGACA' ), - cigar=[(CIGAR.S, 2), (CIGAR.EQ, 96), (CIGAR.I, 2), (CIGAR.EQ, 50)], + cigarstring='2S96=2I50=', ) assert [ (CIGAR.S, 2), @@ -391,10 +357,10 @@ def test_even_deletion_in_repeat(self): ) print(len(qseq) - 28) read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=4, - cigar=convert_string_to_cigar('4S13=2D64='), + cigarstring='4S13=2D64=', query_sequence=qseq, ) reference_genome = {'1': MockObject(seq=rseq)} @@ -416,10 +382,10 @@ def test_odd_deletion_in_repeat(self): ) print(len(qseq) - 28) read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=4, - cigar=convert_string_to_cigar('4S13=3D63='), + cigarstring='4S13=3D63=', query_sequence=qseq, ) reference_genome = {'1': MockObject(seq=rseq)} @@ -434,10 +400,10 @@ def test_unecessary_indel(self): rseq = 'qwertyuiopasdfghjklzxcvbnm' qseq = 'qwertyuiopasdfghjklzxcvbnm' read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=0, - cigar=convert_string_to_cigar('13=1I1D12='), + cigarstring='13=1I1D12=', query_sequence=qseq, ) exp = convert_string_to_cigar('26=') @@ -448,10 +414,10 @@ def test_unecessary_indel2(self): rseq = 'qwertyuiopasdfghjklzxcvbnm' qseq = 'qwertyuiopasdfkghjklzxcvbnm' read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=0, - cigar=convert_string_to_cigar('13=2I1D12='), + cigarstring='13=2I1D12=', query_sequence=qseq, ) exp = convert_string_to_cigar('14=1I12=') @@ -462,10 +428,10 @@ def test_unecessary_indel_end_match(self): rseq = 'qwertyuiopasdfghjklzxcvbnm' qseq = 'qwertyuiopasdfkmkghjklzxcvbnm' read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=0, - cigar=convert_string_to_cigar('14=5I2D10='), + cigarstring='14=5I2D10=', query_sequence=qseq, ) exp = convert_string_to_cigar('14=3I12=') @@ -476,10 +442,10 @@ def test_unecessary_indel_end_match2(self): rseq = 'GGGTGCAGTGGCTTACACCT' 'GTAATCCAAACACCTTGGGAGCCGCCCCCTGAG' 'CCTCCAGGCCCGGGACAGA' qseq = 'GGGTGCAGTGGCTTACACCT' 'CCAGG' 'CCTCCAGGCCCGGGACAGA' read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=0, - cigar=convert_string_to_cigar('20=5I33D19='), + cigarstring='20=5I33D19=', query_sequence=qseq, ) exp = convert_string_to_cigar('20=4I32D20=') @@ -499,10 +465,10 @@ def test_even_insertion_in_repeat(self): ) print(len(qseq) - 13 - 4) read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=4, - cigar=convert_string_to_cigar('4S13=2I66='), + cigarstring='4S13=2I66=', query_sequence=qseq, ) exp = convert_string_to_cigar('4S26=2I53=') @@ -525,21 +491,10 @@ def test_deletion_repeat(self, mock_reference_genome): # deleted reference: TATGTGTAATATACATCATGTATCAAA print(qseq[:76], qseq[76:]) read = MockRead( - 'name', + query_name='name', reference_name='11_86018001-86018500', reference_start=28, - cigar=[ - (CIGAR.S, 4), - (CIGAR.EQ, 10), - (CIGAR.X, 3), - (CIGAR.EQ, 14), - (CIGAR.X, 1), - (CIGAR.EQ, 21), - (CIGAR.X, 1), - (CIGAR.EQ, 22), - (CIGAR.D, 27), - (CIGAR.EQ, 74), - ], + cigarstring='4S10=3X14=1X21=1X22=27D74=', query_sequence=qseq, ) expected_cigar = [ @@ -586,23 +541,11 @@ def test_complex(self): 'ACACACACACACACACAC' ) read = MockRead( - 'name', + query_name='name', reference_name='mock', reference_start=0, query_sequence=qseq, - cigar=[ - (CIGAR.EQ, 33), - (CIGAR.X, 1), - (CIGAR.EQ, 49), - (CIGAR.I, 26), - (CIGAR.EQ, 18), - (CIGAR.X, 1), - (CIGAR.EQ, 1), - (CIGAR.I, 1), - (CIGAR.EQ, 1), - (CIGAR.I, 1), - (CIGAR.EQ, 18), - ], + cigarstring='33=1X49=26I18=1X1=1I1=1I18=', ) print(rseq) print( @@ -634,11 +577,11 @@ def test_deletion_partial_repeat(self): qseq = 'ATCTTAGCCAGGT' 'AGTTACATACATATC' rseq = 'ATCTTAGCCAGGT' 'AGCTAT' 'AGTTACATACATATC' read = MockRead( - 'name', + query_name='name', reference_name='mock', reference_start=0, query_sequence=qseq, - cigar=convert_string_to_cigar('13=6D15='), + cigarstring='13=6D15=', ) assert convert_string_to_cigar('15=6D13=') == hgvs_standardize_cigar(read, rseq) @@ -648,22 +591,22 @@ def test_indel_repeat(self): print(qseq) print(rseq) read = MockRead( - 'name', + query_name='name', reference_name='mock', reference_start=0, query_sequence=qseq, - cigar=convert_string_to_cigar('13=1I6D15='), + cigarstring='13=1I6D15=', ) assert convert_string_to_cigar('13=1I6D15=') == hgvs_standardize_cigar(read, rseq) def test_shift_complex_indel(self): refseq = 'ATATATCTATTTTTTTCTTTCTTTTTTTTACTTTCATTAAGTGCCACTAAAAAATTAGGTTCAATTAAACTTTATTAATCTCTTCTGAGTTTTGATTGAGTATATATATATATATACCCAGTTTCAAGCAGGTATCTGCCTTTAAAGATAAGAGACCTCCTAAATGCTTTCTTTTATTAGTTGCCCTGTTTCAGATTCAGCTTTGTATCTATATCACCTGTTAATATGTGTGGACTCACAGAAATGATCATTGAGGGAATGCACCCTGTTTGGGTGTAAGTAGCTCAGGGAAAAAATCCTAG' read = MockRead( - 'name', + query_name='name', reference_name='18', reference_start=40237946 - 40237890, query_sequence='AGGTTCAATTAAACTTTATTAATCTCTTCTGAGTTTTGATTGAGTGTATATATATATATATATATATATATATATATACCCAGTTTCAAGCAGGTATCTGCCTTTAAAGATAAGAGACCTCCTAAGTGCTTTCTTTTATTAGTGGCCCTG', - cigar=convert_string_to_cigar('44M18I88M'), + cigarstring='44M18I88M', ) print(_read.convert_cigar_to_string(read.cigar)) read.cigar = recompute_cigar_mismatch(read, refseq) @@ -728,7 +671,7 @@ def test_deletions(self): reference_start=0, reference_name='1', query_sequence='', - cigar=convert_string_to_cigar('2=3D8=4D9='), + cigarstring='2=3D8=4D9=', ) assert ( SamRead.deletion_sequences(read, {'1': MockObject(seq='abcdefghijklmnopqrstuvwxyz')}) @@ -741,6 +684,6 @@ def test_insertions(self): reference_start=0, reference_name='1', query_sequence='abcdekkkfghijklmnopqkkkkrstuvwxyz', - cigar=convert_string_to_cigar('5=3I12=4I9='), + cigarstring='5=3I12=4I9=', ) assert SamRead.insertion_sequences(read) == exp diff --git a/tests/test_mavis/mock.py b/tests/test_mavis/mock.py index d05051b7..badc574a 100644 --- a/tests/test_mavis/mock.py +++ b/tests/test_mavis/mock.py @@ -1,11 +1,19 @@ import os import types +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple from mavis.annotate.file_io import load_annotations, load_reference_genome from mavis.annotate.genomic import PreTranscript, Transcript from mavis.annotate.protein import Translation -from mavis.constants import CIGAR, NA_MAPPING_QUALITY -from mavis.validate.align import query_coverage_interval +from mavis.bam.cigar import ( + QUERY_ALIGNED_STATES, + REFERENCE_ALIGNED_STATES, + convert_cigar_to_string, + convert_string_to_cigar, +) +from mavis.bam.read import SamRead +from mavis.constants import CIGAR, NA_MAPPING_QUALITY, PYSAM_READ_FLAGS from ..util import get_data @@ -89,151 +97,186 @@ def __init__(self, **kwargs): setattr(self, arg, val) -class MockRead: - def __init__( - self, - query_name=None, - reference_id=0, - reference_start=None, - reference_end=None, - cigar=None, - is_reverse=False, - mate_is_reverse=True, - next_reference_start=None, - next_reference_id=None, - reference_name=None, - query_sequence=None, - template_length=None, - query_alignment_sequence=None, - query_alignment_start=None, - query_alignment_end=None, - query_alignment_length=None, - flag=None, - tags=[], - is_read1=True, - is_paired=True, - is_unmapped=False, - is_supplementary=False, - mate_is_unmapped=False, - mapping_quality=NA_MAPPING_QUALITY, - **kwargs - ): - for attr, val in kwargs.items(): - setattr(self, attr, val) - self.mapping_quality = mapping_quality - self.query_name = query_name - self.reference_id = reference_id - self.reference_start = reference_start - self.reference_end = reference_end - self.cigar = cigar - self.query_alignment_length = query_alignment_length - self.query_sequence = query_sequence - if self.reference_end is None and cigar and reference_start is not None: - self.reference_end = reference_start + sum( - [f for v, f in cigar if v not in [CIGAR.S, CIGAR.I]] - ) - if not self.query_alignment_length: - if cigar: - self.query_alignment_length = sum( - [v for s, v in cigar if s not in [CIGAR.S, CIGAR.H]] - ) - elif self.query_sequence: - self.query_alignment_length = len(self.query_sequence) - else: - self.query_alignment_length = 0 - self.is_reverse = is_reverse - self.mate_is_reverse = mate_is_reverse - self.next_reference_start = next_reference_start - self.next_reference_id = next_reference_id - self.reference_name = reference_name - - self.query_alignment_sequence = query_alignment_sequence - self.query_alignment_start = query_alignment_start - self.query_alignment_end = query_alignment_end - self.flag = flag - self.tags = tags - if query_alignment_sequence is None and cigar and query_sequence: - s = 0 if cigar[0][0] != CIGAR.S else cigar[0][1] - t = len(query_sequence) - if cigar[-1][0] == CIGAR.S: - t -= cigar[-1][1] - self.query_alignment_sequence = query_sequence[s:t] - if cigar and query_sequence: - if len(query_sequence) != sum( - [f for v, f in cigar if v not in [CIGAR.H, CIGAR.N, CIGAR.D]] - ): - raise AssertionError( - 'length of sequence does not match cigar', - len(query_sequence), - sum([f for v, f in cigar if v not in [CIGAR.H, CIGAR.N, CIGAR.D]]), - ) - if template_length is None and reference_end and next_reference_start: - self.template_length = next_reference_start - reference_end - else: - self.template_length = template_length - self.is_read1 = is_read1 - self.is_read2 = not is_read1 - self.is_paired = is_paired - self.is_unmapped = is_unmapped - self.mate_is_unmapped = mate_is_unmapped - self.is_supplementary = is_supplementary - if self.reference_start and self.reference_end: - if not cigar: - self.cigar = [(CIGAR.M, self.reference_end - self.reference_start)] - if not self.query_sequence: - self.query_sequence = 'N' * (self.reference_end - self.reference_start) - if flag: - self.is_unmapped = bool(self.flag & int(0x4)) - self.mate_is_unmapped = bool(self.flag & int(0x8)) - self.is_reverse = bool(self.flag & int(0x10)) - self.mate_is_reverse = bool(self.flag & int(0x20)) - self.is_read1 = bool(self.flag & int(0x40)) - self.is_read2 = bool(self.flag & int(0x80)) - self.is_secondary = bool(self.flag & int(0x100)) - self.is_qcfail = bool(self.flag & int(0x200)) - self.is_supplementary = bool(self.flag & int(0x400)) - - def query_coverage_interval(self): - return query_coverage_interval(self) - - def set_tag(self, tag, value, value_type=None, replace=True): - new_tag = (tag, value) - if not replace and new_tag in self.tags: - self.tags.append(new_tag) - else: - self.tags.append(new_tag) +def flags_from_number(flag: int) -> Dict: + return dict( + is_unmapped=bool(flag & int(0x4)), + mate_is_unmapped=bool(flag & int(0x8)), + is_reverse=bool(flag & int(0x10)), + mate_is_reverse=bool(flag & int(0x20)), + is_read1=bool(flag & int(0x40)), + is_secondary=bool(flag & int(0x100)), + is_supplementary=bool(flag & int(0x400)), + ) + - def has_tag(self, tag): - return tag in dict(self.tags).keys() +class MockString(str): + def __getitem__(self, key): + if isinstance(key, slice): + size = key.stop - key.start + else: + size = 1 + return 'N' * size - def get_tag(self, tag): - return dict(self.tags)[tag] if tag in dict(self.tags).keys() else False - def __str__(self): - return '{}(ref_id={}, start={}, end={}, seq={})'.format( - self.__class__.__name__, - self.reference_id, - self.reference_start, - self.reference_end, - self.query_sequence, +@dataclass +class MockRead: + cigarstring: str + reference_start: int + reference_name: str = '' + query_sequence: str = MockString() + is_reverse: bool = False + query_name: str = '' + is_read1: bool = False + is_supplementary: bool = False + is_secondary: bool = False + is_paired: bool = True + template_length: Optional[int] = None + is_unmapped: bool = False + mapping_quality: int = NA_MAPPING_QUALITY + + # mate flags + next_reference_name: str = '' + mate_is_reverse: bool = False + next_reference_start: Optional[int] = None + mate_is_unmapped: bool = False + + # custom flags for assembly reads + alignment_rank: Optional[int] = None + + def __hash__(self): + return hash(SamRead.__hash__(self)) + + @property + def query_length(self): + return sum( + [ + size + for (cigar_state, size) in self.cigar + if cigar_state in QUERY_ALIGNED_STATES - {CIGAR.H} + ] ) + @property + def is_proper_pair(self): + return self.is_paired + + @property + def reference_id(self): + try: + return int(self.reference_name) - 1 + except ValueError: + return hash(str(self.reference_name)) + + @property + def next_reference_id(self): + try: + return int(self.next_reference_name) - 1 + except ValueError: + return hash(str(self.next_reference_name)) + + @property + def seq(self): + return self.query_sequence + + @property + def cigar(self) -> List[Tuple[int, int]]: + return convert_string_to_cigar(self.cigarstring) + + @cigar.setter + def cigar(self, cigartuples: List[Tuple[int, int]]): + self.cigarstring = convert_cigar_to_string(cigartuples) + + @property + def reference_end(self): + reference_pos = self.reference_start + for state, size in self.cigar: + if state in REFERENCE_ALIGNED_STATES: + reference_pos += size + return reference_pos + + @property + def query_alignment_start(self): + query_pos = 0 + for state, size in self.cigar: + if state == CIGAR.H: + continue + elif state in QUERY_ALIGNED_STATES - REFERENCE_ALIGNED_STATES: + query_pos += size + elif state in QUERY_ALIGNED_STATES: + return query_pos + return None + + @property + def query_alignment_end(self): + query_pos = 0 + for state, size in self.cigar[::-1]: + if state == CIGAR.H: + continue + elif state in QUERY_ALIGNED_STATES - REFERENCE_ALIGNED_STATES: + query_pos += size + elif state in QUERY_ALIGNED_STATES: + return self.query_length - query_pos + return None + + @property + def query_alignment_length(self): + return self.query_alignment_end - self.query_alignment_start + + @property + def query_alignment_sequence(self): + return self.query_sequence[self.query_alignment_start : self.query_alignment_end] + + def has_tag(self, tag_name: str) -> bool: + return hasattr(self, tag_name) + + def get_tag(self, tag_name: str) -> bool: + if self.has_tag(tag_name): + return getattr(self, tag_name) + return False + + def set_tag(self, tag_name: str, value, value_type='') -> bool: + setattr(self, tag_name, value) + + # SamRead methods def key(self): - """ - uses a stored _key attribute, if available. This is to avoid the hash changing if the reference start (for example) - is changed but also allow this attribute to be used and calculated for non SamRead objects - - This way to change the hash behaviour the user must be explicit and use the set_key method - """ - if hasattr(self, '_key') and self._key is not None: - return self._key - return ( - self.query_name, - self.query_sequence, - self.reference_id, - self.reference_start, - self.is_supplementary, - ) + return SamRead.key(self) + + @property + def flag(self): + flag = 0 + for flag_value, attr_name in [ + ('REVERSE', 'is_reverse'), + ('MATE_REVERSE', 'mate_is_reverse'), + ('MATE_UNMAPPED', 'mate_is_unmapped'), + ('UNMAPPED', 'is_unmapped'), + ('SECONDARY', 'is_secondary'), + ('SUPPLEMENTARY', 'is_supplementary'), + ]: + if getattr(self, attr_name): + flag |= PYSAM_READ_FLAGS[flag_value] + + if self.is_paired: + if self.is_read1: + flag |= PYSAM_READ_FLAGS.FIRST_IN_PAIR + else: + flag |= PYSAM_READ_FLAGS.LAST_IN_PAIR + return flag + + @flag.setter + def flag(self, value): + for flag_value, attr_name in [ + ('REVERSE', 'is_reverse'), + ('MATE_REVERSE', 'mate_is_reverse'), + ('MATE_UNMAPPED', 'mate_is_unmapped'), + ('UNMAPPED', 'is_unmapped'), + ('FIRST_IN_PAIR', 'is_read1'), + ('SECONDARY', 'is_secondary'), + ('SUPPLEMENTARY', 'is_supplementary'), + ]: + if value & PYSAM_READ_FLAGS[flag_value]: + setattr(self, attr_name, True) + if value & PYSAM_READ_FLAGS.LAST_IN_PAIR: + self.is_read1 = False class MockBamFileHandle: @@ -256,35 +299,33 @@ def get_reference_name(self, input_tid): raise KeyError('invalid id') -class MockString: - def __init__(self, char=' '): - self.char = char +def mock_read_pair(mock1: MockRead, mock2: MockRead, proper_pair=True, reverse_order=False): + # make sure pair flags are set + mock1.is_paired = True + mock2.is_paired = True - def __getitem__(self, index): - if isinstance(index, slice): - return self.char * (index.stop - index.start) - else: - return self.char + mock1.is_read1 = not reverse_order + mock2.is_read1 = reverse_order + if not mock1.is_reverse and proper_pair: + mock2.is_reverse = True -def mock_read_pair(mock1, mock2): - if mock1.reference_id != mock2.reference_id: + if mock1.reference_name != mock2.reference_name: mock1.template_length = 0 mock2.template_length = 0 - mock1.next_reference_id = mock2.reference_id + mock1.next_reference_name = mock2.reference_name mock1.next_reference_start = mock2.reference_start mock1.mate_is_reverse = mock2.is_reverse - mock1.is_paired = True - mock1.is_read2 = not mock1.is_read1 + if mock1.template_length is None: - mock1.template_length = mock2.reference_end - mock1.reference_start + # mock1.template_length = abs(mock1.reference_start - mock1.next_reference_start) + 1 + # if reverse_order: + # mock1.template_length *= -1 + mock1.template_length = mock1.next_reference_start - mock1.reference_start + 1 - mock2.next_reference_id = mock1.reference_id + mock2.next_reference_name = mock1.reference_name mock2.next_reference_start = mock1.reference_start mock2.mate_is_reverse = mock1.is_reverse - mock2.is_paired = True - mock2.is_read1 = not mock1.is_read1 - mock2.is_read2 = not mock1.is_read2 if mock2.query_name is None: mock2.query_name = mock1.query_name mock2.template_length = -1 * mock1.template_length diff --git a/tests/test_mavis/test_align.py b/tests/test_mavis/test_align.py index 512fc580..a4d0179e 100644 --- a/tests/test_mavis/test_align.py +++ b/tests/test_mavis/test_align.py @@ -238,7 +238,7 @@ def test_break_left_deletion(self): b = Breakpoint('10', 1030, 1030, orient=ORIENT.LEFT) read = MockRead( - cigar=_cigar.convert_string_to_cigar('35M10D5I20M'), + cigarstring='35M10D5I20M', reference_start=999, reference_name='10', ) @@ -249,11 +249,8 @@ class TestSplitEvents: def test_read_with_exons(self): contig = MockRead( query_sequence='CTTGAAGGAAACTGAATTCAAAAAGATCAAAGTGCTGGGCTCCGGTGCGTTCGGCACGGTGTATAAGGGACTCTGGATCCCAGAAGGTGAGAAAGTTAAAATTCCCGTCGCTATCAAGACATCTCCGAAAGCCAACAAGGAAATCCTCGATGAAGCCTACGTGATGGCCAGCGTGGACAACCCCCACGTGTGCCGCCTGCTGGGCATCTGCCTCACCTCCACCGTGCAGCTCATCATGCAGCTCATGCCCTTCGGCTGCCTCCTGGACTATGTCCGGGAACACAAAGACAATATTGGCTCCCAGTACCTGCTCAACTGGTGTGTGCAGATCGCAAAGGGCATGAACTACTTGGAGGACCGTCGCTTGGTGCACCGCGACCTGGCAGCCAGGAACGTACTGGTGAAAACACCGCAGCATGTCAAGATCACAGATTTTGGGCTGGCCAAACTGCTGGGTGCGGAAGAGAAAGAATACCATGCAGAAGGAGGCAAAGTGCCTATCAAGTGGATGGCATTGGAATCAATTTTACACAGAATCTATACCCACCAGAGTGATGTCTGGAGCTACGGGGTGACCGTTTGGGAGTTGATGACCTTTGGATCCAA', - cigar=_cigar.convert_string_to_cigar( - '68M678D50M15D34M6472D185M10240D158M891D74M8I5883D29M' - ), + cigarstring='68M678D50M15D34M6472D185M10240D158M891D74M8I5883D29M', reference_name='7', - reference_id=6, reference_start=55241669, ) assert len(align.call_read_events(contig)) == 6 @@ -262,10 +259,9 @@ def test_read_with_exons(self): class TestCallBreakpointPair: def test_single_one_event(self): r = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 10), (CIGAR.I, 3), (CIGAR.D, 7), (CIGAR.M, 10)], + cigarstring='10M3I7D10M', query_sequence='ACTGAATCGTGGGTAGCTGCTAG', ) bpps = align.call_read_events(r) @@ -280,10 +276,9 @@ def test_single_one_event(self): def test_ins_and_del(self): r = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 10), (CIGAR.I, 3), (CIGAR.M, 5), (CIGAR.D, 7), (CIGAR.M, 5)], + cigarstring='10M3I5M7D5M', query_sequence='ACTGAATCGTGGGTAGCTGCTAG', ) # only report the major del event for now @@ -305,10 +300,9 @@ def test_ins_and_del(self): def test_single_insertion(self): r = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 10), (CIGAR.I, 8), (CIGAR.M, 5)], + cigarstring='10M8I5M', query_sequence='ACTGAATCGTGGGTAGCTGCTAG', ) bpp = align.call_read_events(r)[0] @@ -321,10 +315,10 @@ def test_single_insertion(self): def test_single_duplication(self): r = MockRead( - name='seq1', + query_name='seq1', reference_name='gene3', reference_start=27155, - cigar=[(CIGAR.M, 65), (CIGAR.I, 6), (CIGAR.D, 95), (CIGAR.M, 21), (CIGAR.S, 17)], + cigarstring='65M6I95D21M17S', query_sequence='TAGTTGGATCTCTGTGCTGACTGACTGACAGACAGACTTTAGTGTCTGTGTGCTGACTGACAGACAGACTTTAGTGTCTGTGTGCTGACT' 'GACAGACTCTAGTAGTGTC', ) @@ -341,10 +335,9 @@ def test_single_duplication_with_leading_untemp(self): 'CTGTGGGCTCGGGCCCGACGCGCACGGAGGACTGGAGGACTGGGGCGTGTGTCTGCGGTGCAGGCGAGGCGGGGCGGGC' ), query_name='duplication_with_untemp', - reference_id=16, reference_name='reference17', reference_start=1882, - cigar=[(CIGAR.EQ, 126), (CIGAR.I, 54), (CIGAR.EQ, 93)], + cigarstring='126=54I93=', is_reverse=False, ) bpp = align.call_read_events(r)[0] @@ -359,10 +352,9 @@ def test_single_duplication_with_no_untemp(self): 'CCGGTTTAGCATTGCCATTGGTAA' ), query_name='duplication_with_untemp', - reference_id=2, reference_name='reference3', reference_start=1497, - cigar=[(CIGAR.EQ, 51), (CIGAR.I, 22), (CIGAR.EQ, 52)], + cigarstring='51=22I52=', is_reverse=False, ) # repeat: GATTTTGCTGTTGTTTTTGTTC @@ -383,10 +375,9 @@ def test_single_duplication_with_trailing_untemp(self): 'CAAAGTGTTTTATACTGATAAAGCAACCCCGGTTTAGCATTGCCATTGGTAA' ), query_name='duplication_with_untemp', - reference_id=2, reference_name='reference3', reference_start=1497, - cigar=[(CIGAR.EQ, 51), (CIGAR.I, 27), (CIGAR.EQ, 52)], + cigarstring='51=27I52=', is_reverse=False, ) # repeat: GATTTTGCTGTTGTTTTTGTTC @@ -410,19 +401,17 @@ def test_read_pair_indel(self): # i ---------GGGAATTCCGGA--------- 10-21 n/a seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 9), (CIGAR.S, 21)], + cigarstring='9M21S', query_sequence=seq, is_reverse=False, ) r2 = MockRead( - reference_id=0, reference_name='1', reference_start=99, - cigar=[(CIGAR.S, 21), (CIGAR.M, 9)], + cigarstring='21S9M', query_sequence=seq, is_reverse=False, ) @@ -443,19 +432,17 @@ def test_read_pair_deletion(self): # r2 aaatttcccgggaattccggaTCGATCGAT seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 21), (CIGAR.S, 9)], + cigarstring='21M9S', query_sequence=seq, is_reverse=False, ) r2 = MockRead( - reference_id=0, reference_name='1', reference_start=99, - cigar=[(CIGAR.S, 21), (CIGAR.M, 9)], + cigarstring='21S9M', query_sequence=seq, is_reverse=False, ) @@ -474,19 +461,17 @@ def test_read_pair_translocation(self): # r2 aaatttcccgggaattccggaTCGATCGAT seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead( - reference_id=0, reference_name='2', reference_start=0, - cigar=[(CIGAR.M, 21), (CIGAR.S, 9)], + cigarstring='21M9S', query_sequence=seq, is_reverse=False, ) r2 = MockRead( - reference_id=0, reference_name='1', reference_start=99, - cigar=[(CIGAR.S, 21), (CIGAR.M, 9)], + cigarstring='21S9M', query_sequence=seq, is_reverse=False, ) @@ -506,19 +491,17 @@ def test_read_pair_deletion_overlapping_query_coverage(self): seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 21), (CIGAR.S, 9)], + cigarstring='21M9S', query_sequence=seq, is_reverse=False, ) r2 = MockRead( - reference_id=0, reference_name='1', reference_start=99, - cigar=[(CIGAR.S, 18), (CIGAR.M, 12)], + cigarstring='18S12M', query_sequence=seq, is_reverse=False, ) @@ -542,19 +525,17 @@ def test_read_pair_inversion_overlapping_query_coverage(self): # r2 ATCTATCGATCCggaattcccgggaaattt 100+12 = 111 - 3 = 108 seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 21), (CIGAR.S, 9)], + cigarstring='21M9S', query_sequence=seq, is_reverse=False, ) r2 = MockRead( - reference_id=0, reference_name='1', reference_start=99, - cigar=[(CIGAR.M, 12), (CIGAR.S, 18)], + cigarstring='12M18S', query_sequence=reverse_complement(seq), is_reverse=True, ) @@ -573,16 +554,16 @@ def test_read_pair_large_inversion_overlapping_query_coverage(self): s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT' read1 = MockRead( - reference_id=3, + reference_name='3', reference_start=1114, - cigar=[(CIGAR.S, 125), (CIGAR.EQ, 120)], + cigarstring='125S120=', query_sequence=s, is_reverse=False, ) read2 = MockRead( - reference_id=3, + reference_name='3', reference_start=2187, - cigar=[(CIGAR.S, 117), (CIGAR.EQ, 8), (CIGAR.D, 1), (CIGAR.M, 120)], + cigarstring='117S8=1D120M', query_sequence=reverse_complement(s), is_reverse=True, ) @@ -613,19 +594,17 @@ def test_read_pair_inversion_gap_in_query_coverage(self): # r2 ATCTATCGATCCggaattcccgggaaattt 100+12 = 111 - 3 = 108 seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 16), (CIGAR.S, 14)], + cigarstring='16M14S', query_sequence=seq, is_reverse=False, ) r2 = MockRead( - reference_id=0, reference_name='1', reference_start=99, - cigar=[(CIGAR.M, 12), (CIGAR.S, 18)], + cigarstring='12M18S', query_sequence=reverse_complement(seq), is_reverse=True, ) @@ -719,19 +698,17 @@ def test_inversion_and_deletion(self): reference_genome=None, bam_cache=mock.Mock(stranded=False), ) - read1 = SamRead( - reference_id=3, + read1 = MockRead( reference_start=1114, - cigar=[(CIGAR.S, 125), (CIGAR.EQ, 120)], + cigarstring='125S120=', query_sequence=s, is_reverse=False, reference_name='3', alignment_rank=0, ) - read2 = SamRead( - reference_id=3, + read2 = MockRead( reference_start=2187, - cigar=[(CIGAR.S, 117), (CIGAR.EQ, 8), (CIGAR.D, 1), (CIGAR.EQ, 120)], + cigarstring='117S7=1X120=', query_sequence=reverse_complement(s), is_reverse=True, reference_name='3', diff --git a/tests/test_mavis/validate/test_call.py b/tests/test_mavis/validate/test_call.py index cf2a78fc..88974fd7 100644 --- a/tests/test_mavis/validate/test_call.py +++ b/tests/test_mavis/validate/test_call.py @@ -66,7 +66,6 @@ def test_EGFR_small_del_transcriptome(self): '68M678D50M15D34M6472D185M10240D158M891D74M' '5875D' '6M' '1X' '29M' ), reference_name='7', - reference_id=6, reference_start=55241669, alignment_rank=0, ) @@ -167,26 +166,38 @@ def test_flanking_support(self): mock_read_pair( MockRead( query_name='test1', - reference_id=3, + reference_name='reference3', template_length=500, reference_start=1150, - reference_end=1200, is_reverse=True, + cigarstring='50M', ), - MockRead(reference_id=3, reference_start=2200, reference_end=2250, is_reverse=True), + MockRead( + reference_name='reference3', + reference_start=2200, + is_reverse=True, + cigarstring='50M', + ), + proper_pair=False, ) ) ev.flanking_pairs.add( mock_read_pair( MockRead( query_name='test2', - reference_id=3, + reference_name='reference3', template_length=560, reference_start=1150, - reference_end=1200, + cigarstring='50M', is_reverse=True, ), - MockRead(reference_id=3, reference_start=2200, reference_end=2250, is_reverse=True), + MockRead( + reference_name='reference3', + reference_start=2200, + is_reverse=True, + cigarstring='50M', + ), + proper_pair=False, ) ) median, stdev = ev.flanking_metrics() @@ -250,8 +261,18 @@ def test_deletion(self): ) flanking_pairs = [ mock_read_pair( - MockRead('r1', 0, 400, 450, is_reverse=False), - MockRead('r1', 0, 1200, 1260, is_reverse=True), + MockRead( + query_name='r1', + reference_name='1', + reference_start=400, + cigarstring='50M', + ), + MockRead( + query_name='r1', + reference_name='1', + reference_start=1200, + cigarstring='60M', + ), ) ] event = call.EventCall( @@ -303,8 +324,12 @@ def test_insertion(self): ) flanking_pairs = [ mock_read_pair( - MockRead('r1', 0, 700, 750, is_reverse=False), - MockRead('r1', 0, 950, 1049, is_reverse=True), + MockRead( + query_name='r1', reference_name='1', reference_start=700, cigarstring='50M' + ), + MockRead( + query_name='r1', reference_name='1', reference_start=950, cigarstring='50M' + ), ) ] print(evidence.min_expected_fragment_size) @@ -326,8 +351,21 @@ def test_inversion(self): ) flanking_pairs = [ mock_read_pair( - MockRead('r1', 0, 400, 450, is_reverse=False), - MockRead('r1', 0, 900, 950, is_reverse=False), + MockRead( + query_name='r1', + reference_name='1', + reference_start=400, + cigarstring='50M', + is_reverse=False, + ), + MockRead( + query_name='r1', + reference_name='1', + reference_start=900, + cigarstring='50M', + is_reverse=False, + ), + proper_pair=False, ) ] event = call.EventCall( @@ -380,28 +418,88 @@ def test_translocation_rl(self): event = call.EventCall(b1, b2, evidence, SVTYPE.TRANS, CALL_METHOD.CONTIG) flanking_pairs = [ mock_read_pair( - MockRead('x', '11', 128675264, 128677087, is_reverse=False), - MockRead('x', '22', 29683030, 29683105, is_reverse=True), + MockRead( + query_name='x1', + reference_name='11', + reference_start=128675264, + cigarstring=str(128677087 - 128675264 - 100) + 'M', + ), + MockRead( + query_name='x1', + reference_name='22', + reference_start=29683030, + cigarstring=str(29683105 - 29683030) + 'M', + ), ), mock_read_pair( - MockRead('x', '11', 128675286, 128677109, is_reverse=False), - MockRead('x', '22', 29683016, 29683091, is_reverse=True), + MockRead( + query_name='x2', + reference_name='11', + reference_start=128675286, + cigarstring=str(128677109 - 128675286) + 'M', + ), + MockRead( + query_name='x2', + reference_name='22', + reference_start=29683016, + cigarstring=str(29683091 - 29683016) + 'M', + ), ), mock_read_pair( - MockRead('x', '11', 128675260, 128677083, is_reverse=False), - MockRead('x', '22', 29683049, 29683123, is_reverse=True), + MockRead( + query_name='x3', + reference_name='11', + reference_start=128675260, + cigarstring=str(128677083 - 128675260) + 'M', + ), + MockRead( + query_name='x3', + reference_name='22', + reference_start=29683049, + cigarstring=str(29683123 - 29683049) + 'M', + ), ), mock_read_pair( - MockRead('x', '11', 128675289, 128677110, is_reverse=False), - MockRead('x', '22', 29683047, 29683122, is_reverse=True), + MockRead( + query_name='x4', + reference_name='11', + reference_start=128675289, + cigarstring=str(128677110 - 128675289) + 'M', + ), + MockRead( + query_name='x4', + reference_name='22', + reference_start=29683047, + cigarstring=str(29683122 - 29683047) + 'M', + ), ), mock_read_pair( - MockRead('x', '11', 128675306, 128677129, is_reverse=False), - MockRead('x', '22', 29683039, 29683114, is_reverse=True), + MockRead( + query_name='x5', + reference_name='11', + reference_start=128675306, + cigarstring=str(128677129 - 128675306) + 'M', + ), + MockRead( + query_name='x5', + reference_name='22', + reference_start=29683039, + cigarstring=str(29683114 - 29683039) + 'M', + ), ), mock_read_pair( - MockRead('x', '11', 128675289, 128677110, is_reverse=False), - MockRead('x', '22', 29683047, 29683122, is_reverse=True), + MockRead( + query_name='x6', + reference_name='11', + reference_start=128675289, + cigarstring=str(128677110 - 128675289) + 'M', + ), + MockRead( + query_name='x6', + reference_name='22', + reference_start=29683047, + cigarstring=str(29683122 - 29683047) + 'M', + ), ), ] event.add_flanking_support(flanking_pairs) @@ -413,8 +511,20 @@ def test_translocation_rl_filter_nonsupporting(self): ) flanking_pairs = [ mock_read_pair( - MockRead('r1', 0, 1201, 1249, is_reverse=True), - MockRead('r1', 1, 1201, 1249, is_reverse=False), + MockRead( + query_name='r1', + reference_name='1', + reference_start=1201, + cigarstring='48M', + is_reverse=True, + ), + MockRead( + query_name='r1', + reference_name='2', + reference_start=1201, + cigarstring='48M', + is_reverse=False, + ), ) ] event = call.EventCall( @@ -447,8 +557,20 @@ def test_duplication(self): ) flanking_pairs = [ mock_read_pair( - MockRead('r1', 0, 1205, 1250, is_reverse=True), - MockRead('r1', 0, 1260, 1295, is_reverse=False), + MockRead( + query_name='r1', + reference_name='1', + reference_start=1205, + cigarstring='45M', + is_reverse=True, + ), + MockRead( + query_name='r1', + reference_name='1', + reference_start=1260, + cigarstring='35M', + is_reverse=False, + ), ) ] event = call.EventCall( @@ -501,59 +623,64 @@ def test_call_all_methods(self): Breakpoint('1', 450, 500, orient=ORIENT.RIGHT), opposing_strands=False, ) - r1, r2 = mock_read_pair( - MockRead( - query_name='t1', - reference_id=0, - reference_name='1', - reference_start=40, - cigar=[(CIGAR.EQ, 60), (CIGAR.S, 40)], - query_sequence='A' * 100, - ), - MockRead( - query_name='t1', - reference_id=0, - reference_name='1', - reference_start=460, - cigar=[(CIGAR.S, 40), (CIGAR.EQ, 60)], - query_sequence='A' * 100, - ), - ) + contig = mock.Mock( **{ 'seq': '', 'complexity.return_value': 1, - 'alignments': [call_paired_read_event(r1, r2)], + 'alignments': [ + call_paired_read_event( + MockRead( + query_name='assembly1', + reference_name='1', + reference_start=40, + cigarstring='60=40S', + query_sequence='A' * 100, + ), + MockRead( + query_name='assembly1', + reference_name='1', + reference_start=460, + cigarstring='40S60=', + query_sequence='A' * 100, + ), + ) + ], } ) contig.input_reads = { - MockRead(query_name='t1', reference_start=100, cigar=[(CIGAR.EQ, 20), (CIGAR.S, 80)]) + MockRead(query_name='t0', reference_name='1', reference_start=100, cigarstring='20=80S') } evidence.contigs.append(contig) - evidence.split_reads[0].add( - MockRead(query_name='t1', reference_start=100, cigar=[(CIGAR.EQ, 20), (CIGAR.S, 80)]) - ) - evidence.split_reads[1].add( - MockRead(query_name='t1', reference_start=500, cigar=[(CIGAR.S, 50), (CIGAR.EQ, 50)]) + split_read1, split_read2 = mock_read_pair( + MockRead( + query_name='t1', reference_name='1', reference_start=100, cigarstring='20=80S' + ), + MockRead( + query_name='t1', + reference_name='1', + reference_start=500, + cigarstring='50S50=', + query_sequence='A' * 100, + ), ) + + evidence.split_reads[0].add(split_read1) + evidence.split_reads[1].add(split_read2) evidence.flanking_pairs.add( mock_read_pair( MockRead( query_name='t4', - reference_id=0, + reference_name='1', reference_start=10, - reference_end=40, - is_reverse=False, - query_alignment_length=30, + cigarstring='30M', ), MockRead( query_name='t4', - reference_id=0, + reference_name='1', + cigarstring='35M', reference_start=505, - reference_end=540, - is_reverse=True, - query_alignment_length=35, ), ) ) @@ -561,45 +688,48 @@ def test_call_all_methods(self): mock_read_pair( MockRead( query_name='t3', - reference_id=0, + reference_name='1', reference_start=49, - reference_end=90, - is_reverse=False, - query_alignment_length=41, + cigarstring='41M', ), MockRead( query_name='t3', - reference_id=0, + reference_name='1', reference_start=805, - reference_end=840, - is_reverse=True, - query_alignment_length=35, + cigarstring='35M', ), ) ) events = call.call_events(evidence) - for ev in events: - print(ev, ev.event_type, ev.call_method) assert len(events) == 4 + assert events[0].call_method == 'contig' assert events[0].break1.start == 100 assert events[0].break2.start == 481 assert events[0].event_type == 'deletion' + assert 't0' in {r.query_name for r in events[0].support()} + assert 't4' in {r.query_name for r in events[0].support()} + assert events[1].call_method == 'split reads' assert events[1].break1.start == 120 assert events[1].break2.start == 501 assert events[1].event_type == 'deletion' + assert 't1' in {r.query_name for r in events[1].support()} + assert events[2].call_method == 'flanking reads' assert events[2].break1.start == 90 assert events[2].break1.end == 299 assert events[2].break2.start == 591 assert events[2].break2.end == 806 assert events[2].event_type == 'deletion' + assert 't3' in {r.query_name for r in events[2].support()} + assert events[3].call_method == 'split reads' assert events[3].break1.start == 120 assert events[3].break2.start == 501 assert events[3].event_type == 'insertion' + assert 't1' in {r.query_name for r in events[3].support()} def test_call_contig_only(self): # event should only be 100L+, 501R+ deletion @@ -608,34 +738,37 @@ def test_call_contig_only(self): Breakpoint('1', 450, 500, orient=ORIENT.RIGHT), opposing_strands=False, ) - r1, r2 = mock_read_pair( - MockRead( - query_name='t1', - reference_id=0, - reference_name='1', - reference_start=40, - cigar=[(CIGAR.EQ, 60), (CIGAR.S, 40)], - query_sequence='A' * 100, - query_alignment_length=100, - ), - MockRead( - query_name='t1', - reference_id=0, - reference_name='1', - reference_start=480, - cigar=[(CIGAR.S, 40), (CIGAR.EQ, 60)], - query_sequence='A' * 100, - query_alignment_length=100, - ), + + contig = mock.Mock( + **{ + 'seq': '', + 'complexity.return_value': 1, + 'alignments': [ + call_paired_read_event( + MockRead( + query_name='t1', + reference_name='1', + reference_start=40, + cigarstring='60=40S', + query_sequence='A' * 100, + ), + MockRead( + query_name='t1', + reference_name='1', + reference_start=480, + cigarstring='40S60=', + query_sequence='A' * 100, + ), + ) + ], + } ) - bpp = call_paired_read_event(r1, r2) - contig = mock.Mock(**{'seq': '', 'complexity.return_value': 1, 'alignments': [bpp]}) contig.input_reads = { MockRead( query_name='t1', reference_start=100, reference_name='1', - cigar=[(CIGAR.EQ, 20), (CIGAR.S, 80)], + cigarstring='20=80S', ) } evidence.contigs.append(contig) @@ -645,7 +778,7 @@ def test_call_contig_only(self): query_name='t1', reference_name='1', reference_start=80, - cigar=[(CIGAR.EQ, 20), (CIGAR.S, 80)], + cigarstring='20=80S', ) ) evidence.split_reads[1].add( @@ -653,7 +786,7 @@ def test_call_contig_only(self): query_name='t1', reference_name='1', reference_start=500, - cigar=[(CIGAR.S, 50), (CIGAR.EQ, 50)], + cigarstring='50S50=', ) ) evidence.split_reads[0].add( @@ -661,7 +794,7 @@ def test_call_contig_only(self): query_name='t2', reference_name='1', reference_start=40, - cigar=[(CIGAR.EQ, 50), (CIGAR.S, 50)], + cigarstring='50=50S', ) ) evidence.flanking_pairs.add( @@ -669,20 +802,16 @@ def test_call_contig_only(self): MockRead( query_name='t3', reference_name='1', - reference_id=0, reference_start=49, - reference_end=90, is_reverse=False, - query_alignment_length=100, + cigarstring='41M', ), MockRead( query_name='t3', reference_name='1', - reference_id=0, reference_start=505, - reference_end=550, is_reverse=True, - query_alignment_length=100, + cigarstring='100M', ), ) ) @@ -702,29 +831,28 @@ def test_call_contig_and_split(self): Breakpoint('1', 450, 500, orient=ORIENT.RIGHT), opposing_strands=False, ) - r1, r2 = mock_read_pair( - MockRead( - query_name='t1', - reference_id=0, - reference_name='1', - reference_start=40, - cigar=[(CIGAR.EQ, 60), (CIGAR.S, 40)], - query_sequence='A' * 100, - ), - MockRead( - query_name='t1', - reference_id=0, - reference_name='1', - reference_start=480, - cigar=[(CIGAR.S, 40), (CIGAR.EQ, 60)], - query_sequence='A' * 100, - ), - ) contig = mock.Mock( **{ 'seq': '', 'complexity.return_value': 1, - 'alignments': [call_paired_read_event(r1, r2)], + 'alignments': [ + call_paired_read_event( + MockRead( + query_name='t1', + reference_name='1', + reference_start=40, + cigarstring='60=40S', + query_sequence='A' * 100, + ), + MockRead( + query_name='t1', + reference_name='1', + reference_start=480, + cigarstring='40S60=', + query_sequence='A' * 100, + ), + ) + ], } ) contig.input_reads = { @@ -732,46 +860,45 @@ def test_call_contig_and_split(self): query_name='t1', reference_name='1', reference_start=100, - cigar=[(CIGAR.EQ, 20), (CIGAR.S, 80)], + cigarstring='20=80S', ) } evidence.contigs.append(contig) - evidence.split_reads[0].add( + split_read1, split_read2 = mock_read_pair( MockRead( - query_name='t1', + query_name='t2', reference_start=100, reference_name='1', - cigar=[(CIGAR.EQ, 20), (CIGAR.S, 80)], - ) - ) - evidence.split_reads[1].add( + cigarstring='20=80S', + query_sequence='C' * 100, + ), MockRead( - query_name='t1', + query_name='t2', reference_start=520, reference_name='1', - cigar=[(CIGAR.S, 50), (CIGAR.EQ, 50)], - ) + cigarstring='50S50=', + query_sequence='A' * 100, + ), ) + + evidence.split_reads[0].add(split_read1) + evidence.split_reads[1].add(split_read2) evidence.flanking_pairs.add( mock_read_pair( MockRead( query_name='t3', - reference_id=0, reference_start=49, reference_name='1', - reference_end=90, is_reverse=False, - query_alignment_length=100, + cigarstring='41M', ), MockRead( query_name='t3', - reference_id=0, reference_start=505, reference_name='1', - reference_end=550, is_reverse=True, - query_alignment_length=100, + cigarstring='100M', ), ) ) @@ -782,9 +909,11 @@ def test_call_contig_and_split(self): assert events[0].break1.start == 100 assert events[0].break2.start == 501 assert events[0].call_method == 'contig' + assert events[1].call_method == 'split reads' assert events[1].break1.start == 120 assert events[1].break2.start == 521 + assert events[2].event_type == 'insertion' assert events[2].call_method == 'split reads' assert events[2].break1.start == 120 @@ -797,28 +926,22 @@ def test_call_split_only(self): opposing_strands=False, ) evidence.split_reads[0].add( - MockRead(query_name='t1', reference_start=140, cigar=[(CIGAR.EQ, 30), (CIGAR.S, 70)]) + MockRead(query_name='t1', reference_start=140, cigarstring='30=70S') ) evidence.split_reads[1].add( - MockRead(query_name='t1', reference_start=870, cigar=[(CIGAR.S, 50), (CIGAR.EQ, 50)]) + MockRead(query_name='t1', reference_start=870, cigarstring='50S50=') ) evidence.flanking_pairs.add( mock_read_pair( MockRead( query_name='t3', - reference_id=0, reference_start=42, - reference_end=140, - is_reverse=False, - query_alignment_length=100, + cigarstring='100M', ), MockRead( query_name='t3', - reference_id=0, reference_start=885, - reference_end=905, - is_reverse=True, - query_alignment_length=100, + cigarstring='100M', ), ) ) @@ -844,19 +967,15 @@ def test_call_flanking_only(self): mock_read_pair( MockRead( query_name='t1', - reference_id=0, + reference_name='1', reference_start=42, - reference_end=140, - is_reverse=False, - query_alignment_length=98, + cigarstring='98M', ), MockRead( query_name='t1', - reference_id=0, + reference_name='1', reference_start=885, - reference_end=905, - is_reverse=True, - query_alignment_length=20, + cigarstring='20M', ), ) ) @@ -910,7 +1029,7 @@ def inversion_evidence(): 'validate.min_flanking_pairs_resolution': 1, 'validate.min_linking_split_reads': 1, 'validate.min_spanning_reads_resolution': 3, - 'validate. min_call_complexity': 0, + 'validate.min_call_complexity': 0, }, ) @@ -922,10 +1041,10 @@ def test_empty(self, inversion_evidence): def test_call_no_duplication_by_split_reads(self, duplication_ev, inversion_evidence): duplication_ev.split_reads[0].add( - MockRead(query_name='t1', reference_start=30, cigar=[(CIGAR.EQ, 20), (CIGAR.S, 20)]) + MockRead(query_name='t1', reference_start=30, cigarstring='20=20S') ) duplication_ev.split_reads[1].add( - MockRead(query_name='t1', reference_start=90, cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)]) + MockRead(query_name='t1', reference_start=90, cigarstring='20S20=') ) bpps = call._call_by_split_reads(inversion_evidence, SVTYPE.DUP) @@ -936,7 +1055,7 @@ def test_by_split_read(self, inversion_evidence): MockRead( query_name='t1', reference_start=100, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='A' * 40, ) ) @@ -944,7 +1063,7 @@ def test_by_split_read(self, inversion_evidence): MockRead( query_name='t1', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='G' * 40, ) ) @@ -952,7 +1071,7 @@ def test_by_split_read(self, inversion_evidence): MockRead( query_name='t2', reference_start=100, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='C' * 40, ) ) @@ -960,7 +1079,7 @@ def test_by_split_read(self, inversion_evidence): MockRead( query_name='t2', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='A' * 40, ) ) @@ -979,7 +1098,7 @@ def test_call_by_split_read_low_resolution(self, inversion_evidence): MockRead( query_name='t1', reference_start=100, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='A' * 40, ) ) @@ -987,7 +1106,7 @@ def test_call_by_split_read_low_resolution(self, inversion_evidence): MockRead( query_name='t1', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='N' * 40, ) ) @@ -1006,7 +1125,7 @@ def test_call_by_split_read_resolve_untemp(self, inversion_evidence): MockRead( query_name='t1', reference_start=100, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='TCGGCTCCCGTACTTGTGTATAAGGGGCTTCTGATGTTAT', ) ) @@ -1014,7 +1133,7 @@ def test_call_by_split_read_resolve_untemp(self, inversion_evidence): MockRead( query_name='t1', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='ATAACATCAGAAGCCCCTTATACACAAGTACGGGAGCCGA', is_reverse=True, ) @@ -1033,7 +1152,7 @@ def test_call_by_split_read_resolve_untemp_exists(self, inversion_evidence): MockRead( query_name='t1', reference_start=100, - cigar=[(CIGAR.S, 22), (CIGAR.EQ, 18)], + cigarstring='22S18=', query_sequence='TCGGCTCCCGTACTTGTGTATAAGGGGCTTCTGATGTTAT', ) ) @@ -1041,7 +1160,7 @@ def test_call_by_split_read_resolve_untemp_exists(self, inversion_evidence): MockRead( query_name='t1', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='ATAACATCAGAAGCCCCTTATACACAAGTACGGGAGCCGA', is_reverse=True, ) @@ -1060,7 +1179,7 @@ def test_call_by_split_read_shift_overlap(self, inversion_evidence): MockRead( query_name='t1', reference_start=100, - cigar=[(CIGAR.S, 18), (CIGAR.EQ, 22)], + cigarstring='18S22=', query_sequence='TCGGCTCCCGTACTTGTGTATAAGGGGCTTCTGATGTTAT', ) ) @@ -1068,7 +1187,7 @@ def test_call_by_split_read_shift_overlap(self, inversion_evidence): MockRead( query_name='t1', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='ATAACATCAGAAGCCCCTTATACACAAGTACGGGAGCCGA', is_reverse=True, ) @@ -1085,30 +1204,57 @@ def test_call_by_split_read_shift_overlap(self, inversion_evidence): def test_both_by_flanking_pairs(self, inversion_evidence): inversion_evidence.flanking_pairs.add( mock_read_pair( - MockRead(query_name='t1', reference_id=0, reference_start=150, reference_end=150), - MockRead(query_name='t1', reference_id=0, reference_start=500, reference_end=520), + MockRead( + query_name='t1', + reference_name='1', + reference_start=150, + cigarstring='0M', + is_reverse=False, + ), + MockRead( + query_name='t1', + reference_name='1', + reference_start=500, + cigarstring='20M', + is_reverse=False, + ), + proper_pair=False, ) ) inversion_evidence.flanking_pairs.add( mock_read_pair( - MockRead(query_name='t2', reference_id=0, reference_start=120, reference_end=140), - MockRead(query_name='t2', reference_id=0, reference_start=520, reference_end=520), + MockRead( + query_name='t2', + reference_name='1', + reference_start=120, + cigarstring='20M', + is_reverse=False, + ), + MockRead( + query_name='t2', + reference_name='1', + reference_start=520, + cigarstring='0M', + is_reverse=False, + ), + proper_pair=False, ) ) bpp = call._call_by_flanking_pairs(inversion_evidence, SVTYPE.INV) # 120-149 ..... 500-519 # max frag = 150 - 80 = 70 - assert bpp.break1.start == 42 - assert bpp.break1.end == 120 - assert bpp.break2.start == 412 # 70 - 21 = 49 - assert bpp.break2.end == 500 + print(bpp) + assert bpp.break1.orient == ORIENT.RIGHT + assert bpp.break2.orient == ORIENT.RIGHT + assert (bpp.break1.start, bpp.break1.end) == (42, 121) + assert (bpp.break2.start, bpp.break2.end) == (412, 501) def test_by_split_reads_multiple_calls(self, inversion_evidence): inversion_evidence.split_reads[0].add( MockRead( query_name='t1', reference_start=100, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='A' * 40, ) ) @@ -1116,7 +1262,7 @@ def test_by_split_reads_multiple_calls(self, inversion_evidence): MockRead( query_name='t1', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='T' * 40, ) ) @@ -1124,7 +1270,7 @@ def test_by_split_reads_multiple_calls(self, inversion_evidence): MockRead( query_name='t2', reference_start=110, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='T' * 40, ) ) @@ -1132,7 +1278,7 @@ def test_by_split_reads_multiple_calls(self, inversion_evidence): MockRead( query_name='t2', reference_start=520, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='A' * 40, ) ) @@ -1160,64 +1306,73 @@ def test_call_by_split_reads_consume_flanking(self): evidence.split_reads[0].add( MockRead( query_name='test1', - cigar=[(CIGAR.S, 110), (CIGAR.EQ, 40)], + cigarstring='110S40=', reference_start=1114, - reference_end=1150, + reference_name='reference3', ) ) evidence.split_reads[0].add( MockRead( + reference_name='reference3', query_name='test2', - cigar=[(CIGAR.EQ, 30), (CIGAR.S, 120)], + cigarstring='30=120S', reference_start=1108, - reference_end=1115, ) ) - evidence.split_reads[0].add( - MockRead( - query_name='test3', - cigar=[(CIGAR.S, 30), (CIGAR.EQ, 120)], - reference_start=1114, - reference_end=1154, - tags=[(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT, 1)], - ) + read = MockRead( + reference_name='reference3', + query_name='test3', + cigarstring='30S120=', + reference_start=1114, ) + read.set_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT, 1) + evidence.split_reads[0].add(read) evidence.split_reads[1].add( MockRead( - query_name='test4', cigar=[(CIGAR.EQ, 30), (CIGAR.S, 120)], reference_start=2187 - ) - ) - evidence.split_reads[1].add( - MockRead( - query_name='test5', cigar=[(CIGAR.S, 30), (CIGAR.EQ, 120)], reference_start=2187 + reference_name='reference3', + query_name='test4', + cigarstring='30=120S', + reference_start=2187, ) ) evidence.split_reads[1].add( MockRead( - query_name='test1', - cigar=[(CIGAR.S, 30), (CIGAR.EQ, 120)], + reference_name='reference3', + query_name='test5', + cigarstring='30S120=', reference_start=2187, - reference_end=2307, - tags=[(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT, 1)], ) ) + read = MockRead( + reference_name='reference3', + query_name='test1', + cigarstring='30S120=', + reference_start=2187, + ) + read.set_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT, 1) + evidence.split_reads[1].add(read) evidence.flanking_pairs.add( mock_read_pair( MockRead( query_name='t1', - reference_id=3, + reference_name='reference3', reference_start=1200, - reference_end=1250, + is_reverse=True, + cigarstring='50M', + ), + MockRead( + reference_name='reference3', + reference_start=2250, + cigarstring='50M', is_reverse=True, ), - MockRead(reference_id=3, reference_start=2250, reference_end=2300, is_reverse=True), ) ) events = call._call_by_split_reads(evidence, event_type=SVTYPE.INV) for ev in events: - print(ev, ev.event_type, ev.call_method) + print(ev, ev.event_type, ev.call_method, [r.query_name for r in ev.support()]) assert len(events) == 1 event = events[0] assert len(event.flanking_pairs) == 1 @@ -1268,34 +1423,24 @@ def test_intrachromosomal_lr(self, left_right_ev): left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( - reference_start=19, - reference_end=60, - next_reference_start=599, - query_alignment_length=25, + reference_start=60 - 25, + cigarstring='25M', ), MockRead( reference_start=599, - reference_end=650, - next_reference_start=19, - query_alignment_length=25, - is_reverse=True, + cigarstring='25M', ), ) ) left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( - reference_start=39, - reference_end=80, - next_reference_start=649, - query_alignment_length=25, + reference_start=80 - 25, + cigarstring='25M', ), MockRead( reference_start=649, - reference_end=675, - next_reference_start=39, - query_alignment_length=25, - is_reverse=True, + cigarstring='25M', ), ) ) @@ -1303,62 +1448,48 @@ def test_intrachromosomal_lr(self, left_right_ev): left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( - reference_start=39, - reference_end=50, - next_reference_start=91, - query_alignment_length=25, + reference_start=25, + cigarstring='25M', ), MockRead( reference_start=91, - reference_end=110, - next_reference_start=39, - query_alignment_length=25, - is_reverse=True, + cigarstring='25M', ), ) ) bpp = call._call_by_flanking_pairs(left_right_ev, SVTYPE.DEL) - print(bpp, bpp.flanking_pairs) + print(bpp, len(bpp.flanking_pairs)) assert bpp.break1.start == 80 - assert bpp.break1.end == 80 + 125 - 45 - assert bpp.break2.start == 600 - 125 + 75 + assert bpp.break1.end == 80 + 125 - 45 # 160 + assert bpp.break2.start == 600 - 125 + 75 # 550 assert bpp.break2.end == 600 def test_intrachromosomal_lr_coverage_overlaps_range(self, left_right_ev): # this test is for ensuring that if a theoretical window calculated for the # first breakpoint overlaps the actual coverage for the second breakpoint (or the reverse) # that we adjust the theoretical window accordingly + left_right_ev.median_fragment_size = 150 left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=21, - reference_end=60, - next_reference_start=80, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=80, - reference_end=120, - next_reference_start=21, - query_alignment_length=25, - is_reverse=True, + cigarstring='25M', ), ) ) left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( - reference_start=41, - reference_end=80, - next_reference_start=110, - query_alignment_length=25, + reference_start=80 - 25, + cigarstring='25M', ), MockRead( reference_start=110, - reference_end=140, - next_reference_start=41, - query_alignment_length=25, - is_reverse=True, + cigarstring='25M', ), ) ) @@ -1367,16 +1498,11 @@ def test_intrachromosomal_lr_coverage_overlaps_range(self, left_right_ev): mock_read_pair( MockRead( reference_start=39, - reference_end=80, - next_reference_start=649, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=649, - reference_end=675, - next_reference_start=39, - query_alignment_length=25, - is_reverse=True, + cigarstring='25M', ), ) ) @@ -1387,19 +1513,19 @@ def test_intrachromosomal_lr_coverage_overlaps_range(self, left_right_ev): assert break2.end == 81 def test_intrachromosomal_flanking_coverage_overlap_error(self, left_right_ev): + """ + Cannot call with the flanking reads together since their coverage intervals overlap at the wrong ends + """ + left_right_ev.config['validate.min_flanking_pairs_resolution'] = 2 left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=19, - reference_end=60, - next_reference_start=599, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=599, - reference_end=650, - next_reference_start=19, - query_alignment_length=25, + cigarstring='25M', ), ) ) @@ -1407,35 +1533,35 @@ def test_intrachromosomal_flanking_coverage_overlap_error(self, left_right_ev): mock_read_pair( MockRead( reference_start=620, - reference_end=80, - next_reference_start=780, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=780, - reference_end=820, - next_reference_start=620, - query_alignment_length=25, + cigarstring='25M', ), ) ) - with pytest.raises(AssertionError): + with pytest.raises(AssertionError) as e_info: call._call_by_flanking_pairs(left_right_ev, SVTYPE.DEL) + assert ( + e_info.value.args[0] + == 'insufficient flanking pairs (1) to call deletion by flanking reads' + ) def test_coverage_larger_than_max_expected_variance_error(self, left_right_ev): + """ + Cannot group the reads to make a single call so the flanking call is below the acceptable resolution + """ + left_right_ev.config['validate.min_flanking_pairs_resolution'] = 2 left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=19, - reference_end=60, - next_reference_start=599, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=599, - reference_end=650, - next_reference_start=19, - query_alignment_length=25, + cigarstring='25M', ), ) ) @@ -1443,20 +1569,20 @@ def test_coverage_larger_than_max_expected_variance_error(self, left_right_ev): mock_read_pair( MockRead( reference_start=301, - reference_end=350, - next_reference_start=780, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=780, - reference_end=820, - next_reference_start=301, - query_alignment_length=25, + cigarstring='25M', ), ) ) - with pytest.raises(AssertionError): + with pytest.raises(AssertionError) as e_info: call._call_by_flanking_pairs(left_right_ev, SVTYPE.DEL) + assert ( + e_info.value.args[0] + == 'insufficient flanking pairs (1) to call deletion by flanking reads' + ) def test_close_to_zero(self, left_right_ev): # this test is for ensuring that if a theoretical window calculated for the @@ -1479,33 +1605,23 @@ def test_close_to_zero(self, left_right_ev): ev.flanking_pairs.add( mock_read_pair( MockRead( - reference_start=19, - reference_end=60, - next_reference_start=149, - query_alignment_length=25, + reference_name='fake', reference_start=19, cigarstring='25M', is_reverse=True ), MockRead( - reference_start=149, - reference_end=150, - next_reference_start=19, - query_alignment_length=25, + reference_name='fake', reference_start=149, cigarstring='25M', is_reverse=True ), + proper_pair=False, ) ) ev.flanking_pairs.add( mock_read_pair( MockRead( - reference_start=39, - reference_end=80, - next_reference_start=199, - query_alignment_length=25, + reference_name='fake', reference_start=39, cigarstring='25M', is_reverse=True ), MockRead( - reference_start=199, - reference_end=200, - next_reference_start=39, - query_alignment_length=25, + reference_name='fake', reference_start=199, cigarstring='25M', is_reverse=True ), + proper_pair=False, ) ) break1, break2 = call._call_by_flanking_pairs(ev, SVTYPE.INV) @@ -1531,15 +1647,11 @@ def test_call_with_overlapping_coverage_intervals(self, left_right_ev): mock_read_pair( MockRead( reference_start=76186159, - reference_end=76186309, - next_reference_start=76186000, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=76186000, - reference_end=76186150, - next_reference_start=76186159, - query_alignment_length=25, + cigarstring='25M', ), ) ) @@ -1597,18 +1709,28 @@ def test_call_deletion(self): # now add the flanking pairs pair = mock_read_pair( MockRead( - 'name', '1', 951, 1051, is_reverse=False, query_alignment_length=50, is_read1=False + query_name='name', + reference_name='1', + reference_start=1001, + is_reverse=False, + cigarstring='50M', + ), + MockRead( + query_name='name', + reference_name='1', + reference_start=2299, + is_reverse=True, + cigarstring='50M', ), - MockRead('name', '1', 2299, 2399, is_reverse=True, query_alignment_length=50), + proper_pair=False, + reverse_order=True, ) print(read_pair_type(pair[0])) # following help in debugging the mockup assert not pair[0].is_reverse assert not pair[0].is_read1 - assert pair[0].is_read2 assert pair[1].is_reverse assert pair[1].is_read1 - assert not pair[1].is_read2 assert sequenced_strand(pair[0], 2) == STRAND.POS assert evidence.decide_sequenced_strand([pair[0]]) == STRAND.POS assert sequenced_strand(pair[1], 2) == STRAND.POS diff --git a/tests/test_mavis/validate/test_evidence.py b/tests/test_mavis/validate/test_evidence.py index a0c43f92..56cb7b2a 100644 --- a/tests/test_mavis/validate/test_evidence.py +++ b/tests/test_mavis/validate/test_evidence.py @@ -11,8 +11,8 @@ from mavis.interval import Interval from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence, TranscriptomeEvidence -from mavis_config import DEFAULTS from mavis.validate.gather import collect_flanking_pair +from mavis_config import DEFAULTS from ..mock import MockBamFileHandle, MockObject, MockRead, mock_read_pair @@ -203,8 +203,18 @@ class TestComputeFragmentSizes: def test_genomic_vs_trans_no_annotations(self, genomic_evidence, read_length, trans_evidence): # should be identical read, mate = mock_read_pair( - MockRead('name', '1', 1051 - read_length + 1, 1051, is_reverse=False), - MockRead('name', '1', 2300, 2300 + read_length - 1, is_reverse=True), + MockRead( + query_name='name', + reference_name='1', + reference_start=1051 - read_length + 1, + cigarstring=str(read_length) + 'M', + ), + MockRead( + query_name='name', + reference_name='1', + reference_start=2300, + cigarstring=str(read_length) + 'M', + ), ) assert genomic_evidence.compute_fragment_size( read, mate @@ -212,8 +222,18 @@ def test_genomic_vs_trans_no_annotations(self, genomic_evidence, read_length, tr def test_reverse_reads(self, genomic_evidence, trans_evidence): read, mate = mock_read_pair( - MockRead('name', '1', 1001, 1100, is_reverse=False), - MockRead('name', '1', 2201, 2301, is_reverse=True), + MockRead( + query_name='name', + reference_name='1', + reference_start=1001, + cigarstring='99M', + ), + MockRead( + query_name='name', + reference_name='1', + reference_start=2300, + cigarstring='100M', + ), ) assert genomic_evidence.compute_fragment_size(read, mate) == Interval(1300) assert genomic_evidence.compute_fragment_size(mate, read) == Interval(1300) @@ -614,8 +634,20 @@ def flanking_ge(read_length): class TestGenomeEvidenceAddReads: def test_collect_flanking_pair_error_unmapped_read(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test', 0, 900, 1000, is_reverse=False), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test', + reference_name='1', + reference_start=900, + cigarstring='100M', + is_reverse=False, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) read.is_unmapped = True with pytest.raises(ValueError): @@ -623,8 +655,20 @@ def test_collect_flanking_pair_error_unmapped_read(self, flanking_ge): def test_collect_flanking_pair_error_mate_unmapped(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test', 0, 900, 1000, is_reverse=False), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test', + reference_name='1', + reference_start=900, + cigarstring=str(1000 - 900) + 'M', + is_reverse=False, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) mate.is_unmapped = True with pytest.raises(ValueError): @@ -632,16 +676,41 @@ def test_collect_flanking_pair_error_mate_unmapped(self, flanking_ge): def test_collect_flanking_pair_error_query_names_dont_match(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test1', 0, 900, 1000, is_reverse=False), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test1', + reference_name='1', + reference_start=900, + cigarstring=str(1000 - 900) + 'M', + is_reverse=False, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) with pytest.raises(ValueError): collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_error_template_lengths_dont_match(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test', 0, 900, 1000, is_reverse=False, template_length=50), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test', + reference_name='1', + reference_start=900, + cigarstring=str(1000 - 900) + 'M', + is_reverse=False, + template_length=50, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) mate.template_length = 55 with pytest.raises(ValueError): @@ -649,23 +718,59 @@ def test_collect_flanking_pair_error_template_lengths_dont_match(self, flanking_ def test_collect_flanking_pair_read_low_mq(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test', 0, 900, 1000, is_reverse=False), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test', + reference_name='1', + reference_start=900, + cigarstring=str(1000 - 900) + 'M', + is_reverse=False, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) read.mapping_quality = 0 assert not collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_mate_low_mq(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test', 0, 900, 1000, is_reverse=False), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test', + reference_name='1', + reference_start=900, + cigarstring=str(1000 - 900) + 'M', + is_reverse=False, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) mate.mapping_quality = 0 assert not collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_interchromosomal(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test', 1, 900, 1000, is_reverse=False), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test', + reference_name='2', + reference_start=900, + cigarstring=str(1000 - 900) + 'M', + is_reverse=False, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) assert not collect_flanking_pair(flanking_ge, read, mate) diff --git a/tests/test_mavis/validate/test_validate2.py b/tests/test_mavis/validate/test_validate2.py index c0a3262f..53f34078 100644 --- a/tests/test_mavis/validate/test_validate2.py +++ b/tests/test_mavis/validate/test_validate2.py @@ -577,33 +577,28 @@ class TestEvidenceGathering: def test_collect_split_read(self, ev_gathering_setup): ev1_sr = MockRead( query_name='HISEQX1_11:3:1105:15351:25130:split', - reference_id=1, - cigar=[(4, 68), (7, 82)], + reference_name='reference3', + cigarstring='68S82=', reference_start=1114, - reference_end=1154, - query_alignment_start=110, query_sequence='TCGTGAGTGGCAGGTGCCATCGTGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTG', - query_alignment_end=150, - flag=113, - next_reference_id=1, + **flags_from_number(113), + next_reference_name='1', next_reference_start=2341, ) collect_split_read(ev_gathering_setup, ev1_sr, True) + assert len(ev_gathering_setup.split_reads[0]) > 0 assert list(ev_gathering_setup.split_reads[0])[0] == ev1_sr def test_collect_split_read_failure(self, ev_gathering_setup): # wrong cigar string ev1_sr = MockRead( query_name='HISEQX1_11:4:1203:3062:55280:split', - reference_id=1, - cigar=[(7, 110), (7, 40)], + reference_name='reference3', + cigarstring='40=110S', reference_start=1114, - reference_end=1154, - query_alignment_start=110, query_sequence='CTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCGTGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATG', - query_alignment_end=150, - flag=371, - next_reference_id=1, + **flags_from_number(371), + next_reference_name='1', next_reference_start=2550, ) assert not collect_split_read(ev_gathering_setup, ev1_sr, True) @@ -611,24 +606,20 @@ def test_collect_split_read_failure(self, ev_gathering_setup): def test_collect_flanking_pair(self, ev_gathering_setup): collect_flanking_pair( ev_gathering_setup, - MockRead( - reference_id=1, - reference_start=2214, - reference_end=2364, - is_reverse=True, - next_reference_id=1, - next_reference_start=1120, - mate_is_reverse=True, - ), - MockRead( - reference_id=1, - reference_start=1120, - reference_end=2364, - is_reverse=True, - next_reference_id=1, - next_reference_start=1120, - mate_is_reverse=True, - is_read1=False, + *mock_read_pair( + MockRead( + reference_name='reference3', + reference_start=2214, + cigarstring=str(2364 - 2214) + 'M', + is_reverse=True, + ), + MockRead( + reference_name='reference3', + reference_start=1120, + cigarstring=str(2364 - 1120) + 'M', + is_reverse=True, + ), + proper_pair=False, ), ) assert len(ev_gathering_setup.flanking_pairs) == 1 @@ -637,8 +628,19 @@ def test_collect_flanking_pair_not_overlapping_evidence_window(self, ev_gatherin # first read in pair does not overlap the first evidence window # therefore this should return False and not add to the flanking_pairs pair = mock_read_pair( - MockRead(reference_id=1, reference_start=1903, reference_end=2053, is_reverse=True), - MockRead(reference_id=1, reference_start=2052, reference_end=2053, is_reverse=True), + MockRead( + reference_name='reference3', + reference_start=1903, + cigarstring=str(2053 - 1903) + 'M', + is_reverse=True, + ), + MockRead( + reference_name='reference3', + reference_start=2052, + cigarstring=str(2053 - 2052) + 'M', + is_reverse=True, + ), + proper_pair=False, ) assert not collect_flanking_pair(ev_gathering_setup, *pair) assert len(ev_gathering_setup.flanking_pairs) == 0 @@ -671,41 +673,59 @@ def test_load_evidence(self, ev_gathering_setup): def test_assemble_split_reads(self, ev_gathering_setup): sr1 = MockRead( + cigarstring='150M', + reference_start=1, query_name='HISEQX1_11:3:1105:15351:25130:split', query_sequence='TCGTGAGTGGCAGGTGCCATCGTGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTG', - flag=113, + **flags_from_number(113), ) sr2 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='GTCGTGAGTGGCAGGTGCCATCGTGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTT', - flag=121, + **flags_from_number(121), ) sr3 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='TCGTGAGTGGCAGGTGCCATCGTGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTG', - flag=113, + **flags_from_number(113), ) sr7 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='TGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATA', - flag=113, + **flags_from_number(113), ) sr9 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='TGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGC', - flag=113, + **flags_from_number(113), ) sr12 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='GATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAG', - flag=113, + **flags_from_number(113), ) sr15 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='GTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACA', - flag=113, + **flags_from_number(113), ) sr19 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='TGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCA', - flag=113, + **flags_from_number(113), ) sr24 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='CTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTT', - flag=113, + **flags_from_number(113), ) ev_gathering_setup.split_reads = ( {sr1}, From 6b70c0add10ed6ef18a14ccccb803347812ede35 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 8 Jun 2022 12:44:45 -0700 Subject: [PATCH 45/56] fix: Set transcript chr --- tests/test_mavis/validate/test_call.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_mavis/validate/test_call.py b/tests/test_mavis/validate/test_call.py index 88974fd7..a3592f60 100644 --- a/tests/test_mavis/validate/test_call.py +++ b/tests/test_mavis/validate/test_call.py @@ -1701,6 +1701,7 @@ def test_call_deletion(self): pre_transcript = PreTranscript( [(1001, 1100), (1501, 1700), (2001, 2100), (2201, 2300)], strand='+' ) + pre_transcript.chr = '1' for patt in pre_transcript.generate_splicing_patterns(): pre_transcript.transcripts.append(Transcript(pre_transcript, patt)) evidence = self.build_transcriptome_evidence( From 11a5d1a2e353c9444691acf789fcea06456680f0 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 8 Jun 2022 12:45:37 -0700 Subject: [PATCH 46/56] Bypass the reference name indexing for mocking simplicity --- tests/test_mavis/validate/test_validate2.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/test_mavis/validate/test_validate2.py b/tests/test_mavis/validate/test_validate2.py index 53f34078..3d0f471a 100644 --- a/tests/test_mavis/validate/test_validate2.py +++ b/tests/test_mavis/validate/test_validate2.py @@ -7,15 +7,23 @@ from mavis.constants import NA_MAPPING_QUALITY, ORIENT, PYSAM_READ_FLAGS from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence -from mavis.validate.gather import collect_split_read, collect_flanking_pair, load_evidence +from mavis.validate.gather import collect_flanking_pair, collect_split_read, load_evidence from mavis_config import DEFAULTS from ...util import get_data, long_running_test -from ..mock import MockLongString, MockObject, MockRead, mock_read_pair +from ..mock import MockLongString, MockObject, MockRead, flags_from_number, mock_read_pair REFERENCE_GENOME = None +class MockBamCache(BamCache): + def get_read_reference_name(self, read): + if isinstance(read, MockRead): + return read.reference_name + else: + return BamCache.get_read_reference_name(self, read) + + def setUpModule(): global REFERENCE_GENOME REFERENCE_GENOME = load_reference_genome(get_data('mock_reference_genome.fa')) @@ -25,9 +33,9 @@ def setUpModule(): ): raise AssertionError('fake genome file does not have the expected contents') global BAM_CACHE - BAM_CACHE = BamCache(get_data('mini_mock_reads_for_events.sorted.bam')) + BAM_CACHE = MockBamCache(get_data('mini_mock_reads_for_events.sorted.bam')) global FULL_BAM_CACHE - FULL_BAM_CACHE = BamCache(get_data('mock_reads_for_events.sorted.bam')) + FULL_BAM_CACHE = MockBamCache(get_data('mock_reads_for_events.sorted.bam')) global READS READS = {} for read in BAM_CACHE.fetch('reference3', 1, 8000): From 38d07c48f42e58f8ad4ef9b98704ef6bb42c45ac Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Wed, 8 Jun 2022 12:46:08 -0700 Subject: [PATCH 47/56] fix: correct test, should have been expecting a single alignment --- tests/test_mavis/test_align.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_mavis/test_align.py b/tests/test_mavis/test_align.py index a4d0179e..4ae1e4ab 100644 --- a/tests/test_mavis/test_align.py +++ b/tests/test_mavis/test_align.py @@ -717,7 +717,7 @@ def test_inversion_and_deletion(self): raw_alignments = {s: [read1, read2]} align.select_contig_alignments(evidence, raw_alignments) alignments = list(evidence.contigs[0].alignments) - assert len(alignments) == 2 + assert len(alignments) == 1 class TestGetAlignerVersion: From bfeb2a623944cc4502cca2326443d09f0bdc03f3 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 1 Nov 2022 14:04:07 -0700 Subject: [PATCH 48/56] add bugfix changes to summary --- src/mavis/summary/summary.py | 7 +++-- tests/data/mock_dgv_annotation.txt | 2 ++ tests/test_mavis/summary/test_summary.py | 40 ++++++++++++++++++++++-- 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/src/mavis/summary/summary.py b/src/mavis/summary/summary.py index 4c022e97..875b2e37 100644 --- a/src/mavis/summary/summary.py +++ b/src/mavis/summary/summary.py @@ -210,10 +210,11 @@ def annotate_dgv(bpps, dgv_regions_by_reference_name, distance=0): for bpp in [ b for b in bpps if not b.interchromosomal and b.break1.chr in dgv_regions_by_reference_name ]: + bpp.data['dgv'] = [] for dgv_region in dgv_regions_by_reference_name[bpp.break1.chr]: dist = abs(Interval.dist(Interval(dgv_region.start), bpp.break1)) if dist > lowest_resolution + distance: - break + continue elif ( dist > distance or abs(Interval.dist(Interval(dgv_region.end), bpp.break2)) > distance @@ -224,8 +225,8 @@ def annotate_dgv(bpps, dgv_regions_by_reference_name, distance=0): refname = dgv_region.reference_object.name except AttributeError: pass - bpp.data['dgv'] = '{}({}:{}-{})'.format( - dgv_region.name, refname, dgv_region.start, dgv_region.end + bpp.data['dgv'].append( + '{}({}:{}-{})'.format(dgv_region.name, refname, dgv_region.start, dgv_region.end) ) diff --git a/tests/data/mock_dgv_annotation.txt b/tests/data/mock_dgv_annotation.txt index ee303c99..bda6925c 100644 --- a/tests/data/mock_dgv_annotation.txt +++ b/tests/data/mock_dgv_annotation.txt @@ -1,6 +1,8 @@ chr start end name 1 1 2300000 nsv482937 1 10001 22118 dgv1n82 +1 10001 22120 rgv2n98 +1 10001 22221 rgv2n99 1 10001 127330 nsv7879 1 10191 10281 nsv958854 1 10377 177417 nsv428112 diff --git a/tests/test_mavis/summary/test_summary.py b/tests/test_mavis/summary/test_summary.py index b66a8134..a23d7b26 100644 --- a/tests/test_mavis/summary/test_summary.py +++ b/tests/test_mavis/summary/test_summary.py @@ -1,9 +1,10 @@ import pytest from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import CALL_METHOD, COLUMNS, PROTOCOL, STRAND, SVTYPE -from mavis.summary.summary import filter_by_annotations +from mavis.summary.summary import filter_by_annotations, annotate_dgv +from mavis.annotate.file_io import load_masking_regions -from ...util import todo +from ...util import todo, get_data @pytest.fixture @@ -40,11 +41,34 @@ def genomic_event2(): ) +@pytest.fixture +def genomic_event3(): + return BreakpointPair( + Breakpoint('1', 10001), + Breakpoint('1', 22118), + opposing_strands=True, + **{ + COLUMNS.event_type: SVTYPE.DEL, + COLUMNS.call_method: CALL_METHOD.CONTIG, + COLUMNS.fusion_sequence_fasta_id: None, + COLUMNS.protocol: PROTOCOL.GENOME, + COLUMNS.fusion_cdna_coding_start: None, + COLUMNS.fusion_cdna_coding_end: None, + COLUMNS.tracking_id: "genomic_event3", + } + ) + + @pytest.fixture def best_transcripts(): return {'ABCA': True, 'ABCD': True} +@pytest.fixture +def dgv_event2(): + return load_masking_regions(get_data("mock_dgv_annotation.txt")) + + class TestFilterByAnnotations: def test_filter_by_annotations_two_best_transcripts( self, genomic_event1, genomic_event2, best_transcripts @@ -174,3 +198,15 @@ def test_filtering_events_split(self): @todo def test_get_pairing_state(self): pass + + +class TestFilterByCallMethod: + def test_annotate_dgv_distance_bed(self, genomic_event3, dgv_event2): + bpps = [genomic_event3] + annotate_dgv(bpps, dgv_event2, 103) + print(bpps[0].data) + assert len(bpps[0].data['dgv']) == 3 + assert ( + bpps[0].data + == "['dgv1n82(1:10001-22118)', 'rgv2n98(1:10001-22120)', 'rgv2n99(1:10001-22221)']" + ) From 9714dd932a4cf9e1348a6779c446bd10dd3b9b76 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 1 Nov 2022 14:20:42 -0700 Subject: [PATCH 49/56] fix test syntax --- tests/test_mavis/summary/test_summary.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_mavis/summary/test_summary.py b/tests/test_mavis/summary/test_summary.py index a23d7b26..fd75cb60 100644 --- a/tests/test_mavis/summary/test_summary.py +++ b/tests/test_mavis/summary/test_summary.py @@ -207,6 +207,6 @@ def test_annotate_dgv_distance_bed(self, genomic_event3, dgv_event2): print(bpps[0].data) assert len(bpps[0].data['dgv']) == 3 assert ( - bpps[0].data - == "['dgv1n82(1:10001-22118)', 'rgv2n98(1:10001-22120)', 'rgv2n99(1:10001-22221)']" + bpps[0].data['dgv'] + == ['dgv1n82(1:10001-22118)', 'rgv2n98(1:10001-22120)', 'rgv2n99(1:10001-22221)'] ) From f8b80c9bdf0e7eafa6706815dff056eb603e42af Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 1 Nov 2022 14:37:18 -0700 Subject: [PATCH 50/56] freeze tabulate version --- .github/workflows/build.yml | 1 + setup.cfg | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b9263f86..f2d3c665 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -96,6 +96,7 @@ jobs: run: | python -m pip install --upgrade pip setuptools wheel pip install mavis_config pandas snakemake + pip install tabulate==0.8.9 - uses: eWaterCycle/setup-singularity@v6 with: singularity-version: 3.6.4 diff --git a/setup.cfg b/setup.cfg index 4149c69c..3ed5680a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -45,6 +45,7 @@ install_requires = Shapely>=1.6.4.post1 shortuuid>=0.5.0 svgwrite + tabulate==0.8.9 typing_extensions>=4 setup_requires = pip>=9.0.0 From d93caf78d3073859f1ae8544955005a713ba6cf7 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 1 Nov 2022 14:50:23 -0700 Subject: [PATCH 51/56] reformat dgv output to string --- src/mavis/summary/summary.py | 3 ++- tests/test_mavis/summary/test_summary.py | 7 +------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/mavis/summary/summary.py b/src/mavis/summary/summary.py index 875b2e37..cd13de5b 100644 --- a/src/mavis/summary/summary.py +++ b/src/mavis/summary/summary.py @@ -228,7 +228,8 @@ def annotate_dgv(bpps, dgv_regions_by_reference_name, distance=0): bpp.data['dgv'].append( '{}({}:{}-{})'.format(dgv_region.name, refname, dgv_region.start, dgv_region.end) ) - + bpp.data['dgv'] = ','.join(bpp.data['dgv']) + print(bpp.data['dgv']) def get_pairing_state( current_protocol, diff --git a/tests/test_mavis/summary/test_summary.py b/tests/test_mavis/summary/test_summary.py index fd75cb60..f60ec1c4 100644 --- a/tests/test_mavis/summary/test_summary.py +++ b/tests/test_mavis/summary/test_summary.py @@ -204,9 +204,4 @@ class TestFilterByCallMethod: def test_annotate_dgv_distance_bed(self, genomic_event3, dgv_event2): bpps = [genomic_event3] annotate_dgv(bpps, dgv_event2, 103) - print(bpps[0].data) - assert len(bpps[0].data['dgv']) == 3 - assert ( - bpps[0].data['dgv'] - == ['dgv1n82(1:10001-22118)', 'rgv2n98(1:10001-22120)', 'rgv2n99(1:10001-22221)'] - ) + assert len(bpps[0].data['dgv'].split(',')) == 3 From 938c243353653912fa09599ee0aa04e0d48bbcd9 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 1 Nov 2022 14:51:37 -0700 Subject: [PATCH 52/56] get rid of print --- src/mavis/summary/summary.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mavis/summary/summary.py b/src/mavis/summary/summary.py index cd13de5b..1494111a 100644 --- a/src/mavis/summary/summary.py +++ b/src/mavis/summary/summary.py @@ -229,7 +229,6 @@ def annotate_dgv(bpps, dgv_regions_by_reference_name, distance=0): '{}({}:{}-{})'.format(dgv_region.name, refname, dgv_region.start, dgv_region.end) ) bpp.data['dgv'] = ','.join(bpp.data['dgv']) - print(bpp.data['dgv']) def get_pairing_state( current_protocol, From 092f690abb93653d5cbf26c21d0667cfe0008e38 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 1 Nov 2022 15:08:28 -0700 Subject: [PATCH 53/56] reformat --- src/mavis/summary/summary.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mavis/summary/summary.py b/src/mavis/summary/summary.py index 1494111a..6016e517 100644 --- a/src/mavis/summary/summary.py +++ b/src/mavis/summary/summary.py @@ -230,6 +230,7 @@ def annotate_dgv(bpps, dgv_regions_by_reference_name, distance=0): ) bpp.data['dgv'] = ','.join(bpp.data['dgv']) + def get_pairing_state( current_protocol, current_disease_state, From 8b932a6dac52d809465cfb5ea9a8a8c5d8dce660 Mon Sep 17 00:00:00 2001 From: Jeremy Fan Date: Tue, 1 Nov 2022 16:26:03 -0700 Subject: [PATCH 54/56] change delimiter --- src/mavis/summary/summary.py | 2 +- tests/test_mavis/summary/test_summary.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mavis/summary/summary.py b/src/mavis/summary/summary.py index 6016e517..a52711cd 100644 --- a/src/mavis/summary/summary.py +++ b/src/mavis/summary/summary.py @@ -228,7 +228,7 @@ def annotate_dgv(bpps, dgv_regions_by_reference_name, distance=0): bpp.data['dgv'].append( '{}({}:{}-{})'.format(dgv_region.name, refname, dgv_region.start, dgv_region.end) ) - bpp.data['dgv'] = ','.join(bpp.data['dgv']) + bpp.data['dgv'] = ';'.join(bpp.data['dgv']) def get_pairing_state( diff --git a/tests/test_mavis/summary/test_summary.py b/tests/test_mavis/summary/test_summary.py index f60ec1c4..50b2318a 100644 --- a/tests/test_mavis/summary/test_summary.py +++ b/tests/test_mavis/summary/test_summary.py @@ -204,4 +204,4 @@ class TestFilterByCallMethod: def test_annotate_dgv_distance_bed(self, genomic_event3, dgv_event2): bpps = [genomic_event3] annotate_dgv(bpps, dgv_event2, 103) - assert len(bpps[0].data['dgv'].split(',')) == 3 + assert len(bpps[0].data['dgv'].split(';')) == 3 From f8bbfa2763eaf65b7eca55b3fd85ab368e878168 Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Thu, 3 Nov 2022 13:46:43 -0700 Subject: [PATCH 55/56] Require mavis config 1.2.2+ --- .github/workflows/build.yml | 3 +-- setup.cfg | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f2d3c665..212a4542 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -95,8 +95,7 @@ jobs: - name: Install workflow dependencies run: | python -m pip install --upgrade pip setuptools wheel - pip install mavis_config pandas snakemake - pip install tabulate==0.8.9 + pip install mavis_config pandas - uses: eWaterCycle/setup-singularity@v6 with: singularity-version: 3.6.4 diff --git a/setup.cfg b/setup.cfg index 3ed5680a..62760701 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,7 +37,7 @@ install_requires = braceexpand==0.1.2 colour Distance>=0.1.3 - mavis_config>=1.1.0, <2.0.0 + mavis_config>=1.2.2, <2.0.0 networkx>=2.5,<3 numpy>=1.13.1 pandas>=1.1, <2 @@ -45,7 +45,6 @@ install_requires = Shapely>=1.6.4.post1 shortuuid>=0.5.0 svgwrite - tabulate==0.8.9 typing_extensions>=4 setup_requires = pip>=9.0.0 From 60817e70cd63eef43a7a9717ddebd3257919cada Mon Sep 17 00:00:00 2001 From: Caralyn Reisle Date: Mon, 7 Nov 2022 12:41:29 -0800 Subject: [PATCH 56/56] Bump version to 3.1.0 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 62760701..850e340b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = mavis -version = 3.0.0 +version = 3.1.0 url = https://github.com/bcgsc/mavis.git download_url = https://github.com/bcgsc/mavis/archive/v2.2.10.tar.gz description = A Structural Variant Post-Processing Package