diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 83ccec48..212a4542 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,7 +20,9 @@ jobs: steps: - uses: actions/checkout@v2 - name: install machine dependencies - run: sudo apt-get install -y libcurl4-openssl-dev + run: | + sudo apt-get update + sudo apt-get install -y libcurl4-openssl-dev - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: @@ -93,7 +95,7 @@ jobs: - name: Install workflow dependencies run: | python -m pip install --upgrade pip setuptools wheel - pip install mavis_config pandas snakemake + pip install mavis_config pandas - uses: eWaterCycle/setup-singularity@v6 with: singularity-version: 3.6.4 diff --git a/README.md b/README.md index b6e6ef45..bf57b2de 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ The simplest way to use MAVIS is via Singularity. The MAVIS docker container use by singularity will take care of installing the aligner as well. ```bash -pip install -U setuptools pip +pip install -U setuptools pip wheel pip install mavis_config # also installs snakemake ``` diff --git a/docs/background/citations.md b/docs/background/citations.md index 742344f8..f65a219b 100644 --- a/docs/background/citations.md +++ b/docs/background/citations.md @@ -23,6 +23,11 @@ Chen,X. et al. (2016) Manta: rapid detection of structural variants and indels for germline and cancer sequencing applications. Bioinformatics, 32, 1220--1222. +## Chiu-2021 + +Chiu,R. et al. (2021) Straglr: discovering and genotyping tandem repeat + expansions using whole genome long-read sequences. Genome Biol., 22, 224. + ## Haas-2017 Haas,B et al. (2017) STAR-Fusion: Fast and Accurate Fusion @@ -62,6 +67,11 @@ Saunders,C.T. et al. (2012) Strelka: accurate somatic small-variant calling from sequenced tumor--normal sample pairs. Bioinformatics, 28, 1811--1817. +## Uhrig-2021 + +Uhrig,S. et al. (2021) Accurate and efficient detection of gene + fusions from RNA sequencing data. Genome Res., 31, 448--460. + ## Yates-2016 Yates,A. et al. (2016) Ensembl 2016. Nucleic Acids Res., 44, diff --git a/docs/glossary.md b/docs/glossary.md index d6147e92..e6ca9e41 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -127,6 +127,11 @@ install instructions. Community based standard of reccommendations for variant notation. See [http://varnomen.hgvs.org/](http://varnomen.hgvs.org/) +## Arriba + +Arriba is an SV caller. Source for Arriba can be found +[here](https://github.com/suhrig/arriba) [Uhrig-2021](../background/citations#uhrig-2021) + ## BreakDancer BreakDancer is an SV caller. Source for BreakDancer can be found diff --git a/docs/inputs/.pages b/docs/inputs/.pages new file mode 100644 index 00000000..886e3d79 --- /dev/null +++ b/docs/inputs/.pages @@ -0,0 +1,4 @@ +nav: + - reference.md + - standard.md + - ... diff --git a/docs/inputs/non_python_dependencies.md b/docs/inputs/non_python_dependencies.md new file mode 100644 index 00000000..fdfdc4e9 --- /dev/null +++ b/docs/inputs/non_python_dependencies.md @@ -0,0 +1,28 @@ +# Non-python Dependencies + +MAVIS integrates with +[SV callers](./sv_callers.md), +[job schedulers](#job-schedulers), and +[aligners](#aligners). While some of +these dependencies are optional, all currently supported options are +detailed below. The versions column in the tables below list all the +versions which were tested for each tool. Each version listed is known +to be compatible with MAVIS. + +## Job Schedulers + +MAVIS v3 uses [snakemake](https://snakemake.readthedocs.io/en/stable/) to handle job scheduling + +## Aligners + +Two aligners are supported [bwa](../../glossary/#bwa) and +[blat](../../glossary/#blat) (default). These are both included in the docker image by default. + +| Name | Version(s) | Environment Setting | +| ---------------------------------------------- | ----------------------- | ------------------------- | +| [blat](../../glossary/#blat) | `36x2` `36` | `MAVIS_ALIGNER=blat` | +| [bwa mem ](../../glossary/#bwa mem ) | `0.7.15-r1140` `0.7.12` | `MAVIS_ALIGNER='bwa mem'` | + +!!! note + When setting the aligner you will also need to set the + [aligner_reference](../../configuration/settings/#aligner_reference) to match diff --git a/docs/inputs/reference.md b/docs/inputs/reference.md index 8c7f0f36..224091c2 100644 --- a/docs/inputs/reference.md +++ b/docs/inputs/reference.md @@ -10,16 +10,16 @@ To improve the install experience for the users, different configurations of the MAVIS annotations file have been made available. These files can be downloaded below, or if the required configuration is not available, -(instructions on generating the annotations file)[/inputs/reference/#generating-the-annotations-from-ensembl] can be found below. +[instructions on generating the annotations file](/inputs/reference/#generating-the-annotations-from-ensembl) can be found below. -| File Name (Type/Format) | Environment Variable | Download | -| --------------------------------------------------------------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [reference genome](../../inputs/reference/#reference-genome) ([fasta](../../glossary/#fasta)) | `MAVIS_REFERENCE_GENOME` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz)
[![](../images/get_app-24px.svg) GRCh38](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.tar.gz) | +| File Name (Type/Format) | Environment Variable | Download | +| --------------------------------------------------------------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [reference genome](../../inputs/reference/#reference-genome) ([fasta](../../glossary/#fasta)) | `MAVIS_REFERENCE_GENOME` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz)
[![](../images/get_app-24px.svg) GRCh38](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.tar.gz) | | [annotations](../../inputs/reference/#annotations) ([JSON](../../glossary/#json)) | `MAVIS_ANNOTATIONS` | [![](../images/get_app-24px.svg) GRCh37/Hg19 + Ensembl69](http://www.bcgsc.ca/downloads/mavis/v3/ensembl69_hg19_annotations.v3.json.gz)
[![](../images/get_app-24px.svg) GRCh38 + Ensembl79](http://www.bcgsc.ca/downloads/mavis/v3/ensembl79_hg38_annotations.v3.json.gz) | -| [masking](../../inputs/reference/#masking-file) (text/tabbed) | `MAVIS_MASKING` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://www.bcgsc.ca/downloads/mavis/hg19_masking.tab)
[![](../images/get_app-24px.svg) GRCh38](http://www.bcgsc.ca/downloads/mavis/GRCh38_masking.tab) | -| [template metadata](../../inputs/reference/#template-metadata) (text/tabbed) | `MAVIS_TEMPLATE_METADATA` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz)
[![](../images/get_app-24px.svg) GRCh38](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz) | -| [DGV annotations](../../inputs/reference/#dgv-database-of-genomic-variants) (text/tabbed) | `MAVIS_DGV_ANNOTATION` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://www.bcgsc.ca/downloads/mavis/dgv_hg19_variants.tab)
[![](../images/get_app-24px.svg) GRCh38](http://www.bcgsc.ca/downloads/mavis/dgv_hg38_variants.tab) | -| [aligner reference](../../inputs/reference/#aligner-reference) | `MAVIS_ALIGNER_REFERENCE` | [![](../images/get_app-24px.svg) GRCh37/Hg19 2bit (blat)](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit)
[![](../images/get_app-24px.svg) GRCh38 2bit (blat)](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit) | +| [masking](../../inputs/reference/#masking-file) (text/tabbed) | `MAVIS_MASKING` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://www.bcgsc.ca/downloads/mavis/hg19_masking.tab)
[![](../images/get_app-24px.svg) GRCh38](http://www.bcgsc.ca/downloads/mavis/GRCh38_masking.tab) | +| [template metadata](../../inputs/reference/#template-metadata) (text/tabbed) | `MAVIS_TEMPLATE_METADATA` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz)
[![](../images/get_app-24px.svg) GRCh38](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz) | +| [DGV annotations](../../inputs/reference/#dgv-database-of-genomic-variants) (text/tabbed) | `MAVIS_DGV_ANNOTATION` | [![](../images/get_app-24px.svg) GRCh37/Hg19](http://www.bcgsc.ca/downloads/mavis/dgv_hg19_variants.tab)
[![](../images/get_app-24px.svg) GRCh38](http://www.bcgsc.ca/downloads/mavis/dgv_hg38_variants.tab) | +| [aligner reference](../../inputs/reference/#aligner-reference) | `MAVIS_ALIGNER_REFERENCE` | [![](../images/get_app-24px.svg) GRCh37/Hg19 2bit (blat)](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit)
[![](../images/get_app-24px.svg) GRCh38 2bit (blat)](http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit) | If the environment variables above are set they will be used as the default values when any step of the pipeline script is called (including diff --git a/docs/inputs/support.md b/docs/inputs/support.md index af3a5e90..23ec5649 100644 --- a/docs/inputs/support.md +++ b/docs/inputs/support.md @@ -41,6 +41,7 @@ It should be noted however that the tool tracked will only be listed as | Name | Version(s) | MAVIS input | Publication | | ------------------------------------------ | ---------------- | --------------------------------------------- | ----------------------------------------------------------- | +| [Arriba](../../glossary/#arriba) | `2.2.1` | `fusions.tsv` | [Uhrig-2021](../../background/citations#uhrig-2021) | | [BreakDancer](../../glossary/#breakdancer) | `1.4.5` | `Tools main output file(s)` | [Chen-2009](../../background/citations#chen-2009) | | [BreakSeq](../../glossary/#breakseq) | `2.2` | `work/breakseq.vcf.gz` | [Abyzov-2015](../../background/citations#abyzov-2015) | | [Chimerascan](../../glossary/#chimerascan) | `0.4.5` | `*.bedpe` | [Iyer-2011](../../background/citations#Iyer-2011) | diff --git a/docs/inputs/sv_callers.md b/docs/inputs/sv_callers.md new file mode 100644 index 00000000..4c5ae23e --- /dev/null +++ b/docs/inputs/sv_callers.md @@ -0,0 +1,158 @@ +# SV Callers + +MAVIS supports output from a wide-variety of SV callers. Assumptions are made for each tool based on interpretation of the output and the publications for each tool. + +## Configuring Conversions + +Adding a conversion step to your MAVIS run is as simple as adding that section to the input JSON config. + +The general structure of this section is as follows + +```jsonc +{ + "convert": { + "": { + "file_type": "", + "name": "", // optional field for supported tools + "inputs": [ + "/path/to/tool/output/file" + ] + } + } +} +``` + +A full version of the input configuration file specification can be found in the [configuration](../configuration/general.md) section. + +## Supported Tools + +The tools and versions currently supported are given below. Versions listed indicate the version of the tool for which output files have been tested as input into MAVIS. MAVIS also supports a [general VCF input](#general-vcf-inputs). + +| SV Caller | Version(s) Tested | Files used as MAVIS input | +| --------------------------------------------------------------------------- | ----------------- | --------------------------------------------- | +| [BreakDancer (Chen, 2009)](../../background/citations#chen-2009) | `1.4.5` | `Tools main output file(s)` | +| [BreakSeq (Abyzov, 2015)](../../background/citations#abyzov-2015) | `2.2` | `work/breakseq.vcf.gz` | +| [Chimerascan (Iyer, 2011)](../../background/citations#iyer-2011) | `0.4.5` | `*.bedpe` | +| [CNVnator (Abyzov, 2011)](../../background/citations#abyzov-2011) | `0.3.3` | `Tools main output file(s)` | +| [CuteSV (Jiang, 2020)](../../background/citations#jiang-2020) | `1.0.10` | `*.vcf` | +| [DeFuse (McPherson. 2011)](../../background/citations#mcpherson-2011) | `0.6.2` | `results/results.classify.tsv` | +| [DELLY (Rausch, 2012)](../../background/citations#rausch-2012) | `0.6.1` `0.7.3` | `combined.vcf` (converted from bcf) | +| [Manta (Chen, 2016)](../../background/citations#chen-2016) | `1.0.0` | `{diploidSV,somaticSV}.vcf` | +| [Pindel (Ye, 2009)](../../background/citations#ye-2009) | `0.2.5b9` | `Tools main output file(s)` | +| [Sniffles (Sedlazeck, 2018)](../../background/citations#sedlazeck-2018) | `1.0.12b` | `*.vcf` | +| [STAR-Fusion (Haas, 2017)](../../background/citations#haas-2017) | `1.4.0` | `star-fusion.fusion_predictions.abridged.tsv` | +| [Straglr (Chiu, 2021)](../../background/citations#chiu-2021) | | | +| [Strelka (Saunders, 2012)](../../background/citations#saunders-2012) | `1.0.6` | `passed.somatic.indels.vcf` | +| [Trans-ABySS (Robertson, 2010)](../../background/citations/#robertson-2010) | `1.4.8 (custom)` | `{indels/events_novel_exons,fusions/*}.tsv` | `.bed` | + +!!! note + [Trans-ABySS](../../glossary/#trans-abyss): The trans-abyss version + used was an in-house dev version. However the output columns are + compatible with 1.4.8 as that was the version branched from. + Additionally, although indels can be used from both genome and + transcriptome outputs of Trans-ABySS, it is recommended to only use the + genome indel calls as the transcriptome indels calls (for versions + tested) introduce a very high number of false positives. This will slow + down validation. It is much faster to simply use the genome indels for + both genome and transcriptome. + +## [DELLY](../../glossary/#delly) Post-processing + +Some post-processing on the delly output files is generally done prior +to input. The output BCF files are converted to a VCF file + +```bash +bcftools concat -f /path/to/file/with/vcf/list --allow-overlaps --output-type v --output combined.vcf +``` + +## General VCF inputs + +Assuming that the tool outputting the VCF file follows standard +conventions, then it is possible to use a +[general VCF conversion](../../package/mavis/tools/vcf) +that is not tool-specific. Given the wide variety in content for VCF files, +MAVIS makes a number of assumptions and the VCF conversion may not work +for all VCFs. In general MAVIS follows the [VCF 4.2 +specification](https://samtools.github.io/hts-specs/VCFv4.2.pdf). If the +input tool you are using differs, it would be better to use a +[custom conversion script](#custom-conversions). + +Using the general VCF tool with a non-standard tool can be done as follows + +```json +{ + "convert": { + "my_tool_alias": { + "file_type": "vcf", + "name": "my_tool", + "inputs": ["/path/to/my_tool/output.vcf"] + } + } +} +``` + +### Assumptions on non-standard INFO fields + +- `PRECISE` if given, Confidence intervals are ignored if given in favour of exact breakpoint calls using pos and END as the breakpoint positions +- `CT` values if given are representative of the breakpoint orientations. +- `CHR2` is given for all interchromosomal events + +### Translating BND type Alt fields + +There are four possible configurations for the alt field of a BND type structural variant +based on the VCF specification. These correspond 1-1 to the orientation types for MAVIS +translocation structural variants. + +```text +r = reference base/seq +u = untemplated sequence/alternate sequence +p = chromosome:position +``` + +| alt format | orients | +| ---------- | ------- | +| `ru[p[` | LR | +| `[p[ur` | RR | +| `]p]ur` | RL | +| `ru]p]` | LL | + +## Custom Conversions + +If there is a tool that is not yet supported by MAVIS and you would like it to be, you can either add a [feature request](https://github.com/bcgsc/mavis/issues) to our GitHub page or tackle writing the conversion script yourself. Either way there are a few things you will need + +- A sample output from the tool in question +- Tool metadata for the citation, version, etc + +### Logic Example - [Chimerascan](../../glossary/#chimerascan) + +The following is a description of how the conversion script for +[Chimerascan](../../background/citations/#iyer-2011) was generated. +While this is a built-in conversion command now, the logic could also +have been put in an external script. As mentioned above, there are a +number of assumptions that had to be made about the tools output to +convert it to the +[standard mavis format](../../inputs/standard/). Assumptions were then verified by reviewing at a series of +called events in [IGV](../../glossary/#igv). In the current +example, [Chimerascan](../../background/citations/#iyer-2011) output +has six columns of interest that were used in the conversion + +- start3p +- end3p +- strand3p +- start5p +- end5p +- strand5p + +The above columns describe two segments which are joined. MAVIS requires +the position of the join. It was assumed that the segments are always +joined as a [sense fusion](../../glossary/#sense-fusion). Using this +assumption there are four logical cases to determine the position of the +breakpoints. + +i.e. the first case would be: If both strands are positive, then the end +of the five-prime segment (end5p) is the first breakpoint and the start +of the three-prime segment is the second breakpoint + +### Calling a Custom Conversion Script + +Since MAVIS v3+ is run using [snakemake](https://snakemake.readthedocs.io/en/stable/) the simplest way to incorporate your custom conversion scripts is to modify the Snakefile and add them as rules. diff --git a/setup.cfg b/setup.cfg index 4149c69c..850e340b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = mavis -version = 3.0.0 +version = 3.1.0 url = https://github.com/bcgsc/mavis.git download_url = https://github.com/bcgsc/mavis/archive/v2.2.10.tar.gz description = A Structural Variant Post-Processing Package @@ -37,7 +37,7 @@ install_requires = braceexpand==0.1.2 colour Distance>=0.1.3 - mavis_config>=1.1.0, <2.0.0 + mavis_config>=1.2.2, <2.0.0 networkx>=2.5,<3 numpy>=1.13.1 pandas>=1.1, <2 diff --git a/src/mavis/bam/read.py b/src/mavis/bam/read.py index 5a986178..e27be0ae 100644 --- a/src/mavis/bam/read.py +++ b/src/mavis/bam/read.py @@ -424,7 +424,7 @@ def sequenced_strand(read: pysam.AlignedSegment, strand_determining_read: int = else: strand = STRAND.NEG if not read.is_reverse else STRAND.POS elif strand_determining_read == 2: - if read.is_read2: + if not read.is_read1: strand = STRAND.NEG if read.is_reverse else STRAND.POS else: strand = STRAND.NEG if not read.is_reverse else STRAND.POS diff --git a/src/mavis/breakpoint.py b/src/mavis/breakpoint.py index ac3b188f..e6003627 100644 --- a/src/mavis/breakpoint.py +++ b/src/mavis/breakpoint.py @@ -304,17 +304,18 @@ def __init__( ) # try classifying to make sure it's a valid combination - BreakpointPair.classify(self) + classify_breakpoint_pair(self) def column(self, colname: str): return self.data.get(COLUMNS[colname]) def __str__(self): - return 'BPP({}, {}{}{})'.format( + return 'BPP({}, {}{}{}{})'.format( str(self.break1), str(self.break2), ', opposing={}'.format(self.opposing_strands) if not self.stranded else '', ', seq=' + repr(self.untemplated_seq) if self.untemplated_seq is not None else '', + ', tracking_id=' + self.tracking_id if self.tracking_id else '', ) def flatten(self): @@ -346,70 +347,6 @@ def flatten(self): row.update(temp) return row - @classmethod - def classify(cls, pair, distance: Optional[Callable] = None) -> Set[str]: - """ - uses the chr, orientations and strands to determine the - possible structural_variant types that this pair could support - - Args: - pair (BreakpointPair): the pair to classify - distance: if defined, will be passed to net size to use in narrowing the list of putative types (del vs ins) - Returns: - a list of possible SVTYPE - - Example: - >>> bpp = BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) - >>> BreakpointPair.classify(bpp) - ['inversion'] - >>> bpp = BreakpointPair(Breakpoint('1', 1, orient='L'), Breakpoint('1', 9999, orient='R'), opposing_strands=False) - >>> BreakpointPair.classify(bpp) - {'deletion', 'insertion'} - - Note: - see [related theory documentation](/background/theory/#classifying-events) - """ - if not pair.interchromosomal: # intrachromosomal - if pair.opposing_strands: - if pair.LR or pair.RL: - raise InvalidRearrangement(pair) - return {SVTYPE.INV} - else: - if pair.LL or pair.RR: - raise InvalidRearrangement(pair) - elif pair.break1.orient == ORIENT.LEFT or pair.break2.orient == ORIENT.RIGHT: - if ( - len(pair.break1) == 1 - and len(pair.break2) == 1 - and abs(pair.break1.start - pair.break2.start) < 2 - ): - if pair.untemplated_seq == '': - return set() - return {SVTYPE.INS} - elif pair.untemplated_seq == '': - return {SVTYPE.DEL} - elif distance: - try: - if pair.net_size(distance).start > 0: - return {SVTYPE.INS} - elif pair.net_size(distance).end < 0: - return {SVTYPE.DEL} - except ValueError: - pass - return {SVTYPE.DEL, SVTYPE.INS} - elif pair.break1.orient == ORIENT.RIGHT or pair.break2.orient == ORIENT.LEFT: - return {SVTYPE.DUP} - return {SVTYPE.DEL, SVTYPE.INS, SVTYPE.DUP} - else: # interchromosomal - if pair.opposing_strands: - if pair.LR or pair.RL: - raise InvalidRearrangement(pair) - return {SVTYPE.ITRANS} - else: - if pair.LL or pair.RR: - raise InvalidRearrangement(pair) - return {SVTYPE.TRANS} - def net_size(self, distance=lambda x, y: Interval(abs(x - y))) -> Interval: """ Returns the size of the event for a given pair. Mainly applicable to indels @@ -623,3 +560,67 @@ def get_bed_repesentation(self) -> List[Tuple[str, int, int, Optional[str]]]: ) ) return bed + + +def classify_breakpoint_pair(pair: BreakpointPair, distance: Optional[Callable] = None) -> Set[str]: + """ + uses the chr, orientations and strands to determine the + possible structural_variant types that this pair could support + + Args: + pair (BreakpointPair): the pair to classify + distance: if defined, will be passed to net size to use in narrowing the list of putative types (del vs ins) + Returns: + a list of possible SVTYPE + + Example: + >>> bpp = BreakpointPair(Breakpoint('1', 1), Breakpoint('1', 9999), opposing_strands=True) + >>> classify_breakpoint_pair(bpp) + ['inversion'] + >>> bpp = BreakpointPair(Breakpoint('1', 1, orient='L'), Breakpoint('1', 9999, orient='R'), opposing_strands=False) + >>> classify_breakpoint_pair(bpp) + {'deletion', 'insertion'} + + Note: + see [related theory documentation](/background/theory/#classifying-events) + """ + if not pair.interchromosomal: # intrachromosomal + if pair.opposing_strands: + if pair.LR or pair.RL: + raise InvalidRearrangement(pair) + return {SVTYPE.INV} + else: + if pair.LL or pair.RR: + raise InvalidRearrangement(pair) + elif pair.break1.orient == ORIENT.LEFT or pair.break2.orient == ORIENT.RIGHT: + if ( + len(pair.break1) == 1 + and len(pair.break2) == 1 + and abs(pair.break1.start - pair.break2.start) < 2 + ): + if pair.untemplated_seq == '': + return set() + return {SVTYPE.INS} + elif pair.untemplated_seq == '': + return {SVTYPE.DEL} + elif distance: + try: + if pair.net_size(distance).start > 0: + return {SVTYPE.INS} + elif pair.net_size(distance).end < 0: + return {SVTYPE.DEL} + except ValueError: + pass + return {SVTYPE.DEL, SVTYPE.INS} + elif pair.break1.orient == ORIENT.RIGHT or pair.break2.orient == ORIENT.LEFT: + return {SVTYPE.DUP} + return {SVTYPE.DEL, SVTYPE.INS, SVTYPE.DUP} + else: # interchromosomal + if pair.opposing_strands: + if pair.LR or pair.RL: + raise InvalidRearrangement(pair) + return {SVTYPE.ITRANS} + else: + if pair.LL or pair.RR: + raise InvalidRearrangement(pair) + return {SVTYPE.TRANS} diff --git a/src/mavis/convert/__init__.py b/src/mavis/convert/__init__.py index 2dd0bc72..1e7f3888 100644 --- a/src/mavis/convert/__init__.py +++ b/src/mavis/convert/__init__.py @@ -4,15 +4,17 @@ import pandas as pd from shortuuid import uuid -from ..breakpoint import Breakpoint, BreakpointPair +from ..breakpoint import Breakpoint, BreakpointPair, classify_breakpoint_pair from ..constants import COLUMNS, ORIENT, STRAND, SVTYPE from ..error import InvalidRearrangement from ..util import logger, read_bpp_from_input_file +from .arriba import convert_row as _parse_arriba from .breakdancer import convert_file as _convert_breakdancer_file from .chimerascan import convert_row as _parse_chimerascan from .cnvnator import convert_row as _parse_cnvnator from .constants import SUPPORTED_TOOL, TOOL_SVTYPE_MAPPING, TRACKING_COLUMN from .starfusion import convert_row as _parse_starfusion +from .straglr import convert_row as _parse_straglr from .transabyss import convert_row as _parse_transabyss from .vcf import convert_file as read_vcf @@ -142,6 +144,14 @@ def _convert_tool_row( {k: v for k, v in row.items() if k not in {'Type', 'Chr1', 'Chr2', 'Pos1', 'Pos2'}} ) + elif file_type == SUPPORTED_TOOL.ARRIBA: + + std_row.update(_parse_arriba(row)) + + elif file_type == SUPPORTED_TOOL.STRAGLR: + + std_row.update(_parse_straglr(row)) + else: raise NotImplementedError('unsupported file type', file_type) @@ -222,7 +232,7 @@ def _convert_tool_row( for col, value in std_row.items(): if col not in COLUMNS and col not in bpp.data: bpp.data[col] = value - if not event_type or event_type in BreakpointPair.classify(bpp): + if not event_type or event_type in classify_breakpoint_pair(bpp): result.append(bpp) except (InvalidRearrangement, AssertionError): diff --git a/src/mavis/convert/arriba.py b/src/mavis/convert/arriba.py new file mode 100644 index 00000000..e523bfd6 --- /dev/null +++ b/src/mavis/convert/arriba.py @@ -0,0 +1,39 @@ +from ..constants import COLUMNS, ORIENT, STRAND + +from .constants import TRACKING_COLUMN, SUPPORTED_TOOL + + +def get_orient(string): + if string == "downstream": + return ORIENT.LEFT + elif string == "upstream": + return ORIENT.RIGHT + return ORIENT.NS + + +def convert_row(row): + """ + transforms the aribba output into the common format for expansion. Maps the input column + names to column names which MAVIS can read + """ + std_row = {} + + try: + std_row[COLUMNS.break1_chromosome], b1_start = row["breakpoint1"].split(":") + std_row[COLUMNS.break2_chromosome], b2_start = row["breakpoint2"].split(":") + + std_row[COLUMNS.break1_strand] = row["strand1(gene/fusion)"].split("/")[1] + std_row[COLUMNS.break2_strand] = row["strand2(gene/fusion)"].split("/")[1] + std_row[COLUMNS.event_type] = row["type"].split("/")[0] + std_row[COLUMNS.break1_orientation] = get_orient(row["direction1"]) + std_row[COLUMNS.break2_orientation] = get_orient(row["direction2"]) + + std_row[COLUMNS.break1_position_start] = std_row[COLUMNS.break1_position_end] = b1_start + std_row[COLUMNS.break2_position_start] = std_row[COLUMNS.break2_position_end] = b2_start + except (ValueError, TypeError): + raise AssertionError( + "Could not parse the breakpoint from the Arriba row: {}, {}".format( + row["breakpoint1"], row["breakpoint2"] + ) + ) + return std_row diff --git a/src/mavis/convert/constants.py b/src/mavis/convert/constants.py index 821d6a6a..aa18fb89 100644 --- a/src/mavis/convert/constants.py +++ b/src/mavis/convert/constants.py @@ -27,6 +27,8 @@ class SUPPORTED_TOOL(MavisNamespace): CNVNATOR = 'cnvnator' STRELKA = 'strelka' STARFUSION = 'starfusion' + STRAGLR = 'straglr' + ARRIBA = 'arriba' TOOL_SVTYPE_MAPPING = {v: [v] for v in SVTYPE.values()} # type: ignore diff --git a/src/mavis/convert/straglr.py b/src/mavis/convert/straglr.py new file mode 100644 index 00000000..aa50db65 --- /dev/null +++ b/src/mavis/convert/straglr.py @@ -0,0 +1,32 @@ +from typing import Dict + +from ..constants import COLUMNS, SVTYPE + + +def convert_row(row: Dict) -> Dict: + """ + Converts the fields from the original STRAGLR BED output into MAVIS definitions of an SV + Since STRAGLR defines regions where short tandem repeats exist we make the definitions here fairly + non-specific + + See their github page for more details: https://github.com/bcgsc/straglr + + BED Columns + - chrom: chromosome name + - start: start coordinate of locus + - end: end coordinate of locus + - repeat_unit: repeat motif + - allele.size: where N={1,2,3...} depending on --max_num_clusters e.g. N={1,2} if --max_num_clusters==2 (default) + - allele.copy_number + - allele.support + """ + return { + COLUMNS.break1_chromosome: row['chrom'], + COLUMNS.break2_chromosome: row['chrom'], + COLUMNS.break1_position_start: row['start'], + COLUMNS.break1_position_end: row['end'], + COLUMNS.break2_position_start: row['start'], + COLUMNS.break2_position_end: row['end'], + COLUMNS.untemplated_seq: None, + COLUMNS.event_type: SVTYPE.INS, + } diff --git a/src/mavis/convert/vcf.py b/src/mavis/convert/vcf.py index c1c07faf..c6447af5 100644 --- a/src/mavis/convert/vcf.py +++ b/src/mavis/convert/vcf.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from typing import Dict, List, Optional, Tuple - +from ..interval import Interval import pandas as pd try: @@ -37,6 +37,7 @@ class VcfInfoType(TypedDict, total=False): CHR2: str CIPOS: Tuple[int, int] CIEND: Tuple[int, int] + CILEN: Tuple[int, int] CT: str END: Optional[int] PRECISE: bool @@ -122,6 +123,98 @@ def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]: raise NotImplementedError('alt specification in unexpected format', alt) +def convert_imprecise_breakend(std_row: Dict, record: List[VcfRecordType], bp_end: int): + """ + Handles IMPRECISE calls, that leveraged uncertainty from the CIPOS/CIEND/CILEN fields. + + bp1_s = breakpoint1 start + bp1_e = breakpoint1 end + bp2_s = breakpoint2 start + bp2_e = breakpoint2 end + + Insertion and deletion edge case - in which bp1_e > bp2_s + E.g bp1_s = 1890, bp1_e = 2000, bp2_s = 1900, bp2_e = 1900. + break1 ------------------------=======================-------------- + break2 ------------------------==========--------------------------- + + Insertion edge case - in which bp1_e > bp1_s + E.g bp1_s = 1890, bp1_e = 1800, bp2_s = 1800, bp2_e = 1800. + break1 ------------------------==----------------------------------- + break2 ------------------------=------------------------------------ + + Insertion edge case - in which bp1_s > bp2_s + E.g bp1_s = 1950, bp1_e = 2000, bp2_s = 1900, bp2_e = 3000. + break1 ------------------------==----------------------------------- + break2 -----------------------========------------------------------ + """ + is_intrachromosomal = std_row["break1_chromosome"] == std_row["break2_chromosome"] + + if record.info.get( + 'CILEN' + ): # as per https://github.com/samtools/hts-specs/issues/615, CILEN takes priority over CIPOS in dictating breakpoint2 + std_row.update( + { + COLUMNS.break1_position_start: max( + 1, record.pos + record.info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max(1, bp_end + record.info.get('CILEN', (0, 0))[0]), + COLUMNS.break2_position_end: bp_end + record.info.get('CILEN', (0, 0))[1], + } + ) + else: + std_row.update( + { + COLUMNS.break1_position_start: max( + 1, record.pos + record.info.get('CIPOS', (0, 0))[0] + ), + COLUMNS.break1_position_end: record.pos + record.info.get('CIPOS', (0, 0))[1], + COLUMNS.break2_position_start: max(1, bp_end + record.info.get('CIEND', (0, 0))[0]), + COLUMNS.break2_position_end: bp_end + record.info.get('CIEND', (0, 0))[1], + } + ) + + if is_intrachromosomal and ( + Interval.overlaps( + (std_row['break1_position_start'], std_row['break1_position_end']), + (std_row['break2_position_start'], std_row['break2_position_end']), + ) + ): + if std_row.get('event_type') != 'BND': + std_row['break2_position_start'] = max( + std_row['break1_position_start'], std_row['break2_position_start'] + ) + std_row['break1_position_end'] = min( + std_row['break1_position_end'], std_row['break2_position_end'] + ) + std_row['break1_position_start'] = min( + std_row['break1_position_start'], std_row['break2_position_end'] + ) + if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: + # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 + std_row.update({'break1_position_end': 1}) + if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: + std_row.update({'break2_position_end': 1}) + + if is_intrachromosomal and ( + std_row['break2_position_start'] > std_row['break2_position_end'] + or std_row['break1_position_start'] > std_row['break1_position_end'] + ): + if 'event_type' in std_row and std_row['event_type'] != 'BND': + raise ValueError( + f'Improper entry. One of the following breakpoints start is greater than breakpoint end: Breakpoint1_start: {std_row["break1_position_start"]}, Breakpoint1_end: {std_row["break1_position_end"]} Breakpoint2_start: {std_row["break2_position_start"]}, Breakpoint2_end: {std_row["break2_position_end"]} This call has been dropped.' + ) + + if ( + None not in (record.pos, record.info.get('END')) + and record.pos > record.info.get('END') + and is_intrachromosomal + ): + raise ValueError( + f'Improper entry. Starting position ({record.pos}) cannot be greater than ending position ({record.info.get("END")}). This call has been dropped.' + ) + + def convert_record(record: VcfRecordType) -> List[Dict]: """ converts a vcf record @@ -181,6 +274,10 @@ def convert_record(record: VcfRecordType) -> List[Dict]: elif size < 0: std_row[COLUMNS.event_type] = SVTYPE.DEL std_row.update({COLUMNS.break1_chromosome: record.chrom, COLUMNS.break2_chromosome: chr2}) + + if 'SVTYPE' in info: + std_row[COLUMNS.event_type] = info['SVTYPE'] + if info.get( 'PRECISE', False ): # DELLY CI only apply when split reads were not used to refine the breakpoint which is then flagged @@ -193,24 +290,7 @@ def convert_record(record: VcfRecordType) -> List[Dict]: } ) else: - std_row.update( - { - COLUMNS.break1_position_start: max( - 1, record.pos + info.get('CIPOS', (0, 0))[0] - ), - COLUMNS.break1_position_end: record.pos + info.get('CIPOS', (0, 0))[1], - COLUMNS.break2_position_start: max(1, end + info.get('CIEND', (0, 0))[0]), - COLUMNS.break2_position_end: end + info.get('CIEND', (0, 0))[1], - } - ) - if std_row['break1_position_end'] == 0 and std_row['break1_position_start'] == 1: - # addresses cases where pos = 0 and telomeric BND alt syntax https://github.com/bcgsc/mavis/issues/294 - std_row.update({'break1_position_end': 1}) - if std_row['break2_position_end'] == 0 and std_row['break2_position_start'] == 1: - std_row.update({'break2_position_end': 1}) - - if 'SVTYPE' in info: - std_row[COLUMNS.event_type] = info['SVTYPE'] + convert_imprecise_breakend(std_row, record, end) try: orient1, orient2 = info['CT'].split('to') @@ -220,7 +300,11 @@ def convert_record(record: VcfRecordType) -> List[Dict]: except KeyError: pass std_row.update( - {k: v for k, v in info.items() if k not in {'CHR2', 'SVTYPE', 'CIPOS', 'CIEND', 'CT'}} + { + k: v + for k, v in info.items() + if k not in {'CHR2', 'SVTYPE', 'CIPOS', 'CIEND', 'CILEN', 'CT'} + } ) records.append(std_row) return records @@ -238,7 +322,7 @@ def parse_info(info_field): # convert info types for key in info: - if key in {'CIPOS', 'CIEND'}: + if key in {'CIPOS', 'CIEND', 'CILEN'}: ci_start, ci_end = info[key].split(',') info[key] = (int(ci_start), int(ci_end)) elif key == 'END': @@ -327,6 +411,6 @@ def convert_file(input_file: str) -> List[Dict]: for variant_record in convert_pandas_rows_to_variants(data): try: rows.extend(convert_record(variant_record)) - except NotImplementedError as err: + except (NotImplementedError, ValueError) as err: logging.warning(str(err)) return rows diff --git a/src/mavis/main.py b/src/mavis/main.py index fac18049..f00f72fc 100644 --- a/src/mavis/main.py +++ b/src/mavis/main.py @@ -14,7 +14,6 @@ from . import __version__ from . import config as _config from . import util as _util -from .align import get_aligner_version from .annotate import main as annotate_main from .cluster import main as cluster_main from .convert import SUPPORTED_TOOL, convert_tool_output @@ -24,6 +23,7 @@ from .summary import main as summary_main from .util import filepath from .validate import main as validate_main +from .validate.align import get_aligner_version def convert_main(inputs, outputfile, file_type, strand_specific=False, assume_no_untemplated=True): diff --git a/src/mavis/summary/summary.py b/src/mavis/summary/summary.py index f30089b4..a52711cd 100644 --- a/src/mavis/summary/summary.py +++ b/src/mavis/summary/summary.py @@ -129,7 +129,9 @@ def group_events(events): bpp.break2.strand != new_bpp.break2.strand, ] ): - raise AssertionError('cannot group events differing on key elements', bpp, new_bpp) + raise AssertionError( + 'cannot group events differing on key elements', str(bpp), str(new_bpp) + ) # Note: There are some attributes that shouldn't be lost if different, currently appending the information # The evidence could be better off as a max instead of a join @@ -208,10 +210,11 @@ def annotate_dgv(bpps, dgv_regions_by_reference_name, distance=0): for bpp in [ b for b in bpps if not b.interchromosomal and b.break1.chr in dgv_regions_by_reference_name ]: + bpp.data['dgv'] = [] for dgv_region in dgv_regions_by_reference_name[bpp.break1.chr]: dist = abs(Interval.dist(Interval(dgv_region.start), bpp.break1)) if dist > lowest_resolution + distance: - break + continue elif ( dist > distance or abs(Interval.dist(Interval(dgv_region.end), bpp.break2)) > distance @@ -222,9 +225,10 @@ def annotate_dgv(bpps, dgv_regions_by_reference_name, distance=0): refname = dgv_region.reference_object.name except AttributeError: pass - bpp.data['dgv'] = '{}({}:{}-{})'.format( - dgv_region.name, refname, dgv_region.start, dgv_region.end + bpp.data['dgv'].append( + '{}({}:{}-{})'.format(dgv_region.name, refname, dgv_region.start, dgv_region.end) ) + bpp.data['dgv'] = ';'.join(bpp.data['dgv']) def get_pairing_state( diff --git a/src/mavis/util.py b/src/mavis/util.py index d4d34508..2e9d58a8 100644 --- a/src/mavis/util.py +++ b/src/mavis/util.py @@ -10,7 +10,7 @@ from mavis_config import bash_expands from shortuuid import uuid -from .breakpoint import Breakpoint, BreakpointPair +from .breakpoint import Breakpoint, BreakpointPair, classify_breakpoint_pair from .constants import ( COLUMNS, FLOAT_COLUMNS, @@ -511,17 +511,17 @@ def validate_pipeline_id(value): bpp.data.update(data) if putative_event_type: bpp.data[COLUMNS.event_type] = putative_event_type - if putative_event_type not in BreakpointPair.classify(bpp): + if putative_event_type not in classify_breakpoint_pair(bpp): raise InvalidRearrangement( 'error: expected one of', - BreakpointPair.classify(bpp), + classify_breakpoint_pair(bpp), 'but found', putative_event_type, str(bpp), row, ) if expand_svtype and not putative_event_type: - for svtype in BreakpointPair.classify( + for svtype in classify_breakpoint_pair( bpp, distance=lambda x, y: Interval(y - x) ): new_bpp = bpp.copy() diff --git a/src/mavis/align.py b/src/mavis/validate/align.py similarity index 96% rename from src/mavis/align.py rename to src/mavis/validate/align.py index 223afddb..b3abb9f0 100644 --- a/src/mavis/align.py +++ b/src/mavis/validate/align.py @@ -9,16 +9,16 @@ import pysam -from .bam import cigar as _cigar -from .bam import read as _read -from .breakpoint import Breakpoint, BreakpointPair -from .constants import CIGAR, ORIENT, STRAND, SVTYPE, MavisNamespace, reverse_complement -from .interval import Interval -from .types import ReferenceGenome -from .util import logger +from ..bam import cigar as _cigar +from ..bam import read as _read +from ..breakpoint import Breakpoint, BreakpointPair, classify_breakpoint_pair +from ..constants import CIGAR, ORIENT, STRAND, SVTYPE, MavisNamespace, reverse_complement +from ..interval import Interval +from ..types import ReferenceGenome +from ..util import logger if TYPE_CHECKING: - from .bam.cache import BamCache + from ..bam.cache import BamCache class SUPPORTED_ALIGNER(MavisNamespace): @@ -34,7 +34,7 @@ class SUPPORTED_ALIGNER(MavisNamespace): BLAT = 'blat' -class SplitAlignment(BreakpointPair): +class DiscontinuousAlignment(BreakpointPair): def __init__(self, *pos, **kwargs): self.read1 = kwargs.pop('read1') self.read2 = kwargs.pop('read2', None) @@ -177,9 +177,8 @@ def query_coverage_interval(read: pysam.AlignedSegment) -> Interval: Returns: The portion of the original query sequence that is aligned by this read """ - seq = read.query_sequence st = 0 - end = len(seq) - 1 + end = read.query_length - 1 if read.cigar[0][0] == CIGAR.S: st += read.cigar[0][1] if read.cigar[-1][0] == CIGAR.S: @@ -210,7 +209,7 @@ def convert_to_duplication(alignment, reference_genome: ReferenceGenome): if refseq != alignment.untemplated_seq[:dup_len]: continue - result = SplitAlignment( + result = DiscontinuousAlignment( Breakpoint( alignment.break2.chr, alignment.break2.start - dup_len, @@ -280,7 +279,7 @@ def call_read_events(read, secondary_read=None, is_stranded=False): result = [] strand = STRAND.NEG if read.is_reverse else STRAND.POS for ref_start, delsize, insseq in events: - bpp = SplitAlignment( + bpp = DiscontinuousAlignment( Breakpoint( read.reference_name, ref_start, @@ -333,7 +332,8 @@ def read_breakpoint(read): def call_paired_read_event(read1, read2, is_stranded=False): """ For a given pair of reads call all applicable events. Assume there is a major - event from both reads and then call indels from the individual reads + event from both reads and then call indels from the individual reads. Should be alternate alignments of + the same read """ # sort the reads so that we are calling consistently break1 = read_breakpoint(read1) @@ -383,7 +383,9 @@ def call_paired_read_event(read1, read2, is_stranded=False): if not is_stranded: break1.strand = STRAND.NS break2.strand = STRAND.NS - return SplitAlignment(break1, break2, untemplated_seq=untemplated_seq, read1=read1, read2=read2) + return DiscontinuousAlignment( + break1, break2, untemplated_seq=untemplated_seq, read1=read1, read2=read2 + ) def align_sequences( @@ -521,7 +523,7 @@ def select_contig_alignments(evidence, reads_by_query): standardize/simplify reads and filter bad/irrelevant alignments adds the contig alignments to the contigs """ - putative_types = BreakpointPair.classify(evidence) + putative_types = classify_breakpoint_pair(evidence) if {SVTYPE.DUP, SVTYPE.INS} & putative_types: putative_types.update({SVTYPE.DUP, SVTYPE.INS}) @@ -541,7 +543,7 @@ def filter_pass(alignment): def supports_primary_event(alignment): return all( [ - BreakpointPair.classify(alignment) & putative_types, + classify_breakpoint_pair(alignment) & putative_types, alignment.break1.chr == evidence.break1.chr, alignment.break2.chr == evidence.break2.chr, alignment.break1 & evidence.outer_window1, diff --git a/src/mavis/assemble.py b/src/mavis/validate/assemble.py similarity index 98% rename from src/mavis/assemble.py rename to src/mavis/validate/assemble.py index 235078fa..4efbe874 100644 --- a/src/mavis/assemble.py +++ b/src/mavis/validate/assemble.py @@ -4,11 +4,11 @@ import distance import networkx as nx -from .bam import cigar as _cigar -from .bam.read import calculate_alignment_score, nsb_align, sequence_complexity -from .constants import reverse_complement -from .interval import Interval -from .util import logger +from ..bam import cigar as _cigar +from ..bam.read import calculate_alignment_score, nsb_align, sequence_complexity +from ..constants import reverse_complement +from ..interval import Interval +from ..util import logger class Contig: diff --git a/src/mavis/validate/base.py b/src/mavis/validate/base.py index 65a70bc5..68bb28d7 100644 --- a/src/mavis/validate/base.py +++ b/src/mavis/validate/base.py @@ -5,24 +5,15 @@ import pysam from mavis_config import DEFAULTS -from ..assemble import assemble from ..bam import cigar as _cigar from ..bam import read as _read from ..bam.cache import BamCache -from ..breakpoint import Breakpoint, BreakpointPair -from ..constants import ( - CIGAR, - COLUMNS, - NA_MAPPING_QUALITY, - ORIENT, - PYSAM_READ_FLAGS, - STRAND, - SVTYPE, - reverse_complement, -) +from ..breakpoint import Breakpoint, BreakpointPair, classify_breakpoint_pair +from ..constants import COLUMNS, ORIENT, PYSAM_READ_FLAGS, STRAND, SVTYPE, reverse_complement from ..error import NotSpecifiedError from ..interval import Interval from ..util import logger +from .assemble import Contig, assemble class Evidence(BreakpointPair): @@ -33,7 +24,7 @@ class Evidence(BreakpointPair): compatible_window1: Optional[Interval] compatible_window2: Optional[Interval] config: Dict - contigs: List + contigs: List[Contig] counts: List[int] flanking_pairs: Set half_mapped: Tuple[Set, Set] @@ -160,12 +151,12 @@ def __init__( self.stdev_fragment_size = stdev_fragment_size self.strand_determining_read = strand_determining_read - if self.classification is not None and self.classification not in BreakpointPair.classify( + if self.classification is not None and self.classification not in classify_breakpoint_pair( self ): raise AttributeError( 'breakpoint pair improper classification', - BreakpointPair.classify(self), + classify_breakpoint_pair(self), self.classification, ) @@ -252,7 +243,7 @@ def putative_event_types(self): """ if self.classification: return {self.classification} - return BreakpointPair.classify(self) + return classify_breakpoint_pair(self) @property def compatible_type(self): @@ -281,515 +272,6 @@ def supporting_reads(self): result.update(self.spanning_reads) return result - def collect_spanning_read(self, read: pysam.AlignedSegment): - """ - spanning read: a read covering BOTH breakpoints - - This is only applicable to small events. Do not need to look for soft clipped reads - here since they will be collected already - - Args: - read: the putative spanning read - - Returns: - bool: - - True: the read was collected and stored in the current evidence object - - False: the read was not collected - """ - if self.interchromosomal: - return False - elif not Interval.overlaps(self.inner_window1, self.inner_window2): - # too far apart to call spanning reads - return False - - if self.stranded: - strand = _read.sequenced_strand(read, self.strand_determining_read) - if strand != self.break1.strand and strand != self.break2.strand: - return False - - combined = self.inner_window1 & self.inner_window2 - read_interval = Interval(read.reference_start + 1, read.reference_end) - - if Interval.overlaps(combined, read_interval): - - if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( - PYSAM_READ_FLAGS.RECOMPUTED_CIGAR - ): - - read = self.standardize_read(read) - # in the correct position, now determine if it can support the event types - for event_type in self.putative_event_types(): - if event_type in [SVTYPE.DUP, SVTYPE.INS]: - if CIGAR.I in [c[0] for c in read.cigar]: - self.spanning_reads.add(read) - return True - elif event_type == SVTYPE.DEL: - if CIGAR.D in [c[0] for c in read.cigar]: - self.spanning_reads.add(read) - return True - elif event_type == SVTYPE.INV: - if CIGAR.X in [c[0] for c in read.cigar]: - return True - return False - - def collect_compatible_flanking_pair( - self, read: pysam.AlignedSegment, mate: pysam.AlignedSegment, compatible_type: str - ) -> bool: - """ - checks if a given read meets the minimum quality criteria to be counted as evidence as stored as support for - this event - - Args: - read: the read to add - mate: the mate - compatible_type (SVTYPE): the type we are collecting for - - Returns: - bool: - - True: the pair was collected and stored in the current evidence object - - False: the pair was not collected - Raises: - ValueError: if the input reads are not a valid pair - - Note: - see [theory - types of flanking evidence](/background/theory/#compatible-flanking-pairs) - """ - if ( - read.is_unmapped - or mate.is_unmapped - or read.query_name != mate.query_name - or read.is_read1 == mate.is_read1 - ): - raise ValueError('input reads must be a mapped and mated pair') - if not self.compatible_window1: - raise ValueError('compatible windows were not given') - if self.interchromosomal: - raise NotImplementedError( - 'interchromosomal events do not have compatible flanking pairs' - ) - # check that the references are right for the pair - if read.reference_id != read.next_reference_id: - return False - elif ( - read.mapping_quality < self.min_mapping_quality - or mate.mapping_quality < self.min_mapping_quality - ): - return False - if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( - PYSAM_READ_FLAGS.RECOMPUTED_CIGAR - ): - read = self.standardize_read(read) - if not mate.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not mate.get_tag( - PYSAM_READ_FLAGS.RECOMPUTED_CIGAR - ): - mate = self.standardize_read(mate) - # order the read pairs so that they are in the same order that we expect for the breakpoints - if read.reference_start > mate.reference_start: - read, mate = mate, read - - if ( - self.bam_cache.get_read_reference_name(read) != self.break1.chr - or self.bam_cache.get_read_reference_name(mate) != self.break2.chr - ): - return False - - # check if this read falls in the first breakpoint window - iread = Interval(read.reference_start + 1, read.reference_end) - imate = Interval(mate.reference_start + 1, mate.reference_end) - - if self.stranded: - strand1 = _read.sequenced_strand(read, self.strand_determining_read) - strand2 = _read.sequenced_strand(mate, self.strand_determining_read) - if strand1 != self.break1.strand or strand2 != self.break2.strand: - return False - - # check that the pair orientation is correct - if not _read.orientation_supports_type(read, compatible_type): - return False - - # check that the fragment size is reasonable - fragment_size = self.compute_fragment_size(read, mate) - - if compatible_type == SVTYPE.DEL: - if fragment_size.end <= self.max_expected_fragment_size: - return False - elif compatible_type == SVTYPE.INS: - if fragment_size.start >= self.min_expected_fragment_size: - return False - - # check that the positions of the reads and the strands make sense - if Interval.overlaps(iread, self.compatible_window1) and Interval.overlaps( - imate, self.compatible_window2 - ): - self.compatible_flanking_pairs.add((read, mate)) - return True - - return False - - def collect_flanking_pair(self, read: pysam.AlignedSegment, mate: pysam.AlignedSegment): - """ - checks if a given read meets the minimum quality criteria to be counted as evidence as stored as support for - this event - - Args: - read: the read to add - mate: the mate - - Returns: - bool: - - True: the pair was collected and stored in the current evidence object - - False: the pair was not collected - Raises: - ValueError: if the input reads are not a valid pair - - see [theory - types of flanking evidence](/background/theory/#types-of-flanking-evidence) - """ - if read.is_unmapped or mate.is_unmapped: - raise ValueError( - 'input reads must be a mapped and mated pair. One or both of the reads is unmapped' - ) - elif read.query_name != mate.query_name: - raise ValueError( - 'input reads must be a mapped and mated pair. The query names do not match' - ) - elif abs(read.template_length) != abs(mate.template_length): - raise ValueError( - 'input reads must be a mapped and mated pair. The template lengths (abs value) do not match', - abs(read.template_length), - abs(mate.template_length), - ) - elif ( - read.mapping_quality < self.min_mapping_quality - or mate.mapping_quality < self.min_mapping_quality - ): - return False # do not meet the minimum mapping quality requirements - # check that the references are right for the pair - if self.interchromosomal: - if read.reference_id == read.next_reference_id: - return False - elif read.reference_id != read.next_reference_id: - return False - - if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( - PYSAM_READ_FLAGS.RECOMPUTED_CIGAR - ): - read = self.standardize_read(read) - if not mate.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not mate.get_tag( - PYSAM_READ_FLAGS.RECOMPUTED_CIGAR - ): - mate = self.standardize_read(mate) - # order the read pairs so that they are in the same order that we expect for the breakpoints - if read.reference_id != mate.reference_id: - if self.bam_cache.get_read_reference_name( - read - ) > self.bam_cache.get_read_reference_name(mate): - read, mate = mate, read - elif read.reference_start > mate.reference_start: - read, mate = mate, read - - if ( - self.bam_cache.get_read_reference_name(read) != self.break1.chr - or self.bam_cache.get_read_reference_name(mate) != self.break2.chr - ): - return False - if any( - [ - self.break1.orient == ORIENT.LEFT and read.is_reverse, - self.break1.orient == ORIENT.RIGHT and not read.is_reverse, - self.break2.orient == ORIENT.LEFT and mate.is_reverse, - self.break2.orient == ORIENT.RIGHT and not mate.is_reverse, - ] - ): - return False - # check if this read falls in the first breakpoint window - iread = Interval(read.reference_start + 1, read.reference_end) - imate = Interval(mate.reference_start + 1, mate.reference_end) - - if self.stranded: - strand1 = _read.sequenced_strand(read, self.strand_determining_read) - strand2 = _read.sequenced_strand(mate, self.strand_determining_read) - - if strand1 != self.break1.strand or strand2 != self.break2.strand: - return False - - for event_type in self.putative_event_types(): - - # check that the pair orientation is correct - if not _read.orientation_supports_type(read, event_type): - continue - - # check that the fragment size is reasonable - fragment_size = self.compute_fragment_size(read, mate) - - if event_type == SVTYPE.DEL: - if fragment_size.end <= self.max_expected_fragment_size: - continue - elif event_type == SVTYPE.INS: - if fragment_size.start >= self.min_expected_fragment_size: - continue - - # check that the positions of the reads and the strands make sense - if Interval.overlaps(iread, self.outer_window1) and Interval.overlaps( - imate, self.outer_window2 - ): - self.flanking_pairs.add((read, mate)) - return True - - return False - - def collect_half_mapped(self, read: pysam.AlignedSegment, mate: pysam.AlignedSegment): - """ - Args: - read: the read to add - mate: the unmapped mate - - Returns: - bool: - - True: the read was collected and stored in the current evidence object - - False: the read was not collected - Raises: - AssertionError: if the mate is not unmapped - """ - if not mate.is_unmapped: - raise AssertionError('expected the mate to be unmapped') - read_itvl = Interval(read.reference_start + 1, read.reference_end) - added = False - if self.break1.chr == read.reference_name and Interval.overlaps( - self.outer_window1, read_itvl - ): - self.half_mapped[0].add(mate) - added = True - if self.break2.chr == read.reference_name and Interval.overlaps( - self.outer_window2, read_itvl - ): - self.half_mapped[1].add(mate) - added = True - return added - - def collect_split_read(self, read: pysam.AlignedSegment, first_breakpoint: bool): - """ - adds a split read if it passes the criteria filters and raises a warning if it does not - - Args: - read: the read to add - first_breakpoint (bool): add to the first breakpoint (or second if false) - Returns: - bool: - - True: the read was collected and stored in the current evidence object - - False: the read was not collected - Raises: - NotSpecifiedError: if the breakpoint orientation is not specified - """ - breakpoint = self.break1 if first_breakpoint else self.break2 - window = self.inner_window1 if first_breakpoint else self.inner_window2 - opposite_breakpoint = self.break2 if first_breakpoint else self.break1 - opposite_window = self.inner_window2 if first_breakpoint else self.inner_window1 - - if read.cigar[0][0] != CIGAR.S and read.cigar[-1][0] != CIGAR.S: - return False # split read is not softclipped - elif breakpoint.orient == ORIENT.LEFT and read.cigar[-1][0] != CIGAR.S: - return False # split read is not softclipped - elif breakpoint.orient == ORIENT.RIGHT and read.cigar[0][0] != CIGAR.S: - return False # split read is not softclipped - - # the first breakpoint of a BreakpointPair is always the lower breakpoint - # if this is being added to the second breakpoint then we'll need to check if the - # read soft-clipping needs to be adjusted - - # check if the read falls within the evidence collection window - # recall that pysam is 0-indexed and the window is 1-indexed - if ( - read.reference_start > window.end - 1 - or read.reference_end < window.start - or self.bam_cache.get_read_reference_name(read) != breakpoint.chr - ): - return False # read not in breakpoint evidence window - # can only enforce strand if both the breakpoint and the bam are stranded - if self.stranded and self.bam_cache.stranded: - strand = _read.sequenced_strand( - read, strand_determining_read=self.strand_determining_read - ) - if strand != breakpoint.strand: - return False # split read not on the appropriate strand - unused = '' - primary = '' - clipped = '' - if breakpoint.orient == ORIENT.LEFT: - unused = read.query_sequence[: read.query_alignment_start] - primary = read.query_sequence[read.query_alignment_start : read.query_alignment_end] - # end is exclusive in pysam - clipped = read.query_sequence[read.query_alignment_end :] - elif breakpoint.orient == ORIENT.RIGHT: - clipped = read.query_sequence[: read.query_alignment_start] - primary = read.query_sequence[read.query_alignment_start : read.query_alignment_end] - unused = read.query_sequence[read.query_alignment_end :] - else: - raise NotSpecifiedError( - 'cannot assign split reads to a breakpoint where the orientation has not been specified' - ) - if len(primary) + len(clipped) + len(unused) != len(read.query_sequence): - raise AssertionError( - 'unused, primary, and clipped sequences should make up the original sequence', - unused, - primary, - clipped, - read.query_sequence, - len(read.query_sequence), - ) - - if ( - len(primary) < self.config['validate.min_anchor_exact'] - or len(clipped) < self.config['validate.min_softclipping'] - ): - # split read does not meet the minimum anchor criteria - return False - if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( - PYSAM_READ_FLAGS.RECOMPUTED_CIGAR - ): - read = self.standardize_read(read) - # data quality filters - if ( - _cigar.alignment_matches(read.cigar) - >= self.config['validate.min_sample_size_to_apply_percentage'] - and _cigar.match_percent(read.cigar) < self.config['validate.min_anchor_match'] - ): - return False # too poor quality of an alignment - if ( - _cigar.longest_exact_match(read.cigar) < self.config['validate.min_anchor_exact'] - and _cigar.longest_fuzzy_match( - read.cigar, self.config['validate.fuzzy_mismatch_number'] - ) - < self.config['validate.min_anchor_fuzzy'] - ): - return False # too poor quality of an alignment - else: - self.split_reads[0 if first_breakpoint else 1].add(read) - - # try mapping the soft-clipped portion to the other breakpoint - w = (opposite_window[0], opposite_window[1]) - opposite_breakpoint_ref = self.reference_genome[opposite_breakpoint.chr].seq[ - w[0] - 1 : w[1] - ] - - # figure out how much of the read must match when remaped - min_match_tgt = read.cigar[-1][1] if breakpoint.orient == ORIENT.LEFT else read.cigar[0][1] - min_match_tgt = min( - min_match_tgt * self.config['validate.min_anchor_match'], min_match_tgt - 1 - ) / len(read.query_sequence) - if not self.opposing_strands: # same strand - sc_align = _read.nsb_align( - opposite_breakpoint_ref, - read.query_sequence, - min_consecutive_match=self.config['validate.min_anchor_exact'], - min_match=min_match_tgt, - min_overlap_percent=min_match_tgt, - ) # split half to this side - - for alignment in sc_align: - alignment.flag = read.flag - putative_alignments = sc_align - else: - # should align opposite the current read - revcomp_sc_align = reverse_complement(read.query_sequence) - revcomp_sc_align = _read.nsb_align( - opposite_breakpoint_ref, - revcomp_sc_align, - min_consecutive_match=self.config['validate.min_anchor_exact'], - min_match=min_match_tgt, - min_overlap_percent=min_match_tgt, - ) - - for alignment in revcomp_sc_align: - alignment.flag = read.flag ^ PYSAM_READ_FLAGS.REVERSE # EXOR - putative_alignments = revcomp_sc_align - - scores = [] - for alignment in putative_alignments: # loop over the alignments - alignment.flag = alignment.flag | PYSAM_READ_FLAGS.SUPPLEMENTARY - # set this flag so we don't recompute the cigar multiple - alignment.set_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR, 1, value_type='i') - # add information from the original read - alignment.reference_start = w[0] - 1 + alignment.reference_start - alignment._reference_name = ( - opposite_breakpoint.chr - ) # must be set since not associated with an alignment file - alignment.reference_id = self.bam_cache.reference_id(opposite_breakpoint.chr) - alignment.query_name = read.query_name - alignment.set_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT, 1, value_type='i') - alignment.next_reference_start = read.next_reference_start - alignment.next_reference_id = read.next_reference_id - alignment.mapping_quality = NA_MAPPING_QUALITY - try: - cigar, offset = _cigar.extend_softclipping( - alignment.cigar, self.config['validate.min_anchor_exact'] - ) - alignment.cigar = cigar - alignment.reference_start = alignment.reference_start + offset - except AttributeError: - # if the matches section is too small you can't extend the - # softclipping - pass - s = _cigar.score(alignment.cigar) - alignment.cigar = _cigar.join(alignment.cigar) - if alignment.reference_id == alignment.next_reference_id: - # https://samtools.github.io/hts-specs/SAMv1.pdf - # unsigned observed template length equals the number of bases from the leftmost - # mapped base to the rightmost mapped base - tlen = abs(alignment.reference_start - alignment.next_reference_start) + 1 - if alignment.reference_start < alignment.next_reference_start: - alignment.template_length = tlen - else: - alignment.template_length = -1 * tlen - else: - alignment.template_length = 0 - if ( - _cigar.alignment_matches(alignment.cigar) - >= self.config['validate.min_sample_size_to_apply_percentage'] - and _cigar.match_percent(alignment.cigar) < self.config['validate.min_anchor_match'] - ): - continue - if ( - _cigar.longest_exact_match(alignment.cigar) - < self.config['validate.min_anchor_exact'] - and _cigar.longest_fuzzy_match( - alignment.cigar, self.config['validate.fuzzy_mismatch_number'] - ) - < self.config['validate.min_anchor_fuzzy'] - ): - continue - if self.config['validate.max_sc_preceeding_anchor'] is not None: - if opposite_breakpoint.orient == ORIENT.LEFT: - if ( - alignment.cigar[0][0] == CIGAR.S - and alignment.cigar[0][1] > self.config['validate.max_sc_preceeding_anchor'] - ): - continue - elif opposite_breakpoint.orient == ORIENT.RIGHT: - if ( - alignment.cigar[-1][0] == CIGAR.S - and alignment.cigar[-1][1] - > self.config['validate.max_sc_preceeding_anchor'] - ): - continue - alignment.set_key() # set the hash key before we add the read as evidence - scores.append((s, _cigar.match_percent(alignment.cigar), alignment)) - - scores = sorted(scores, key=lambda x: (x[0], x[1]), reverse=True) if scores else [] - - if len(scores) > 1: - if scores[0][0] != scores[1][0] and scores[0][1] != scores[1][1]: - # not multimap, pick highest scoring alignment - clipped = scores[0][2] - self.split_reads[1 if first_breakpoint else 0].add( - clipped - ) # add to the opposite breakpoint - elif len(scores) == 1: - clipped = scores[0][2] - self.split_reads[1 if first_breakpoint else 0].add( - clipped - ) # add to the opposite breakpoint - return True - def decide_sequenced_strand(self, reads: Set[pysam.AlignedSegment]): """ given a set of reads, determines the sequenced strand (if possible) and then returns the majority @@ -978,211 +460,6 @@ def assemble_contig(self): list(filtered_contigs.values()), key=lambda x: (x.remap_score() * -1, x.seq) ) - def load_evidence(self): - """ - open the associated bam file and read and store the evidence - does some preliminary read-quality filtering - """ - - def cache_if_true(read): - if read.is_unmapped or read.mate_is_unmapped: - return True - elif any( - [ - self.config['validate.filter_secondary_alignments'] and read.is_secondary, - read.mapping_quality < self.min_mapping_quality, - ] - ): - return False - elif set([x[0] for x in read.cigar]) & {CIGAR.S, CIGAR.H}: - return True - elif not read.is_proper_pair: - if any( - [_read.orientation_supports_type(read, e) for e in self.putative_event_types()] - ): - return True - elif self.compatible_type and _read.orientation_supports_type( - read, self.compatible_type - ): - return True - elif not self.interchromosomal and not self.opposing_strands: - min_frag_est = ( - abs(read.reference_start - read.next_reference_start) - self.read_length - ) - max_frag_est = min_frag_est + 3 * self.read_length - if ( - min_frag_est < self.min_expected_fragment_size - or max_frag_est > self.max_expected_fragment_size - ): - return True - - return False - - def filter_if_true(read): - if not cache_if_true(read): - if any( - [ - self.config['validate.filter_secondary_alignments'] and read.is_secondary, - read.mapping_quality < self.min_mapping_quality, - ] - ): - return True - elif not self.interchromosomal and set([x[0] for x in read.cigar]) & { - CIGAR.I, - CIGAR.D, - }: - return False - return True - return False - - flanking_pairs = set() # collect putative pairs - half_mapped_partners1 = set() - half_mapped_partners2 = set() - - for read in self.bam_cache.fetch_from_bins( - '{0}'.format(self.break1.chr), - self.outer_window1[0], - self.outer_window1[1], - read_limit=self.config['validate.fetch_reads_limit'], - sample_bins=self.config['validate.fetch_reads_bins'], - min_bin_size=self.config['validate.fetch_min_bin_size'], - cache=True, - cache_if=cache_if_true, - filter_if=filter_if_true, - ): - if read.mapping_quality < self.min_mapping_quality: - continue - self.counts[0] += 1 - if read.is_unmapped: - continue - if not self.collect_split_read(read, True): - self.collect_spanning_read(read) - if read.mate_is_unmapped: - half_mapped_partners1.add(read) - elif ( - any( - [ - _read.orientation_supports_type(read, et) - for et in self.putative_event_types() - ] - ) - and (read.reference_id != read.next_reference_id) == self.interchromosomal - ): - flanking_pairs.add(read) - - for read in self.bam_cache.fetch_from_bins( - '{0}'.format(self.break2.chr), - self.outer_window2[0], - self.outer_window2[1], - read_limit=self.config['validate.fetch_reads_limit'], - sample_bins=self.config['validate.fetch_reads_bins'], - min_bin_size=self.config['validate.fetch_min_bin_size'], - cache=True, - cache_if=cache_if_true, - filter_if=filter_if_true, - ): - if read.mapping_quality < self.min_mapping_quality: - continue - - self.counts[1] += 1 - - if read.is_unmapped: - continue - if not self.collect_split_read(read, False): - self.collect_spanning_read(read) - if read.mate_is_unmapped: - half_mapped_partners2.add(read) - elif ( - any( - [ - _read.orientation_supports_type(read, et) - for et in self.putative_event_types() - ] - ) - and (read.reference_id != read.next_reference_id) == self.interchromosomal - ): - flanking_pairs.add(read) - for flanking_read in sorted( - flanking_pairs, key=lambda x: (x.query_name, x.reference_start) - ): - # try and get the mate from the cache - try: - mates = self.bam_cache.get_mate(flanking_read, allow_file_access=False) - for mate in mates: - if mate.is_unmapped: - logger.debug(f'ignoring unmapped mate {mate.query_name}') - continue - self.collect_flanking_pair(flanking_read, mate) - except KeyError: - pass - - if self.compatible_window1: - compatible_type = SVTYPE.DUP - if SVTYPE.DUP in self.putative_event_types(): - compatible_type = SVTYPE.INS - - compt_flanking = set() - for read in self.bam_cache.fetch_from_bins( - '{0}'.format(self.break1.chr), - self.compatible_window1[0], - self.compatible_window1[1], - read_limit=self.config['validate.fetch_reads_limit'], - sample_bins=self.config['validate.fetch_reads_bins'], - min_bin_size=self.config['validate.fetch_min_bin_size'], - cache=True, - cache_if=cache_if_true, - filter_if=filter_if_true, - ): - if _read.orientation_supports_type(read, compatible_type): - compt_flanking.add(read) - - for read in self.bam_cache.fetch_from_bins( - '{0}'.format(self.break2.chr), - self.compatible_window2[0], - self.compatible_window2[1], - read_limit=self.config['validate.fetch_reads_limit'], - sample_bins=self.config['validate.fetch_reads_bins'], - min_bin_size=self.config['validate.fetch_min_bin_size'], - cache=True, - cache_if=cache_if_true, - filter_if=filter_if_true, - ): - if _read.orientation_supports_type(read, compatible_type): - compt_flanking.add(read) - - for flanking_read in compt_flanking: - # try and get the mate from the cache - try: - mates = self.bam_cache.get_mate(flanking_read, allow_file_access=False) - for mate in mates: - if mate.is_unmapped: - logger.debug(f'ignoring unmapped mate {mate.query_name}') - continue - try: - self.collect_compatible_flanking_pair( - flanking_read, mate, compatible_type - ) - except ValueError: - pass - except KeyError: - pass - - # now collect the half mapped reads - logger.info( - f'collected {len(half_mapped_partners1 | half_mapped_partners2)} putative half mapped reads', - ) - mates_found = 0 - for read in half_mapped_partners1 | half_mapped_partners2: - # try and get the mate from the cache - try: - mates = self.bam_cache.get_mate(read, allow_file_access=False) - mates_found += 1 - for mate in mates: - self.collect_half_mapped(read, mate) - except KeyError: - pass - logger.info(f'{mates_found} half-mapped mates found') - def copy(self): raise NotImplementedError('not appropriate for copy of evidence') diff --git a/src/mavis/blat.py b/src/mavis/validate/blat.py similarity index 98% rename from src/mavis/blat.py rename to src/mavis/validate/blat.py index cbfeaace..939b940f 100644 --- a/src/mavis/blat.py +++ b/src/mavis/validate/blat.py @@ -15,12 +15,11 @@ import pandas as pd -from .align import query_coverage_interval -from .bam import cigar as _cigar -from .bam.cache import BamCache -from .bam.cigar import QUERY_ALIGNED_STATES -from .bam.read import SamRead -from .constants import ( +from ..bam import cigar as _cigar +from ..bam.cache import BamCache +from ..bam.cigar import QUERY_ALIGNED_STATES +from ..bam.read import SamRead +from ..constants import ( CIGAR, DNA_ALPHABET, NA_MAPPING_QUALITY, @@ -28,9 +27,10 @@ STRAND, reverse_complement, ) -from .interval import Interval -from .types import ReferenceGenome -from .util import logger +from ..interval import Interval +from ..types import ReferenceGenome +from ..util import logger +from .align import query_coverage_interval class Blat: diff --git a/src/mavis/validate/call.py b/src/mavis/validate/call.py index 002534a3..a69bf62a 100644 --- a/src/mavis/validate/call.py +++ b/src/mavis/validate/call.py @@ -3,10 +3,8 @@ import statistics from typing import List, Optional, Set -from ..align import SplitAlignment, call_paired_read_event, call_read_events, convert_to_duplication -from ..assemble import Contig from ..bam import read as _read -from ..breakpoint import Breakpoint, BreakpointPair +from ..breakpoint import Breakpoint, BreakpointPair, classify_breakpoint_pair from ..constants import ( CALL_METHOD, COLUMNS, @@ -18,6 +16,13 @@ ) from ..interval import Interval from ..validate.base import Evidence +from .align import ( + DiscontinuousAlignment, + call_paired_read_event, + call_read_events, + convert_to_duplication, +) +from .assemble import Contig class EventCall(BreakpointPair): @@ -34,7 +39,7 @@ class for holding evidence and the related calls since we can't freeze the evide compatible_flanking_pairs: Set compatible_type: Optional[str] contig: Optional[Contig] - contig_alignment: Optional[SplitAlignment] + contig_alignment: Optional[DiscontinuousAlignment] @property def has_compatible(self): @@ -48,7 +53,7 @@ def __init__( event_type: str, call_method: str, contig: Optional[Contig] = None, - contig_alignment: Optional[SplitAlignment] = None, + contig_alignment: Optional[DiscontinuousAlignment] = None, untemplated_seq: Optional[str] = None, ): """ @@ -84,10 +89,10 @@ def __init__( else: self.compatible_type = None # use the distance function from the source evidence to narrow the possible types - putative_types = BreakpointPair.classify(self, source_evidence.distance) + putative_types = classify_breakpoint_pair(self, source_evidence.distance) if event_type not in putative_types and self.compatible_type in putative_types: event_type, self.compatible_type = self.compatible_type, event_type - putative_types = BreakpointPair.classify(self, source_evidence.distance) + putative_types = classify_breakpoint_pair(self, source_evidence.distance) self.data[COLUMNS.event_type] = SVTYPE.enforce(event_type) if event_type not in putative_types | {self.compatible_type}: @@ -172,7 +177,7 @@ def is_supplementary(self): return not all( [ {self.event_type, self.compatible_type} - & BreakpointPair.classify(self.source_evidence), + & classify_breakpoint_pair(self.source_evidence), self.break1 & self.source_evidence.outer_window1, self.break2 & self.source_evidence.outer_window2, self.break1.chr == self.source_evidence.break1.chr, @@ -236,8 +241,8 @@ def add_flanking_support(self, flanking_pairs, is_compatible=False): else: # L R if not all( [ - read.reference_start + 1 <= self.break1.end, - mate.reference_end >= self.break2.start, + read.reference_end <= self.break1.end, + mate.reference_start >= self.break2.start, ] ): continue @@ -300,7 +305,7 @@ def add_spanning_read(self, read): read (pysam.AlignedSegment): putative spanning read """ for event in call_read_events(read, is_stranded=self.source_evidence.bam_cache.stranded): - if event == self and self.event_type in BreakpointPair.classify( + if event == self and self.event_type in classify_breakpoint_pair( event, distance=self.source_evidence.distance ): self.spanning_reads.add(read) @@ -515,13 +520,13 @@ def flatten(self): if self.contig: cseq = self.contig_alignment.query_sequence try: - break1_read_depth = SplitAlignment.breakpoint_contig_remapped_depth( + break1_read_depth = DiscontinuousAlignment.breakpoint_contig_remapped_depth( self.break1, self.contig, self.contig_alignment.read1 ) except AssertionError: break1_read_depth = None try: - break2_read_depth = SplitAlignment.breakpoint_contig_remapped_depth( + break2_read_depth = DiscontinuousAlignment.breakpoint_contig_remapped_depth( self.break2, self.contig, self.contig_alignment.read1 @@ -562,7 +567,7 @@ def _call_by_contigs(source_evidence): for aln in ctg.alignments: if aln.is_putative_indel and aln.net_size(source_evidence.distance) == Interval(0): continue - for event_type in BreakpointPair.classify(aln, distance=source_evidence.distance): + for event_type in classify_breakpoint_pair(aln, distance=source_evidence.distance): try: new_event = EventCall( aln.break1, @@ -655,8 +660,10 @@ def _call_by_spanning_reads(source_evidence: Evidence, consumed_evidence): None # unless we are collecting a consensus we shouldn't assign sequences to the breaks ) event.break2.seq = None - types = BreakpointPair.classify(source_evidence) | {source_evidence.compatible_type} - for event_type in types & BreakpointPair.classify(event, distance=source_evidence.distance): + types = classify_breakpoint_pair(source_evidence) | {source_evidence.compatible_type} + for event_type in types & classify_breakpoint_pair( + event, distance=source_evidence.distance + ): try: new_event = EventCall( event.break1, @@ -1076,7 +1083,7 @@ def _call_by_split_reads(evidence, event_type, consumed_evidence=None): event_type, call_method=CALL_METHOD.SPLIT, untemplated_seq=call.untemplated_seq, - contig_alignment=None if not isinstance(call, SplitAlignment) else call, + contig_alignment=None if not isinstance(call, DiscontinuousAlignment) else call, ) except ValueError: # incompatible types continue diff --git a/src/mavis/validate/evidence.py b/src/mavis/validate/evidence.py index db013e46..77a60a64 100644 --- a/src/mavis/validate/evidence.py +++ b/src/mavis/validate/evidence.py @@ -210,11 +210,14 @@ def traverse( # type: ignore return Interval.from_iterable(genomic_end_positions) def compute_fragment_size(self, read, mate): + """ + Template_length to match genome template length (aka. fragment size - 1 read length if all mapped perfectly) + """ if read.reference_start > mate.reference_start: read, mate = mate, read if read.reference_name == mate.reference_name: start, end = self.distance( - read.reference_start + 1, mate.reference_end, chrom=read.reference_name + read.reference_start + 1, mate.reference_start + 1, chrom=read.reference_name ) return Interval(start + 1, end + 1) return Interval(0) diff --git a/src/mavis/validate/gather.py b/src/mavis/validate/gather.py new file mode 100644 index 00000000..41f03914 --- /dev/null +++ b/src/mavis/validate/gather.py @@ -0,0 +1,748 @@ +""" +Module which contains functions related to picking reads from a BAM file which support a putative event +""" +import pysam + +from ..bam import cigar as _cigar +from ..bam import read as _read +from ..constants import ( + CIGAR, + NA_MAPPING_QUALITY, + ORIENT, + PYSAM_READ_FLAGS, + SVTYPE, + reverse_complement, +) +from ..error import NotSpecifiedError +from ..interval import Interval +from ..util import logger +from .base import Evidence + + +def collect_split_read(evidence_bpp: Evidence, read: pysam.AlignedSegment, first_breakpoint: bool): + """ + adds a split read if it passes the criteria filters and raises a warning if it does not + + Args: + read: the read to add + first_breakpoint: add to the first breakpoint (or second if false) + Returns: + bool: + - True: the read was collected and stored in the current evidence object + - False: the read was not collected + Raises: + NotSpecifiedError: if the breakpoint orientation is not specified + """ + breakpoint = evidence_bpp.break1 if first_breakpoint else evidence_bpp.break2 + window = evidence_bpp.inner_window1 if first_breakpoint else evidence_bpp.inner_window2 + opposite_breakpoint = evidence_bpp.break2 if first_breakpoint else evidence_bpp.break1 + opposite_window = evidence_bpp.inner_window2 if first_breakpoint else evidence_bpp.inner_window1 + + if read.cigar[0][0] != CIGAR.S and read.cigar[-1][0] != CIGAR.S: + return False # split read is not softclipped + elif breakpoint.orient == ORIENT.LEFT and read.cigar[-1][0] != CIGAR.S: + return False # split read is not softclipped + elif breakpoint.orient == ORIENT.RIGHT and read.cigar[0][0] != CIGAR.S: + return False # split read is not softclipped + + # the first breakpoint of a BreakpointPair is always the lower breakpoint + # if this is being added to the second breakpoint then we'll need to check if the + # read soft-clipping needs to be adjusted + + # check if the read falls within the evidence collection window + # recall that pysam is 0-indexed and the window is 1-indexed + if ( + read.reference_start > window.end - 1 + or read.reference_end < window.start + or evidence_bpp.bam_cache.get_read_reference_name(read) != breakpoint.chr + ): + return False # read not in breakpoint evidence window + # can only enforce strand if both the breakpoint and the bam are stranded + if evidence_bpp.stranded and evidence_bpp.bam_cache.stranded: + strand = _read.sequenced_strand( + read, strand_determining_read=evidence_bpp.strand_determining_read + ) + if strand != breakpoint.strand: + return False # split read not on the appropriate strand + unused = '' + primary = '' + clipped = '' + if breakpoint.orient == ORIENT.LEFT: + unused = read.query_sequence[: read.query_alignment_start] + primary = read.query_sequence[read.query_alignment_start : read.query_alignment_end] + # end is exclusive in pysam + clipped = read.query_sequence[read.query_alignment_end :] + elif breakpoint.orient == ORIENT.RIGHT: + clipped = read.query_sequence[: read.query_alignment_start] + primary = read.query_sequence[read.query_alignment_start : read.query_alignment_end] + unused = read.query_sequence[read.query_alignment_end :] + else: + raise NotSpecifiedError( + 'cannot assign split reads to a breakpoint where the orientation has not been specified' + ) + if len(primary) + len(clipped) + len(unused) != len(read.query_sequence): + raise AssertionError( + 'unused, primary, and clipped sequences should make up the original sequence', + unused, + primary, + clipped, + read.query_sequence, + len(read.query_sequence), + ) + + if ( + len(primary) < evidence_bpp.config['validate.min_anchor_exact'] + or len(clipped) < evidence_bpp.config['validate.min_softclipping'] + ): + # split read does not meet the minimum anchor criteria + return False + if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( + PYSAM_READ_FLAGS.RECOMPUTED_CIGAR + ): + read = evidence_bpp.standardize_read(read) + # data quality filters + if ( + _cigar.alignment_matches(read.cigar) + >= evidence_bpp.config['validate.min_sample_size_to_apply_percentage'] + and _cigar.match_percent(read.cigar) < evidence_bpp.config['validate.min_anchor_match'] + ): + return False # too poor quality of an alignment + if ( + _cigar.longest_exact_match(read.cigar) < evidence_bpp.config['validate.min_anchor_exact'] + and _cigar.longest_fuzzy_match( + read.cigar, evidence_bpp.config['validate.fuzzy_mismatch_number'] + ) + < evidence_bpp.config['validate.min_anchor_fuzzy'] + ): + return False # too poor quality of an alignment + else: + evidence_bpp.split_reads[0 if first_breakpoint else 1].add(read) + + # try mapping the soft-clipped portion to the other breakpoint + w = (opposite_window[0], opposite_window[1]) + opposite_breakpoint_ref = evidence_bpp.reference_genome[opposite_breakpoint.chr].seq[ + w[0] - 1 : w[1] + ] + + # figure out how much of the read must match when remapped + min_match_tgt = read.cigar[-1][1] if breakpoint.orient == ORIENT.LEFT else read.cigar[0][1] + min_match_tgt = min( + min_match_tgt * evidence_bpp.config['validate.min_anchor_match'], min_match_tgt - 1 + ) / len(read.query_sequence) + if not evidence_bpp.opposing_strands: # same strand + sc_align = _read.nsb_align( + opposite_breakpoint_ref, + read.query_sequence, + min_consecutive_match=evidence_bpp.config['validate.min_anchor_exact'], + min_match=min_match_tgt, + min_overlap_percent=min_match_tgt, + ) # split half to this side + + for alignment in sc_align: + alignment.flag = read.flag + putative_alignments = sc_align + else: + # should align opposite the current read + revcomp_sc_align = reverse_complement(read.query_sequence) + revcomp_sc_align = _read.nsb_align( + opposite_breakpoint_ref, + revcomp_sc_align, + min_consecutive_match=evidence_bpp.config['validate.min_anchor_exact'], + min_match=min_match_tgt, + min_overlap_percent=min_match_tgt, + ) + + for alignment in revcomp_sc_align: + alignment.flag = read.flag ^ PYSAM_READ_FLAGS.REVERSE # EXOR + putative_alignments = revcomp_sc_align + + scores = [] + for alignment in putative_alignments: # loop over the alignments + alignment.flag = alignment.flag | PYSAM_READ_FLAGS.SUPPLEMENTARY + # set this flag so we don't recompute the cigar multiple + alignment.set_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR, 1, value_type='i') + # add information from the original read + alignment.reference_start = w[0] - 1 + alignment.reference_start + alignment._reference_name = ( + opposite_breakpoint.chr + ) # must be set since not associated with an alignment file + alignment.reference_id = evidence_bpp.bam_cache.reference_id(opposite_breakpoint.chr) + alignment.query_name = read.query_name + alignment.set_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT, 1, value_type='i') + alignment.next_reference_start = read.next_reference_start + alignment.next_reference_id = read.next_reference_id + alignment.mapping_quality = NA_MAPPING_QUALITY + try: + cigar, offset = _cigar.extend_softclipping( + alignment.cigar, evidence_bpp.config['validate.min_anchor_exact'] + ) + alignment.cigar = cigar + alignment.reference_start = alignment.reference_start + offset + except AttributeError: + # if the matches section is too small you can't extend the + # softclipping + pass + s = _cigar.score(alignment.cigar) + alignment.cigar = _cigar.join(alignment.cigar) + if alignment.reference_id == alignment.next_reference_id: + # https://samtools.github.io/hts-specs/SAMv1.pdf + # unsigned observed template length equals the number of bases from the leftmost + # mapped base to the rightmost mapped base + tlen = abs(alignment.reference_start - alignment.next_reference_start) + 1 + if alignment.reference_start < alignment.next_reference_start: + alignment.template_length = tlen + else: + alignment.template_length = -1 * tlen + else: + alignment.template_length = 0 + if ( + _cigar.alignment_matches(alignment.cigar) + >= evidence_bpp.config['validate.min_sample_size_to_apply_percentage'] + and _cigar.match_percent(alignment.cigar) + < evidence_bpp.config['validate.min_anchor_match'] + ): + continue + if ( + _cigar.longest_exact_match(alignment.cigar) + < evidence_bpp.config['validate.min_anchor_exact'] + and _cigar.longest_fuzzy_match( + alignment.cigar, evidence_bpp.config['validate.fuzzy_mismatch_number'] + ) + < evidence_bpp.config['validate.min_anchor_fuzzy'] + ): + continue + if evidence_bpp.config['validate.max_sc_preceeding_anchor'] is not None: + if opposite_breakpoint.orient == ORIENT.LEFT: + if ( + alignment.cigar[0][0] == CIGAR.S + and alignment.cigar[0][1] + > evidence_bpp.config['validate.max_sc_preceeding_anchor'] + ): + continue + elif opposite_breakpoint.orient == ORIENT.RIGHT: + if ( + alignment.cigar[-1][0] == CIGAR.S + and alignment.cigar[-1][1] + > evidence_bpp.config['validate.max_sc_preceeding_anchor'] + ): + continue + alignment.set_key() # set the hash key before we add the read as evidence + scores.append((s, _cigar.match_percent(alignment.cigar), alignment)) + + scores = sorted(scores, key=lambda x: (x[0], x[1]), reverse=True) if scores else [] + + if len(scores) > 1: + if scores[0][0] != scores[1][0] and scores[0][1] != scores[1][1]: + # not multimap, pick highest scoring alignment + clipped = scores[0][2] + evidence_bpp.split_reads[1 if first_breakpoint else 0].add( + clipped + ) # add to the opposite breakpoint + elif len(scores) == 1: + clipped = scores[0][2] + evidence_bpp.split_reads[1 if first_breakpoint else 0].add( + clipped + ) # add to the opposite breakpoint + return True + + +def collect_half_mapped( + evidence_bpp: Evidence, read: pysam.AlignedSegment, mate: pysam.AlignedSegment +): + """ + Args: + read: the read to add + mate: the unmapped mate + + Returns: + bool: + - True: the read was collected and stored in the current evidence object + - False: the read was not collected + Raises: + AssertionError: if the mate is not unmapped + """ + if not mate.is_unmapped: + raise AssertionError('expected the mate to be unmapped') + read_itvl = Interval(read.reference_start + 1, read.reference_end) + added = False + if evidence_bpp.break1.chr == read.reference_name and Interval.overlaps( + evidence_bpp.outer_window1, read_itvl + ): + evidence_bpp.half_mapped[0].add(mate) + added = True + if evidence_bpp.break2.chr == read.reference_name and Interval.overlaps( + evidence_bpp.outer_window2, read_itvl + ): + evidence_bpp.half_mapped[1].add(mate) + added = True + return added + + +def collect_flanking_pair( + evidence_bpp: Evidence, read: pysam.AlignedSegment, mate: pysam.AlignedSegment +): + """ + checks if a given read meets the minimum quality criteria to be counted as evidence as stored as support for + this event + + Args: + read: the read to add + mate: the mate + + Returns: + bool: + - True: the pair was collected and stored in the current evidence object + - False: the pair was not collected + Raises: + ValueError: if the input reads are not a valid pair + + see [theory - types of flanking evidence](/background/theory/#types-of-flanking-evidence) + """ + if read.is_unmapped or mate.is_unmapped: + raise ValueError( + 'input reads must be a mapped and mated pair. One or both of the reads is unmapped' + ) + elif read.query_name != mate.query_name: + raise ValueError( + 'input reads must be a mapped and mated pair. The query names do not match' + ) + elif abs(read.template_length) != abs(mate.template_length): + raise ValueError( + 'input reads must be a mapped and mated pair. The template lengths (abs value) do not match', + abs(read.template_length), + abs(mate.template_length), + ) + elif ( + read.mapping_quality < evidence_bpp.min_mapping_quality + or mate.mapping_quality < evidence_bpp.min_mapping_quality + ): + return False # do not meet the minimum mapping quality requirements + # check that the references are right for the pair + if evidence_bpp.interchromosomal: + if read.reference_id == read.next_reference_id: + return False + elif read.reference_id != read.next_reference_id: + return False + + if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( + PYSAM_READ_FLAGS.RECOMPUTED_CIGAR + ): + read = evidence_bpp.standardize_read(read) + if not mate.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not mate.get_tag( + PYSAM_READ_FLAGS.RECOMPUTED_CIGAR + ): + mate = evidence_bpp.standardize_read(mate) + # order the read pairs so that they are in the same order that we expect for the breakpoints + if read.reference_id != mate.reference_id: + if evidence_bpp.bam_cache.get_read_reference_name( + read + ) > evidence_bpp.bam_cache.get_read_reference_name(mate): + read, mate = mate, read + elif read.reference_start > mate.reference_start: + read, mate = mate, read + + if ( + evidence_bpp.bam_cache.get_read_reference_name(read) != evidence_bpp.break1.chr + or evidence_bpp.bam_cache.get_read_reference_name(mate) != evidence_bpp.break2.chr + ): + return False + if any( + [ + evidence_bpp.break1.orient == ORIENT.LEFT and read.is_reverse, + evidence_bpp.break1.orient == ORIENT.RIGHT and not read.is_reverse, + evidence_bpp.break2.orient == ORIENT.LEFT and mate.is_reverse, + evidence_bpp.break2.orient == ORIENT.RIGHT and not mate.is_reverse, + ] + ): + return False + # check if this read falls in the first breakpoint window + iread = Interval(read.reference_start + 1, read.reference_end) + imate = Interval(mate.reference_start + 1, mate.reference_end) + + if evidence_bpp.stranded: + strand1 = _read.sequenced_strand(read, evidence_bpp.strand_determining_read) + strand2 = _read.sequenced_strand(mate, evidence_bpp.strand_determining_read) + + if strand1 != evidence_bpp.break1.strand or strand2 != evidence_bpp.break2.strand: + return False + + for event_type in evidence_bpp.putative_event_types(): + + # check that the pair orientation is correct + if not _read.orientation_supports_type(read, event_type): + continue + + # check that the fragment size is reasonable + fragment_size = evidence_bpp.compute_fragment_size(read, mate) + + if event_type == SVTYPE.DEL: + if fragment_size.end <= evidence_bpp.max_expected_fragment_size: + continue + elif event_type == SVTYPE.INS: + if fragment_size.start >= evidence_bpp.min_expected_fragment_size: + continue + + # check that the positions of the reads and the strands make sense + if Interval.overlaps(iread, evidence_bpp.outer_window1) and Interval.overlaps( + imate, evidence_bpp.outer_window2 + ): + evidence_bpp.flanking_pairs.add((read, mate)) + return True + + return False + + +def collect_compatible_flanking_pair( + evidence_bpp: Evidence, + read: pysam.AlignedSegment, + mate: pysam.AlignedSegment, + compatible_type: str, +) -> bool: + """ + checks if a given read meets the minimum quality criteria to be counted as evidence as stored as support for + this event + + Args: + read: the read to add + mate: the mate + compatible_type (SVTYPE): the type we are collecting for + + Returns: + bool: + - True: the pair was collected and stored in the current evidence object + - False: the pair was not collected + Raises: + ValueError: if the input reads are not a valid pair + + Note: + see [theory - types of flanking evidence](/background/theory/#compatible-flanking-pairs) + """ + if ( + read.is_unmapped + or mate.is_unmapped + or read.query_name != mate.query_name + or read.is_read1 == mate.is_read1 + ): + raise ValueError('input reads must be a mapped and mated pair') + if not evidence_bpp.compatible_window1: + raise ValueError('compatible windows were not given') + if evidence_bpp.interchromosomal: + raise NotImplementedError('interchromosomal events do not have compatible flanking pairs') + # check that the references are right for the pair + if read.reference_id != read.next_reference_id: + return False + elif ( + read.mapping_quality < evidence_bpp.min_mapping_quality + or mate.mapping_quality < evidence_bpp.min_mapping_quality + ): + return False + if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( + PYSAM_READ_FLAGS.RECOMPUTED_CIGAR + ): + read = evidence_bpp.standardize_read(read) + if not mate.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not mate.get_tag( + PYSAM_READ_FLAGS.RECOMPUTED_CIGAR + ): + mate = evidence_bpp.standardize_read(mate) + # order the read pairs so that they are in the same order that we expect for the breakpoints + if read.reference_start > mate.reference_start: + read, mate = mate, read + + if ( + evidence_bpp.bam_cache.get_read_reference_name(read) != evidence_bpp.break1.chr + or evidence_bpp.bam_cache.get_read_reference_name(mate) != evidence_bpp.break2.chr + ): + return False + + # check if this read falls in the first breakpoint window + iread = Interval(read.reference_start + 1, read.reference_end) + imate = Interval(mate.reference_start + 1, mate.reference_end) + + if evidence_bpp.stranded: + strand1 = _read.sequenced_strand(read, evidence_bpp.strand_determining_read) + strand2 = _read.sequenced_strand(mate, evidence_bpp.strand_determining_read) + if strand1 != evidence_bpp.break1.strand or strand2 != evidence_bpp.break2.strand: + return False + + # check that the pair orientation is correct + if not _read.orientation_supports_type(read, compatible_type): + return False + + # check that the fragment size is reasonable + fragment_size = evidence_bpp.compute_fragment_size(read, mate) + + if compatible_type == SVTYPE.DEL: + if fragment_size.end <= evidence_bpp.max_expected_fragment_size: + return False + elif compatible_type == SVTYPE.INS: + if fragment_size.start >= evidence_bpp.min_expected_fragment_size: + return False + + # check that the positions of the reads and the strands make sense + if Interval.overlaps(iread, evidence_bpp.compatible_window1) and Interval.overlaps( + imate, evidence_bpp.compatible_window2 + ): + evidence_bpp.compatible_flanking_pairs.add((read, mate)) + return True + + return False + + +def collect_spanning_read(evidence_bpp: Evidence, read: pysam.AlignedSegment): + """ + spanning read: a read covering BOTH breakpoints + + This is only applicable to small events. Do not need to look for soft clipped reads + here since they will be collected already + + Args: + read: the putative spanning read + + Returns: + bool: + - True: the read was collected and stored in the current evidence object + - False: the read was not collected + """ + if evidence_bpp.interchromosomal: + return False + elif not Interval.overlaps(evidence_bpp.inner_window1, evidence_bpp.inner_window2): + # too far apart to call spanning reads + return False + + if evidence_bpp.stranded: + strand = _read.sequenced_strand(read, evidence_bpp.strand_determining_read) + if strand != evidence_bpp.break1.strand and strand != evidence_bpp.break2.strand: + return False + + combined = evidence_bpp.inner_window1 & evidence_bpp.inner_window2 + read_interval = Interval(read.reference_start + 1, read.reference_end) + + if Interval.overlaps(combined, read_interval): + + if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag( + PYSAM_READ_FLAGS.RECOMPUTED_CIGAR + ): + + read = evidence_bpp.standardize_read(read) + # in the correct position, now determine if it can support the event types + for event_type in evidence_bpp.putative_event_types(): + if event_type in [SVTYPE.DUP, SVTYPE.INS]: + if CIGAR.I in [c[0] for c in read.cigar]: + evidence_bpp.spanning_reads.add(read) + return True + elif event_type == SVTYPE.DEL: + if CIGAR.D in [c[0] for c in read.cigar]: + evidence_bpp.spanning_reads.add(read) + return True + elif event_type == SVTYPE.INV: + if CIGAR.X in [c[0] for c in read.cigar]: + return True + return False + + +def load_evidence(evidence_bpp: Evidence): + """ + open the associated bam file and read and store the evidence + does some preliminary read-quality filtering + """ + + def cache_if_true(read): + if read.is_unmapped or read.mate_is_unmapped: + return True + elif any( + [ + evidence_bpp.config['validate.filter_secondary_alignments'] and read.is_secondary, + read.mapping_quality < evidence_bpp.min_mapping_quality, + ] + ): + return False + elif set([x[0] for x in read.cigar]) & {CIGAR.S, CIGAR.H}: + return True + elif not read.is_proper_pair: + if any( + [ + _read.orientation_supports_type(read, e) + for e in evidence_bpp.putative_event_types() + ] + ): + return True + elif evidence_bpp.compatible_type and _read.orientation_supports_type( + read, evidence_bpp.compatible_type + ): + return True + elif not evidence_bpp.interchromosomal and not evidence_bpp.opposing_strands: + min_frag_est = ( + abs(read.reference_start - read.next_reference_start) - evidence_bpp.read_length + ) + max_frag_est = min_frag_est + 3 * evidence_bpp.read_length + if ( + min_frag_est < evidence_bpp.min_expected_fragment_size + or max_frag_est > evidence_bpp.max_expected_fragment_size + ): + return True + + return False + + def filter_if_true(read): + if not cache_if_true(read): + if any( + [ + evidence_bpp.config['validate.filter_secondary_alignments'] + and read.is_secondary, + read.mapping_quality < evidence_bpp.min_mapping_quality, + ] + ): + return True + elif not evidence_bpp.interchromosomal and set([x[0] for x in read.cigar]) & { + CIGAR.I, + CIGAR.D, + }: + return False + return True + return False + + flanking_pairs = set() # collect putative pairs + half_mapped_partners1 = set() + half_mapped_partners2 = set() + + for read in evidence_bpp.bam_cache.fetch_from_bins( + '{0}'.format(evidence_bpp.break1.chr), + evidence_bpp.outer_window1[0], + evidence_bpp.outer_window1[1], + read_limit=evidence_bpp.config['validate.fetch_reads_limit'], + sample_bins=evidence_bpp.config['validate.fetch_reads_bins'], + min_bin_size=evidence_bpp.config['validate.fetch_min_bin_size'], + cache=True, + cache_if=cache_if_true, + filter_if=filter_if_true, + ): + if read.mapping_quality < evidence_bpp.min_mapping_quality: + continue + evidence_bpp.counts[0] += 1 + if read.is_unmapped: + continue + if not collect_split_read(evidence_bpp, read, True): + collect_spanning_read(evidence_bpp, read) + if read.mate_is_unmapped: + half_mapped_partners1.add(read) + elif ( + any( + [ + _read.orientation_supports_type(read, et) + for et in evidence_bpp.putative_event_types() + ] + ) + and (read.reference_id != read.next_reference_id) == evidence_bpp.interchromosomal + ): + flanking_pairs.add(read) + + for read in evidence_bpp.bam_cache.fetch_from_bins( + '{0}'.format(evidence_bpp.break2.chr), + evidence_bpp.outer_window2[0], + evidence_bpp.outer_window2[1], + read_limit=evidence_bpp.config['validate.fetch_reads_limit'], + sample_bins=evidence_bpp.config['validate.fetch_reads_bins'], + min_bin_size=evidence_bpp.config['validate.fetch_min_bin_size'], + cache=True, + cache_if=cache_if_true, + filter_if=filter_if_true, + ): + if read.mapping_quality < evidence_bpp.min_mapping_quality: + continue + + evidence_bpp.counts[1] += 1 + + if read.is_unmapped: + continue + if not collect_split_read(evidence_bpp, read, False): + collect_spanning_read(evidence_bpp, read) + if read.mate_is_unmapped: + half_mapped_partners2.add(read) + elif ( + any( + [ + _read.orientation_supports_type(read, et) + for et in evidence_bpp.putative_event_types() + ] + ) + and (read.reference_id != read.next_reference_id) == evidence_bpp.interchromosomal + ): + flanking_pairs.add(read) + for flanking_read in sorted(flanking_pairs, key=lambda x: (x.query_name, x.reference_start)): + # try and get the mate from the cache + try: + mates = evidence_bpp.bam_cache.get_mate(flanking_read, allow_file_access=False) + for mate in mates: + if mate.is_unmapped: + logger.debug(f'ignoring unmapped mate {mate.query_name}') + continue + collect_flanking_pair(evidence_bpp, flanking_read, mate) + except KeyError: + pass + + if evidence_bpp.compatible_window1: + compatible_type = SVTYPE.DUP + if SVTYPE.DUP in evidence_bpp.putative_event_types(): + compatible_type = SVTYPE.INS + + compt_flanking = set() + for read in evidence_bpp.bam_cache.fetch_from_bins( + '{0}'.format(evidence_bpp.break1.chr), + evidence_bpp.compatible_window1[0], + evidence_bpp.compatible_window1[1], + read_limit=evidence_bpp.config['validate.fetch_reads_limit'], + sample_bins=evidence_bpp.config['validate.fetch_reads_bins'], + min_bin_size=evidence_bpp.config['validate.fetch_min_bin_size'], + cache=True, + cache_if=cache_if_true, + filter_if=filter_if_true, + ): + if _read.orientation_supports_type(read, compatible_type): + compt_flanking.add(read) + + for read in evidence_bpp.bam_cache.fetch_from_bins( + '{0}'.format(evidence_bpp.break2.chr), + evidence_bpp.compatible_window2[0], + evidence_bpp.compatible_window2[1], + read_limit=evidence_bpp.config['validate.fetch_reads_limit'], + sample_bins=evidence_bpp.config['validate.fetch_reads_bins'], + min_bin_size=evidence_bpp.config['validate.fetch_min_bin_size'], + cache=True, + cache_if=cache_if_true, + filter_if=filter_if_true, + ): + if _read.orientation_supports_type(read, compatible_type): + compt_flanking.add(read) + + for flanking_read in compt_flanking: + # try and get the mate from the cache + try: + mates = evidence_bpp.bam_cache.get_mate(flanking_read, allow_file_access=False) + for mate in mates: + if mate.is_unmapped: + logger.debug(f'ignoring unmapped mate {mate.query_name}') + continue + try: + collect_compatible_flanking_pair( + evidence_bpp, flanking_read, mate, compatible_type + ) + except ValueError: + pass + except KeyError: + pass + + # now collect the half mapped reads + logger.info( + f'collected {len(half_mapped_partners1 | half_mapped_partners2)} putative half mapped reads', + ) + mates_found = 0 + for read in half_mapped_partners1 | half_mapped_partners2: + # try and get the mate from the cache + try: + mates = evidence_bpp.bam_cache.get_mate(read, allow_file_access=False) + mates_found += 1 + for mate in mates: + collect_half_mapped(evidence_bpp, read, mate) + except KeyError: + pass + logger.info(f'{mates_found} half-mapped mates found') diff --git a/src/mavis/validate/main.py b/src/mavis/validate/main.py index 9cbc1383..f47e0e2b 100644 --- a/src/mavis/validate/main.py +++ b/src/mavis/validate/main.py @@ -8,12 +8,11 @@ import pysam from shortuuid import uuid -from ..align import SUPPORTED_ALIGNER, align_sequences, select_contig_alignments from ..annotate.base import BioInterval from ..annotate.file_io import ReferenceFile from ..bam import cigar as _cigar from ..bam.cache import BamCache -from ..breakpoint import BreakpointPair +from ..breakpoint import classify_breakpoint_pair from ..constants import CALL_METHOD, COLUMNS, PROTOCOL from ..util import ( filter_on_overlap, @@ -24,9 +23,11 @@ read_inputs, write_bed_file, ) +from .align import SUPPORTED_ALIGNER, align_sequences, select_contig_alignments from .call import call_events from .constants import PASS_FILENAME from .evidence import GenomeEvidence, TranscriptomeEvidence +from .gather import load_evidence def main( @@ -164,14 +165,14 @@ def main( ), ) logger.info(str(evidence)) - logger.info(f'possible event type(s): {BreakpointPair.classify(evidence)}') + logger.info(f'possible event type(s): {classify_breakpoint_pair(evidence)}') logger.info( f'outer window regions: {evidence.break1.chr}:{evidence.outer_window1[0]}-{evidence.outer_window1[1]} {evidence.break2.chr}:{evidence.outer_window2[0]}-{evidence.outer_window2[1]}' ) logger.info( f'inner window regions: {evidence.break1.chr}:{evidence.inner_window1[0]}-{evidence.inner_window1[1]} {evidence.break2.chr}:{evidence.inner_window2[0]}-{evidence.inner_window2[1]}' ) - evidence.load_evidence() + load_evidence(evidence) logger.info( f'flanking pairs: {len(evidence.flanking_pairs)}' + '; split reads: {}, {}'.format(*[len(a) for a in evidence.split_reads]) diff --git a/tests/data/mock_dgv_annotation.txt b/tests/data/mock_dgv_annotation.txt index ee303c99..bda6925c 100644 --- a/tests/data/mock_dgv_annotation.txt +++ b/tests/data/mock_dgv_annotation.txt @@ -1,6 +1,8 @@ chr start end name 1 1 2300000 nsv482937 1 10001 22118 dgv1n82 +1 10001 22120 rgv2n98 +1 10001 22221 rgv2n99 1 10001 127330 nsv7879 1 10191 10281 nsv958854 1 10377 177417 nsv428112 diff --git a/tests/data/straglr.bed b/tests/data/straglr.bed new file mode 100644 index 00000000..bb325e59 --- /dev/null +++ b/tests/data/straglr.bed @@ -0,0 +1,10 @@ +#chrom start end repeat_unit allele1:size allele1:copy_number allele1:support allele2:size allele2:copy_number allele2:support +chr11 776686 778078 CT 100.0 150.0 10 100.0 100.0 1 +chr10 3079216 3079421 AGAGGTCACCACCCCTTCCCAACAATCCAGTAACAATCC 100.0 150.0 10 100.0 100.0 1 +chr9 2080637 2081030 CTCCTTCCCTCCGCCCCCACCTCGGTCCCTGT 100.0 150.0 10 100.0 100.0 1 +chrX 244719 245293 CCCCGGGAACCGCCT 100.0 150.0 10 - - - +chr7 284096 284233 GGT 100.0 150.0 10 - - - +chr8 288173 290242 CCCTGCTCCGT 100.0 150.0 10 100.0 100.0 1 +chr3 2382228 2382908 CCGTGGGGGAGGCTGAGGCTATGGGGACT 100.0 100.0 10 - - - +chr2 2427285 2427528 CCTCC 100.0 150.0 10 - - - +chr2 2427953 2428216 GGAGG 100.0 150.0 10 100.0 100.0 1 diff --git a/tests/test_mavis/bam/test_bam.py b/tests/test_mavis/bam/test_bam.py index 4f2132e7..822cad52 100644 --- a/tests/test_mavis/bam/test_bam.py +++ b/tests/test_mavis/bam/test_bam.py @@ -5,7 +5,6 @@ import pytest import timeout_decorator from mavis.annotate.file_io import load_annotations, load_reference_genome -from mavis.bam import cigar as _cigar from mavis.bam import read as _read from mavis.bam.cache import BamCache from mavis.bam.read import ( @@ -15,7 +14,7 @@ sequenced_strand, ) from mavis.bam.stats import Histogram, compute_genome_bam_stats, compute_transcriptome_bam_stats -from mavis.constants import CIGAR, DNA_ALPHABET, ORIENT, READ_PAIR_TYPE, STRAND, SVTYPE +from mavis.constants import DNA_ALPHABET, ORIENT, READ_PAIR_TYPE, STRAND, SVTYPE from mavis.interval import Interval from ...util import get_data @@ -148,7 +147,7 @@ def test_alphabet_matching(self): def test_breakpoint_pos(self): # ==========+++++++++> - r = MockRead(reference_start=10, cigar=[(CIGAR.M, 10), (CIGAR.S, 10)]) + r = MockRead(reference_start=10, cigarstring='10M10S') assert _read.breakpoint_pos(r) == 19 with pytest.raises(AttributeError): @@ -157,7 +156,7 @@ def test_breakpoint_pos(self): assert _read.breakpoint_pos(r, ORIENT.LEFT) == 19 # ++++++++++=========> - r = MockRead(reference_start=10, cigar=[(CIGAR.S, 10), (CIGAR.M, 10)]) + r = MockRead(reference_start=10, cigarstring='10S10M') assert _read.breakpoint_pos(r) == 10 with pytest.raises(AttributeError): @@ -166,7 +165,7 @@ def test_breakpoint_pos(self): assert _read.breakpoint_pos(r, ORIENT.RIGHT) == 10 with pytest.raises(AttributeError): - r = MockRead(reference_start=10, cigar=[(CIGAR.X, 10), (CIGAR.M, 10)]) + r = MockRead(reference_start=10, cigarstring='10X10M') _read.breakpoint_pos(r, ORIENT.LEFT) def test_nsb_align(self): @@ -240,20 +239,66 @@ def test_min_overlap(self): @pytest.fixture def stranded_reads(): n = argparse.Namespace() - n.read1_pos_neg = MockRead(is_reverse=False, is_read1=True, mate_is_reverse=True) - assert not n.read1_pos_neg.is_read2 - n.read1_neg_pos = MockRead(is_reverse=True, is_read1=True, mate_is_reverse=False) - n.read1_pos_pos = MockRead(is_reverse=False, is_read1=True, mate_is_reverse=False) - n.read1_neg_neg = MockRead(is_reverse=True, is_read1=True, mate_is_reverse=True) - - n.read2_pos_neg = MockRead(is_reverse=True, is_read1=False, mate_is_reverse=True) - assert n.read2_pos_neg.is_read2 - n.read2_neg_pos = MockRead(is_reverse=False, is_read1=False, mate_is_reverse=False) - n.read2_pos_pos = MockRead(is_reverse=False, is_read1=False, mate_is_reverse=False) - n.read2_neg_neg = MockRead(is_reverse=True, is_read1=False, mate_is_reverse=True) - - n.unpaired_pos = MockRead(is_reverse=False, is_paired=False) - n.unpaired_neg = MockRead(is_reverse=True, is_paired=False) + n.read1_pos_neg = MockRead( + is_reverse=False, + is_read1=True, + mate_is_reverse=True, + cigarstring='10=', + reference_start=100, + ) + n.read1_neg_pos = MockRead( + is_reverse=True, + is_read1=True, + mate_is_reverse=False, + cigarstring='10=', + reference_start=100, + ) + n.read1_pos_pos = MockRead( + is_reverse=False, + is_read1=True, + mate_is_reverse=False, + cigarstring='10=', + reference_start=100, + ) + n.read1_neg_neg = MockRead( + is_reverse=True, is_read1=True, mate_is_reverse=True, cigarstring='10=', reference_start=100 + ) + + n.read2_pos_neg = MockRead( + is_reverse=True, + is_read1=False, + mate_is_reverse=True, + cigarstring='10=', + reference_start=100, + ) + n.read2_neg_pos = MockRead( + is_reverse=False, + is_read1=False, + mate_is_reverse=False, + cigarstring='10=', + reference_start=100, + ) + n.read2_pos_pos = MockRead( + is_reverse=False, + is_read1=False, + mate_is_reverse=False, + cigarstring='10=', + reference_start=100, + ) + n.read2_neg_neg = MockRead( + is_reverse=True, + is_read1=False, + mate_is_reverse=True, + cigarstring='10=', + reference_start=100, + ) + + n.unpaired_pos = MockRead( + is_reverse=False, is_paired=False, cigarstring='10=', reference_start=100 + ) + n.unpaired_neg = MockRead( + is_reverse=True, is_paired=False, cigarstring='10=', reference_start=100 + ) return n @@ -329,32 +374,36 @@ def test_read_pair_strand_det_error(self, stranded_reads): def read_pairs(): n = argparse.Namespace() n.LR = MockRead( - reference_id=0, - next_reference_id=0, + reference_name='1', + next_reference_name='1', + cigarstring='50=', reference_start=1, next_reference_start=2, is_reverse=False, mate_is_reverse=True, ) n.LL = MockRead( - reference_id=0, - next_reference_id=0, + reference_name='1', + next_reference_name='1', + cigarstring='50=', reference_start=1, next_reference_start=2, is_reverse=False, mate_is_reverse=False, ) n.RR = MockRead( - reference_id=0, - next_reference_id=0, + reference_name='1', + next_reference_name='1', + cigarstring='50=', reference_start=1, next_reference_start=2, is_reverse=True, mate_is_reverse=True, ) n.RL = MockRead( - reference_id=0, - next_reference_id=0, + reference_name='1', + next_reference_name='1', + cigarstring='50=', reference_start=1, next_reference_start=2, is_reverse=True, @@ -483,7 +532,7 @@ def test_trans_bam_stats(self): @pytest.fixture def contig_read(): return MockRead( - cigar=_cigar.convert_string_to_cigar('275M18I12041D278M'), + cigarstring='275M18I12041D278M', reference_start=89700025, reference_name='10', ) diff --git a/tests/test_mavis/bam/test_bam_cigar.py b/tests/test_mavis/bam/test_cigar.py similarity index 66% rename from tests/test_mavis/bam/test_bam_cigar.py rename to tests/test_mavis/bam/test_cigar.py index 11fd7b6a..862084c6 100644 --- a/tests/test_mavis/bam/test_bam_cigar.py +++ b/tests/test_mavis/bam/test_cigar.py @@ -24,80 +24,52 @@ from ...util import get_data from ..mock import MockObject, MockRead -REFERENCE_GENOME = None - -def setUpModule(): +@pytest.fixture(scope='module') +def mock_reference_genome(): warnings.simplefilter('ignore') - global REFERENCE_GENOME - REFERENCE_GENOME = load_reference_genome(get_data('mock_reference_genome.fa')) - if ( - 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' - != REFERENCE_GENOME['fake'].seq[0:50].upper() - ): + ref = load_reference_genome(get_data('mock_reference_genome.fa')) + if 'CTCCAAAGAAATTGTAGTTTTCTTCTGGCTTAGAGGTAGATCATCTTGGT' != ref['fake'].seq[0:50].upper(): raise AssertionError('fake genome file does not have the expected contents') - - -class TestRecomputeCigarMismatch: - def test_simple(self): - r = MockRead( - reference_start=1456, - query_sequence='CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', - cigar=[(CIGAR.M, 45)], - ) - assert recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']) == [(CIGAR.EQ, 45)] - - def test_hardclipping(self): - r = MockRead( - reference_start=1456, - query_sequence='CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', - cigar=[(CIGAR.H, 20), (CIGAR.M, 45)], - ) - assert [(CIGAR.H, 20), (CIGAR.EQ, 45)] == recompute_cigar_mismatch( - r, REFERENCE_GENOME['fake'] - ) - - def test_with_events(self): - r = MockRead( - reference_start=1456, - query_sequence='TATA' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', - cigar=[(CIGAR.S, 4), (CIGAR.M, 10), (CIGAR.D, 10), (CIGAR.I, 10), (CIGAR.M, 25)], - ) - assert [ - (CIGAR.S, 4), - (CIGAR.EQ, 10), - (CIGAR.D, 10), - (CIGAR.I, 10), - (CIGAR.EQ, 25), - ] == recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']) - - def test_mismatch_to_mismatch(self): - r = MockRead( - reference_start=1452, - query_sequence='CAGC' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', - cigar=[(CIGAR.X, 4), (CIGAR.M, 10), (CIGAR.D, 10), (CIGAR.I, 10), (CIGAR.M, 25)], - ) - assert [ - (CIGAR.X, 4), - (CIGAR.EQ, 10), - (CIGAR.D, 10), - (CIGAR.I, 10), - (CIGAR.EQ, 25), - ] == recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']) - - def test_m_to_mismatch(self): - r = MockRead( - reference_start=1452, - query_sequence='CAGC' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', - cigar=[(CIGAR.M, 14), (CIGAR.D, 10), (CIGAR.I, 10), (CIGAR.M, 25)], - ) - assert [ - (CIGAR.X, 4), - (CIGAR.EQ, 10), - (CIGAR.D, 10), - (CIGAR.I, 10), - (CIGAR.EQ, 25), - ] == recompute_cigar_mismatch(r, REFERENCE_GENOME['fake']) + return ref + + +@pytest.mark.parametrize( + 'reference_start,query_sequence,cigar_in,cigar_out', + [ + [1456, 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', '45M', '45='], + [1456, 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', '20H45M', '20H45='], + [ + 1456, + 'TATA' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', + '4S10M10D10I25M', + '4S10=10D10I25=', + ], + [ + 1452, + 'CAGC' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', + '4X10M10D10I25M', + '4X10=10D10I25=', + ], + [ + 1452, + 'CAGC' 'CCCAAACAAC' 'TATAAATTTT' 'GTAATACCTA' 'GAACAATATA' 'AATAT', + '14M10D10I25M', + '4X10=10D10I25=', + ], + ], +) +def test_recompute_cigar_mismatch( + reference_start, query_sequence, cigar_in, cigar_out, mock_reference_genome +): + r = MockRead( + reference_start=reference_start, + query_sequence=query_sequence, + cigarstring=cigar_in, + ) + assert recompute_cigar_mismatch(r, mock_reference_genome['fake']) == convert_string_to_cigar( + cigar_out + ) class TestExtendSoftclipping: @@ -108,23 +80,65 @@ def test_softclipped_right(self): assert cnew == convert_string_to_cigar('70=80S') +@pytest.mark.parametrize( + 'input_cigars,output_cigar', + [ + [['10M10X10X'], '10M20X'], + [['10X10M10X'], '10X10M10X'], + [['10M10X10X', '10X10M10X'], '10M30X10M10X'], + [['1S2S5=7X2=5X28=1X99='], '3S5=7X2=5X28=1X99='], + [['10H10M10X10X'], '10H10M20X'], + ], +) +def test_join(input_cigars, output_cigar): + assert join(*[convert_string_to_cigar(c) for c in input_cigars]) == convert_string_to_cigar( + output_cigar + ) + + +@pytest.mark.parametrize( + 'seq1,seq2,opt,cigar_string,ref_start', + [ + ['GTGAGTAAATTCAACATCGTTTTT', 'AACTTAGAATTCAAC---------', {}, '7S8=', 7], + ['GTGAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------', {}, '5S8=', 7], + [ + 'GTGAGTAAATTCAACATCGTTTTT', + '--CTTAGAATTCAAC---------', + dict(force_softclipping=False), + '5S8=', + 7, + ], + [ + 'GTGAGTAAATTC--CATCGTTTTT', + '--CTTAGAATTCAAC---------', + dict(force_softclipping=False), + '5S5=2I1=', + 7, + ], + ['CCTG', 'CCGT', dict(min_exact_to_stop_softclipping=10), '2=2X', 0], + [ + '--GAGTAAATTCAACATCGTTTTT', + '--CTTAGAATTCAAC---------', + dict(force_softclipping=False), + '5S8=', + 5, + ], + ], +) +def test_compute(seq1, seq2, opt, cigar_string, ref_start): + assert compute(seq1, seq2, **opt) == (convert_string_to_cigar(cigar_string), ref_start) + + +def test_compute_error(): + with pytest.raises(AttributeError): + compute('CCTG', 'CCG') + + class TestCigarTools: def test_alignment_matches(self): c = [(CIGAR.M, 10), (CIGAR.EQ, 10), (CIGAR.X, 10)] assert alignment_matches(c) == 30 - def test_join(self): - c = [(CIGAR.M, 10), (CIGAR.X, 10), (CIGAR.X, 10)] - assert join(c) == [(CIGAR.M, 10), (CIGAR.X, 20)] - k = [(CIGAR.X, 10), (CIGAR.M, 10), (CIGAR.X, 10)] - assert join(c, k) == [(CIGAR.M, 10), (CIGAR.X, 30), (CIGAR.M, 10), (CIGAR.X, 10)] - k = [(4, 1), (4, 2), (7, 5), (8, 7), (7, 2), (8, 5), (7, 28), (8, 1), (7, 99)] - assert [(4, 3), (7, 5), (8, 7), (7, 2), (8, 5), (7, 28), (8, 1), (7, 99)] == join(k) - - def test_join_hardclipping(self): - c = [(CIGAR.H, 10), (CIGAR.M, 10), (CIGAR.X, 10), (CIGAR.X, 10)] - assert join(c) == [(CIGAR.H, 10), (CIGAR.M, 10), (CIGAR.X, 20)] - def test_longest_fuzzy_match(self): c = [ (CIGAR.S, 10), @@ -170,34 +184,6 @@ def test_match_percent(self): with pytest.raises(AttributeError): match_percent([(CIGAR.S, 100)]) - def test_compute(self): - # GTGAGTAAATTCAACATCGTTTTT - # aacttagAATTCAAC--------- - assert ([(CIGAR.S, 7), (CIGAR.EQ, 8)], 7) == compute( - 'GTGAGTAAATTCAACATCGTTTTT', 'AACTTAGAATTCAAC---------' - ) - assert ([(CIGAR.S, 5), (CIGAR.EQ, 8)], 7) == compute( - 'GTGAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------' - ) - assert ([(CIGAR.S, 5), (CIGAR.EQ, 8)], 7) == compute( - 'GTGAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------', False - ) - - assert ([(CIGAR.S, 5), (CIGAR.EQ, 5), (CIGAR.I, 2), (CIGAR.EQ, 1)], 7) == compute( - 'GTGAGTAAATTC--CATCGTTTTT', '--CTTAGAATTCAAC---------', False - ) - - with pytest.raises(AttributeError): - compute('CCTG', 'CCG') - - assert ([(CIGAR.EQ, 2), (CIGAR.X, 2)], 0) == compute( - 'CCTG', 'CCGT', min_exact_to_stop_softclipping=10 - ) - - assert ([(CIGAR.S, 5), (CIGAR.EQ, 8)], 5) == compute( - '--GAGTAAATTCAACATCGTTTTT', '--CTTAGAATTCAAC---------', False - ) - def test_convert_for_igv(self): c = [(CIGAR.M, 10), (CIGAR.EQ, 10), (CIGAR.X, 10)] assert convert_for_igv(c) == [(CIGAR.M, 30)] @@ -206,17 +192,22 @@ def test_convert_for_igv(self): class TestHgvsStandardizeCigars: def no_change_aligned(self): ref = 'AAATTTGGGCCCAATT' - read = MockRead('name', '1', 1, cigar=[(CIGAR.M, 10)], query_sequence='AAATTTGGGC') + read = MockRead( + query_name='name', + reference_name='1', + reference_start=1, + cigarstring='10M', + query_sequence='AAATTTGGGC', + ) assert hgvs_standardize_cigar(read, ref) == [(CIGAR.M, 10)] def no_change_proper_indel(self): - ref = 'ATAGGC' 'ATCTACGAG' 'ATCGCTACG' + ref = 'ATAGGC' 'ATCTAC' 'GAG' 'ATCG' 'CTACG' read = MockRead( - 'name', - 1, - 6, + query_name='name', + reference_start=6, query_sequence='ATCTAC' 'CCC' 'ATCG', - cigar=[(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)], + cigarstring='6=3I3D4=', ) assert [(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)] == hgvs_standardize_cigar( read, ref @@ -225,11 +216,10 @@ def no_change_proper_indel(self): def ins_after_deletion(self): ref = 'ATAGGC' 'ATCTACGAG' 'ATCGCTACG' read = MockRead( - 'name', - 1, - 6, + query_name='name', + reference_start=6, query_sequence='ATCTAC' 'CCC' 'ATCG', - cigar=[(CIGAR.EQ, 6), (CIGAR.D, 3), (CIGAR.I, 3), (CIGAR.EQ, 4)], + cigarstring='6=3D3I4=', ) assert [(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)] == hgvs_standardize_cigar( read, ref @@ -238,22 +228,20 @@ def ins_after_deletion(self): def test_insertion_in_repeat(self): ref = 'ATAGGC' 'ATCT' 'ACGA' 'GATCGCTACG' read = MockRead( - 'name', - 1, - 6, + query_name='name', + reference_start=6, query_sequence='ATCT' 'ACGA' 'ACGA' 'GATC', - cigar=[(CIGAR.EQ, 4), (CIGAR.I, 4), (CIGAR.EQ, 8)], + cigarstring='4=4I8=', ) assert [(CIGAR.EQ, 8), (CIGAR.I, 4), (CIGAR.EQ, 4)] == hgvs_standardize_cigar(read, ref) def test_deletion_in_repeat(self): ref = 'ATAGGC' 'ATCT' 'ACGA' 'ACGA' 'ACGA' 'GATCGCTACG' read = MockRead( - 'name', - 1, - 6, + query_name='name', + reference_start=6, query_sequence='ATCT' 'ACGA' 'ACGA' 'GATC', - cigar=[(CIGAR.EQ, 4), (CIGAR.D, 4), (CIGAR.EQ, 12)], + cigarstring='4=4D12=', ) assert [(CIGAR.EQ, 12), (CIGAR.D, 4), (CIGAR.EQ, 4)] == hgvs_standardize_cigar(read, ref) @@ -261,18 +249,7 @@ def test_bubble_sort_indel_sections(self): rseq = 'ATAGGC' 'ATCT' 'GG' 'GA' 'GCGA' 'GATCGCTACG' qseq = 'ATCT' 'TTT' 'TT' 'GCGA' 'GATC' read = MockRead( - 'name', - 1, - 6, - query_sequence=qseq, - cigar=[ - (CIGAR.EQ, 4), - (CIGAR.D, 2), - (CIGAR.I, 3), - (CIGAR.D, 2), - (CIGAR.I, 2), - (CIGAR.EQ, 8), - ], + query_name='name', reference_start=6, query_sequence=qseq, cigarstring='4=2D3I2D2I8=' ) assert [(CIGAR.EQ, 4), (CIGAR.I, 5), (CIGAR.D, 4), (CIGAR.EQ, 8)] == hgvs_standardize_cigar( read, rseq @@ -283,19 +260,7 @@ def test_bubble_sort_indel_sections_drop_mismatch(self): # ATCT CTTTT TACGA qseq = 'ATCT' 'C' 'TT' 'TTT' 'ACGA' 'GATC' read = MockRead( - 'name', - 1, - 6, - query_sequence=qseq, - cigar=[ - (CIGAR.EQ, 4), - (CIGAR.X, 1), - (CIGAR.D, 3), - (CIGAR.I, 2), - (CIGAR.D, 5), - (CIGAR.I, 3), - (CIGAR.EQ, 8), - ], + query_name='name', reference_start=6, query_sequence=qseq, cigarstring='4=1X3D2I5D3I8=' ) assert [(CIGAR.EQ, 4), (CIGAR.I, 5), (CIGAR.D, 8), (CIGAR.EQ, 9)] == hgvs_standardize_cigar( read, rseq @@ -310,21 +275,11 @@ def test_bubble_sort_indel_sections_drop_mismatch_with_hardclipping(self): # ATAGGCATCT ACGAACGAACGAGATCGCTACG # ATCTCTTTTT CGAACG read = MockRead( - 'name', - 1, - 6, + query_name='name', + reference_start=6, reference_name='1', query_sequence='ATCTCTTTTTCGAACG', - cigar=[ - (CIGAR.H, 10), - (CIGAR.EQ, 4), - (CIGAR.X, 1), - (CIGAR.D, 2), - (CIGAR.I, 3), - (CIGAR.D, 2), - (CIGAR.I, 2), - (CIGAR.EQ, 6), - ], + cigarstring='10H4=1X2D3I2D2I6=', ) print(SamRead.deletion_sequences(read, {'1': MockObject(seq=ref)})) print(SamRead.insertion_sequences(read)) @@ -340,11 +295,10 @@ def test_bubble_sort_indel_sections_drop_mismatch_with_hardclipping(self): def test_homopolymer_even_odd(self): ref = 'ATCGAGAT' + 'A' * 15 + 'TCGAGAT' read = MockRead( - 'name', - 1, - 1, + query_name='name', + reference_start=1, query_sequence='ATCGAGATA' + 'A' * 12 + 'TCGAGAT', - cigar=[(CIGAR.EQ, 8), (CIGAR.D, 2), (CIGAR.EQ, 20)], + cigarstring='8=2D20=', ) assert [(CIGAR.EQ, 9 + 12), (CIGAR.D, 2), (CIGAR.EQ, 7)] == hgvs_standardize_cigar( read, ref @@ -355,15 +309,14 @@ def test_homopolymer_even_odd(self): ) read = MockRead( - 'name', - '1', - 0, - 149, + query_name='name', + reference_name='1', + reference_start=0, query_sequence=( 'CCCCGGCTCATGTCTGGTTTTGTTTTCCGGGGGCGGGGGGGCTCCCTGGGGATGATGGTGATTTTTTTTTTTTTTTTTAATCCTCAACTAGGAGAGAAAA' 'TGAGGCAGAGACAATGTGGGGAGCGAGAGAGGGGAAAAGGACGGGGGAGG' ), - cigar=[(CIGAR.EQ, 61), (CIGAR.I, 2), (CIGAR.EQ, 87)], + cigarstring='61=2I87=', ) assert [(CIGAR.EQ, 61 + 15), (CIGAR.I, 2), (CIGAR.EQ, 87 - 15)] == hgvs_standardize_cigar( read, ref @@ -375,15 +328,14 @@ def test_homopolymer_even_odd(self): ) read = MockRead( - 'name', - '1', - 0, - 149, + query_name='name', + reference_name='1', + reference_start=0, query_sequence=( 'CCCCTCCTCGGTCGGGCAGATCTTTCAGAAGCAGGAGCCCAGGATCATGTCTGGTTTTGTTTTCCGAGGGCGAGGGGGCTCCCTGAGGATGATGGTGATTTT' 'TTTTTTTTTTTTTAATCCTCAACTAGGAGAGAAAATGAGGCAGAGACA' ), - cigar=[(CIGAR.S, 2), (CIGAR.EQ, 96), (CIGAR.I, 2), (CIGAR.EQ, 50)], + cigarstring='2S96=2I50=', ) assert [ (CIGAR.S, 2), @@ -405,10 +357,10 @@ def test_even_deletion_in_repeat(self): ) print(len(qseq) - 28) read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=4, - cigar=convert_string_to_cigar('4S13=2D64='), + cigarstring='4S13=2D64=', query_sequence=qseq, ) reference_genome = {'1': MockObject(seq=rseq)} @@ -430,10 +382,10 @@ def test_odd_deletion_in_repeat(self): ) print(len(qseq) - 28) read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=4, - cigar=convert_string_to_cigar('4S13=3D63='), + cigarstring='4S13=3D63=', query_sequence=qseq, ) reference_genome = {'1': MockObject(seq=rseq)} @@ -448,10 +400,10 @@ def test_unecessary_indel(self): rseq = 'qwertyuiopasdfghjklzxcvbnm' qseq = 'qwertyuiopasdfghjklzxcvbnm' read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=0, - cigar=convert_string_to_cigar('13=1I1D12='), + cigarstring='13=1I1D12=', query_sequence=qseq, ) exp = convert_string_to_cigar('26=') @@ -462,10 +414,10 @@ def test_unecessary_indel2(self): rseq = 'qwertyuiopasdfghjklzxcvbnm' qseq = 'qwertyuiopasdfkghjklzxcvbnm' read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=0, - cigar=convert_string_to_cigar('13=2I1D12='), + cigarstring='13=2I1D12=', query_sequence=qseq, ) exp = convert_string_to_cigar('14=1I12=') @@ -476,10 +428,10 @@ def test_unecessary_indel_end_match(self): rseq = 'qwertyuiopasdfghjklzxcvbnm' qseq = 'qwertyuiopasdfkmkghjklzxcvbnm' read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=0, - cigar=convert_string_to_cigar('14=5I2D10='), + cigarstring='14=5I2D10=', query_sequence=qseq, ) exp = convert_string_to_cigar('14=3I12=') @@ -490,10 +442,10 @@ def test_unecessary_indel_end_match2(self): rseq = 'GGGTGCAGTGGCTTACACCT' 'GTAATCCAAACACCTTGGGAGCCGCCCCCTGAG' 'CCTCCAGGCCCGGGACAGA' qseq = 'GGGTGCAGTGGCTTACACCT' 'CCAGG' 'CCTCCAGGCCCGGGACAGA' read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=0, - cigar=convert_string_to_cigar('20=5I33D19='), + cigarstring='20=5I33D19=', query_sequence=qseq, ) exp = convert_string_to_cigar('20=4I32D20=') @@ -513,10 +465,10 @@ def test_even_insertion_in_repeat(self): ) print(len(qseq) - 13 - 4) read = MockRead( - 'name', + query_name='name', reference_name='1', reference_start=4, - cigar=convert_string_to_cigar('4S13=2I66='), + cigarstring='4S13=2I66=', query_sequence=qseq, ) exp = convert_string_to_cigar('4S26=2I53=') @@ -524,7 +476,7 @@ def test_even_insertion_in_repeat(self): read.cigar = new_cigar assert new_cigar == exp - def test_deletion_repeat(self): + def test_deletion_repeat(self, mock_reference_genome): qseq = ( 'GAGT' 'GAGACTCTGT' @@ -539,21 +491,10 @@ def test_deletion_repeat(self): # deleted reference: TATGTGTAATATACATCATGTATCAAA print(qseq[:76], qseq[76:]) read = MockRead( - 'name', + query_name='name', reference_name='11_86018001-86018500', reference_start=28, - cigar=[ - (CIGAR.S, 4), - (CIGAR.EQ, 10), - (CIGAR.X, 3), - (CIGAR.EQ, 14), - (CIGAR.X, 1), - (CIGAR.EQ, 21), - (CIGAR.X, 1), - (CIGAR.EQ, 22), - (CIGAR.D, 27), - (CIGAR.EQ, 74), - ], + cigarstring='4S10=3X14=1X21=1X22=27D74=', query_sequence=qseq, ) expected_cigar = [ @@ -568,10 +509,10 @@ def test_deletion_repeat(self): (CIGAR.D, 27), (CIGAR.EQ, 74 - 30), ] - std_cigar = hgvs_standardize_cigar(read, REFERENCE_GENOME[read.reference_name].seq) - print(SamRead.deletion_sequences(read, REFERENCE_GENOME)) + std_cigar = hgvs_standardize_cigar(read, mock_reference_genome[read.reference_name].seq) + print(SamRead.deletion_sequences(read, mock_reference_genome)) read.cigar = std_cigar - print(SamRead.deletion_sequences(read, REFERENCE_GENOME)) + print(SamRead.deletion_sequences(read, mock_reference_genome)) assert std_cigar == expected_cigar @timeout_decorator.timeout(1) @@ -600,23 +541,11 @@ def test_complex(self): 'ACACACACACACACACAC' ) read = MockRead( - 'name', + query_name='name', reference_name='mock', reference_start=0, query_sequence=qseq, - cigar=[ - (CIGAR.EQ, 33), - (CIGAR.X, 1), - (CIGAR.EQ, 49), - (CIGAR.I, 26), - (CIGAR.EQ, 18), - (CIGAR.X, 1), - (CIGAR.EQ, 1), - (CIGAR.I, 1), - (CIGAR.EQ, 1), - (CIGAR.I, 1), - (CIGAR.EQ, 18), - ], + cigarstring='33=1X49=26I18=1X1=1I1=1I18=', ) print(rseq) print( @@ -648,11 +577,11 @@ def test_deletion_partial_repeat(self): qseq = 'ATCTTAGCCAGGT' 'AGTTACATACATATC' rseq = 'ATCTTAGCCAGGT' 'AGCTAT' 'AGTTACATACATATC' read = MockRead( - 'name', + query_name='name', reference_name='mock', reference_start=0, query_sequence=qseq, - cigar=convert_string_to_cigar('13=6D15='), + cigarstring='13=6D15=', ) assert convert_string_to_cigar('15=6D13=') == hgvs_standardize_cigar(read, rseq) @@ -662,22 +591,22 @@ def test_indel_repeat(self): print(qseq) print(rseq) read = MockRead( - 'name', + query_name='name', reference_name='mock', reference_start=0, query_sequence=qseq, - cigar=convert_string_to_cigar('13=1I6D15='), + cigarstring='13=1I6D15=', ) assert convert_string_to_cigar('13=1I6D15=') == hgvs_standardize_cigar(read, rseq) def test_shift_complex_indel(self): refseq = 'ATATATCTATTTTTTTCTTTCTTTTTTTTACTTTCATTAAGTGCCACTAAAAAATTAGGTTCAATTAAACTTTATTAATCTCTTCTGAGTTTTGATTGAGTATATATATATATATACCCAGTTTCAAGCAGGTATCTGCCTTTAAAGATAAGAGACCTCCTAAATGCTTTCTTTTATTAGTTGCCCTGTTTCAGATTCAGCTTTGTATCTATATCACCTGTTAATATGTGTGGACTCACAGAAATGATCATTGAGGGAATGCACCCTGTTTGGGTGTAAGTAGCTCAGGGAAAAAATCCTAG' read = MockRead( - 'name', + query_name='name', reference_name='18', reference_start=40237946 - 40237890, query_sequence='AGGTTCAATTAAACTTTATTAATCTCTTCTGAGTTTTGATTGAGTGTATATATATATATATATATATATATATATATACCCAGTTTCAAGCAGGTATCTGCCTTTAAAGATAAGAGACCTCCTAAGTGCTTTCTTTTATTAGTGGCCCTG', - cigar=convert_string_to_cigar('44M18I88M'), + cigarstring='44M18I88M', ) print(_read.convert_cigar_to_string(read.cigar)) read.cigar = recompute_cigar_mismatch(read, refseq) @@ -705,21 +634,34 @@ def test_small_exact_match(self): assert new_cigar == exp -class TestConvertStringToCigar: - def test(self): - string = '283M' '17506D' '5M' '21275D' '596M' '17506D' '5M' '21275D' '313M' - exp = [ - (CIGAR.M, 283), - (CIGAR.D, 17506), - (CIGAR.M, 5), - (CIGAR.D, 21275), - (CIGAR.M, 596), - (CIGAR.D, 17506), - (CIGAR.M, 5), - (CIGAR.D, 21275), - (CIGAR.M, 313), - ] - assert convert_string_to_cigar(string) == exp +@pytest.mark.parametrize( + 'string,cigartuples', + [ + [ + '283M' '17506D' '5M' '21275D' '596M' '17506D' '5M' '21275D' '313M', + [ + (CIGAR.M, 283), + (CIGAR.D, 17506), + (CIGAR.M, 5), + (CIGAR.D, 21275), + (CIGAR.M, 596), + (CIGAR.D, 17506), + (CIGAR.M, 5), + (CIGAR.D, 21275), + (CIGAR.M, 313), + ], + ], + [ + '42H25M', + [ + (CIGAR.H, 42), + (CIGAR.M, 25), + ], + ], + ], +) +def test_convert_string_to_cigar(string, cigartuples): + assert convert_string_to_cigar(string) == cigartuples class TestGetSequences: @@ -729,7 +671,7 @@ def test_deletions(self): reference_start=0, reference_name='1', query_sequence='', - cigar=convert_string_to_cigar('2=3D8=4D9='), + cigarstring='2=3D8=4D9=', ) assert ( SamRead.deletion_sequences(read, {'1': MockObject(seq='abcdefghijklmnopqrstuvwxyz')}) @@ -742,6 +684,6 @@ def test_insertions(self): reference_start=0, reference_name='1', query_sequence='abcdekkkfghijklmnopqkkkkrstuvwxyz', - cigar=convert_string_to_cigar('5=3I12=4I9='), + cigarstring='5=3I12=4I9=', ) assert SamRead.insertion_sequences(read) == exp diff --git a/tests/test_mavis/convert/test_convert.py b/tests/test_mavis/convert/test_convert.py index 4470bc66..44b8aa24 100644 --- a/tests/test_mavis/convert/test_convert.py +++ b/tests/test_mavis/convert/test_convert.py @@ -1,3 +1,4 @@ +import itertools import os import shutil import sys @@ -47,6 +48,16 @@ def run_main(self, inputfile, file_type, strand_specific=False): result.setdefault(pair.data['tracking_id'], []).append(pair) return result + def test_straglr(self): + result = self.run_main(get_data('straglr.bed'), SUPPORTED_TOOL.STRAGLR, False) + assert len(result) == 9 + for bpp in itertools.chain(*result.values()): + assert bpp.break1.chr == bpp.break2.chr + assert bpp.break1.start == bpp.break2.start + assert bpp.break1.end == bpp.break2.end + assert bpp.event_type == SVTYPE.INS + assert bpp.untemplated_seq is None + def test_chimerascan(self): self.run_main(get_data('chimerascan_output.bedpe'), SUPPORTED_TOOL.CHIMERASCAN, False) diff --git a/tests/test_mavis/convert/test_tool.py b/tests/test_mavis/convert/test_tool.py index a6371452..6603572d 100644 --- a/tests/test_mavis/convert/test_tool.py +++ b/tests/test_mavis/convert/test_tool.py @@ -102,6 +102,111 @@ def test_convert_duplication(self): assert bpp.break1.chr == '1' assert bpp.break2.chr == '1' +class TestArriba: + def test_convert_standard_event(self): + row = { + 'breakpoint1': '13:114529969', + 'breakpoint2': '13:114751269', + 'type': 'inversion', + 'strand1(gene/fusion)': '+/+', + 'strand2(gene/fusion)': '-/-', + 'direction1': 'downstream', + 'direction2': 'downstream', + } + bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, True) + + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == 'chr13' + assert bpp.break2.chr == 'chr13' + assert bpp.break1.start == 114529969 + assert bpp.break2.start == 114751269 + assert bpp.event_type == SVTYPE.INV + assert bpp.break1.orient == 'L' + assert bpp.break2.orient == 'L' + assert bpp.opposing_strands == True + + def test_convert_translocation(self): + row = { + 'breakpoint1': '17:69313092', + 'breakpoint2': '20:58272875', + 'type': 'translocation', + 'strand1(gene/fusion)': '-/-', + 'strand2(gene/fusion)': '-/-', + 'direction1': 'upstream', + 'direction2': 'downstream', + } + bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, True) + + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == 'chr17' + assert bpp.break2.chr == 'chr20' + assert bpp.break1.start == 69313092 + assert bpp.break2.start == 58272875 + assert bpp.event_type == SVTYPE.TRANS + assert bpp.break1.orient == 'R' + assert bpp.break2.orient == 'L' + assert bpp.opposing_strands == False + + def test_convert_translocation(self): + row = { + 'breakpoint1': '20:57265705', + 'breakpoint2': '20:47786405', + 'type': 'inversion/5\'-5\'', + 'strand1(gene/fusion)': '-/-', + 'strand2(gene/fusion)': '-/+', + 'direction1': 'upstream', + 'direction2': 'upstream', + } + bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, True) + + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == 'chr20' + assert bpp.break2.chr == 'chr20' + assert bpp.break1.start == 57265705 + assert bpp.break2.start == 47786405 + assert bpp.event_type == SVTYPE.INV + assert bpp.break1.orient == 'R' + assert bpp.break2.orient == 'R' + assert bpp.opposing_strands == True + + def test_convert_translocation(self): + row = { + 'breakpoint1': '14:102877322', + 'breakpoint2': '14:102994672', + 'type': 'deletion/read-through/5\'-5\'', + 'strand1(gene/fusion)': '+/+', + 'strand2(gene/fusion)': '-/+', + 'direction1': 'downstream', + 'direction2': 'upstream', + } + bpp_list = _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, True) + + assert len(bpp_list) == 1 + bpp = bpp_list[0] + assert bpp.break1.chr == 'chr14' + assert bpp.break2.chr == 'chr14' + assert bpp.break1.start == 102877322 + assert bpp.break2.start == 102994672 + assert bpp.event_type == SVTYPE.DEL + assert bpp.break1.orient == 'L' + assert bpp.break2.orient == 'R' + assert bpp.opposing_strands == False + + def test_malformed(self): + row = { + 'breakpoint1': '', + 'breakpoint2': None, + 'type': 'translocation', + 'strand1(gene/fusion)': '-/-', + 'strand2(gene/fusion)': '-/-', + 'direction1': 'upstream', + 'direction2': 'downstream', + } + with pytest.raises(AssertionError): + _convert_tool_row(row, SUPPORTED_TOOL.ARRIBA, False) class TestStarFusion: def test_convert_standard_event(self): diff --git a/tests/test_mavis/convert/test_tools_vcf.py b/tests/test_mavis/convert/test_tools_vcf.py index f846d41a..ef4eaded 100644 --- a/tests/test_mavis/convert/test_tools_vcf.py +++ b/tests/test_mavis/convert/test_tools_vcf.py @@ -1,4 +1,4 @@ -from mavis.convert import SUPPORTED_TOOL, _convert_tool_row +import pytest from mavis.convert.vcf import VcfInfoType, VcfRecordType, convert_record, pandas_vcf from ...util import get_data @@ -10,7 +10,7 @@ def test_read_vcf(): assert df.shape[0] == 106 -def test_convert_record(): +def test_convert_telomeric_region(): variant_imprecise = VcfRecordType( id='mock-BND-imprecise', pos=0, @@ -59,3 +59,115 @@ def test_convert_record(): assert precise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' assert imprecise_records.get('break1_chromosome') == 'chr14_KI270722v1_random' + + +TEST_POS = 1853407 + + +@pytest.mark.parametrize( + 'pos,break1_ci,break2_ci,break1,break2,ids', + [ + [ + TEST_POS, + (-30, 30), + (-65, 65), + (TEST_POS - 30, TEST_POS + 30), + (TEST_POS - 30, TEST_POS + 65), + 'vcf-cuteSV.INS.breakpoint_2_start < breakpoint_1_start', + ], + [ + TEST_POS, + (-30, 99999), + (-10, 65), + (TEST_POS - 30, TEST_POS + 65), + (TEST_POS - 10, TEST_POS + 65), + 'vcf-cuteSV.INS.breakpoint_1_end > breakpoint_2_end', + ], + ], + ids=[ + 'breakpoint_2_start < breakpoint_1_start', + 'breakpoint_1_end > breakpoint_2_end', + ], +) +def test_convert_intrachromosomal_imprecise_breakend( + pos, break1_ci, break2_ci, break1, break2, ids +): + variant_vcf = VcfRecordType( + id=ids, + pos=pos, + chrom='chr5', + alts=['AGG'], + ref='A', + info=VcfInfoType( + CHR2="chr5", + IMPRECISE=True, + SVMETHOD="cuteSV-1.0.12", + SVTYPE="INS", + CIPOS=break1_ci, + CILEN=break2_ci, + ), + ) + result = convert_record(variant_vcf) + assert len(result) == 1 + variant = result[0] + assert variant.get('break1_position_start') == break1[0] + assert variant.get('break1_position_end') == break1[1] + assert variant.get('break2_position_start') == break2[0] + assert variant.get('break2_position_end') == break2[1] + + +@pytest.mark.parametrize( + 'pos,break1_ci,break2_ci,break1,break2,ids', + [ + [ + TEST_POS, + (-30, 99999), + (70, 65), + (TEST_POS - 30, TEST_POS + 65), + (TEST_POS + 65, TEST_POS + 65), + 'vcf-cuteSV.INS.breakpoint_2_start > breakpoint_2_end', + ], + ], + ids=[ + 'breakpoint_2_start > breakpoint_2_end', + ], +) +def test_error_on_convert_intrachromosomal_imprecise_breakend( + pos, break1_ci, break2_ci, break1, break2, ids +): + variant_vcf = VcfRecordType( + id=ids, + pos=pos, + chrom='chr5', + alts=['AGG'], + ref='A', + info=VcfInfoType( + CHR2="chr5", + IMPRECISE=True, + SVMETHOD="cuteSV-1.0.12", + SVTYPE="INS", + CIPOS=break1_ci, + CILEN=break2_ci, + ), + ) + with pytest.raises(ValueError): + convert_record(variant_vcf) + + +def test_convert_intrachromosomal_imprecise_breakend_no_ci(): + # breakpoint_1_start > breakpoint_1_end + variant_cilen4 = VcfRecordType( + id='Sniffle.INS', + pos=11184, + chrom='chr2', + alts=['AGG'], + ref='N', + info=VcfInfoType( + CHR2="chr2", + IMPRECISE=True, + SVTYPE="INS", + END=11183, + ), + ) + with pytest.raises(ValueError): + convert_record(variant_cilen4) diff --git a/tests/test_mavis/mock.py b/tests/test_mavis/mock.py index 6e91f96d..badc574a 100644 --- a/tests/test_mavis/mock.py +++ b/tests/test_mavis/mock.py @@ -1,11 +1,19 @@ import os import types +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple -from mavis.align import query_coverage_interval from mavis.annotate.file_io import load_annotations, load_reference_genome from mavis.annotate.genomic import PreTranscript, Transcript from mavis.annotate.protein import Translation -from mavis.constants import CIGAR, NA_MAPPING_QUALITY +from mavis.bam.cigar import ( + QUERY_ALIGNED_STATES, + REFERENCE_ALIGNED_STATES, + convert_cigar_to_string, + convert_string_to_cigar, +) +from mavis.bam.read import SamRead +from mavis.constants import CIGAR, NA_MAPPING_QUALITY, PYSAM_READ_FLAGS from ..util import get_data @@ -89,151 +97,186 @@ def __init__(self, **kwargs): setattr(self, arg, val) -class MockRead: - def __init__( - self, - query_name=None, - reference_id=0, - reference_start=None, - reference_end=None, - cigar=None, - is_reverse=False, - mate_is_reverse=True, - next_reference_start=None, - next_reference_id=None, - reference_name=None, - query_sequence=None, - template_length=None, - query_alignment_sequence=None, - query_alignment_start=None, - query_alignment_end=None, - query_alignment_length=None, - flag=None, - tags=[], - is_read1=True, - is_paired=True, - is_unmapped=False, - is_supplementary=False, - mate_is_unmapped=False, - mapping_quality=NA_MAPPING_QUALITY, - **kwargs - ): - for attr, val in kwargs.items(): - setattr(self, attr, val) - self.mapping_quality = mapping_quality - self.query_name = query_name - self.reference_id = reference_id - self.reference_start = reference_start - self.reference_end = reference_end - self.cigar = cigar - self.query_alignment_length = query_alignment_length - self.query_sequence = query_sequence - if self.reference_end is None and cigar and reference_start is not None: - self.reference_end = reference_start + sum( - [f for v, f in cigar if v not in [CIGAR.S, CIGAR.I]] - ) - if not self.query_alignment_length: - if cigar: - self.query_alignment_length = sum( - [v for s, v in cigar if s not in [CIGAR.S, CIGAR.H]] - ) - elif self.query_sequence: - self.query_alignment_length = len(self.query_sequence) - else: - self.query_alignment_length = 0 - self.is_reverse = is_reverse - self.mate_is_reverse = mate_is_reverse - self.next_reference_start = next_reference_start - self.next_reference_id = next_reference_id - self.reference_name = reference_name - - self.query_alignment_sequence = query_alignment_sequence - self.query_alignment_start = query_alignment_start - self.query_alignment_end = query_alignment_end - self.flag = flag - self.tags = tags - if query_alignment_sequence is None and cigar and query_sequence: - s = 0 if cigar[0][0] != CIGAR.S else cigar[0][1] - t = len(query_sequence) - if cigar[-1][0] == CIGAR.S: - t -= cigar[-1][1] - self.query_alignment_sequence = query_sequence[s:t] - if cigar and query_sequence: - if len(query_sequence) != sum( - [f for v, f in cigar if v not in [CIGAR.H, CIGAR.N, CIGAR.D]] - ): - raise AssertionError( - 'length of sequence does not match cigar', - len(query_sequence), - sum([f for v, f in cigar if v not in [CIGAR.H, CIGAR.N, CIGAR.D]]), - ) - if template_length is None and reference_end and next_reference_start: - self.template_length = next_reference_start - reference_end - else: - self.template_length = template_length - self.is_read1 = is_read1 - self.is_read2 = not is_read1 - self.is_paired = is_paired - self.is_unmapped = is_unmapped - self.mate_is_unmapped = mate_is_unmapped - self.is_supplementary = is_supplementary - if self.reference_start and self.reference_end: - if not cigar: - self.cigar = [(CIGAR.M, self.reference_end - self.reference_start)] - if not self.query_sequence: - self.query_sequence = 'N' * (self.reference_end - self.reference_start) - if flag: - self.is_unmapped = bool(self.flag & int(0x4)) - self.mate_is_unmapped = bool(self.flag & int(0x8)) - self.is_reverse = bool(self.flag & int(0x10)) - self.mate_is_reverse = bool(self.flag & int(0x20)) - self.is_read1 = bool(self.flag & int(0x40)) - self.is_read2 = bool(self.flag & int(0x80)) - self.is_secondary = bool(self.flag & int(0x100)) - self.is_qcfail = bool(self.flag & int(0x200)) - self.is_supplementary = bool(self.flag & int(0x400)) - - def query_coverage_interval(self): - return query_coverage_interval(self) - - def set_tag(self, tag, value, value_type=None, replace=True): - new_tag = (tag, value) - if not replace and new_tag in self.tags: - self.tags.append(new_tag) - else: - self.tags.append(new_tag) +def flags_from_number(flag: int) -> Dict: + return dict( + is_unmapped=bool(flag & int(0x4)), + mate_is_unmapped=bool(flag & int(0x8)), + is_reverse=bool(flag & int(0x10)), + mate_is_reverse=bool(flag & int(0x20)), + is_read1=bool(flag & int(0x40)), + is_secondary=bool(flag & int(0x100)), + is_supplementary=bool(flag & int(0x400)), + ) + - def has_tag(self, tag): - return tag in dict(self.tags).keys() +class MockString(str): + def __getitem__(self, key): + if isinstance(key, slice): + size = key.stop - key.start + else: + size = 1 + return 'N' * size - def get_tag(self, tag): - return dict(self.tags)[tag] if tag in dict(self.tags).keys() else False - def __str__(self): - return '{}(ref_id={}, start={}, end={}, seq={})'.format( - self.__class__.__name__, - self.reference_id, - self.reference_start, - self.reference_end, - self.query_sequence, +@dataclass +class MockRead: + cigarstring: str + reference_start: int + reference_name: str = '' + query_sequence: str = MockString() + is_reverse: bool = False + query_name: str = '' + is_read1: bool = False + is_supplementary: bool = False + is_secondary: bool = False + is_paired: bool = True + template_length: Optional[int] = None + is_unmapped: bool = False + mapping_quality: int = NA_MAPPING_QUALITY + + # mate flags + next_reference_name: str = '' + mate_is_reverse: bool = False + next_reference_start: Optional[int] = None + mate_is_unmapped: bool = False + + # custom flags for assembly reads + alignment_rank: Optional[int] = None + + def __hash__(self): + return hash(SamRead.__hash__(self)) + + @property + def query_length(self): + return sum( + [ + size + for (cigar_state, size) in self.cigar + if cigar_state in QUERY_ALIGNED_STATES - {CIGAR.H} + ] ) + @property + def is_proper_pair(self): + return self.is_paired + + @property + def reference_id(self): + try: + return int(self.reference_name) - 1 + except ValueError: + return hash(str(self.reference_name)) + + @property + def next_reference_id(self): + try: + return int(self.next_reference_name) - 1 + except ValueError: + return hash(str(self.next_reference_name)) + + @property + def seq(self): + return self.query_sequence + + @property + def cigar(self) -> List[Tuple[int, int]]: + return convert_string_to_cigar(self.cigarstring) + + @cigar.setter + def cigar(self, cigartuples: List[Tuple[int, int]]): + self.cigarstring = convert_cigar_to_string(cigartuples) + + @property + def reference_end(self): + reference_pos = self.reference_start + for state, size in self.cigar: + if state in REFERENCE_ALIGNED_STATES: + reference_pos += size + return reference_pos + + @property + def query_alignment_start(self): + query_pos = 0 + for state, size in self.cigar: + if state == CIGAR.H: + continue + elif state in QUERY_ALIGNED_STATES - REFERENCE_ALIGNED_STATES: + query_pos += size + elif state in QUERY_ALIGNED_STATES: + return query_pos + return None + + @property + def query_alignment_end(self): + query_pos = 0 + for state, size in self.cigar[::-1]: + if state == CIGAR.H: + continue + elif state in QUERY_ALIGNED_STATES - REFERENCE_ALIGNED_STATES: + query_pos += size + elif state in QUERY_ALIGNED_STATES: + return self.query_length - query_pos + return None + + @property + def query_alignment_length(self): + return self.query_alignment_end - self.query_alignment_start + + @property + def query_alignment_sequence(self): + return self.query_sequence[self.query_alignment_start : self.query_alignment_end] + + def has_tag(self, tag_name: str) -> bool: + return hasattr(self, tag_name) + + def get_tag(self, tag_name: str) -> bool: + if self.has_tag(tag_name): + return getattr(self, tag_name) + return False + + def set_tag(self, tag_name: str, value, value_type='') -> bool: + setattr(self, tag_name, value) + + # SamRead methods def key(self): - """ - uses a stored _key attribute, if available. This is to avoid the hash changing if the reference start (for example) - is changed but also allow this attribute to be used and calculated for non SamRead objects - - This way to change the hash behaviour the user must be explicit and use the set_key method - """ - if hasattr(self, '_key') and self._key is not None: - return self._key - return ( - self.query_name, - self.query_sequence, - self.reference_id, - self.reference_start, - self.is_supplementary, - ) + return SamRead.key(self) + + @property + def flag(self): + flag = 0 + for flag_value, attr_name in [ + ('REVERSE', 'is_reverse'), + ('MATE_REVERSE', 'mate_is_reverse'), + ('MATE_UNMAPPED', 'mate_is_unmapped'), + ('UNMAPPED', 'is_unmapped'), + ('SECONDARY', 'is_secondary'), + ('SUPPLEMENTARY', 'is_supplementary'), + ]: + if getattr(self, attr_name): + flag |= PYSAM_READ_FLAGS[flag_value] + + if self.is_paired: + if self.is_read1: + flag |= PYSAM_READ_FLAGS.FIRST_IN_PAIR + else: + flag |= PYSAM_READ_FLAGS.LAST_IN_PAIR + return flag + + @flag.setter + def flag(self, value): + for flag_value, attr_name in [ + ('REVERSE', 'is_reverse'), + ('MATE_REVERSE', 'mate_is_reverse'), + ('MATE_UNMAPPED', 'mate_is_unmapped'), + ('UNMAPPED', 'is_unmapped'), + ('FIRST_IN_PAIR', 'is_read1'), + ('SECONDARY', 'is_secondary'), + ('SUPPLEMENTARY', 'is_supplementary'), + ]: + if value & PYSAM_READ_FLAGS[flag_value]: + setattr(self, attr_name, True) + if value & PYSAM_READ_FLAGS.LAST_IN_PAIR: + self.is_read1 = False class MockBamFileHandle: @@ -256,35 +299,33 @@ def get_reference_name(self, input_tid): raise KeyError('invalid id') -class MockString: - def __init__(self, char=' '): - self.char = char +def mock_read_pair(mock1: MockRead, mock2: MockRead, proper_pair=True, reverse_order=False): + # make sure pair flags are set + mock1.is_paired = True + mock2.is_paired = True - def __getitem__(self, index): - if isinstance(index, slice): - return self.char * (index.stop - index.start) - else: - return self.char + mock1.is_read1 = not reverse_order + mock2.is_read1 = reverse_order + if not mock1.is_reverse and proper_pair: + mock2.is_reverse = True -def mock_read_pair(mock1, mock2): - if mock1.reference_id != mock2.reference_id: + if mock1.reference_name != mock2.reference_name: mock1.template_length = 0 mock2.template_length = 0 - mock1.next_reference_id = mock2.reference_id + mock1.next_reference_name = mock2.reference_name mock1.next_reference_start = mock2.reference_start mock1.mate_is_reverse = mock2.is_reverse - mock1.is_paired = True - mock1.is_read2 = not mock1.is_read1 + if mock1.template_length is None: - mock1.template_length = mock2.reference_end - mock1.reference_start + # mock1.template_length = abs(mock1.reference_start - mock1.next_reference_start) + 1 + # if reverse_order: + # mock1.template_length *= -1 + mock1.template_length = mock1.next_reference_start - mock1.reference_start + 1 - mock2.next_reference_id = mock1.reference_id + mock2.next_reference_name = mock1.reference_name mock2.next_reference_start = mock1.reference_start mock2.mate_is_reverse = mock1.is_reverse - mock2.is_paired = True - mock2.is_read1 = not mock1.is_read1 - mock2.is_read2 = not mock1.is_read2 if mock2.query_name is None: mock2.query_name = mock1.query_name mock2.template_length = -1 * mock1.template_length diff --git a/tests/test_mavis/summary/test_summary.py b/tests/test_mavis/summary/test_summary.py index b66a8134..50b2318a 100644 --- a/tests/test_mavis/summary/test_summary.py +++ b/tests/test_mavis/summary/test_summary.py @@ -1,9 +1,10 @@ import pytest from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import CALL_METHOD, COLUMNS, PROTOCOL, STRAND, SVTYPE -from mavis.summary.summary import filter_by_annotations +from mavis.summary.summary import filter_by_annotations, annotate_dgv +from mavis.annotate.file_io import load_masking_regions -from ...util import todo +from ...util import todo, get_data @pytest.fixture @@ -40,11 +41,34 @@ def genomic_event2(): ) +@pytest.fixture +def genomic_event3(): + return BreakpointPair( + Breakpoint('1', 10001), + Breakpoint('1', 22118), + opposing_strands=True, + **{ + COLUMNS.event_type: SVTYPE.DEL, + COLUMNS.call_method: CALL_METHOD.CONTIG, + COLUMNS.fusion_sequence_fasta_id: None, + COLUMNS.protocol: PROTOCOL.GENOME, + COLUMNS.fusion_cdna_coding_start: None, + COLUMNS.fusion_cdna_coding_end: None, + COLUMNS.tracking_id: "genomic_event3", + } + ) + + @pytest.fixture def best_transcripts(): return {'ABCA': True, 'ABCD': True} +@pytest.fixture +def dgv_event2(): + return load_masking_regions(get_data("mock_dgv_annotation.txt")) + + class TestFilterByAnnotations: def test_filter_by_annotations_two_best_transcripts( self, genomic_event1, genomic_event2, best_transcripts @@ -174,3 +198,10 @@ def test_filtering_events_split(self): @todo def test_get_pairing_state(self): pass + + +class TestFilterByCallMethod: + def test_annotate_dgv_distance_bed(self, genomic_event3, dgv_event2): + bpps = [genomic_event3] + annotate_dgv(bpps, dgv_event2, 103) + assert len(bpps[0].data['dgv'].split(';')) == 3 diff --git a/tests/test_mavis/test_align.py b/tests/test_mavis/test_align.py index 790a3d09..4ae1e4ab 100644 --- a/tests/test_mavis/test_align.py +++ b/tests/test_mavis/test_align.py @@ -1,14 +1,14 @@ from unittest import mock import mavis.bam.cigar as _cigar -from mavis import align from mavis.annotate.file_io import load_reference_genome -from mavis.assemble import Contig from mavis.bam.cache import BamCache from mavis.bam.read import SamRead from mavis.breakpoint import Breakpoint, BreakpointPair from mavis.constants import CIGAR, ORIENT, STRAND, reverse_complement from mavis.interval import Interval +from mavis.validate import align +from mavis.validate.assemble import Contig from mavis.validate.evidence import GenomeEvidence from mavis_config import DEFAULTS @@ -238,22 +238,19 @@ def test_break_left_deletion(self): b = Breakpoint('10', 1030, 1030, orient=ORIENT.LEFT) read = MockRead( - cigar=_cigar.convert_string_to_cigar('35M10D5I20M'), + cigarstring='35M10D5I20M', reference_start=999, reference_name='10', ) - align.SplitAlignment.breakpoint_contig_remapped_depth(b, contig, read) + align.DiscontinuousAlignment.breakpoint_contig_remapped_depth(b, contig, read) class TestSplitEvents: def test_read_with_exons(self): contig = MockRead( query_sequence='CTTGAAGGAAACTGAATTCAAAAAGATCAAAGTGCTGGGCTCCGGTGCGTTCGGCACGGTGTATAAGGGACTCTGGATCCCAGAAGGTGAGAAAGTTAAAATTCCCGTCGCTATCAAGACATCTCCGAAAGCCAACAAGGAAATCCTCGATGAAGCCTACGTGATGGCCAGCGTGGACAACCCCCACGTGTGCCGCCTGCTGGGCATCTGCCTCACCTCCACCGTGCAGCTCATCATGCAGCTCATGCCCTTCGGCTGCCTCCTGGACTATGTCCGGGAACACAAAGACAATATTGGCTCCCAGTACCTGCTCAACTGGTGTGTGCAGATCGCAAAGGGCATGAACTACTTGGAGGACCGTCGCTTGGTGCACCGCGACCTGGCAGCCAGGAACGTACTGGTGAAAACACCGCAGCATGTCAAGATCACAGATTTTGGGCTGGCCAAACTGCTGGGTGCGGAAGAGAAAGAATACCATGCAGAAGGAGGCAAAGTGCCTATCAAGTGGATGGCATTGGAATCAATTTTACACAGAATCTATACCCACCAGAGTGATGTCTGGAGCTACGGGGTGACCGTTTGGGAGTTGATGACCTTTGGATCCAA', - cigar=_cigar.convert_string_to_cigar( - '68M678D50M15D34M6472D185M10240D158M891D74M8I5883D29M' - ), + cigarstring='68M678D50M15D34M6472D185M10240D158M891D74M8I5883D29M', reference_name='7', - reference_id=6, reference_start=55241669, ) assert len(align.call_read_events(contig)) == 6 @@ -262,10 +259,9 @@ def test_read_with_exons(self): class TestCallBreakpointPair: def test_single_one_event(self): r = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 10), (CIGAR.I, 3), (CIGAR.D, 7), (CIGAR.M, 10)], + cigarstring='10M3I7D10M', query_sequence='ACTGAATCGTGGGTAGCTGCTAG', ) bpps = align.call_read_events(r) @@ -280,10 +276,9 @@ def test_single_one_event(self): def test_ins_and_del(self): r = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 10), (CIGAR.I, 3), (CIGAR.M, 5), (CIGAR.D, 7), (CIGAR.M, 5)], + cigarstring='10M3I5M7D5M', query_sequence='ACTGAATCGTGGGTAGCTGCTAG', ) # only report the major del event for now @@ -305,10 +300,9 @@ def test_ins_and_del(self): def test_single_insertion(self): r = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 10), (CIGAR.I, 8), (CIGAR.M, 5)], + cigarstring='10M8I5M', query_sequence='ACTGAATCGTGGGTAGCTGCTAG', ) bpp = align.call_read_events(r)[0] @@ -321,10 +315,10 @@ def test_single_insertion(self): def test_single_duplication(self): r = MockRead( - name='seq1', + query_name='seq1', reference_name='gene3', reference_start=27155, - cigar=[(CIGAR.M, 65), (CIGAR.I, 6), (CIGAR.D, 95), (CIGAR.M, 21), (CIGAR.S, 17)], + cigarstring='65M6I95D21M17S', query_sequence='TAGTTGGATCTCTGTGCTGACTGACTGACAGACAGACTTTAGTGTCTGTGTGCTGACTGACAGACAGACTTTAGTGTCTGTGTGCTGACT' 'GACAGACTCTAGTAGTGTC', ) @@ -341,10 +335,9 @@ def test_single_duplication_with_leading_untemp(self): 'CTGTGGGCTCGGGCCCGACGCGCACGGAGGACTGGAGGACTGGGGCGTGTGTCTGCGGTGCAGGCGAGGCGGGGCGGGC' ), query_name='duplication_with_untemp', - reference_id=16, reference_name='reference17', reference_start=1882, - cigar=[(CIGAR.EQ, 126), (CIGAR.I, 54), (CIGAR.EQ, 93)], + cigarstring='126=54I93=', is_reverse=False, ) bpp = align.call_read_events(r)[0] @@ -359,10 +352,9 @@ def test_single_duplication_with_no_untemp(self): 'CCGGTTTAGCATTGCCATTGGTAA' ), query_name='duplication_with_untemp', - reference_id=2, reference_name='reference3', reference_start=1497, - cigar=[(CIGAR.EQ, 51), (CIGAR.I, 22), (CIGAR.EQ, 52)], + cigarstring='51=22I52=', is_reverse=False, ) # repeat: GATTTTGCTGTTGTTTTTGTTC @@ -383,10 +375,9 @@ def test_single_duplication_with_trailing_untemp(self): 'CAAAGTGTTTTATACTGATAAAGCAACCCCGGTTTAGCATTGCCATTGGTAA' ), query_name='duplication_with_untemp', - reference_id=2, reference_name='reference3', reference_start=1497, - cigar=[(CIGAR.EQ, 51), (CIGAR.I, 27), (CIGAR.EQ, 52)], + cigarstring='51=27I52=', is_reverse=False, ) # repeat: GATTTTGCTGTTGTTTTTGTTC @@ -410,19 +401,17 @@ def test_read_pair_indel(self): # i ---------GGGAATTCCGGA--------- 10-21 n/a seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 9), (CIGAR.S, 21)], + cigarstring='9M21S', query_sequence=seq, is_reverse=False, ) r2 = MockRead( - reference_id=0, reference_name='1', reference_start=99, - cigar=[(CIGAR.S, 21), (CIGAR.M, 9)], + cigarstring='21S9M', query_sequence=seq, is_reverse=False, ) @@ -443,19 +432,17 @@ def test_read_pair_deletion(self): # r2 aaatttcccgggaattccggaTCGATCGAT seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 21), (CIGAR.S, 9)], + cigarstring='21M9S', query_sequence=seq, is_reverse=False, ) r2 = MockRead( - reference_id=0, reference_name='1', reference_start=99, - cigar=[(CIGAR.S, 21), (CIGAR.M, 9)], + cigarstring='21S9M', query_sequence=seq, is_reverse=False, ) @@ -474,19 +461,17 @@ def test_read_pair_translocation(self): # r2 aaatttcccgggaattccggaTCGATCGAT seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead( - reference_id=0, reference_name='2', reference_start=0, - cigar=[(CIGAR.M, 21), (CIGAR.S, 9)], + cigarstring='21M9S', query_sequence=seq, is_reverse=False, ) r2 = MockRead( - reference_id=0, reference_name='1', reference_start=99, - cigar=[(CIGAR.S, 21), (CIGAR.M, 9)], + cigarstring='21S9M', query_sequence=seq, is_reverse=False, ) @@ -506,19 +491,17 @@ def test_read_pair_deletion_overlapping_query_coverage(self): seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 21), (CIGAR.S, 9)], + cigarstring='21M9S', query_sequence=seq, is_reverse=False, ) r2 = MockRead( - reference_id=0, reference_name='1', reference_start=99, - cigar=[(CIGAR.S, 18), (CIGAR.M, 12)], + cigarstring='18S12M', query_sequence=seq, is_reverse=False, ) @@ -542,19 +525,17 @@ def test_read_pair_inversion_overlapping_query_coverage(self): # r2 ATCTATCGATCCggaattcccgggaaattt 100+12 = 111 - 3 = 108 seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 21), (CIGAR.S, 9)], + cigarstring='21M9S', query_sequence=seq, is_reverse=False, ) r2 = MockRead( - reference_id=0, reference_name='1', reference_start=99, - cigar=[(CIGAR.M, 12), (CIGAR.S, 18)], + cigarstring='12M18S', query_sequence=reverse_complement(seq), is_reverse=True, ) @@ -573,16 +554,16 @@ def test_read_pair_large_inversion_overlapping_query_coverage(self): s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT' read1 = MockRead( - reference_id=3, + reference_name='3', reference_start=1114, - cigar=[(CIGAR.S, 125), (CIGAR.EQ, 120)], + cigarstring='125S120=', query_sequence=s, is_reverse=False, ) read2 = MockRead( - reference_id=3, + reference_name='3', reference_start=2187, - cigar=[(CIGAR.S, 117), (CIGAR.EQ, 8), (CIGAR.D, 1), (CIGAR.M, 120)], + cigarstring='117S8=1D120M', query_sequence=reverse_complement(s), is_reverse=True, ) @@ -613,19 +594,17 @@ def test_read_pair_inversion_gap_in_query_coverage(self): # r2 ATCTATCGATCCggaattcccgggaaattt 100+12 = 111 - 3 = 108 seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead( - reference_id=0, reference_name='1', reference_start=0, - cigar=[(CIGAR.M, 16), (CIGAR.S, 14)], + cigarstring='16M14S', query_sequence=seq, is_reverse=False, ) r2 = MockRead( - reference_id=0, reference_name='1', reference_start=99, - cigar=[(CIGAR.M, 12), (CIGAR.S, 18)], + cigarstring='12M18S', query_sequence=reverse_complement(seq), is_reverse=True, ) @@ -719,19 +698,17 @@ def test_inversion_and_deletion(self): reference_genome=None, bam_cache=mock.Mock(stranded=False), ) - read1 = SamRead( - reference_id=3, + read1 = MockRead( reference_start=1114, - cigar=[(CIGAR.S, 125), (CIGAR.EQ, 120)], + cigarstring='125S120=', query_sequence=s, is_reverse=False, reference_name='3', alignment_rank=0, ) - read2 = SamRead( - reference_id=3, + read2 = MockRead( reference_start=2187, - cigar=[(CIGAR.S, 117), (CIGAR.EQ, 8), (CIGAR.D, 1), (CIGAR.EQ, 120)], + cigarstring='117S7=1X120=', query_sequence=reverse_complement(s), is_reverse=True, reference_name='3', @@ -740,7 +717,7 @@ def test_inversion_and_deletion(self): raw_alignments = {s: [read1, read2]} align.select_contig_alignments(evidence, raw_alignments) alignments = list(evidence.contigs[0].alignments) - assert len(alignments) == 2 + assert len(alignments) == 1 class TestGetAlignerVersion: diff --git a/tests/test_mavis/test_args.py b/tests/test_mavis/test_args.py index 492509f3..4230bd97 100644 --- a/tests/test_mavis/test_args.py +++ b/tests/test_mavis/test_args.py @@ -4,7 +4,6 @@ from unittest.mock import Mock, patch import pytest -from mavis import util from mavis.cluster import main as cluster_main from mavis.main import main as mavis_main from mavis.validate import main as validate_main diff --git a/tests/test_mavis/test_assemble.py b/tests/test_mavis/test_assemble.py index 52711a6c..bc7f2bea 100644 --- a/tests/test_mavis/test_assemble.py +++ b/tests/test_mavis/test_assemble.py @@ -2,8 +2,8 @@ import random import pytest -from mavis.assemble import Contig, DeBruijnGraph, assemble, filter_contigs, kmers from mavis.constants import DNA_ALPHABET +from mavis.validate.assemble import Contig, DeBruijnGraph, assemble, filter_contigs, kmers from ..util import get_data, long_running_test diff --git a/tests/test_mavis/test_assemble2.py b/tests/test_mavis/test_assemble2.py index 8baa718c..600d6b91 100644 --- a/tests/test_mavis/test_assemble2.py +++ b/tests/test_mavis/test_assemble2.py @@ -2,9 +2,9 @@ import pytest import timeout_decorator -from mavis.assemble import Contig, assemble, filter_contigs from mavis.constants import reverse_complement from mavis.interval import Interval +from mavis.validate.assemble import Contig, assemble, filter_contigs from mavis_config import DEFAULTS from ..util import get_data, long_running_test diff --git a/tests/test_mavis/test_blat.py b/tests/test_mavis/test_blat.py index 488f14d7..f1c09d73 100644 --- a/tests/test_mavis/test_blat.py +++ b/tests/test_mavis/test_blat.py @@ -1,6 +1,6 @@ import pytest -from mavis.blat import Blat from mavis.constants import CIGAR, reverse_complement +from mavis.validate.blat import Blat from .mock import Mock, MockFunction, MockLongString diff --git a/tests/test_mavis/test_blat2.py b/tests/test_mavis/test_blat2.py index 254f6a5d..030fca61 100644 --- a/tests/test_mavis/test_blat2.py +++ b/tests/test_mavis/test_blat2.py @@ -1,12 +1,12 @@ import mavis.bam.cigar as _cigar import pytest from Bio import SeqIO -from mavis.align import query_coverage_interval from mavis.annotate.file_io import load_reference_genome from mavis.bam.cache import BamCache -from mavis.blat import Blat from mavis.constants import CIGAR, reverse_complement from mavis.interval import Interval +from mavis.validate.align import query_coverage_interval +from mavis.validate.blat import Blat from ..util import get_data from .mock import MockBamFileHandle, MockLongString, MockObject diff --git a/tests/test_mavis/test_breakpoint.py b/tests/test_mavis/test_breakpoint.py index 9eeb8347..5515cce2 100644 --- a/tests/test_mavis/test_breakpoint.py +++ b/tests/test_mavis/test_breakpoint.py @@ -1,7 +1,7 @@ from unittest.mock import Mock import pytest -from mavis.breakpoint import Breakpoint, BreakpointPair +from mavis.breakpoint import Breakpoint, BreakpointPair, classify_breakpoint_pair from mavis.constants import ORIENT, STRAND, SVTYPE from mavis.error import InvalidRearrangement, NotSpecifiedError from mavis.interval import Interval @@ -206,7 +206,7 @@ def test_inverted_translocation(self): Breakpoint(2, 1, 2, ORIENT.LEFT), opposing_strands=True, ) - BreakpointPair.classify(b) + classify_breakpoint_pair(b) def test_translocation(self): b = BreakpointPair( @@ -214,123 +214,123 @@ def test_translocation(self): Breakpoint(2, 1, 2, ORIENT.LEFT), opposing_strands=False, ) - BreakpointPair.classify(b) + classify_breakpoint_pair(b) def test_inversion(self): b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.RIGHT), ) - assert BreakpointPair.classify(b) == {SVTYPE.INV} + assert classify_breakpoint_pair(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.RIGHT), ) - assert BreakpointPair.classify(b) == {SVTYPE.INV} + assert classify_breakpoint_pair(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.NS), ) - assert BreakpointPair.classify(b) == {SVTYPE.INV} + assert classify_breakpoint_pair(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.NS), ) - assert BreakpointPair.classify(b) == {SVTYPE.INV} + assert classify_breakpoint_pair(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.LEFT), ) - assert BreakpointPair.classify(b) == {SVTYPE.INV} + assert classify_breakpoint_pair(b) == {SVTYPE.INV} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.LEFT), ) - assert BreakpointPair.classify(b) == {SVTYPE.INV} + assert classify_breakpoint_pair(b) == {SVTYPE.INV} def test_duplication(self): b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.LEFT), ) - assert BreakpointPair.classify(b) == {SVTYPE.DUP} + assert classify_breakpoint_pair(b) == {SVTYPE.DUP} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.LEFT), ) - assert BreakpointPair.classify(b) == {SVTYPE.DUP} + assert classify_breakpoint_pair(b) == {SVTYPE.DUP} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.LEFT), ) - assert BreakpointPair.classify(b) == {SVTYPE.DUP} + assert classify_breakpoint_pair(b) == {SVTYPE.DUP} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.NS), ) - assert BreakpointPair.classify(b) == {SVTYPE.DUP} + assert classify_breakpoint_pair(b) == {SVTYPE.DUP} b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.RIGHT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.NS), ) - assert BreakpointPair.classify(b) == {SVTYPE.DUP} + assert classify_breakpoint_pair(b) == {SVTYPE.DUP} def test_deletion_or_insertion(self): b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.RIGHT), ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.POS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NS, orient=ORIENT.RIGHT), opposing_strands=False, ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.RIGHT), ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NEG, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NS, orient=ORIENT.RIGHT), opposing_strands=False, ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.POS, orient=ORIENT.RIGHT), opposing_strands=False, ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NEG, orient=ORIENT.RIGHT), opposing_strands=False, ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) b = BreakpointPair( Breakpoint(1, 1, 2, strand=STRAND.NS, orient=ORIENT.LEFT), Breakpoint(1, 10, 11, strand=STRAND.NS, orient=ORIENT.RIGHT), opposing_strands=False, ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL, SVTYPE.INS]) def test_insertion(self): b = BreakpointPair( @@ -338,7 +338,7 @@ def test_insertion(self): Breakpoint(1, 2, 2, strand=STRAND.NS, orient=ORIENT.RIGHT), opposing_strands=False, ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.INS]) def test_no_type(self): b = BreakpointPair( @@ -347,7 +347,7 @@ def test_no_type(self): opposing_strands=False, untemplated_seq='', ) - assert BreakpointPair.classify(b) == set() + assert classify_breakpoint_pair(b) == set() def test_deletion(self): b = BreakpointPair( @@ -356,7 +356,7 @@ def test_deletion(self): opposing_strands=False, untemplated_seq='', ) - assert sorted(BreakpointPair.classify(b)) == sorted([SVTYPE.DEL]) + assert sorted(classify_breakpoint_pair(b)) == sorted([SVTYPE.DEL]) def test_deletion_with_useq(self): bpp = BreakpointPair( @@ -365,20 +365,20 @@ def test_deletion_with_useq(self): opposing=False, untemplated_seq='CCCT', ) - assert sorted(BreakpointPair.classify(bpp)) == sorted([SVTYPE.DEL, SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(bpp)) == sorted([SVTYPE.DEL, SVTYPE.INS]) def distance(x, y): return Interval(abs(x - y)) net_size = BreakpointPair.net_size(bpp, distance) assert net_size == Interval(-71) - assert sorted(BreakpointPair.classify(bpp, distance)) == sorted([SVTYPE.DEL]) + assert sorted(classify_breakpoint_pair(bpp, distance)) == sorted([SVTYPE.DEL]) def test_deletion_no_distance_error(self): bpp = BreakpointPair( Breakpoint('1', 7039, orient='L'), Breakpoint('1', 7040, orient='R'), opposing=False ) - assert sorted(BreakpointPair.classify(bpp)) == sorted([SVTYPE.INS]) + assert sorted(classify_breakpoint_pair(bpp)) == sorted([SVTYPE.INS]) class TestNetSize: diff --git a/tests/test_mavis/validate/test_call.py b/tests/test_mavis/validate/test_call.py index b959b6d5..a3592f60 100644 --- a/tests/test_mavis/validate/test_call.py +++ b/tests/test_mavis/validate/test_call.py @@ -1,7 +1,6 @@ from unittest import mock import pytest -from mavis.align import call_paired_read_event, select_contig_alignments from mavis.annotate.file_io import load_reference_genome from mavis.annotate.genomic import PreTranscript, Transcript from mavis.bam import cigar as _cigar @@ -12,6 +11,7 @@ from mavis.constants import CALL_METHOD, CIGAR, ORIENT, PYSAM_READ_FLAGS, STRAND, SVTYPE from mavis.interval import Interval from mavis.validate import call +from mavis.validate.align import call_paired_read_event, select_contig_alignments from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence, TranscriptomeEvidence @@ -66,7 +66,6 @@ def test_EGFR_small_del_transcriptome(self): '68M678D50M15D34M6472D185M10240D158M891D74M' '5875D' '6M' '1X' '29M' ), reference_name='7', - reference_id=6, reference_start=55241669, alignment_rank=0, ) @@ -167,26 +166,38 @@ def test_flanking_support(self): mock_read_pair( MockRead( query_name='test1', - reference_id=3, + reference_name='reference3', template_length=500, reference_start=1150, - reference_end=1200, is_reverse=True, + cigarstring='50M', ), - MockRead(reference_id=3, reference_start=2200, reference_end=2250, is_reverse=True), + MockRead( + reference_name='reference3', + reference_start=2200, + is_reverse=True, + cigarstring='50M', + ), + proper_pair=False, ) ) ev.flanking_pairs.add( mock_read_pair( MockRead( query_name='test2', - reference_id=3, + reference_name='reference3', template_length=560, reference_start=1150, - reference_end=1200, + cigarstring='50M', is_reverse=True, ), - MockRead(reference_id=3, reference_start=2200, reference_end=2250, is_reverse=True), + MockRead( + reference_name='reference3', + reference_start=2200, + is_reverse=True, + cigarstring='50M', + ), + proper_pair=False, ) ) median, stdev = ev.flanking_metrics() @@ -250,8 +261,18 @@ def test_deletion(self): ) flanking_pairs = [ mock_read_pair( - MockRead('r1', 0, 400, 450, is_reverse=False), - MockRead('r1', 0, 1200, 1260, is_reverse=True), + MockRead( + query_name='r1', + reference_name='1', + reference_start=400, + cigarstring='50M', + ), + MockRead( + query_name='r1', + reference_name='1', + reference_start=1200, + cigarstring='60M', + ), ) ] event = call.EventCall( @@ -303,8 +324,12 @@ def test_insertion(self): ) flanking_pairs = [ mock_read_pair( - MockRead('r1', 0, 700, 750, is_reverse=False), - MockRead('r1', 0, 950, 1049, is_reverse=True), + MockRead( + query_name='r1', reference_name='1', reference_start=700, cigarstring='50M' + ), + MockRead( + query_name='r1', reference_name='1', reference_start=950, cigarstring='50M' + ), ) ] print(evidence.min_expected_fragment_size) @@ -326,8 +351,21 @@ def test_inversion(self): ) flanking_pairs = [ mock_read_pair( - MockRead('r1', 0, 400, 450, is_reverse=False), - MockRead('r1', 0, 900, 950, is_reverse=False), + MockRead( + query_name='r1', + reference_name='1', + reference_start=400, + cigarstring='50M', + is_reverse=False, + ), + MockRead( + query_name='r1', + reference_name='1', + reference_start=900, + cigarstring='50M', + is_reverse=False, + ), + proper_pair=False, ) ] event = call.EventCall( @@ -380,28 +418,88 @@ def test_translocation_rl(self): event = call.EventCall(b1, b2, evidence, SVTYPE.TRANS, CALL_METHOD.CONTIG) flanking_pairs = [ mock_read_pair( - MockRead('x', '11', 128675264, 128677087, is_reverse=False), - MockRead('x', '22', 29683030, 29683105, is_reverse=True), + MockRead( + query_name='x1', + reference_name='11', + reference_start=128675264, + cigarstring=str(128677087 - 128675264 - 100) + 'M', + ), + MockRead( + query_name='x1', + reference_name='22', + reference_start=29683030, + cigarstring=str(29683105 - 29683030) + 'M', + ), ), mock_read_pair( - MockRead('x', '11', 128675286, 128677109, is_reverse=False), - MockRead('x', '22', 29683016, 29683091, is_reverse=True), + MockRead( + query_name='x2', + reference_name='11', + reference_start=128675286, + cigarstring=str(128677109 - 128675286) + 'M', + ), + MockRead( + query_name='x2', + reference_name='22', + reference_start=29683016, + cigarstring=str(29683091 - 29683016) + 'M', + ), ), mock_read_pair( - MockRead('x', '11', 128675260, 128677083, is_reverse=False), - MockRead('x', '22', 29683049, 29683123, is_reverse=True), + MockRead( + query_name='x3', + reference_name='11', + reference_start=128675260, + cigarstring=str(128677083 - 128675260) + 'M', + ), + MockRead( + query_name='x3', + reference_name='22', + reference_start=29683049, + cigarstring=str(29683123 - 29683049) + 'M', + ), ), mock_read_pair( - MockRead('x', '11', 128675289, 128677110, is_reverse=False), - MockRead('x', '22', 29683047, 29683122, is_reverse=True), + MockRead( + query_name='x4', + reference_name='11', + reference_start=128675289, + cigarstring=str(128677110 - 128675289) + 'M', + ), + MockRead( + query_name='x4', + reference_name='22', + reference_start=29683047, + cigarstring=str(29683122 - 29683047) + 'M', + ), ), mock_read_pair( - MockRead('x', '11', 128675306, 128677129, is_reverse=False), - MockRead('x', '22', 29683039, 29683114, is_reverse=True), + MockRead( + query_name='x5', + reference_name='11', + reference_start=128675306, + cigarstring=str(128677129 - 128675306) + 'M', + ), + MockRead( + query_name='x5', + reference_name='22', + reference_start=29683039, + cigarstring=str(29683114 - 29683039) + 'M', + ), ), mock_read_pair( - MockRead('x', '11', 128675289, 128677110, is_reverse=False), - MockRead('x', '22', 29683047, 29683122, is_reverse=True), + MockRead( + query_name='x6', + reference_name='11', + reference_start=128675289, + cigarstring=str(128677110 - 128675289) + 'M', + ), + MockRead( + query_name='x6', + reference_name='22', + reference_start=29683047, + cigarstring=str(29683122 - 29683047) + 'M', + ), ), ] event.add_flanking_support(flanking_pairs) @@ -413,8 +511,20 @@ def test_translocation_rl_filter_nonsupporting(self): ) flanking_pairs = [ mock_read_pair( - MockRead('r1', 0, 1201, 1249, is_reverse=True), - MockRead('r1', 1, 1201, 1249, is_reverse=False), + MockRead( + query_name='r1', + reference_name='1', + reference_start=1201, + cigarstring='48M', + is_reverse=True, + ), + MockRead( + query_name='r1', + reference_name='2', + reference_start=1201, + cigarstring='48M', + is_reverse=False, + ), ) ] event = call.EventCall( @@ -447,8 +557,20 @@ def test_duplication(self): ) flanking_pairs = [ mock_read_pair( - MockRead('r1', 0, 1205, 1250, is_reverse=True), - MockRead('r1', 0, 1260, 1295, is_reverse=False), + MockRead( + query_name='r1', + reference_name='1', + reference_start=1205, + cigarstring='45M', + is_reverse=True, + ), + MockRead( + query_name='r1', + reference_name='1', + reference_start=1260, + cigarstring='35M', + is_reverse=False, + ), ) ] event = call.EventCall( @@ -501,59 +623,64 @@ def test_call_all_methods(self): Breakpoint('1', 450, 500, orient=ORIENT.RIGHT), opposing_strands=False, ) - r1, r2 = mock_read_pair( - MockRead( - query_name='t1', - reference_id=0, - reference_name='1', - reference_start=40, - cigar=[(CIGAR.EQ, 60), (CIGAR.S, 40)], - query_sequence='A' * 100, - ), - MockRead( - query_name='t1', - reference_id=0, - reference_name='1', - reference_start=460, - cigar=[(CIGAR.S, 40), (CIGAR.EQ, 60)], - query_sequence='A' * 100, - ), - ) + contig = mock.Mock( **{ 'seq': '', 'complexity.return_value': 1, - 'alignments': [call_paired_read_event(r1, r2)], + 'alignments': [ + call_paired_read_event( + MockRead( + query_name='assembly1', + reference_name='1', + reference_start=40, + cigarstring='60=40S', + query_sequence='A' * 100, + ), + MockRead( + query_name='assembly1', + reference_name='1', + reference_start=460, + cigarstring='40S60=', + query_sequence='A' * 100, + ), + ) + ], } ) contig.input_reads = { - MockRead(query_name='t1', reference_start=100, cigar=[(CIGAR.EQ, 20), (CIGAR.S, 80)]) + MockRead(query_name='t0', reference_name='1', reference_start=100, cigarstring='20=80S') } evidence.contigs.append(contig) - evidence.split_reads[0].add( - MockRead(query_name='t1', reference_start=100, cigar=[(CIGAR.EQ, 20), (CIGAR.S, 80)]) - ) - evidence.split_reads[1].add( - MockRead(query_name='t1', reference_start=500, cigar=[(CIGAR.S, 50), (CIGAR.EQ, 50)]) + split_read1, split_read2 = mock_read_pair( + MockRead( + query_name='t1', reference_name='1', reference_start=100, cigarstring='20=80S' + ), + MockRead( + query_name='t1', + reference_name='1', + reference_start=500, + cigarstring='50S50=', + query_sequence='A' * 100, + ), ) + + evidence.split_reads[0].add(split_read1) + evidence.split_reads[1].add(split_read2) evidence.flanking_pairs.add( mock_read_pair( MockRead( query_name='t4', - reference_id=0, + reference_name='1', reference_start=10, - reference_end=40, - is_reverse=False, - query_alignment_length=30, + cigarstring='30M', ), MockRead( query_name='t4', - reference_id=0, + reference_name='1', + cigarstring='35M', reference_start=505, - reference_end=540, - is_reverse=True, - query_alignment_length=35, ), ) ) @@ -561,45 +688,48 @@ def test_call_all_methods(self): mock_read_pair( MockRead( query_name='t3', - reference_id=0, + reference_name='1', reference_start=49, - reference_end=90, - is_reverse=False, - query_alignment_length=41, + cigarstring='41M', ), MockRead( query_name='t3', - reference_id=0, + reference_name='1', reference_start=805, - reference_end=840, - is_reverse=True, - query_alignment_length=35, + cigarstring='35M', ), ) ) events = call.call_events(evidence) - for ev in events: - print(ev, ev.event_type, ev.call_method) assert len(events) == 4 + assert events[0].call_method == 'contig' assert events[0].break1.start == 100 assert events[0].break2.start == 481 assert events[0].event_type == 'deletion' + assert 't0' in {r.query_name for r in events[0].support()} + assert 't4' in {r.query_name for r in events[0].support()} + assert events[1].call_method == 'split reads' assert events[1].break1.start == 120 assert events[1].break2.start == 501 assert events[1].event_type == 'deletion' + assert 't1' in {r.query_name for r in events[1].support()} + assert events[2].call_method == 'flanking reads' assert events[2].break1.start == 90 assert events[2].break1.end == 299 assert events[2].break2.start == 591 assert events[2].break2.end == 806 assert events[2].event_type == 'deletion' + assert 't3' in {r.query_name for r in events[2].support()} + assert events[3].call_method == 'split reads' assert events[3].break1.start == 120 assert events[3].break2.start == 501 assert events[3].event_type == 'insertion' + assert 't1' in {r.query_name for r in events[3].support()} def test_call_contig_only(self): # event should only be 100L+, 501R+ deletion @@ -608,34 +738,37 @@ def test_call_contig_only(self): Breakpoint('1', 450, 500, orient=ORIENT.RIGHT), opposing_strands=False, ) - r1, r2 = mock_read_pair( - MockRead( - query_name='t1', - reference_id=0, - reference_name='1', - reference_start=40, - cigar=[(CIGAR.EQ, 60), (CIGAR.S, 40)], - query_sequence='A' * 100, - query_alignment_length=100, - ), - MockRead( - query_name='t1', - reference_id=0, - reference_name='1', - reference_start=480, - cigar=[(CIGAR.S, 40), (CIGAR.EQ, 60)], - query_sequence='A' * 100, - query_alignment_length=100, - ), + + contig = mock.Mock( + **{ + 'seq': '', + 'complexity.return_value': 1, + 'alignments': [ + call_paired_read_event( + MockRead( + query_name='t1', + reference_name='1', + reference_start=40, + cigarstring='60=40S', + query_sequence='A' * 100, + ), + MockRead( + query_name='t1', + reference_name='1', + reference_start=480, + cigarstring='40S60=', + query_sequence='A' * 100, + ), + ) + ], + } ) - bpp = call_paired_read_event(r1, r2) - contig = mock.Mock(**{'seq': '', 'complexity.return_value': 1, 'alignments': [bpp]}) contig.input_reads = { MockRead( query_name='t1', reference_start=100, reference_name='1', - cigar=[(CIGAR.EQ, 20), (CIGAR.S, 80)], + cigarstring='20=80S', ) } evidence.contigs.append(contig) @@ -645,7 +778,7 @@ def test_call_contig_only(self): query_name='t1', reference_name='1', reference_start=80, - cigar=[(CIGAR.EQ, 20), (CIGAR.S, 80)], + cigarstring='20=80S', ) ) evidence.split_reads[1].add( @@ -653,7 +786,7 @@ def test_call_contig_only(self): query_name='t1', reference_name='1', reference_start=500, - cigar=[(CIGAR.S, 50), (CIGAR.EQ, 50)], + cigarstring='50S50=', ) ) evidence.split_reads[0].add( @@ -661,7 +794,7 @@ def test_call_contig_only(self): query_name='t2', reference_name='1', reference_start=40, - cigar=[(CIGAR.EQ, 50), (CIGAR.S, 50)], + cigarstring='50=50S', ) ) evidence.flanking_pairs.add( @@ -669,20 +802,16 @@ def test_call_contig_only(self): MockRead( query_name='t3', reference_name='1', - reference_id=0, reference_start=49, - reference_end=90, is_reverse=False, - query_alignment_length=100, + cigarstring='41M', ), MockRead( query_name='t3', reference_name='1', - reference_id=0, reference_start=505, - reference_end=550, is_reverse=True, - query_alignment_length=100, + cigarstring='100M', ), ) ) @@ -702,29 +831,28 @@ def test_call_contig_and_split(self): Breakpoint('1', 450, 500, orient=ORIENT.RIGHT), opposing_strands=False, ) - r1, r2 = mock_read_pair( - MockRead( - query_name='t1', - reference_id=0, - reference_name='1', - reference_start=40, - cigar=[(CIGAR.EQ, 60), (CIGAR.S, 40)], - query_sequence='A' * 100, - ), - MockRead( - query_name='t1', - reference_id=0, - reference_name='1', - reference_start=480, - cigar=[(CIGAR.S, 40), (CIGAR.EQ, 60)], - query_sequence='A' * 100, - ), - ) contig = mock.Mock( **{ 'seq': '', 'complexity.return_value': 1, - 'alignments': [call_paired_read_event(r1, r2)], + 'alignments': [ + call_paired_read_event( + MockRead( + query_name='t1', + reference_name='1', + reference_start=40, + cigarstring='60=40S', + query_sequence='A' * 100, + ), + MockRead( + query_name='t1', + reference_name='1', + reference_start=480, + cigarstring='40S60=', + query_sequence='A' * 100, + ), + ) + ], } ) contig.input_reads = { @@ -732,46 +860,45 @@ def test_call_contig_and_split(self): query_name='t1', reference_name='1', reference_start=100, - cigar=[(CIGAR.EQ, 20), (CIGAR.S, 80)], + cigarstring='20=80S', ) } evidence.contigs.append(contig) - evidence.split_reads[0].add( + split_read1, split_read2 = mock_read_pair( MockRead( - query_name='t1', + query_name='t2', reference_start=100, reference_name='1', - cigar=[(CIGAR.EQ, 20), (CIGAR.S, 80)], - ) - ) - evidence.split_reads[1].add( + cigarstring='20=80S', + query_sequence='C' * 100, + ), MockRead( - query_name='t1', + query_name='t2', reference_start=520, reference_name='1', - cigar=[(CIGAR.S, 50), (CIGAR.EQ, 50)], - ) + cigarstring='50S50=', + query_sequence='A' * 100, + ), ) + + evidence.split_reads[0].add(split_read1) + evidence.split_reads[1].add(split_read2) evidence.flanking_pairs.add( mock_read_pair( MockRead( query_name='t3', - reference_id=0, reference_start=49, reference_name='1', - reference_end=90, is_reverse=False, - query_alignment_length=100, + cigarstring='41M', ), MockRead( query_name='t3', - reference_id=0, reference_start=505, reference_name='1', - reference_end=550, is_reverse=True, - query_alignment_length=100, + cigarstring='100M', ), ) ) @@ -782,9 +909,11 @@ def test_call_contig_and_split(self): assert events[0].break1.start == 100 assert events[0].break2.start == 501 assert events[0].call_method == 'contig' + assert events[1].call_method == 'split reads' assert events[1].break1.start == 120 assert events[1].break2.start == 521 + assert events[2].event_type == 'insertion' assert events[2].call_method == 'split reads' assert events[2].break1.start == 120 @@ -797,28 +926,22 @@ def test_call_split_only(self): opposing_strands=False, ) evidence.split_reads[0].add( - MockRead(query_name='t1', reference_start=140, cigar=[(CIGAR.EQ, 30), (CIGAR.S, 70)]) + MockRead(query_name='t1', reference_start=140, cigarstring='30=70S') ) evidence.split_reads[1].add( - MockRead(query_name='t1', reference_start=870, cigar=[(CIGAR.S, 50), (CIGAR.EQ, 50)]) + MockRead(query_name='t1', reference_start=870, cigarstring='50S50=') ) evidence.flanking_pairs.add( mock_read_pair( MockRead( query_name='t3', - reference_id=0, reference_start=42, - reference_end=140, - is_reverse=False, - query_alignment_length=100, + cigarstring='100M', ), MockRead( query_name='t3', - reference_id=0, reference_start=885, - reference_end=905, - is_reverse=True, - query_alignment_length=100, + cigarstring='100M', ), ) ) @@ -844,19 +967,15 @@ def test_call_flanking_only(self): mock_read_pair( MockRead( query_name='t1', - reference_id=0, + reference_name='1', reference_start=42, - reference_end=140, - is_reverse=False, - query_alignment_length=98, + cigarstring='98M', ), MockRead( query_name='t1', - reference_id=0, + reference_name='1', reference_start=885, - reference_end=905, - is_reverse=True, - query_alignment_length=20, + cigarstring='20M', ), ) ) @@ -910,7 +1029,7 @@ def inversion_evidence(): 'validate.min_flanking_pairs_resolution': 1, 'validate.min_linking_split_reads': 1, 'validate.min_spanning_reads_resolution': 3, - 'validate. min_call_complexity': 0, + 'validate.min_call_complexity': 0, }, ) @@ -922,10 +1041,10 @@ def test_empty(self, inversion_evidence): def test_call_no_duplication_by_split_reads(self, duplication_ev, inversion_evidence): duplication_ev.split_reads[0].add( - MockRead(query_name='t1', reference_start=30, cigar=[(CIGAR.EQ, 20), (CIGAR.S, 20)]) + MockRead(query_name='t1', reference_start=30, cigarstring='20=20S') ) duplication_ev.split_reads[1].add( - MockRead(query_name='t1', reference_start=90, cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)]) + MockRead(query_name='t1', reference_start=90, cigarstring='20S20=') ) bpps = call._call_by_split_reads(inversion_evidence, SVTYPE.DUP) @@ -936,7 +1055,7 @@ def test_by_split_read(self, inversion_evidence): MockRead( query_name='t1', reference_start=100, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='A' * 40, ) ) @@ -944,7 +1063,7 @@ def test_by_split_read(self, inversion_evidence): MockRead( query_name='t1', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='G' * 40, ) ) @@ -952,7 +1071,7 @@ def test_by_split_read(self, inversion_evidence): MockRead( query_name='t2', reference_start=100, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='C' * 40, ) ) @@ -960,7 +1079,7 @@ def test_by_split_read(self, inversion_evidence): MockRead( query_name='t2', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='A' * 40, ) ) @@ -979,7 +1098,7 @@ def test_call_by_split_read_low_resolution(self, inversion_evidence): MockRead( query_name='t1', reference_start=100, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='A' * 40, ) ) @@ -987,7 +1106,7 @@ def test_call_by_split_read_low_resolution(self, inversion_evidence): MockRead( query_name='t1', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='N' * 40, ) ) @@ -1006,7 +1125,7 @@ def test_call_by_split_read_resolve_untemp(self, inversion_evidence): MockRead( query_name='t1', reference_start=100, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='TCGGCTCCCGTACTTGTGTATAAGGGGCTTCTGATGTTAT', ) ) @@ -1014,7 +1133,7 @@ def test_call_by_split_read_resolve_untemp(self, inversion_evidence): MockRead( query_name='t1', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='ATAACATCAGAAGCCCCTTATACACAAGTACGGGAGCCGA', is_reverse=True, ) @@ -1033,7 +1152,7 @@ def test_call_by_split_read_resolve_untemp_exists(self, inversion_evidence): MockRead( query_name='t1', reference_start=100, - cigar=[(CIGAR.S, 22), (CIGAR.EQ, 18)], + cigarstring='22S18=', query_sequence='TCGGCTCCCGTACTTGTGTATAAGGGGCTTCTGATGTTAT', ) ) @@ -1041,7 +1160,7 @@ def test_call_by_split_read_resolve_untemp_exists(self, inversion_evidence): MockRead( query_name='t1', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='ATAACATCAGAAGCCCCTTATACACAAGTACGGGAGCCGA', is_reverse=True, ) @@ -1060,7 +1179,7 @@ def test_call_by_split_read_shift_overlap(self, inversion_evidence): MockRead( query_name='t1', reference_start=100, - cigar=[(CIGAR.S, 18), (CIGAR.EQ, 22)], + cigarstring='18S22=', query_sequence='TCGGCTCCCGTACTTGTGTATAAGGGGCTTCTGATGTTAT', ) ) @@ -1068,7 +1187,7 @@ def test_call_by_split_read_shift_overlap(self, inversion_evidence): MockRead( query_name='t1', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='ATAACATCAGAAGCCCCTTATACACAAGTACGGGAGCCGA', is_reverse=True, ) @@ -1085,30 +1204,57 @@ def test_call_by_split_read_shift_overlap(self, inversion_evidence): def test_both_by_flanking_pairs(self, inversion_evidence): inversion_evidence.flanking_pairs.add( mock_read_pair( - MockRead(query_name='t1', reference_id=0, reference_start=150, reference_end=150), - MockRead(query_name='t1', reference_id=0, reference_start=500, reference_end=520), + MockRead( + query_name='t1', + reference_name='1', + reference_start=150, + cigarstring='0M', + is_reverse=False, + ), + MockRead( + query_name='t1', + reference_name='1', + reference_start=500, + cigarstring='20M', + is_reverse=False, + ), + proper_pair=False, ) ) inversion_evidence.flanking_pairs.add( mock_read_pair( - MockRead(query_name='t2', reference_id=0, reference_start=120, reference_end=140), - MockRead(query_name='t2', reference_id=0, reference_start=520, reference_end=520), + MockRead( + query_name='t2', + reference_name='1', + reference_start=120, + cigarstring='20M', + is_reverse=False, + ), + MockRead( + query_name='t2', + reference_name='1', + reference_start=520, + cigarstring='0M', + is_reverse=False, + ), + proper_pair=False, ) ) bpp = call._call_by_flanking_pairs(inversion_evidence, SVTYPE.INV) # 120-149 ..... 500-519 # max frag = 150 - 80 = 70 - assert bpp.break1.start == 42 - assert bpp.break1.end == 120 - assert bpp.break2.start == 412 # 70 - 21 = 49 - assert bpp.break2.end == 500 + print(bpp) + assert bpp.break1.orient == ORIENT.RIGHT + assert bpp.break2.orient == ORIENT.RIGHT + assert (bpp.break1.start, bpp.break1.end) == (42, 121) + assert (bpp.break2.start, bpp.break2.end) == (412, 501) def test_by_split_reads_multiple_calls(self, inversion_evidence): inversion_evidence.split_reads[0].add( MockRead( query_name='t1', reference_start=100, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='A' * 40, ) ) @@ -1116,7 +1262,7 @@ def test_by_split_reads_multiple_calls(self, inversion_evidence): MockRead( query_name='t1', reference_start=500, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='T' * 40, ) ) @@ -1124,7 +1270,7 @@ def test_by_split_reads_multiple_calls(self, inversion_evidence): MockRead( query_name='t2', reference_start=110, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='T' * 40, ) ) @@ -1132,7 +1278,7 @@ def test_by_split_reads_multiple_calls(self, inversion_evidence): MockRead( query_name='t2', reference_start=520, - cigar=[(CIGAR.S, 20), (CIGAR.EQ, 20)], + cigarstring='20S20=', query_sequence='A' * 40, ) ) @@ -1160,64 +1306,73 @@ def test_call_by_split_reads_consume_flanking(self): evidence.split_reads[0].add( MockRead( query_name='test1', - cigar=[(CIGAR.S, 110), (CIGAR.EQ, 40)], + cigarstring='110S40=', reference_start=1114, - reference_end=1150, + reference_name='reference3', ) ) evidence.split_reads[0].add( MockRead( + reference_name='reference3', query_name='test2', - cigar=[(CIGAR.EQ, 30), (CIGAR.S, 120)], + cigarstring='30=120S', reference_start=1108, - reference_end=1115, ) ) - evidence.split_reads[0].add( - MockRead( - query_name='test3', - cigar=[(CIGAR.S, 30), (CIGAR.EQ, 120)], - reference_start=1114, - reference_end=1154, - tags=[(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT, 1)], - ) + read = MockRead( + reference_name='reference3', + query_name='test3', + cigarstring='30S120=', + reference_start=1114, ) + read.set_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT, 1) + evidence.split_reads[0].add(read) evidence.split_reads[1].add( MockRead( - query_name='test4', cigar=[(CIGAR.EQ, 30), (CIGAR.S, 120)], reference_start=2187 - ) - ) - evidence.split_reads[1].add( - MockRead( - query_name='test5', cigar=[(CIGAR.S, 30), (CIGAR.EQ, 120)], reference_start=2187 + reference_name='reference3', + query_name='test4', + cigarstring='30=120S', + reference_start=2187, ) ) evidence.split_reads[1].add( MockRead( - query_name='test1', - cigar=[(CIGAR.S, 30), (CIGAR.EQ, 120)], + reference_name='reference3', + query_name='test5', + cigarstring='30S120=', reference_start=2187, - reference_end=2307, - tags=[(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT, 1)], ) ) + read = MockRead( + reference_name='reference3', + query_name='test1', + cigarstring='30S120=', + reference_start=2187, + ) + read.set_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT, 1) + evidence.split_reads[1].add(read) evidence.flanking_pairs.add( mock_read_pair( MockRead( query_name='t1', - reference_id=3, + reference_name='reference3', reference_start=1200, - reference_end=1250, + is_reverse=True, + cigarstring='50M', + ), + MockRead( + reference_name='reference3', + reference_start=2250, + cigarstring='50M', is_reverse=True, ), - MockRead(reference_id=3, reference_start=2250, reference_end=2300, is_reverse=True), ) ) events = call._call_by_split_reads(evidence, event_type=SVTYPE.INV) for ev in events: - print(ev, ev.event_type, ev.call_method) + print(ev, ev.event_type, ev.call_method, [r.query_name for r in ev.support()]) assert len(events) == 1 event = events[0] assert len(event.flanking_pairs) == 1 @@ -1268,34 +1423,24 @@ def test_intrachromosomal_lr(self, left_right_ev): left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( - reference_start=19, - reference_end=60, - next_reference_start=599, - query_alignment_length=25, + reference_start=60 - 25, + cigarstring='25M', ), MockRead( reference_start=599, - reference_end=650, - next_reference_start=19, - query_alignment_length=25, - is_reverse=True, + cigarstring='25M', ), ) ) left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( - reference_start=39, - reference_end=80, - next_reference_start=649, - query_alignment_length=25, + reference_start=80 - 25, + cigarstring='25M', ), MockRead( reference_start=649, - reference_end=675, - next_reference_start=39, - query_alignment_length=25, - is_reverse=True, + cigarstring='25M', ), ) ) @@ -1303,62 +1448,48 @@ def test_intrachromosomal_lr(self, left_right_ev): left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( - reference_start=39, - reference_end=50, - next_reference_start=91, - query_alignment_length=25, + reference_start=25, + cigarstring='25M', ), MockRead( reference_start=91, - reference_end=110, - next_reference_start=39, - query_alignment_length=25, - is_reverse=True, + cigarstring='25M', ), ) ) bpp = call._call_by_flanking_pairs(left_right_ev, SVTYPE.DEL) - print(bpp, bpp.flanking_pairs) + print(bpp, len(bpp.flanking_pairs)) assert bpp.break1.start == 80 - assert bpp.break1.end == 80 + 125 - 45 - assert bpp.break2.start == 600 - 125 + 75 + assert bpp.break1.end == 80 + 125 - 45 # 160 + assert bpp.break2.start == 600 - 125 + 75 # 550 assert bpp.break2.end == 600 def test_intrachromosomal_lr_coverage_overlaps_range(self, left_right_ev): # this test is for ensuring that if a theoretical window calculated for the # first breakpoint overlaps the actual coverage for the second breakpoint (or the reverse) # that we adjust the theoretical window accordingly + left_right_ev.median_fragment_size = 150 left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=21, - reference_end=60, - next_reference_start=80, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=80, - reference_end=120, - next_reference_start=21, - query_alignment_length=25, - is_reverse=True, + cigarstring='25M', ), ) ) left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( - reference_start=41, - reference_end=80, - next_reference_start=110, - query_alignment_length=25, + reference_start=80 - 25, + cigarstring='25M', ), MockRead( reference_start=110, - reference_end=140, - next_reference_start=41, - query_alignment_length=25, - is_reverse=True, + cigarstring='25M', ), ) ) @@ -1367,16 +1498,11 @@ def test_intrachromosomal_lr_coverage_overlaps_range(self, left_right_ev): mock_read_pair( MockRead( reference_start=39, - reference_end=80, - next_reference_start=649, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=649, - reference_end=675, - next_reference_start=39, - query_alignment_length=25, - is_reverse=True, + cigarstring='25M', ), ) ) @@ -1387,19 +1513,19 @@ def test_intrachromosomal_lr_coverage_overlaps_range(self, left_right_ev): assert break2.end == 81 def test_intrachromosomal_flanking_coverage_overlap_error(self, left_right_ev): + """ + Cannot call with the flanking reads together since their coverage intervals overlap at the wrong ends + """ + left_right_ev.config['validate.min_flanking_pairs_resolution'] = 2 left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=19, - reference_end=60, - next_reference_start=599, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=599, - reference_end=650, - next_reference_start=19, - query_alignment_length=25, + cigarstring='25M', ), ) ) @@ -1407,35 +1533,35 @@ def test_intrachromosomal_flanking_coverage_overlap_error(self, left_right_ev): mock_read_pair( MockRead( reference_start=620, - reference_end=80, - next_reference_start=780, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=780, - reference_end=820, - next_reference_start=620, - query_alignment_length=25, + cigarstring='25M', ), ) ) - with pytest.raises(AssertionError): + with pytest.raises(AssertionError) as e_info: call._call_by_flanking_pairs(left_right_ev, SVTYPE.DEL) + assert ( + e_info.value.args[0] + == 'insufficient flanking pairs (1) to call deletion by flanking reads' + ) def test_coverage_larger_than_max_expected_variance_error(self, left_right_ev): + """ + Cannot group the reads to make a single call so the flanking call is below the acceptable resolution + """ + left_right_ev.config['validate.min_flanking_pairs_resolution'] = 2 left_right_ev.flanking_pairs.add( mock_read_pair( MockRead( reference_start=19, - reference_end=60, - next_reference_start=599, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=599, - reference_end=650, - next_reference_start=19, - query_alignment_length=25, + cigarstring='25M', ), ) ) @@ -1443,20 +1569,20 @@ def test_coverage_larger_than_max_expected_variance_error(self, left_right_ev): mock_read_pair( MockRead( reference_start=301, - reference_end=350, - next_reference_start=780, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=780, - reference_end=820, - next_reference_start=301, - query_alignment_length=25, + cigarstring='25M', ), ) ) - with pytest.raises(AssertionError): + with pytest.raises(AssertionError) as e_info: call._call_by_flanking_pairs(left_right_ev, SVTYPE.DEL) + assert ( + e_info.value.args[0] + == 'insufficient flanking pairs (1) to call deletion by flanking reads' + ) def test_close_to_zero(self, left_right_ev): # this test is for ensuring that if a theoretical window calculated for the @@ -1479,33 +1605,23 @@ def test_close_to_zero(self, left_right_ev): ev.flanking_pairs.add( mock_read_pair( MockRead( - reference_start=19, - reference_end=60, - next_reference_start=149, - query_alignment_length=25, + reference_name='fake', reference_start=19, cigarstring='25M', is_reverse=True ), MockRead( - reference_start=149, - reference_end=150, - next_reference_start=19, - query_alignment_length=25, + reference_name='fake', reference_start=149, cigarstring='25M', is_reverse=True ), + proper_pair=False, ) ) ev.flanking_pairs.add( mock_read_pair( MockRead( - reference_start=39, - reference_end=80, - next_reference_start=199, - query_alignment_length=25, + reference_name='fake', reference_start=39, cigarstring='25M', is_reverse=True ), MockRead( - reference_start=199, - reference_end=200, - next_reference_start=39, - query_alignment_length=25, + reference_name='fake', reference_start=199, cigarstring='25M', is_reverse=True ), + proper_pair=False, ) ) break1, break2 = call._call_by_flanking_pairs(ev, SVTYPE.INV) @@ -1531,15 +1647,11 @@ def test_call_with_overlapping_coverage_intervals(self, left_right_ev): mock_read_pair( MockRead( reference_start=76186159, - reference_end=76186309, - next_reference_start=76186000, - query_alignment_length=25, + cigarstring='25M', ), MockRead( reference_start=76186000, - reference_end=76186150, - next_reference_start=76186159, - query_alignment_length=25, + cigarstring='25M', ), ) ) @@ -1589,6 +1701,7 @@ def test_call_deletion(self): pre_transcript = PreTranscript( [(1001, 1100), (1501, 1700), (2001, 2100), (2201, 2300)], strand='+' ) + pre_transcript.chr = '1' for patt in pre_transcript.generate_splicing_patterns(): pre_transcript.transcripts.append(Transcript(pre_transcript, patt)) evidence = self.build_transcriptome_evidence( @@ -1597,18 +1710,28 @@ def test_call_deletion(self): # now add the flanking pairs pair = mock_read_pair( MockRead( - 'name', '1', 951, 1051, is_reverse=False, query_alignment_length=50, is_read1=False + query_name='name', + reference_name='1', + reference_start=1001, + is_reverse=False, + cigarstring='50M', + ), + MockRead( + query_name='name', + reference_name='1', + reference_start=2299, + is_reverse=True, + cigarstring='50M', ), - MockRead('name', '1', 2299, 2399, is_reverse=True, query_alignment_length=50), + proper_pair=False, + reverse_order=True, ) print(read_pair_type(pair[0])) # following help in debugging the mockup assert not pair[0].is_reverse assert not pair[0].is_read1 - assert pair[0].is_read2 assert pair[1].is_reverse assert pair[1].is_read1 - assert not pair[1].is_read2 assert sequenced_strand(pair[0], 2) == STRAND.POS assert evidence.decide_sequenced_strand([pair[0]]) == STRAND.POS assert sequenced_strand(pair[1], 2) == STRAND.POS diff --git a/tests/test_mavis/validate/test_evidence.py b/tests/test_mavis/validate/test_evidence.py index 0c6c1420..56cb7b2a 100644 --- a/tests/test_mavis/validate/test_evidence.py +++ b/tests/test_mavis/validate/test_evidence.py @@ -11,6 +11,7 @@ from mavis.interval import Interval from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence, TranscriptomeEvidence +from mavis.validate.gather import collect_flanking_pair from mavis_config import DEFAULTS from ..mock import MockBamFileHandle, MockObject, MockRead, mock_read_pair @@ -202,8 +203,18 @@ class TestComputeFragmentSizes: def test_genomic_vs_trans_no_annotations(self, genomic_evidence, read_length, trans_evidence): # should be identical read, mate = mock_read_pair( - MockRead('name', '1', 1051 - read_length + 1, 1051, is_reverse=False), - MockRead('name', '1', 2300, 2300 + read_length - 1, is_reverse=True), + MockRead( + query_name='name', + reference_name='1', + reference_start=1051 - read_length + 1, + cigarstring=str(read_length) + 'M', + ), + MockRead( + query_name='name', + reference_name='1', + reference_start=2300, + cigarstring=str(read_length) + 'M', + ), ) assert genomic_evidence.compute_fragment_size( read, mate @@ -211,8 +222,18 @@ def test_genomic_vs_trans_no_annotations(self, genomic_evidence, read_length, tr def test_reverse_reads(self, genomic_evidence, trans_evidence): read, mate = mock_read_pair( - MockRead('name', '1', 1001, 1100, is_reverse=False), - MockRead('name', '1', 2201, 2301, is_reverse=True), + MockRead( + query_name='name', + reference_name='1', + reference_start=1001, + cigarstring='99M', + ), + MockRead( + query_name='name', + reference_name='1', + reference_start=2300, + cigarstring='100M', + ), ) assert genomic_evidence.compute_fragment_size(read, mate) == Interval(1300) assert genomic_evidence.compute_fragment_size(mate, read) == Interval(1300) @@ -613,58 +634,143 @@ def flanking_ge(read_length): class TestGenomeEvidenceAddReads: def test_collect_flanking_pair_error_unmapped_read(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test', 0, 900, 1000, is_reverse=False), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test', + reference_name='1', + reference_start=900, + cigarstring='100M', + is_reverse=False, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) read.is_unmapped = True with pytest.raises(ValueError): - flanking_ge.collect_flanking_pair(read, mate) + collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_error_mate_unmapped(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test', 0, 900, 1000, is_reverse=False), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test', + reference_name='1', + reference_start=900, + cigarstring=str(1000 - 900) + 'M', + is_reverse=False, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) mate.is_unmapped = True with pytest.raises(ValueError): - flanking_ge.collect_flanking_pair(read, mate) + collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_error_query_names_dont_match(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test1', 0, 900, 1000, is_reverse=False), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test1', + reference_name='1', + reference_start=900, + cigarstring=str(1000 - 900) + 'M', + is_reverse=False, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) with pytest.raises(ValueError): - flanking_ge.collect_flanking_pair(read, mate) + collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_error_template_lengths_dont_match(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test', 0, 900, 1000, is_reverse=False, template_length=50), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test', + reference_name='1', + reference_start=900, + cigarstring=str(1000 - 900) + 'M', + is_reverse=False, + template_length=50, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) mate.template_length = 55 with pytest.raises(ValueError): - flanking_ge.collect_flanking_pair(read, mate) + collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_read_low_mq(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test', 0, 900, 1000, is_reverse=False), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test', + reference_name='1', + reference_start=900, + cigarstring=str(1000 - 900) + 'M', + is_reverse=False, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) read.mapping_quality = 0 - assert not flanking_ge.collect_flanking_pair(read, mate) + assert not collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_mate_low_mq(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test', 0, 900, 1000, is_reverse=False), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test', + reference_name='1', + reference_start=900, + cigarstring=str(1000 - 900) + 'M', + is_reverse=False, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) mate.mapping_quality = 0 - assert not flanking_ge.collect_flanking_pair(read, mate) + assert not collect_flanking_pair(flanking_ge, read, mate) def test_collect_flanking_pair_interchromosomal(self, flanking_ge): read, mate = mock_read_pair( - MockRead('test', 1, 900, 1000, is_reverse=False), - MockRead('test', 0, 6000, 6099, is_reverse=True), + MockRead( + query_name='test', + reference_name='2', + reference_start=900, + cigarstring=str(1000 - 900) + 'M', + is_reverse=False, + ), + MockRead( + query_name='test', + reference_name='1', + reference_start=6000, + cigarstring=str(6099 - 6000) + 'M', + is_reverse=True, + ), ) - assert not flanking_ge.collect_flanking_pair(read, mate) + assert not collect_flanking_pair(flanking_ge, read, mate) diff --git a/tests/test_mavis/validate/test_validate2.py b/tests/test_mavis/validate/test_validate2.py index 0ec4278d..3d0f471a 100644 --- a/tests/test_mavis/validate/test_validate2.py +++ b/tests/test_mavis/validate/test_validate2.py @@ -7,14 +7,23 @@ from mavis.constants import NA_MAPPING_QUALITY, ORIENT, PYSAM_READ_FLAGS from mavis.validate.base import Evidence from mavis.validate.evidence import GenomeEvidence +from mavis.validate.gather import collect_flanking_pair, collect_split_read, load_evidence from mavis_config import DEFAULTS from ...util import get_data, long_running_test -from ..mock import MockLongString, MockObject, MockRead, mock_read_pair +from ..mock import MockLongString, MockObject, MockRead, flags_from_number, mock_read_pair REFERENCE_GENOME = None +class MockBamCache(BamCache): + def get_read_reference_name(self, read): + if isinstance(read, MockRead): + return read.reference_name + else: + return BamCache.get_read_reference_name(self, read) + + def setUpModule(): global REFERENCE_GENOME REFERENCE_GENOME = load_reference_genome(get_data('mock_reference_genome.fa')) @@ -24,9 +33,9 @@ def setUpModule(): ): raise AssertionError('fake genome file does not have the expected contents') global BAM_CACHE - BAM_CACHE = BamCache(get_data('mini_mock_reads_for_events.sorted.bam')) + BAM_CACHE = MockBamCache(get_data('mini_mock_reads_for_events.sorted.bam')) global FULL_BAM_CACHE - FULL_BAM_CACHE = BamCache(get_data('mock_reads_for_events.sorted.bam')) + FULL_BAM_CACHE = MockBamCache(get_data('mock_reads_for_events.sorted.bam')) global READS READS = {} for read in BAM_CACHE.fetch('reference3', 1, 8000): @@ -125,7 +134,7 @@ def test_load_evidence_translocation(self): Breakpoint('reference19', 964, orient=ORIENT.LEFT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 14 assert self.count_original_reads(ev1.split_reads[1]) == 20 @@ -137,7 +146,7 @@ def test_load_evidence_translocation(self): Breakpoint('reference4', 2000, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 21 # one of the reads that appears to look good in the bam is too low quality % match @@ -152,7 +161,7 @@ def test_load_evidence_inversion(self): opposing_strands=True, ) - ev1.load_evidence() + load_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 54 assert self.count_original_reads(ev1.split_reads[1]) == 20 @@ -164,7 +173,7 @@ def test_load_evidence_inversion(self): Breakpoint('reference7', 19000, orient=ORIENT.RIGHT), opposing_strands=True, ) - ev1.load_evidence() + load_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[1]) == 15 assert self.count_original_reads(ev1.split_reads[0]) == 27 @@ -176,7 +185,7 @@ def test_load_evidence_duplication(self): Breakpoint('reference7', 11000, orient=ORIENT.LEFT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 35 @@ -190,7 +199,7 @@ def test_load_evidence_deletion1(self): Breakpoint('reference20', 6000, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert len(ev1.flanking_pairs) == 49 @@ -204,7 +213,7 @@ def test_load_evidence_deletion2(self): Breakpoint('referenceX', 6000, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 4 @@ -218,7 +227,7 @@ def test_load_evidence_deletion3(self): Breakpoint('referenceX', 14000, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 8 @@ -232,7 +241,7 @@ def test_load_evidence_deletion4(self): Breakpoint('reference10', 3818, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) assert self.count_original_reads(ev1.split_reads[0]) == 20 @@ -246,7 +255,7 @@ def test_load_evidence_small_deletion1(self): Breakpoint('reference11', 6003, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -264,7 +273,7 @@ def test_load_evidence_small_deletion2(self): Breakpoint('reference11', 10030, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -283,7 +292,7 @@ def test_load_evidence_small_deletion_test1(self): Breakpoint('reference12', 2120, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -302,7 +311,7 @@ def test_load_evidence_small_deletion_test2(self): Breakpoint('reference10', 3818, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) assert self.count_original_reads(ev1.split_reads[0]) == 20 assert self.count_original_reads(ev1.split_reads[1]) == 18 @@ -315,7 +324,7 @@ def test_load_evidence_small_deletion_test3(self): Breakpoint('reference10', 8927, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -334,7 +343,7 @@ def test_load_evidence_small_deletion_test4(self): Breakpoint('reference10', 13123, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -353,7 +362,7 @@ def test_load_evidence_small_deletion_test5(self): Breakpoint('reference10', 17899, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -372,7 +381,7 @@ def test_load_evidence_small_deletion_test6(self): Breakpoint('reference10', 24330, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) print(len(ev1.spanning_reads)) @@ -389,7 +398,7 @@ def test_load_evidence_small_deletion_test7(self): Breakpoint('reference10', 31827, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -407,7 +416,7 @@ def test_load_evidence_small_deletion_test8(self): Breakpoint('reference10', 42159, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -426,7 +435,7 @@ def test_load_evidence_complex_deletion(self): Breakpoint('reference12', 6016, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -449,7 +458,7 @@ def test_load_evidence_small_insertion(self): Breakpoint('reference1', 2001, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -469,7 +478,7 @@ def test_load_evidence_small_insertion_high_coverage(self): Breakpoint('reference9', 2001, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -487,7 +496,7 @@ def test_load_evidence_small_insertion_high_coverage(self): Breakpoint('reference16', 2001, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -506,7 +515,7 @@ def test_load_evidence_small_duplication(self): Breakpoint('reference12', 10021, orient=ORIENT.LEFT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -525,7 +534,7 @@ def test_load_evidence_small_duplication(self): Breakpoint('reference17', 2020, orient=ORIENT.LEFT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs), len(ev1.spanning_reads)) @@ -544,7 +553,7 @@ def test_load_evidence_low_qual_deletion(self): Breakpoint('reference19', 5219, 5219, orient=ORIENT.RIGHT), opposing_strands=False, ) - ev1.load_evidence() + load_evidence(ev1) self.print_evidence(ev1) print(len(ev1.spanning_reads)) print(len(ev1.split_reads[0]), len(ev1.flanking_pairs)) @@ -576,57 +585,49 @@ class TestEvidenceGathering: def test_collect_split_read(self, ev_gathering_setup): ev1_sr = MockRead( query_name='HISEQX1_11:3:1105:15351:25130:split', - reference_id=1, - cigar=[(4, 68), (7, 82)], + reference_name='reference3', + cigarstring='68S82=', reference_start=1114, - reference_end=1154, - query_alignment_start=110, query_sequence='TCGTGAGTGGCAGGTGCCATCGTGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTG', - query_alignment_end=150, - flag=113, - next_reference_id=1, + **flags_from_number(113), + next_reference_name='1', next_reference_start=2341, ) - ev_gathering_setup.collect_split_read(ev1_sr, True) + collect_split_read(ev_gathering_setup, ev1_sr, True) + assert len(ev_gathering_setup.split_reads[0]) > 0 assert list(ev_gathering_setup.split_reads[0])[0] == ev1_sr def test_collect_split_read_failure(self, ev_gathering_setup): # wrong cigar string ev1_sr = MockRead( query_name='HISEQX1_11:4:1203:3062:55280:split', - reference_id=1, - cigar=[(7, 110), (7, 40)], + reference_name='reference3', + cigarstring='40=110S', reference_start=1114, - reference_end=1154, - query_alignment_start=110, query_sequence='CTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCGTGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATG', - query_alignment_end=150, - flag=371, - next_reference_id=1, + **flags_from_number(371), + next_reference_name='1', next_reference_start=2550, ) - assert not ev_gathering_setup.collect_split_read(ev1_sr, True) + assert not collect_split_read(ev_gathering_setup, ev1_sr, True) def test_collect_flanking_pair(self, ev_gathering_setup): - ev_gathering_setup.collect_flanking_pair( - MockRead( - reference_id=1, - reference_start=2214, - reference_end=2364, - is_reverse=True, - next_reference_id=1, - next_reference_start=1120, - mate_is_reverse=True, - ), - MockRead( - reference_id=1, - reference_start=1120, - reference_end=2364, - is_reverse=True, - next_reference_id=1, - next_reference_start=1120, - mate_is_reverse=True, - is_read1=False, + collect_flanking_pair( + ev_gathering_setup, + *mock_read_pair( + MockRead( + reference_name='reference3', + reference_start=2214, + cigarstring=str(2364 - 2214) + 'M', + is_reverse=True, + ), + MockRead( + reference_name='reference3', + reference_start=1120, + cigarstring=str(2364 - 1120) + 'M', + is_reverse=True, + ), + proper_pair=False, ), ) assert len(ev_gathering_setup.flanking_pairs) == 1 @@ -635,15 +636,26 @@ def test_collect_flanking_pair_not_overlapping_evidence_window(self, ev_gatherin # first read in pair does not overlap the first evidence window # therefore this should return False and not add to the flanking_pairs pair = mock_read_pair( - MockRead(reference_id=1, reference_start=1903, reference_end=2053, is_reverse=True), - MockRead(reference_id=1, reference_start=2052, reference_end=2053, is_reverse=True), + MockRead( + reference_name='reference3', + reference_start=1903, + cigarstring=str(2053 - 1903) + 'M', + is_reverse=True, + ), + MockRead( + reference_name='reference3', + reference_start=2052, + cigarstring=str(2053 - 2052) + 'M', + is_reverse=True, + ), + proper_pair=False, ) - assert not ev_gathering_setup.collect_flanking_pair(*pair) + assert not collect_flanking_pair(ev_gathering_setup, *pair) assert len(ev_gathering_setup.flanking_pairs) == 0 def test_load_evidence(self, ev_gathering_setup): print(ev_gathering_setup) - ev_gathering_setup.load_evidence() + load_evidence(ev_gathering_setup) print(ev_gathering_setup.spanning_reads) assert ( len( @@ -669,41 +681,59 @@ def test_load_evidence(self, ev_gathering_setup): def test_assemble_split_reads(self, ev_gathering_setup): sr1 = MockRead( + cigarstring='150M', + reference_start=1, query_name='HISEQX1_11:3:1105:15351:25130:split', query_sequence='TCGTGAGTGGCAGGTGCCATCGTGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTG', - flag=113, + **flags_from_number(113), ) sr2 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='GTCGTGAGTGGCAGGTGCCATCGTGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTT', - flag=121, + **flags_from_number(121), ) sr3 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='TCGTGAGTGGCAGGTGCCATCGTGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTG', - flag=113, + **flags_from_number(113), ) sr7 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='TGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATA', - flag=113, + **flags_from_number(113), ) sr9 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='TGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGC', - flag=113, + **flags_from_number(113), ) sr12 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='GATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAG', - flag=113, + **flags_from_number(113), ) sr15 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='GTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACA', - flag=113, + **flags_from_number(113), ) sr19 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='TGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCA', - flag=113, + **flags_from_number(113), ) sr24 = MockRead( + cigarstring='150M', + reference_start=1, query_sequence='CTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTT', - flag=113, + **flags_from_number(113), ) ev_gathering_setup.split_reads = ( {sr1},