diff --git a/notebooks/getting_started/1_Quick_Start.ipynb b/notebooks/getting_started/1_Quick_Start.ipynb index 98fe1e23..e4c22d55 100644 --- a/notebooks/getting_started/1_Quick_Start.ipynb +++ b/notebooks/getting_started/1_Quick_Start.ipynb @@ -2,8 +2,15 @@ "cells": [ { "cell_type": "markdown", + "id": "7f11c55b5ce145ee", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ - "# 1 Quick Start\n", + "# 1. Quick Start\n", "This notebook provides instructions on how to get your vrs-python environment up and running with as few\n", "steps as possible, and to provide some rudimentary examples to prove it is working properly.\n", "\n", @@ -18,17 +25,16 @@ " UTA_DB_URL=\"postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20210129b\"\n", "\n", "**NOTE** The external sources for the SeqRepo and UTA repositories are **ONLY** to be used as part of this notebook series and are not meant for use in production code. Please refer to the links above and follow the directions provided on how to setup local instances." - ], - "metadata": { - "collapsed": false - }, - "id": "7f11c55b5ce145ee" + ] }, { "cell_type": "markdown", "id": "91f92a3e35bd48a1", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "#### Step 1 - Setup Data Proxy Access\n", @@ -40,10 +46,13 @@ "execution_count": 1, "id": "37130d69b9dbd9d1", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-18T20:21:17.340604Z", "start_time": "2024-04-18T20:21:17.207979Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, "outputs": [], @@ -57,7 +66,10 @@ "cell_type": "markdown", "id": "a1d5bbc3a77ff03f", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "Assert that the UTA URL is defined in the environment" @@ -68,10 +80,13 @@ "execution_count": 2, "id": "b11653c9aae4ecba", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-18T20:21:17.343195Z", "start_time": "2024-04-18T20:21:17.341690Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, "outputs": [], @@ -84,7 +99,10 @@ "cell_type": "markdown", "id": "98ab29e5ac01cc3a", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "#### Step 2 - Setup an Allele Translator\n", @@ -96,10 +114,13 @@ "execution_count": 3, "id": "42bd6d6f09916724", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-18T20:21:17.393606Z", "start_time": "2024-04-18T20:21:17.343980Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, "outputs": [], @@ -112,7 +133,10 @@ "cell_type": "markdown", "id": "88d9aba51e44ae0", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "#### Step 3 - Translate variation representations to VRS\n", @@ -125,16 +149,31 @@ "execution_count": 4, "id": "af7a8f1509acf4ed", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-18T20:21:17.525029Z", "start_time": "2024-04-18T20:21:17.394248Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, "outputs": [ { "data": { - "text/plain": "{'id': 'ga4gh:VA.LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n 'type': 'Allele',\n 'digest': 'LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n 'location': {'id': 'ga4gh:SL.nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n 'type': 'SequenceLocation',\n 'digest': 'nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n 'sequenceReference': {'type': 'SequenceReference',\n 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n 'start': 80656509,\n 'end': 80656510},\n 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'TT'}}" + "text/plain": [ + "{'id': 'ga4gh:VA.LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n", + " 'type': 'Allele',\n", + " 'digest': 'LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n", + " 'location': {'id': 'ga4gh:SL.nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'type': 'SequenceLocation',\n", + " 'digest': 'nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'sequenceReference': {'type': 'SequenceReference',\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656509,\n", + " 'end': 80656510},\n", + " 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'TT'}}" + ] }, "execution_count": 4, "metadata": {}, @@ -150,7 +189,10 @@ "cell_type": "markdown", "id": "6fa7cb2ac050547f", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "The output above is the JSON structure of an *Allele* in VRS form. You should be able to recognize the *Allele*, *SequenceLocation*, *SequenceReference* and *LiteralSequenceLocation* classes. \n", @@ -163,16 +205,31 @@ "execution_count": 5, "id": "42430c680fe262c1", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-18T20:21:18.708118Z", "start_time": "2024-04-18T20:21:17.526585Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, "outputs": [ { "data": { - "text/plain": "{'id': 'ga4gh:VA.LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n 'type': 'Allele',\n 'digest': 'LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n 'location': {'id': 'ga4gh:SL.nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n 'type': 'SequenceLocation',\n 'digest': 'nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n 'sequenceReference': {'type': 'SequenceReference',\n 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n 'start': 80656509,\n 'end': 80656510},\n 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'TT'}}" + "text/plain": [ + "{'id': 'ga4gh:VA.LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n", + " 'type': 'Allele',\n", + " 'digest': 'LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n", + " 'location': {'id': 'ga4gh:SL.nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'type': 'SequenceLocation',\n", + " 'digest': 'nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'sequenceReference': {'type': 'SequenceReference',\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656509,\n", + " 'end': 80656510},\n", + " 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'TT'}}" + ] }, "execution_count": 5, "metadata": {}, @@ -186,29 +243,35 @@ }, { "cell_type": "markdown", - "source": [ - "The VRS variant representations should be the same." - ], + "id": "2c6d2303b4bda87c", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "id": "2c6d2303b4bda87c" + "source": [ + "The VRS variant representations should be the same." + ] }, { "cell_type": "code", - "outputs": [], - "source": [ - "assert(vrs_from_hgvs == vrs_from_spdi)" - ], + "execution_count": 6, + "id": "3c91be2bdec6b4be", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-18T20:21:18.713821Z", "start_time": "2024-04-18T20:21:18.710176Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, - "id": "3c91be2bdec6b4be", - "execution_count": 6 + "outputs": [], + "source": [ + "assert(vrs_from_hgvs == vrs_from_spdi)" + ] } ], "metadata": { @@ -227,7 +290,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/getting_started/2_Exploring_the_SeqRepo_DataProxy.ipynb b/notebooks/getting_started/2_Exploring_the_SeqRepo_DataProxy.ipynb index 93dbf1d3..f5e81a09 100644 --- a/notebooks/getting_started/2_Exploring_the_SeqRepo_DataProxy.ipynb +++ b/notebooks/getting_started/2_Exploring_the_SeqRepo_DataProxy.ipynb @@ -2,70 +2,99 @@ "cells": [ { "cell_type": "markdown", - "source": [ - "# 2 Exploring the SeqRepo DataProxy\n", - "The SeqRepo DataProxy has sequence related functionality that may be of use." - ], + "id": "4107043be5d9af0d", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "id": "4107043be5d9af0d" + "source": [ + "# 2. Exploring the SeqRepo DataProxy\n", + "The SeqRepo DataProxy has sequence related functionality that may be of use." + ] }, { "cell_type": "markdown", + "id": "2cd1dae76c042895", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "#### Step 1 - Setup Data Proxy Access\n", "The *DataProxy* provides access to sequence references." - ], - "metadata": { - "collapsed": false - }, - "id": "2cd1dae76c042895" + ] }, { "cell_type": "code", - "outputs": [], - "source": [ - "from ga4gh.vrs.dataproxy import create_dataproxy\n", - "seqrepo_rest_service_url = \"seqrepo+https://services.genomicmedlab.org/seqrepo\"\n", - "seqrepo_dataproxy = create_dataproxy(uri=seqrepo_rest_service_url)" - ], + "execution_count": 1, + "id": "f5057501e0ff48aa", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-18T20:22:19.380351Z", "start_time": "2024-04-18T20:22:19.248562Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, - "id": "f5057501e0ff48aa", - "execution_count": 1 + "outputs": [], + "source": [ + "from ga4gh.vrs.dataproxy import create_dataproxy\n", + "seqrepo_rest_service_url = \"seqrepo+https://services.genomicmedlab.org/seqrepo\"\n", + "seqrepo_dataproxy = create_dataproxy(uri=seqrepo_rest_service_url)" + ] }, { "cell_type": "markdown", - "source": [ - "#### Step 2 - Information on refseq accessions" - ], + "id": "80b8074e25870aee", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "id": "80b8074e25870aee" + "source": [ + "#### Step 2 - Information on refseq accessions" + ] }, { "cell_type": "markdown", - "source": [ - "It is often necessary when building *SequenceLocation* objects, to obtain the refget accession from a public accession identifier. The *DataProxy* method *derive_refget_accession* can do this for you." - ], + "id": "efdf07f650059a11", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "id": "efdf07f650059a11" + "source": [ + "It is often necessary when building *SequenceLocation* objects, to obtain the refget accession from a public accession identifier. The *DataProxy* method *derive_refget_accession* can do this for you." + ] }, { "cell_type": "code", + "execution_count": 2, + "id": "77fa2f312e39d4a3", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:22:19.515713Z", + "start_time": "2024-04-18T20:22:19.381606Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "'SQ.Pw3Ch0x3XWD6ljsnIfmk_NERcZCI9sNM'" + "text/plain": [ + "'SQ.Pw3Ch0x3XWD6ljsnIfmk_NERcZCI9sNM'" + ] }, "execution_count": 2, "metadata": {}, @@ -74,33 +103,51 @@ ], "source": [ "seqrepo_dataproxy.derive_refget_accession('refseq:NM_002439.5')" - ], + ] + }, + { + "cell_type": "markdown", + "id": "203aada74390820e", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-18T20:22:19.515713Z", - "start_time": "2024-04-18T20:22:19.381606Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "77fa2f312e39d4a3", - "execution_count": 2 - }, - { - "cell_type": "markdown", "source": [ "The *DataProxy* *get_metadata* method provides metadata information on the accession including: the date the accession was added, aliases for the accession and reference length." - ], - "metadata": { - "collapsed": false - }, - "id": "203aada74390820e" + ] }, { "cell_type": "code", + "execution_count": 3, + "id": "bdb9122059add31c", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:22:19.694430Z", + "start_time": "2024-04-18T20:22:19.516491Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'added': '2016-08-24T05:03:11Z',\n 'aliases': ['MD5:215137b1973c1a5afcf86be7d999574a',\n 'NCBI:NM_000551.3',\n 'refseq:NM_000551.3',\n 'SEGUID:T12L0p2X5E8DbnL0+SwI4Wc1S6g',\n 'SHA1:4f5d8bd29d97e44f036e72f4f92c08e167354ba8',\n 'VMC:GS_v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_',\n 'sha512t24u:v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_',\n 'ga4gh:SQ.v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_'],\n 'alphabet': 'ACGT',\n 'length': 4560}" + "text/plain": [ + "{'added': '2016-08-24T05:03:11Z',\n", + " 'aliases': ['MD5:215137b1973c1a5afcf86be7d999574a',\n", + " 'NCBI:NM_000551.3',\n", + " 'refseq:NM_000551.3',\n", + " 'SEGUID:T12L0p2X5E8DbnL0+SwI4Wc1S6g',\n", + " 'SHA1:4f5d8bd29d97e44f036e72f4f92c08e167354ba8',\n", + " 'VMC:GS_v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_',\n", + " 'sha512t24u:v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_',\n", + " 'ga4gh:SQ.v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_'],\n", + " 'alphabet': 'ACGT',\n", + " 'length': 4560}" + ] }, "execution_count": 3, "metadata": {}, @@ -109,33 +156,41 @@ ], "source": [ "seqrepo_dataproxy.get_metadata(\"refseq:NM_000551.3\")" - ], + ] + }, + { + "cell_type": "markdown", + "id": "e73504bc095b1ff1", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-18T20:22:19.694430Z", - "start_time": "2024-04-18T20:22:19.516491Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "bdb9122059add31c", - "execution_count": 3 - }, - { - "cell_type": "markdown", "source": [ "*DataProxy* *get_sequence* returns actual sequence for given identifier, optionally limited to interbase intervals." - ], - "metadata": { - "collapsed": false - }, - "id": "e73504bc095b1ff1" + ] }, { "cell_type": "code", + "execution_count": 4, + "id": "7d5563eae9dd9e58", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:22:19.833640Z", + "start_time": "2024-04-18T20:22:19.695452Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "'CCTCGCCTCCGTTACAACGGCCTACGGTGCTGGAGGATCCTTCTGCGCACG'" + "text/plain": [ + "'CCTCGCCTCCGTTACAACGGCCTACGGTGCTGGAGGATCCTTCTGCGCACG'" + ] }, "execution_count": 4, "metadata": {}, @@ -145,33 +200,41 @@ "source": [ "identifier = \"ga4gh:SQ.v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_\"\n", "seqrepo_dataproxy.get_sequence(identifier, start=0, end=51)" - ], + ] + }, + { + "cell_type": "markdown", + "id": "7c1e8515d46f0fac", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-18T20:22:19.833640Z", - "start_time": "2024-04-18T20:22:19.695452Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "7d5563eae9dd9e58", - "execution_count": 4 - }, - { - "cell_type": "markdown", "source": [ "*DataProxy* *translate_sequence_identifier* returns a list of equivalent identifiers in the given namespace." - ], - "metadata": { - "collapsed": false - }, - "id": "7c1e8515d46f0fac" + ] }, { "cell_type": "code", + "execution_count": 5, + "id": "9ad294ec1b92bd86", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:22:19.940602Z", + "start_time": "2024-04-18T20:22:19.836067Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "['ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl']" + "text/plain": [ + "['ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl']" + ] }, "execution_count": 5, "metadata": {}, @@ -180,23 +243,28 @@ ], "source": [ "seqrepo_dataproxy.translate_sequence_identifier(\"GRCh38:19\", \"ga4gh\")" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a16ba639ccca0323", "metadata": { - "collapsed": false, "ExecuteTime": { - "end_time": "2024-04-18T20:22:19.940602Z", - "start_time": "2024-04-18T20:22:19.836067Z" + "end_time": "2024-04-18T20:22:20.069679Z", + "start_time": "2024-04-18T20:22:19.941670Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, - "id": "9ad294ec1b92bd86", - "execution_count": 5 - }, - { - "cell_type": "code", "outputs": [ { "data": { - "text/plain": "['GRCh38:19', 'GRCh38:chr19']" + "text/plain": [ + "['GRCh38:19', 'GRCh38:chr19']" + ] }, "execution_count": 6, "metadata": {}, @@ -205,35 +273,26 @@ ], "source": [ "seqrepo_dataproxy.translate_sequence_identifier(\"ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\", \"GRCh38\")" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-18T20:22:20.069679Z", - "start_time": "2024-04-18T20:22:19.941670Z" - } - }, - "id": "a16ba639ccca0323", - "execution_count": 6 + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/getting_started/3_Basic_Models.ipynb b/notebooks/getting_started/3_Basic_Models.ipynb index 70871290..87931317 100644 --- a/notebooks/getting_started/3_Basic_Models.ipynb +++ b/notebooks/getting_started/3_Basic_Models.ipynb @@ -2,94 +2,127 @@ "cells": [ { "cell_type": "markdown", - "source": [ - "# 3 Basic Models\n", - "This notebook details how to compose VRS objects using component classes, not by use of a nomenclature string (HGVS/SPDI/Gnomad-VCF).\n" - ], + "id": "3b784af70aaada45", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "id": "3b784af70aaada45" + "source": [ + "# 3. Basic Models\n", + "This notebook details how to compose VRS objects using component classes, not by use of a nomenclature string (HGVS/SPDI/Gnomad-VCF).\n" + ] }, { "cell_type": "markdown", + "id": "6567dd63bcbe6969", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "#### Step 1 - Setup Data Proxy Access\n", "The *DataProxy* provides access to sequence references." - ], - "metadata": { - "collapsed": false - }, - "id": "6567dd63bcbe6969" + ] }, { "cell_type": "code", - "outputs": [], - "source": [ - "from ga4gh.vrs.dataproxy import create_dataproxy\n", - "seqrepo_rest_service_url = \"seqrepo+https://services.genomicmedlab.org/seqrepo\"\n", - "seqrepo_dataproxy = create_dataproxy(uri=seqrepo_rest_service_url)" - ], + "execution_count": 1, + "id": "4347f39231fd663c", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-18T20:23:57.240826Z", "start_time": "2024-04-18T20:23:57.106544Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, - "id": "4347f39231fd663c", - "execution_count": 1 + "outputs": [], + "source": [ + "from ga4gh.vrs.dataproxy import create_dataproxy\n", + "seqrepo_rest_service_url = \"seqrepo+https://services.genomicmedlab.org/seqrepo\"\n", + "seqrepo_dataproxy = create_dataproxy(uri=seqrepo_rest_service_url)" + ] }, { "cell_type": "markdown", + "id": "ae2e3ada0adaccdf", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "#### Step 2 - Access the VRS models package\n", "The models package contains the various classes necessary for building VRS objects." - ], - "metadata": { - "collapsed": false - }, - "id": "ae2e3ada0adaccdf" + ] }, { "cell_type": "code", - "outputs": [], - "source": [ - "from ga4gh.vrs import models" - ], + "execution_count": 2, + "id": "da9f8a5f19d1ea35", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-18T20:23:57.243477Z", "start_time": "2024-04-18T20:23:57.241894Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, - "id": "da9f8a5f19d1ea35", - "execution_count": 2 + "outputs": [], + "source": [ + "from ga4gh.vrs import models" + ] }, { "cell_type": "markdown", + "id": "63f99fabb02e236f", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "#### Step 3 - Build the Allele\n", "In this example we are going to build a VRS object from the variant \"NC_000005.10:g.80656510delinsTT\". This variant can be viewed in [Clinvar](https://www.ncbi.nlm.nih.gov/clinvar/variation/2673535/)." - ], - "metadata": { - "collapsed": false - }, - "id": "63f99fabb02e236f" + ] }, { "cell_type": "markdown", - "source": [ - "Start by getting the VRS string representation of the sequence reference using the *DataProxy* object." - ], + "id": "c1a3db39d8d2d3ea", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "id": "c1a3db39d8d2d3ea" + "source": [ + "Start by getting the VRS string representation of the sequence reference using the *DataProxy* object." + ] }, { "cell_type": "code", + "execution_count": 3, + "id": "320242aa48ef314d", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:23:57.304096Z", + "start_time": "2024-04-18T20:23:57.244087Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "name": "stdout", @@ -102,33 +135,47 @@ "source": [ "refget_accession = seqrepo_dataproxy.derive_refget_accession('refseq:NM_002439.5')\n", "print(refget_accession)" - ], + ] + }, + { + "cell_type": "markdown", + "id": "6ac6feb2b9ffb0c", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-18T20:23:57.304096Z", - "start_time": "2024-04-18T20:23:57.244087Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "320242aa48ef314d", - "execution_count": 3 - }, - { - "cell_type": "markdown", "source": [ "Build a dictionary of type *SequenceReference* containing the refget_accession. Then continue in succession building dictionaries of type *SequenceLocation*, *LiteralSequenceExpression* and *Allele* referencing previously built structures where applicable." - ], - "metadata": { - "collapsed": false - }, - "id": "6ac6feb2b9ffb0c" + ] }, { "cell_type": "code", + "execution_count": 4, + "id": "445983b1043c504f", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:23:57.308698Z", + "start_time": "2024-04-18T20:23:57.304734Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'type': 'Allele',\n 'location': {'type': 'SequenceLocation',\n 'sequenceReference': {'type': 'SequenceReference',\n 'refgetAccession': 'SQ.Pw3Ch0x3XWD6ljsnIfmk_NERcZCI9sNM'},\n 'start': 80656509,\n 'end': 80656510},\n 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'TT'}}" + "text/plain": [ + "{'type': 'Allele',\n", + " 'location': {'type': 'SequenceLocation',\n", + " 'sequenceReference': {'type': 'SequenceReference',\n", + " 'refgetAccession': 'SQ.Pw3Ch0x3XWD6ljsnIfmk_NERcZCI9sNM'},\n", + " 'start': 80656509,\n", + " 'end': 80656510},\n", + " 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'TT'}}" + ] }, "execution_count": 4, "metadata": {}, @@ -160,29 +207,35 @@ "}\n", "allele = models.Allele(**allele_dict)\n", "allele.model_dump(exclude_none=True)" - ], + ] + }, + { + "cell_type": "markdown", + "id": "b69c827d943f1ef1", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-18T20:23:57.308698Z", - "start_time": "2024-04-18T20:23:57.304734Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "445983b1043c504f", - "execution_count": 4 - }, - { - "cell_type": "markdown", "source": [ "The *Allele* object is displayed above. Since it was built from component dictionaries, it is not yet complete as not all the identifiable objects have VRS identifiers. Note that not all objects in the Allele object are VRS identifiable." - ], - "metadata": { - "collapsed": false - }, - "id": "b69c827d943f1ef1" + ] }, { "cell_type": "code", + "execution_count": 5, + "id": "60b41e56a20d4e10", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:23:57.312561Z", + "start_time": "2024-04-18T20:23:57.310213Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "name": "stdout", @@ -205,34 +258,52 @@ "literal_sequence_expression = models.LiteralSequenceExpression(**literal_sequence_expression_dict)\n", "is_identifiable(literal_sequence_expression)\n", "is_identifiable(allele)" - ], + ] + }, + { + "cell_type": "markdown", + "id": "91619f7666c0bcf2", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-18T20:23:57.312561Z", - "start_time": "2024-04-18T20:23:57.310213Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "60b41e56a20d4e10", - "execution_count": 5 - }, - { - "cell_type": "markdown", "source": [ "#### Step 4 - Compute the identifiers\n", "To make the *Allele* object a valid VRS object - that is that all identifiable objects have valid VRS identifiers - is to use the *ga4gh_identify* method on the identifiable objects (*SequenceLocation*, and *Allele*)." - ], - "metadata": { - "collapsed": false - }, - "id": "91619f7666c0bcf2" + ] }, { "cell_type": "code", + "execution_count": 6, + "id": "5ad675932601aa94", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:23:57.315549Z", + "start_time": "2024-04-18T20:23:57.313165Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'id': 'ga4gh:VA.5C67OBmCLuHPgDkCQj7EOMih58BS2Eor',\n 'type': 'Allele',\n 'digest': '5C67OBmCLuHPgDkCQj7EOMih58BS2Eor',\n 'location': {'id': 'ga4gh:SL.lGxOP1JRd4dysmrOVaskO5P_35DyCLnx',\n 'type': 'SequenceLocation',\n 'digest': 'lGxOP1JRd4dysmrOVaskO5P_35DyCLnx',\n 'sequenceReference': {'type': 'SequenceReference',\n 'refgetAccession': 'SQ.Pw3Ch0x3XWD6ljsnIfmk_NERcZCI9sNM'},\n 'start': 80656509,\n 'end': 80656510},\n 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'TT'}}" + "text/plain": [ + "{'id': 'ga4gh:VA.5C67OBmCLuHPgDkCQj7EOMih58BS2Eor',\n", + " 'type': 'Allele',\n", + " 'digest': '5C67OBmCLuHPgDkCQj7EOMih58BS2Eor',\n", + " 'location': {'id': 'ga4gh:SL.lGxOP1JRd4dysmrOVaskO5P_35DyCLnx',\n", + " 'type': 'SequenceLocation',\n", + " 'digest': 'lGxOP1JRd4dysmrOVaskO5P_35DyCLnx',\n", + " 'sequenceReference': {'type': 'SequenceReference',\n", + " 'refgetAccession': 'SQ.Pw3Ch0x3XWD6ljsnIfmk_NERcZCI9sNM'},\n", + " 'start': 80656509,\n", + " 'end': 80656510},\n", + " 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'TT'}}" + ] }, "execution_count": 6, "metadata": {}, @@ -244,45 +315,39 @@ "allele.location.id = ga4gh_identify(allele.location)\n", "allele.id = ga4gh_identify(allele)\n", "allele.model_dump(exclude_none=True)" - ], + ] + }, + { + "cell_type": "markdown", + "id": "37b5e28820b700fd", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-18T20:23:57.315549Z", - "start_time": "2024-04-18T20:23:57.313165Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "5ad675932601aa94", - "execution_count": 6 - }, - { - "cell_type": "markdown", "source": [ "The output of the *Allele* object represents a complete VRS allele with VRS identifiers and digests on all of the identifiable objects." - ], - "metadata": { - "collapsed": false - }, - "id": "37b5e28820b700fd" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/getting_started/4_Exploring_the_AlleleTranslator.ipynb b/notebooks/getting_started/4_Exploring_the_AlleleTranslator.ipynb index c04f9fe3..d8542b1e 100644 --- a/notebooks/getting_started/4_Exploring_the_AlleleTranslator.ipynb +++ b/notebooks/getting_started/4_Exploring_the_AlleleTranslator.ipynb @@ -2,115 +2,163 @@ "cells": [ { "cell_type": "markdown", + "id": "f3a35f19da823af8", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ - "# 4 Exploring the AlleleTranslator\n", + "# 4. Exploring the AlleleTranslator\n", "There are four variant nomenclatures available in the vrs-python *AlleleTranslator*: SPDI, gnomad/VCF, Beacon and HGVS. In this notebook we will perform a simple Allele translation for each. We will use each of the four nomenclatures for a single variant in translating variants to VRS. his variant can be viewed in \n", "[ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/variation/652570) and in [gnomAD](https://gnomad.broadinstitute.org/variant/5-80656489-C-T)." - ], - "metadata": { - "collapsed": false - }, - "id": "f3a35f19da823af8" + ] }, { "cell_type": "markdown", + "id": "5b3ec967f1e08834", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "#### Step 1 - Setup Data Proxy Access\n", "The *DataProxy* provides access to sequence references." - ], - "metadata": { - "collapsed": false - }, - "id": "5b3ec967f1e08834" + ] }, { "cell_type": "code", - "outputs": [], - "source": [ - "from ga4gh.vrs.dataproxy import create_dataproxy\n", - "seqrepo_rest_service_url = \"seqrepo+https://services.genomicmedlab.org/seqrepo\"\n", - "seqrepo_dataproxy = create_dataproxy(uri=seqrepo_rest_service_url)" - ], + "execution_count": 1, + "id": "4dd605526ab7227e", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-19T17:13:46.243963Z", "start_time": "2024-04-19T17:13:46.110956Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, - "id": "4dd605526ab7227e", - "execution_count": 1 + "outputs": [], + "source": [ + "from ga4gh.vrs.dataproxy import create_dataproxy\n", + "seqrepo_rest_service_url = \"seqrepo+https://services.genomicmedlab.org/seqrepo\"\n", + "seqrepo_dataproxy = create_dataproxy(uri=seqrepo_rest_service_url)" + ] }, { "cell_type": "markdown", - "source": [ - "Import the *AlleleTranslator* class." - ], + "id": "5f7ac7c602d40af7", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "id": "5f7ac7c602d40af7" + "source": [ + "Import the *AlleleTranslator* class." + ] }, { "cell_type": "code", - "outputs": [], - "source": [ - "from ga4gh.vrs.extras.translator import AlleleTranslator" - ], + "execution_count": 2, + "id": "24b10ea2d6ae9b0b", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-19T17:13:46.295212Z", "start_time": "2024-04-19T17:13:46.245063Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, - "id": "24b10ea2d6ae9b0b", - "execution_count": 2 + "outputs": [], + "source": [ + "from ga4gh.vrs.extras.translator import AlleleTranslator" + ] }, { "cell_type": "markdown", - "source": [ - "The UTA server is required in the environment since we are translating from/to HGVS." - ], + "id": "6381d3a17866d76d", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "id": "6381d3a17866d76d" + "source": [ + "The UTA server is required in the environment since we are translating from/to HGVS." + ] }, { "cell_type": "code", - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"UTA_DB_URL\"] = \"postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20210129b\"" - ], + "execution_count": 3, + "id": "864e5a9c0ee98257", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-19T17:13:46.297528Z", "start_time": "2024-04-19T17:13:46.295903Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, - "id": "864e5a9c0ee98257", - "execution_count": 3 + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"UTA_DB_URL\"] = \"postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20210129b\"" + ] }, { "cell_type": "markdown", + "id": "87ddbc19e906119f", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "#### From/To HGVS\n", "This example will translate an HGVS variant to VRS using the *AlleleTranslator* *translate_from* method." - ], - "metadata": { - "collapsed": false - }, - "id": "87ddbc19e906119f" + ] }, { "cell_type": "code", + "execution_count": 4, + "id": "925b01dd4764ed33", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-19T17:13:48.351951Z", + "start_time": "2024-04-19T17:13:46.298147Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'id': 'ga4gh:VA.ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n 'type': 'Allele',\n 'digest': 'ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n 'location': {'id': 'ga4gh:SL.JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n 'type': 'SequenceLocation',\n 'digest': 'JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n 'sequenceReference': {'type': 'SequenceReference',\n 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n 'start': 80656488,\n 'end': 80656489},\n 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'T'}}" + "text/plain": [ + "{'id': 'ga4gh:VA.ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n", + " 'type': 'Allele',\n", + " 'digest': 'ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n", + " 'location': {'id': 'ga4gh:SL.JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n", + " 'type': 'SequenceLocation',\n", + " 'digest': 'JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n", + " 'sequenceReference': {'type': 'SequenceReference',\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656488,\n", + " 'end': 80656489},\n", + " 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'T'}}" + ] }, "execution_count": 4, "metadata": {}, @@ -121,33 +169,41 @@ "allele_translator = AlleleTranslator(data_proxy=seqrepo_dataproxy)\n", "allele = allele_translator.translate_from(\"NC_000005.10:g.80656489C>T\", \"hgvs\")\n", "allele.model_dump(exclude_none=True)" - ], + ] + }, + { + "cell_type": "markdown", + "id": "f3951db9d1a1b833", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-19T17:13:48.351951Z", - "start_time": "2024-04-19T17:13:46.298147Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "925b01dd4764ed33", - "execution_count": 4 - }, - { - "cell_type": "markdown", "source": [ "The output from above is the VRS representation of the *Allele*. Using the *AlleleTranslator* *translate_to* method we can get back to the HGVS representation." - ], - "metadata": { - "collapsed": false - }, - "id": "f3951db9d1a1b833" + ] }, { "cell_type": "code", + "execution_count": 5, + "id": "722919c1d8cdd83b", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-19T17:13:49.804350Z", + "start_time": "2024-04-19T17:13:48.354473Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "['NC_000005.10:g.80656489C>T']" + "text/plain": [ + "['NC_000005.10:g.80656489C>T']" + ] }, "execution_count": 5, "metadata": {}, @@ -156,36 +212,54 @@ ], "source": [ "allele_translator.translate_to(allele, \"hgvs\")" - ], + ] + }, + { + "cell_type": "markdown", + "id": "508d821d68360f36", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-19T17:13:49.804350Z", - "start_time": "2024-04-19T17:13:48.354473Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "722919c1d8cdd83b", - "execution_count": 5 - }, - { - "cell_type": "markdown", "source": [ "The AlleleTranslator class by default will use \"GRCh38\" as the default assembly when performing translation. But the actual assembly used for translation will be inferred from the reference sequence passed as part of the HGVS variant. A specific default assembly may be specified when creating an AlleleTranslator by passing in the keyword argument \"default_assembly_name\" with the assembly:\n", "> AlleleTranslator(data_proxy=seqrepo_dataproxy, default_assembly_name=\"GRCh37\")\n", "\n", "This example is using the GRCh37 representation of the variant." - ], - "metadata": { - "collapsed": false - }, - "id": "508d821d68360f36" + ] }, { "cell_type": "code", + "execution_count": 6, + "id": "ef3ba37deafba7ac", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-19T17:13:50.060957Z", + "start_time": "2024-04-19T17:13:49.806216Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'id': 'ga4gh:VA.hEyB1sGiQrdrPFIq4u4CF17uAuUs2Wvx',\n 'type': 'Allele',\n 'digest': 'hEyB1sGiQrdrPFIq4u4CF17uAuUs2Wvx',\n 'location': {'id': 'ga4gh:SL.Y-itBtqe9IwbxyL4EVZ4T_X9TUsdbJ22',\n 'type': 'SequenceLocation',\n 'digest': 'Y-itBtqe9IwbxyL4EVZ4T_X9TUsdbJ22',\n 'sequenceReference': {'type': 'SequenceReference',\n 'refgetAccession': 'SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX'},\n 'start': 79952307,\n 'end': 79952308},\n 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'T'}}" + "text/plain": [ + "{'id': 'ga4gh:VA.hEyB1sGiQrdrPFIq4u4CF17uAuUs2Wvx',\n", + " 'type': 'Allele',\n", + " 'digest': 'hEyB1sGiQrdrPFIq4u4CF17uAuUs2Wvx',\n", + " 'location': {'id': 'ga4gh:SL.Y-itBtqe9IwbxyL4EVZ4T_X9TUsdbJ22',\n", + " 'type': 'SequenceLocation',\n", + " 'digest': 'Y-itBtqe9IwbxyL4EVZ4T_X9TUsdbJ22',\n", + " 'sequenceReference': {'type': 'SequenceReference',\n", + " 'refgetAccession': 'SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX'},\n", + " 'start': 79952307,\n", + " 'end': 79952308},\n", + " 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'T'}}" + ] }, "execution_count": 6, "metadata": {}, @@ -195,23 +269,28 @@ "source": [ "allele = allele_translator.translate_from(\"NC_000005.9:g.79952308C>T\", \"hgvs\")\n", "allele.model_dump(exclude_none=True)" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "adae25fd8dbca27c", "metadata": { - "collapsed": false, "ExecuteTime": { - "end_time": "2024-04-19T17:13:50.060957Z", - "start_time": "2024-04-19T17:13:49.806216Z" + "end_time": "2024-04-19T17:13:51.648795Z", + "start_time": "2024-04-19T17:13:50.062068Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, - "id": "ef3ba37deafba7ac", - "execution_count": 6 - }, - { - "cell_type": "code", "outputs": [ { "data": { - "text/plain": "['NC_000005.9:g.79952308C>T']" + "text/plain": [ + "['NC_000005.9:g.79952308C>T']" + ] }, "execution_count": 7, "metadata": {}, @@ -220,34 +299,52 @@ ], "source": [ "allele_translator.translate_to(allele, \"hgvs\")" - ], + ] + }, + { + "cell_type": "markdown", + "id": "728fbf740565a801", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-19T17:13:51.648795Z", - "start_time": "2024-04-19T17:13:50.062068Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "adae25fd8dbca27c", - "execution_count": 7 - }, - { - "cell_type": "markdown", "source": [ "#### From/To SPDI\n", "Example of translation a SPDI representation of a variant to and from VRS." - ], - "metadata": { - "collapsed": false - }, - "id": "728fbf740565a801" + ] }, { "cell_type": "code", + "execution_count": 8, + "id": "43873e55f82d10a0", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-19T17:13:51.658433Z", + "start_time": "2024-04-19T17:13:51.652705Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'id': 'ga4gh:VA.ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n 'type': 'Allele',\n 'digest': 'ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n 'location': {'id': 'ga4gh:SL.JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n 'type': 'SequenceLocation',\n 'digest': 'JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n 'sequenceReference': {'type': 'SequenceReference',\n 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n 'start': 80656488,\n 'end': 80656489},\n 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'T'}}" + "text/plain": [ + "{'id': 'ga4gh:VA.ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n", + " 'type': 'Allele',\n", + " 'digest': 'ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n", + " 'location': {'id': 'ga4gh:SL.JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n", + " 'type': 'SequenceLocation',\n", + " 'digest': 'JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n", + " 'sequenceReference': {'type': 'SequenceReference',\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656488,\n", + " 'end': 80656489},\n", + " 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'T'}}" + ] }, "execution_count": 8, "metadata": {}, @@ -257,23 +354,28 @@ "source": [ "allele = allele_translator.translate_from(\"NC_000005.10:80656488:C:T\",\"spdi\")\n", "allele.model_dump(exclude_none=True)" - ], + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "dd695b3dfc14a83e", "metadata": { - "collapsed": false, "ExecuteTime": { - "end_time": "2024-04-19T17:13:51.658433Z", - "start_time": "2024-04-19T17:13:51.652705Z" + "end_time": "2024-04-19T17:13:51.662545Z", + "start_time": "2024-04-19T17:13:51.659707Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, - "id": "43873e55f82d10a0", - "execution_count": 8 - }, - { - "cell_type": "code", "outputs": [ { "data": { - "text/plain": "['NC_000005.10:80656488:1:T']" + "text/plain": [ + "['NC_000005.10:80656488:1:T']" + ] }, "execution_count": 9, "metadata": {}, @@ -282,34 +384,52 @@ ], "source": [ "allele_translator.translate_to(allele, \"spdi\")" - ], + ] + }, + { + "cell_type": "markdown", + "id": "4e0911a7694a060", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-19T17:13:51.662545Z", - "start_time": "2024-04-19T17:13:51.659707Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "dd695b3dfc14a83e", - "execution_count": 9 - }, - { - "cell_type": "markdown", "source": [ "#### From Beacon (VCF-like)\n", "For variants represented in the Beacon nomenclature, the *AlleleTranslator* currently only supports *translate_from* to convert to VRS. *translate_to* is not yet supported." - ], - "metadata": { - "collapsed": false - }, - "id": "4e0911a7694a060" + ] }, { "cell_type": "code", + "execution_count": 10, + "id": "57f54e6c3854a48f", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-19T17:13:51.743020Z", + "start_time": "2024-04-19T17:13:51.663593Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'id': 'ga4gh:VA.ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n 'type': 'Allele',\n 'digest': 'ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n 'location': {'id': 'ga4gh:SL.JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n 'type': 'SequenceLocation',\n 'digest': 'JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n 'sequenceReference': {'type': 'SequenceReference',\n 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n 'start': 80656488,\n 'end': 80656489},\n 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'T'}}" + "text/plain": [ + "{'id': 'ga4gh:VA.ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n", + " 'type': 'Allele',\n", + " 'digest': 'ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n", + " 'location': {'id': 'ga4gh:SL.JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n", + " 'type': 'SequenceLocation',\n", + " 'digest': 'JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n", + " 'sequenceReference': {'type': 'SequenceReference',\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656488,\n", + " 'end': 80656489},\n", + " 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'T'}}" + ] }, "execution_count": 10, "metadata": {}, @@ -319,34 +439,52 @@ "source": [ "allele = allele_translator.translate_from(\"5 : 80656489 C > T\", \"beacon\")\n", "allele.model_dump(exclude_none=True)" - ], + ] + }, + { + "cell_type": "markdown", + "id": "c4a6abfd56b8fa1e", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-19T17:13:51.743020Z", - "start_time": "2024-04-19T17:13:51.663593Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "57f54e6c3854a48f", - "execution_count": 10 - }, - { - "cell_type": "markdown", "source": [ "#### From gnomAD style VCF\n", "For variants represented in the gnomad nomenclature, the *AlleleTranslator* currently only supports *translate_from* to convert to VRS. *translate_to* is not yet supported." - ], - "metadata": { - "collapsed": false - }, - "id": "c4a6abfd56b8fa1e" + ] }, { "cell_type": "code", + "execution_count": 11, + "id": "7868c365e327d995", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-19T17:13:51.823442Z", + "start_time": "2024-04-19T17:13:51.743770Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'id': 'ga4gh:VA.ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n 'type': 'Allele',\n 'digest': 'ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n 'location': {'id': 'ga4gh:SL.JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n 'type': 'SequenceLocation',\n 'digest': 'JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n 'sequenceReference': {'type': 'SequenceReference',\n 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n 'start': 80656488,\n 'end': 80656489},\n 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'T'}}" + "text/plain": [ + "{'id': 'ga4gh:VA.ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n", + " 'type': 'Allele',\n", + " 'digest': 'ebezGL6HoAhtGJyVnB_mE5BH18ntKev4',\n", + " 'location': {'id': 'ga4gh:SL.JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n", + " 'type': 'SequenceLocation',\n", + " 'digest': 'JiLRuuyS5wefF_6-Vw7m3Yoqqb2YFkss',\n", + " 'sequenceReference': {'type': 'SequenceReference',\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656488,\n", + " 'end': 80656489},\n", + " 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'T'}}" + ] }, "execution_count": 11, "metadata": {}, @@ -356,35 +494,26 @@ "source": [ "allele = allele_translator.translate_from(\"5-80656489-C-T\", \"gnomad\")\n", "allele.model_dump(exclude_none=True)" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-19T17:13:51.823442Z", - "start_time": "2024-04-19T17:13:51.743770Z" - } - }, - "id": "7868c365e327d995", - "execution_count": 11 + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/getting_started/5_Exploring_the_CnvTranslator.ipynb b/notebooks/getting_started/5_Exploring_the_CnvTranslator.ipynb index 78313d56..44252c7a 100644 --- a/notebooks/getting_started/5_Exploring_the_CnvTranslator.ipynb +++ b/notebooks/getting_started/5_Exploring_the_CnvTranslator.ipynb @@ -2,10 +2,17 @@ "cells": [ { "cell_type": "markdown", + "id": "c940e9d78bc6e98a", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ - "# 5 Exploring the CnvTranslator\n", + "# 5. Exploring the CnvTranslator\n", "The vrs-python model supports two classes of copy number variation: \n", - "* CopyNumberChange - an assessment of loss or gain relative to a location within a gene or system, where loss or gain is represented by the following \"efo\" ontology codes:\n", + "* CopyNumberChange - an assessment of loss or gain relative to a location within a system, where loss or gain is represented by the following [EMBL-EBI Experimental Factor Ontology](https://www.ebi.ac.uk/efo/) (EFO) codes:\n", " * efo:0030064 - regional base ploidy\n", " * efo:0030067 - loss\n", " * efo:0030068 - low-level loss\n", @@ -16,120 +23,164 @@ " * efo:0020073 - high-level loss \n", "* CopyNumberCount - an absolute count of discrete copies of a location within a gene or system\n", "For the CnvTranslator, only HGVS nomenclature is used to describe the variation." - ], - "metadata": { - "collapsed": false - }, - "id": "c940e9d78bc6e98a" + ] }, { "cell_type": "markdown", + "id": "ac31eec4a405b218", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "#### Step 1 - Setup Data Proxy Access\n", "The *DataProxy* provides access to sequence references." - ], - "metadata": { - "collapsed": false - }, - "id": "ac31eec4a405b218" + ] }, { "cell_type": "code", - "outputs": [], - "source": [ - "from ga4gh.vrs.dataproxy import create_dataproxy\n", - "seqrepo_rest_service_url = \"seqrepo+https://services.genomicmedlab.org/seqrepo\"\n", - "seqrepo_dataproxy = create_dataproxy(uri=seqrepo_rest_service_url)" - ], + "execution_count": 1, + "id": "b7b0c4864ad5f9dd", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-18T20:25:27.328387Z", "start_time": "2024-04-18T20:25:27.194307Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, - "id": "b7b0c4864ad5f9dd", - "execution_count": 1 + "outputs": [], + "source": [ + "from ga4gh.vrs.dataproxy import create_dataproxy\n", + "seqrepo_rest_service_url = \"seqrepo+https://services.genomicmedlab.org/seqrepo\"\n", + "seqrepo_dataproxy = create_dataproxy(uri=seqrepo_rest_service_url)" + ] }, { "cell_type": "markdown", - "source": [ - "Import the *CnvTranslator* class." - ], + "id": "27a861e38d55ea44", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "id": "27a861e38d55ea44" + "source": [ + "Import the *CnvTranslator* class." + ] }, { "cell_type": "code", - "outputs": [], - "source": [ - "from ga4gh.vrs.extras.translator import CnvTranslator" - ], + "execution_count": 2, + "id": "9fe4f25508590533", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-18T20:25:27.379097Z", "start_time": "2024-04-18T20:25:27.329523Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, - "id": "9fe4f25508590533", - "execution_count": 2 + "outputs": [], + "source": [ + "from ga4gh.vrs.extras.translator import CnvTranslator" + ] }, { "cell_type": "markdown", - "source": [ - "The UTA server is required in the environment since we are translating from/to HGVS." - ], + "id": "e9312b31a06c98e0", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "id": "e9312b31a06c98e0" + "source": [ + "The UTA server is required in the environment since we are translating from/to HGVS." + ] }, { "cell_type": "code", - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"UTA_DB_URL\"] = \"postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20210129b\"" - ], + "execution_count": 3, + "id": "71a288dff7b87f1", "metadata": { - "collapsed": false, "ExecuteTime": { "end_time": "2024-04-18T20:25:27.381511Z", "start_time": "2024-04-18T20:25:27.379793Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, - "id": "71a288dff7b87f1", - "execution_count": 3 + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"UTA_DB_URL\"] = \"postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20210129b\"" + ] }, { "cell_type": "markdown", - "source": [ - "#### Step 2 - CopyNumberChange examples" - ], + "id": "6c8160cff22db940", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "id": "6c8160cff22db940" + "source": [ + "#### Step 2 - CopyNumberChange examples" + ] }, { "cell_type": "markdown", - "source": [ - "This example depicts a *CopyNumberChange* representing a deletion, or copy number loss. The \"efo\" ontology code specifying the type of copy number change is passed as a keyword argument \"copy_change\" to *translate_from*. This variant can be viewed in [ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/variation/984438)." - ], + "id": "6a2c1c355bbcc494", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "id": "6a2c1c355bbcc494" + "source": [ + "This example depicts a *CopyNumberChange* representing a deletion, or copy number loss. The Experimental Factor Ontology code specifying the type of copy number change is passed as a keyword argument \"copy_change\" to *translate_from*. This variant can be viewed in [ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/variation/984438)." + ] }, { "cell_type": "code", + "execution_count": 4, + "id": "c8a6400cb8605b0d", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:28.974970Z", + "start_time": "2024-04-18T20:25:27.382082Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'id': 'ga4gh:CX.0M5VkV5v504_laQURFMEsqzZGcOF9YEw',\n 'type': 'CopyNumberChange',\n 'digest': '0M5VkV5v504_laQURFMEsqzZGcOF9YEw',\n 'location': {'id': 'ga4gh:SL.GSJAEJXFDz7Nq6VlJj5NTEku48MmteUU',\n 'type': 'SequenceLocation',\n 'digest': 'GSJAEJXFDz7Nq6VlJj5NTEku48MmteUU',\n 'sequenceReference': {'type': 'SequenceReference',\n 'refgetAccession': 'SQ.eK4D2MosgK_ivBkgi6FVPg5UXs1bYESm'},\n 'start': 45002866,\n 'end': 45015056},\n 'copyChange': 'efo:0030067'}" + "text/plain": [ + "{'id': 'ga4gh:CX.0M5VkV5v504_laQURFMEsqzZGcOF9YEw',\n", + " 'type': ,\n", + " 'digest': '0M5VkV5v504_laQURFMEsqzZGcOF9YEw',\n", + " 'location': {'id': 'ga4gh:SL.GSJAEJXFDz7Nq6VlJj5NTEku48MmteUU',\n", + " 'type': ,\n", + " 'digest': 'GSJAEJXFDz7Nq6VlJj5NTEku48MmteUU',\n", + " 'sequenceReference': {'type': ,\n", + " 'refgetAccession': 'SQ.eK4D2MosgK_ivBkgi6FVPg5UXs1bYESm'},\n", + " 'start': 45002866,\n", + " 'end': 45015056},\n", + " 'copyChange': }" + ] }, "execution_count": 4, "metadata": {}, @@ -138,35 +189,53 @@ ], "source": [ "cnv_translator = CnvTranslator(data_proxy=seqrepo_dataproxy)\n", - "allele = cnv_translator.translate_from(\"NC_000014.9:g.45002867_45015056del\", \"hgvs\", copy_change=\"efo:0030067\")\n", - "allele.model_dump(exclude_none=True)" - ], + "cnc = cnv_translator.translate_from(\"NC_000014.9:g.45002867_45015056del\", \"hgvs\", copy_change=\"efo:0030067\")\n", + "cnc.model_dump(exclude_none=True)" + ] + }, + { + "cell_type": "markdown", + "id": "6fc8dfa1340e010d", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-18T20:25:28.974970Z", - "start_time": "2024-04-18T20:25:27.382082Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "c8a6400cb8605b0d", - "execution_count": 4 - }, - { - "cell_type": "markdown", "source": [ "This example depicts a CopyNumberChange* representing a duplication, or copy number gain. This variant can be viewed in [ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/variation/549625)." - ], - "metadata": { - "collapsed": false - }, - "id": "6fc8dfa1340e010d" + ] }, { "cell_type": "code", + "execution_count": 5, + "id": "f4efc189d53d7000", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:29.076267Z", + "start_time": "2024-04-18T20:25:28.976791Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'id': 'ga4gh:CX.0BN4vrqPrLPAZYsQEAPnG4IS8AYeBGe1',\n 'type': 'CopyNumberChange',\n 'digest': '0BN4vrqPrLPAZYsQEAPnG4IS8AYeBGe1',\n 'location': {'id': 'ga4gh:SL.tydo6UFL8Y60L5Me3k8AJfljURO9vYn9',\n 'type': 'SequenceLocation',\n 'digest': 'tydo6UFL8Y60L5Me3k8AJfljURO9vYn9',\n 'sequenceReference': {'type': 'SequenceReference',\n 'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI'},\n 'start': 75502957,\n 'end': 76045032},\n 'copyChange': 'efo:0030070'}" + "text/plain": [ + "{'id': 'ga4gh:CX.0BN4vrqPrLPAZYsQEAPnG4IS8AYeBGe1',\n", + " 'type': ,\n", + " 'digest': '0BN4vrqPrLPAZYsQEAPnG4IS8AYeBGe1',\n", + " 'location': {'id': 'ga4gh:SL.tydo6UFL8Y60L5Me3k8AJfljURO9vYn9',\n", + " 'type': ,\n", + " 'digest': 'tydo6UFL8Y60L5Me3k8AJfljURO9vYn9',\n", + " 'sequenceReference': {'type': ,\n", + " 'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI'},\n", + " 'start': 75502957,\n", + " 'end': 76045032},\n", + " 'copyChange': }" + ] }, "execution_count": 5, "metadata": {}, @@ -174,45 +243,66 @@ } ], "source": [ - "allele = cnv_translator.translate_from(\"NC_000009.12:g.75502958_76045032dup\", \"hgvs\", copy_change=\"efo:0030070\")\n", - "allele.model_dump(exclude_none=True)" - ], + "cnx = cnv_translator.translate_from(\"NC_000009.12:g.75502958_76045032dup\", \"hgvs\", copy_change=\"efo:0030070\")\n", + "cnx.model_dump(exclude_none=True)" + ] + }, + { + "cell_type": "markdown", + "id": "aaf041e4b83301cd", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-18T20:25:29.076267Z", - "start_time": "2024-04-18T20:25:28.976791Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "f4efc189d53d7000", - "execution_count": 5 - }, - { - "cell_type": "markdown", "source": [ "#### Step 3 - CopyNumberCount examples" - ], - "metadata": { - "collapsed": false - }, - "id": "aaf041e4b83301cd" + ] }, { "cell_type": "markdown", - "source": [ - "This example depicts a *CopyNumberCount* with a copy number gain. With copy number count variation, the \"copies\" keyword argument is passed to *translate_from* with the appropriate \"efo\" ontology code. This variant can be viewed in [ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/variation/2579174/)." - ], + "id": "ff76cb8a2f1387a5", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "id": "ff76cb8a2f1387a5" + "source": [ + "This example depicts a *CopyNumberCount* with a copy number gain. With copy number count variation, the \"copies\" keyword argument is passed to *translate_from* with the appropriate \"efo\" ontology code. This variant can be viewed in [ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/variation/2579174/)." + ] }, { "cell_type": "code", + "execution_count": 6, + "id": "f057e93172e97a88", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:29.187023Z", + "start_time": "2024-04-18T20:25:29.077128Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'id': 'ga4gh:CN.O_QHImmfErh9jDFkJaypPPvUmnj7EM70',\n 'type': 'CopyNumberCount',\n 'digest': 'O_QHImmfErh9jDFkJaypPPvUmnj7EM70',\n 'location': {'id': 'ga4gh:SL.hBVWalem_rNclxjmUuT9CHbEGCdlqW9L',\n 'type': 'SequenceLocation',\n 'digest': 'hBVWalem_rNclxjmUuT9CHbEGCdlqW9L',\n 'sequenceReference': {'type': 'SequenceReference',\n 'refgetAccession': 'SQ.HxuclGHh0XCDuF8x6yQrpHUBL7ZntAHc'},\n 'start': 85623,\n 'end': 57073230},\n 'copies': 3}" + "text/plain": [ + "{'id': 'ga4gh:CN.O_QHImmfErh9jDFkJaypPPvUmnj7EM70',\n", + " 'type': ,\n", + " 'digest': 'O_QHImmfErh9jDFkJaypPPvUmnj7EM70',\n", + " 'location': {'id': 'ga4gh:SL.hBVWalem_rNclxjmUuT9CHbEGCdlqW9L',\n", + " 'type': ,\n", + " 'digest': 'hBVWalem_rNclxjmUuT9CHbEGCdlqW9L',\n", + " 'sequenceReference': {'type': ,\n", + " 'refgetAccession': 'SQ.HxuclGHh0XCDuF8x6yQrpHUBL7ZntAHc'},\n", + " 'start': 85623,\n", + " 'end': 57073230},\n", + " 'copies': 3}" + ] }, "execution_count": 6, "metadata": {}, @@ -220,35 +310,53 @@ } ], "source": [ - "allele = cnv_translator.translate_from(\"NC_000004.12:g.85624_57073230dup\", \"hgvs\", copies=\"3\")\n", - "allele.model_dump(exclude_none=True)" - ], + "cnc = cnv_translator.translate_from(\"NC_000004.12:g.85624_57073230dup\", \"hgvs\", copies=\"3\")\n", + "cnc.model_dump(exclude_none=True)" + ] + }, + { + "cell_type": "markdown", + "id": "41df40dd67cb1009", "metadata": { "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-18T20:25:29.187023Z", - "start_time": "2024-04-18T20:25:29.077128Z" + "jupyter": { + "outputs_hidden": false } }, - "id": "f057e93172e97a88", - "execution_count": 6 - }, - { - "cell_type": "markdown", "source": [ "This example depicts a *CopyNumberCount* with a copy number loss. This variant can be viewed in [ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/variation/2579226/)." - ], - "metadata": { - "collapsed": false - }, - "id": "41df40dd67cb1009" + ] }, { "cell_type": "code", + "execution_count": 7, + "id": "412feaeba95751e7", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:29.276073Z", + "start_time": "2024-04-18T20:25:29.187923Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "outputs": [ { "data": { - "text/plain": "{'id': 'ga4gh:CN.WDzlT9oUq4IcQrVRWGH0dZnARnFBotCS',\n 'type': 'CopyNumberCount',\n 'digest': 'WDzlT9oUq4IcQrVRWGH0dZnARnFBotCS',\n 'location': {'id': 'ga4gh:SL.H1Zh5xdBqamBjwVE9orWdY_uBkpEMH1V',\n 'type': 'SequenceLocation',\n 'digest': 'H1Zh5xdBqamBjwVE9orWdY_uBkpEMH1V',\n 'sequenceReference': {'type': 'SequenceReference',\n 'refgetAccession': 'SQ.5ZUqxCmDDgN4xTRbaSjN8LwgZironmB8'},\n 'start': 46111352,\n 'end': 46119948},\n 'copies': 1}" + "text/plain": [ + "{'id': 'ga4gh:CN.WDzlT9oUq4IcQrVRWGH0dZnARnFBotCS',\n", + " 'type': ,\n", + " 'digest': 'WDzlT9oUq4IcQrVRWGH0dZnARnFBotCS',\n", + " 'location': {'id': 'ga4gh:SL.H1Zh5xdBqamBjwVE9orWdY_uBkpEMH1V',\n", + " 'type': ,\n", + " 'digest': 'H1Zh5xdBqamBjwVE9orWdY_uBkpEMH1V',\n", + " 'sequenceReference': {'type': ,\n", + " 'refgetAccession': 'SQ.5ZUqxCmDDgN4xTRbaSjN8LwgZironmB8'},\n", + " 'start': 46111352,\n", + " 'end': 46119948},\n", + " 'copies': 1}" + ] }, "execution_count": 7, "metadata": {}, @@ -256,37 +364,28 @@ } ], "source": [ - "allele = cnv_translator.translate_from(\"NC_000021.9:g.46111353_46119948del\", \"hgvs\", copies=\"1\")\n", - "allele.model_dump(exclude_none=True)" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-04-18T20:25:29.276073Z", - "start_time": "2024-04-18T20:25:29.187923Z" - } - }, - "id": "412feaeba95751e7", - "execution_count": 7 + "cnc = cnv_translator.translate_from(\"NC_000021.9:g.46111353_46119948del\", \"hgvs\", copies=\"1\")\n", + "cnc.model_dump(exclude_none=True)" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/getting_started/6_Upcoming_features.ipynb b/notebooks/getting_started/6_Upcoming_features.ipynb new file mode 100644 index 00000000..5f4f31eb --- /dev/null +++ b/notebooks/getting_started/6_Upcoming_features.ipynb @@ -0,0 +1,515 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c940e9d78bc6e98a", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "# 6. New and Upcoming Features in VRS 2.x\n", + "The VRS 2.0 specification is under active development, and several new and upcoming features have been added to VRS-Python in preparation for this upcoming release. This notebook covers several of these upcoming features." + ] + }, + { + "cell_type": "markdown", + "id": "ac31eec4a405b218", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## Prerequisites - Setup Data Proxy Access\n", + "The *DataProxy* provides access to sequence references." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b7b0c4864ad5f9dd", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:27.328387Z", + "start_time": "2024-04-18T20:25:27.194307Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "from ga4gh.vrs.dataproxy import create_dataproxy\n", + "seqrepo_rest_service_url = \"seqrepo+https://services.genomicmedlab.org/seqrepo\"\n", + "seqrepo_dataproxy = create_dataproxy(uri=seqrepo_rest_service_url)" + ] + }, + { + "cell_type": "markdown", + "id": "27a861e38d55ea44", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "Import the *AlleleTranslator* class." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9fe4f25508590533", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:27.379097Z", + "start_time": "2024-04-18T20:25:27.329523Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "from ga4gh.vrs.extras.translator import AlleleTranslator\n", + "translator = AlleleTranslator(data_proxy=seqrepo_dataproxy)" + ] + }, + { + "cell_type": "markdown", + "id": "e9312b31a06c98e0", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "The UTA server is required in the environment since we are translating from/to HGVS." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "71a288dff7b87f1", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:27.381511Z", + "start_time": "2024-04-18T20:25:27.379793Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"UTA_DB_URL\"] = \"postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20210129b\"" + ] + }, + { + "cell_type": "markdown", + "id": "6c8160cff22db940", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## VRS Object Metadata" + ] + }, + { + "cell_type": "markdown", + "id": "abaa2c22-3f44-4239-aa20-10ad69a575b2", + "metadata": {}, + "source": [ + "First, we start with an Allele from our previous examples." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e6f10dc3-6bec-4ad8-9c13-c139e2140211", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'ga4gh:VA.LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n", + " 'type': ,\n", + " 'digest': 'LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n", + " 'location': {'id': 'ga4gh:SL.nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'type': ,\n", + " 'digest': 'nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'sequenceReference': {'type': ,\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656509,\n", + " 'end': 80656510},\n", + " 'state': {'type': ,\n", + " 'sequence': 'TT'}}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "allele = translator.translate_from(\"NC_000005.10:80656509:C:TT\", \"spdi\")\n", + "allele.model_dump(exclude_none=True)" + ] + }, + { + "cell_type": "markdown", + "id": "669fdc91-f6d9-43dd-84e3-d5e8d5f4759a", + "metadata": {}, + "source": [ + "This Allele, like all variant and location objects in VRS, has several useful fields for describing object metadata." + ] + }, + { + "cell_type": "markdown", + "id": "6a2c1c355bbcc494", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "### Describing a Sequence" + ] + }, + { + "cell_type": "markdown", + "id": "49269576-0e5c-420a-8cb1-e5d83ae4dfdf", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:28.974970Z", + "start_time": "2024-04-18T20:25:27.382082Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "The location of our Allele is a VRS `SequenceLocation` object." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f4efc189d53d7000", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:29.076267Z", + "start_time": "2024-04-18T20:25:28.976791Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'ga4gh:SL.nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'type': ,\n", + " 'digest': 'nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'sequenceReference': {'type': ,\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656509,\n", + " 'end': 80656510}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seqloc = allele.location\n", + "seqloc.model_dump(exclude_none=True)" + ] + }, + { + "cell_type": "markdown", + "id": "5e4c4ab5", + "metadata": {}, + "source": [ + "The `SequenceLocation` uses a `SequenceReference` object to describe the sequence on which the location is defined:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3ecae98d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'type': ,\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seqref = seqloc.sequenceReference\n", + "seqref.model_dump(exclude_none=True)" + ] + }, + { + "cell_type": "markdown", + "id": "9b5958a3-bd9a-4fb8-889e-918271681554", + "metadata": {}, + "source": [ + "However, many additional metadata fields are available for use:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a1539981-f7d7-4dfc-b72e-2f0a802b6f09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': None,\n", + " 'type': ,\n", + " 'label': None,\n", + " 'description': None,\n", + " 'alternativeLabels': None,\n", + " 'extensions': None,\n", + " 'mappings': None,\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI',\n", + " 'residueAlphabet': None,\n", + " 'circular': None}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seqref.model_dump()" + ] + }, + { + "cell_type": "markdown", + "id": "7e81e6b9-3496-490f-a0a0-cba4e3e563b0", + "metadata": {}, + "source": [ + "This is the minimal representation of the reference; however, additional content may be helpful for describing this sequence when it is received. First, we can look up some metadata for it using SeqRepo:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "75c6ba36-d144-4f42-8d08-455b6436b858", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'added': '2016-08-24T08:25:20Z',\n", + " 'aliases': ['Ensembl:5',\n", + " 'ensembl:5',\n", + " 'GRCh38:5',\n", + " 'GRCh38:chr5',\n", + " 'GRCh38.p1:5',\n", + " 'GRCh38.p1:chr5',\n", + " 'GRCh38.p10:5',\n", + " 'GRCh38.p10:chr5',\n", + " 'GRCh38.p11:5',\n", + " 'GRCh38.p11:chr5',\n", + " 'GRCh38.p12:5',\n", + " 'GRCh38.p12:chr5',\n", + " 'GRCh38.p2:5',\n", + " 'GRCh38.p2:chr5',\n", + " 'GRCh38.p3:5',\n", + " 'GRCh38.p3:chr5',\n", + " 'GRCh38.p4:5',\n", + " 'GRCh38.p4:chr5',\n", + " 'GRCh38.p5:5',\n", + " 'GRCh38.p5:chr5',\n", + " 'GRCh38.p6:5',\n", + " 'GRCh38.p6:chr5',\n", + " 'GRCh38.p7:5',\n", + " 'GRCh38.p7:chr5',\n", + " 'GRCh38.p8:5',\n", + " 'GRCh38.p8:chr5',\n", + " 'GRCh38.p9:5',\n", + " 'GRCh38.p9:chr5',\n", + " 'MD5:f7f05fb7ceea78cbc32ce652c540ff2d',\n", + " 'NCBI:NC_000005.10',\n", + " 'refseq:NC_000005.10',\n", + " 'SEGUID:TuMsXqT81pQNOh4t8oKmnG9F9xM',\n", + " 'SHA1:4ee32c5ea4fcd6940d3a1e2df282a69c6f45f713',\n", + " 'VMC:GS_aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI',\n", + " 'sha512t24u:aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI',\n", + " 'ga4gh:SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'],\n", + " 'alphabet': 'ACGNT',\n", + " 'length': 181538259}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ref_namespaced_id = f'ga4gh:{seqref.refgetAccession}'\n", + "seqrepo_dataproxy.get_metadata(ref_namespaced_id)" + ] + }, + { + "cell_type": "markdown", + "id": "cb047244-7b2f-4e36-bf6f-576546d9bf86", + "metadata": {}, + "source": [ + "We can use some of these data to annotate our sequence reference:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "36e891f4-531d-4f6b-943d-5e53af9797a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'refseq:NC_000005.10',\n", + " 'type': ,\n", + " 'label': 'GRCh38:chr5',\n", + " 'alternativeLabels': ['GRCh38:5'],\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seqref.id = seqrepo_dataproxy.translate_sequence_identifier(ref_namespaced_id, \"refseq\")[0]\n", + "seqref.label = seqrepo_dataproxy.translate_sequence_identifier(ref_namespaced_id, \"GRCh38\")[0]\n", + "seqref.alternativeLabels = seqrepo_dataproxy.translate_sequence_identifier(ref_namespaced_id, \"GRCh38\")[1:]\n", + "seqref.model_dump(exclude_none=True)" + ] + }, + { + "cell_type": "markdown", + "id": "e10b693f-4d00-4b7f-a2e4-c5acc4b61256", + "metadata": {}, + "source": [ + "These changes then work their way back up to the parent models:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c43c14a2-cd91-4cb9-a634-8dd855bc0387", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'ga4gh:SL.nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'type': ,\n", + " 'digest': 'nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'sequenceReference': {'id': 'refseq:NC_000005.10',\n", + " 'type': ,\n", + " 'label': 'GRCh38:chr5',\n", + " 'alternativeLabels': ['GRCh38:5'],\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656509,\n", + " 'end': 80656510}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seqloc.model_dump(exclude_none=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5c25308b-56fb-4f3a-8521-2a1e4bc4d232", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'ga4gh:VA.LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n", + " 'type': ,\n", + " 'digest': 'LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n", + " 'location': {'id': 'ga4gh:SL.nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'type': ,\n", + " 'digest': 'nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'sequenceReference': {'id': 'refseq:NC_000005.10',\n", + " 'type': ,\n", + " 'label': 'GRCh38:chr5',\n", + " 'alternativeLabels': ['GRCh38:5'],\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656509,\n", + " 'end': 80656510},\n", + " 'state': {'type': ,\n", + " 'sequence': 'TT'}}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "allele.model_dump(exclude_none=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/ga4gh/core/domain_models.py b/src/ga4gh/core/domain_models.py index d6d0918f..e9a070c7 100644 --- a/src/ga4gh/core/domain_models.py +++ b/src/ga4gh/core/domain_models.py @@ -15,7 +15,7 @@ from pydantic import Field, RootModel -from ga4gh.core.entity_models import _DomainEntity +from ga4gh.core.entity_models import DomainEntity class CommonDomainType(str, Enum): @@ -30,7 +30,7 @@ class CommonDomainType(str, Enum): TR_COMB = "CombinationTherapy" GENE = "Gene" -class Phenotype(_DomainEntity): +class Phenotype(DomainEntity): """An observable characteristic or trait of an organism.""" type: Literal[CommonDomainType.PHENOTYPE] = Field( @@ -39,7 +39,7 @@ class Phenotype(_DomainEntity): ) -class Disease(_DomainEntity): +class Disease(DomainEntity): """A particular abnormal condition that negatively affects the structure or function of all or part of an organism and is not immediately due to any external injury. """ @@ -50,7 +50,7 @@ class Disease(_DomainEntity): ) -class TraitSet(_DomainEntity): +class TraitSet(DomainEntity): """A set of phenotype and/or disease concepts that together constitute a condition.""" type: Literal[CommonDomainType.TRAIT_SET] = Field( @@ -73,7 +73,7 @@ class Condition(RootModel): ) -class TherapeuticAction(_DomainEntity): +class TherapeuticAction(DomainEntity): """A therapeutic action taken that is intended to alter or stop a pathologic process.""" type: Literal[CommonDomainType.TR_ACTION] = Field( @@ -82,7 +82,7 @@ class TherapeuticAction(_DomainEntity): ) -class TherapeuticAgent(_DomainEntity): +class TherapeuticAgent(DomainEntity): """An administered therapeutic agent that is intended to alter or stop a pathologic process.""" type: Literal[CommonDomainType.TR_AGENT] = Field( @@ -91,7 +91,7 @@ class TherapeuticAgent(_DomainEntity): ) -class TherapeuticSubstituteGroup(_DomainEntity): +class TherapeuticSubstituteGroup(DomainEntity): """A group of therapeutic procedures that may be treated as substitutes for one another.""" type: Literal[CommonDomainType.TR_SUB] = Field( @@ -105,7 +105,7 @@ class TherapeuticSubstituteGroup(_DomainEntity): ) -class CombinationTherapy(_DomainEntity): +class CombinationTherapy(DomainEntity): """A therapeutic procedure that involves multiple different therapeutic procedures performed in combination. """ @@ -133,7 +133,7 @@ class TherapeuticProcedure(RootModel): ) -class Gene(_DomainEntity): +class Gene(DomainEntity): """A basic physical and functional unit of heredity.""" type: Literal[CommonDomainType.GENE] = Field( diff --git a/src/ga4gh/core/entity_models.py b/src/ga4gh/core/entity_models.py index 196af6bb..bb9f1a49 100644 --- a/src/ga4gh/core/entity_models.py +++ b/src/ga4gh/core/entity_models.py @@ -10,6 +10,7 @@ * `import ga4gh.core`, and refer to models using the fully-qualified module name, e.g., `ga4gh.core.entity_models.Coding` """ +from abc import ABC from typing import Any, Dict, Annotated, Optional, Union, List from enum import Enum @@ -152,7 +153,7 @@ class Expression(BaseModel): ######################################### -class _Entity(BaseModel): +class Entity(ABC, BaseModel): """Entity is the root class of the 'gks-common' core information model classes - those that have identifiers and other general metadata like labels, xrefs, urls, descriptions, etc. All common classes descend from and inherit its attributes. @@ -175,7 +176,7 @@ class _Entity(BaseModel): extensions: Optional[List[Extension]] = Field(None, description="A list of extensions to the entity. Extensions are not expected to be natively understood, but may be used for pre-negotiated exchange of message attributes between systems.") -class _DomainEntity(_Entity): +class DomainEntity(Entity, ABC): """An Entity that is specific to a particular biomedical domain such as disease, therapeutics, or genes. Domain Entities are considered as 'concept-level' entities, as opposed to particular instances. e.g. 'Lung Cancer', not 'patient123's lung diff --git a/src/ga4gh/vrs/models.py b/src/ga4gh/vrs/models.py index e98ea17a..31ebcf3a 100644 --- a/src/ga4gh/vrs/models.py +++ b/src/ga4gh/vrs/models.py @@ -10,6 +10,7 @@ * `import ga4gh.vrs`, and refer to models using the fully-qualified module name, e.g., `ga4gh.vrs.models.Allele` """ +from abc import ABC from typing import List, Literal, Optional, Union, Dict, Annotated from collections import OrderedDict from enum import Enum @@ -31,7 +32,7 @@ from ga4gh.core.pydantic import ( getattr_in ) -from ga4gh.core.entity_models import IRI, Expression, _DomainEntity +from ga4gh.core.entity_models import IRI, Expression, DomainEntity def flatten(vals): @@ -187,7 +188,7 @@ def _recurse_ga4gh_serialize(obj): return obj -class _ValueObject(_DomainEntity): +class _ValueObject(DomainEntity, ABC): """A contextual value whose equality is based on value, not identity. See https://en.wikipedia.org/wiki/Value_object for more on Value Objects. """ @@ -210,7 +211,7 @@ def is_ga4gh_identifiable(): return False -class _Ga4ghIdentifiableObject(_ValueObject): +class _Ga4ghIdentifiableObject(_ValueObject, ABC): """A contextual value object for which a GA4GH computed identifier can be created. All GA4GH Identifiable Objects may have computed digests from the VRS Computed Identifier algorithm. @@ -513,7 +514,7 @@ class ga4gh(_Ga4ghIdentifiableObject.ga4gh): ######################################### -class _VariationBase(_Ga4ghIdentifiableObject): +class _VariationBase(_Ga4ghIdentifiableObject, ABC): """Base class for variation""" expressions: Optional[List[Expression]] = None @@ -665,7 +666,7 @@ class ga4gh(_Ga4ghIdentifiableObject.ga4gh): ######################################### -class _CopyNumber(_VariationBase): +class _CopyNumber(_VariationBase, ABC): """A measure of the copies of a `Location` within a system (e.g. genome, cell, etc.)""" location: Union[IRI, SequenceLocation] = Field(