diff --git a/notebooks/getting_started/6_Upcoming_features.ipynb b/notebooks/getting_started/6_Upcoming_features.ipynb new file mode 100644 index 00000000..5f4f31eb --- /dev/null +++ b/notebooks/getting_started/6_Upcoming_features.ipynb @@ -0,0 +1,515 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c940e9d78bc6e98a", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "# 6. New and Upcoming Features in VRS 2.x\n", + "The VRS 2.0 specification is under active development, and several new and upcoming features have been added to VRS-Python in preparation for this upcoming release. This notebook covers several of these upcoming features." + ] + }, + { + "cell_type": "markdown", + "id": "ac31eec4a405b218", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## Prerequisites - Setup Data Proxy Access\n", + "The *DataProxy* provides access to sequence references." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b7b0c4864ad5f9dd", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:27.328387Z", + "start_time": "2024-04-18T20:25:27.194307Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "from ga4gh.vrs.dataproxy import create_dataproxy\n", + "seqrepo_rest_service_url = \"seqrepo+https://services.genomicmedlab.org/seqrepo\"\n", + "seqrepo_dataproxy = create_dataproxy(uri=seqrepo_rest_service_url)" + ] + }, + { + "cell_type": "markdown", + "id": "27a861e38d55ea44", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "Import the *AlleleTranslator* class." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9fe4f25508590533", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:27.379097Z", + "start_time": "2024-04-18T20:25:27.329523Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "from ga4gh.vrs.extras.translator import AlleleTranslator\n", + "translator = AlleleTranslator(data_proxy=seqrepo_dataproxy)" + ] + }, + { + "cell_type": "markdown", + "id": "e9312b31a06c98e0", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "The UTA server is required in the environment since we are translating from/to HGVS." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "71a288dff7b87f1", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:27.381511Z", + "start_time": "2024-04-18T20:25:27.379793Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"UTA_DB_URL\"] = \"postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20210129b\"" + ] + }, + { + "cell_type": "markdown", + "id": "6c8160cff22db940", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "## VRS Object Metadata" + ] + }, + { + "cell_type": "markdown", + "id": "abaa2c22-3f44-4239-aa20-10ad69a575b2", + "metadata": {}, + "source": [ + "First, we start with an Allele from our previous examples." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e6f10dc3-6bec-4ad8-9c13-c139e2140211", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'ga4gh:VA.LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n", + " 'type': ,\n", + " 'digest': 'LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n", + " 'location': {'id': 'ga4gh:SL.nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'type': ,\n", + " 'digest': 'nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'sequenceReference': {'type': ,\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656509,\n", + " 'end': 80656510},\n", + " 'state': {'type': ,\n", + " 'sequence': 'TT'}}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "allele = translator.translate_from(\"NC_000005.10:80656509:C:TT\", \"spdi\")\n", + "allele.model_dump(exclude_none=True)" + ] + }, + { + "cell_type": "markdown", + "id": "669fdc91-f6d9-43dd-84e3-d5e8d5f4759a", + "metadata": {}, + "source": [ + "This Allele, like all variant and location objects in VRS, has several useful fields for describing object metadata." + ] + }, + { + "cell_type": "markdown", + "id": "6a2c1c355bbcc494", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "### Describing a Sequence" + ] + }, + { + "cell_type": "markdown", + "id": "49269576-0e5c-420a-8cb1-e5d83ae4dfdf", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:28.974970Z", + "start_time": "2024-04-18T20:25:27.382082Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "source": [ + "The location of our Allele is a VRS `SequenceLocation` object." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f4efc189d53d7000", + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-18T20:25:29.076267Z", + "start_time": "2024-04-18T20:25:28.976791Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'ga4gh:SL.nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'type': ,\n", + " 'digest': 'nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'sequenceReference': {'type': ,\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656509,\n", + " 'end': 80656510}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seqloc = allele.location\n", + "seqloc.model_dump(exclude_none=True)" + ] + }, + { + "cell_type": "markdown", + "id": "5e4c4ab5", + "metadata": {}, + "source": [ + "The `SequenceLocation` uses a `SequenceReference` object to describe the sequence on which the location is defined:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3ecae98d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'type': ,\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seqref = seqloc.sequenceReference\n", + "seqref.model_dump(exclude_none=True)" + ] + }, + { + "cell_type": "markdown", + "id": "9b5958a3-bd9a-4fb8-889e-918271681554", + "metadata": {}, + "source": [ + "However, many additional metadata fields are available for use:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a1539981-f7d7-4dfc-b72e-2f0a802b6f09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': None,\n", + " 'type': ,\n", + " 'label': None,\n", + " 'description': None,\n", + " 'alternativeLabels': None,\n", + " 'extensions': None,\n", + " 'mappings': None,\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI',\n", + " 'residueAlphabet': None,\n", + " 'circular': None}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seqref.model_dump()" + ] + }, + { + "cell_type": "markdown", + "id": "7e81e6b9-3496-490f-a0a0-cba4e3e563b0", + "metadata": {}, + "source": [ + "This is the minimal representation of the reference; however, additional content may be helpful for describing this sequence when it is received. First, we can look up some metadata for it using SeqRepo:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "75c6ba36-d144-4f42-8d08-455b6436b858", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'added': '2016-08-24T08:25:20Z',\n", + " 'aliases': ['Ensembl:5',\n", + " 'ensembl:5',\n", + " 'GRCh38:5',\n", + " 'GRCh38:chr5',\n", + " 'GRCh38.p1:5',\n", + " 'GRCh38.p1:chr5',\n", + " 'GRCh38.p10:5',\n", + " 'GRCh38.p10:chr5',\n", + " 'GRCh38.p11:5',\n", + " 'GRCh38.p11:chr5',\n", + " 'GRCh38.p12:5',\n", + " 'GRCh38.p12:chr5',\n", + " 'GRCh38.p2:5',\n", + " 'GRCh38.p2:chr5',\n", + " 'GRCh38.p3:5',\n", + " 'GRCh38.p3:chr5',\n", + " 'GRCh38.p4:5',\n", + " 'GRCh38.p4:chr5',\n", + " 'GRCh38.p5:5',\n", + " 'GRCh38.p5:chr5',\n", + " 'GRCh38.p6:5',\n", + " 'GRCh38.p6:chr5',\n", + " 'GRCh38.p7:5',\n", + " 'GRCh38.p7:chr5',\n", + " 'GRCh38.p8:5',\n", + " 'GRCh38.p8:chr5',\n", + " 'GRCh38.p9:5',\n", + " 'GRCh38.p9:chr5',\n", + " 'MD5:f7f05fb7ceea78cbc32ce652c540ff2d',\n", + " 'NCBI:NC_000005.10',\n", + " 'refseq:NC_000005.10',\n", + " 'SEGUID:TuMsXqT81pQNOh4t8oKmnG9F9xM',\n", + " 'SHA1:4ee32c5ea4fcd6940d3a1e2df282a69c6f45f713',\n", + " 'VMC:GS_aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI',\n", + " 'sha512t24u:aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI',\n", + " 'ga4gh:SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'],\n", + " 'alphabet': 'ACGNT',\n", + " 'length': 181538259}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ref_namespaced_id = f'ga4gh:{seqref.refgetAccession}'\n", + "seqrepo_dataproxy.get_metadata(ref_namespaced_id)" + ] + }, + { + "cell_type": "markdown", + "id": "cb047244-7b2f-4e36-bf6f-576546d9bf86", + "metadata": {}, + "source": [ + "We can use some of these data to annotate our sequence reference:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "36e891f4-531d-4f6b-943d-5e53af9797a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'refseq:NC_000005.10',\n", + " 'type': ,\n", + " 'label': 'GRCh38:chr5',\n", + " 'alternativeLabels': ['GRCh38:5'],\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seqref.id = seqrepo_dataproxy.translate_sequence_identifier(ref_namespaced_id, \"refseq\")[0]\n", + "seqref.label = seqrepo_dataproxy.translate_sequence_identifier(ref_namespaced_id, \"GRCh38\")[0]\n", + "seqref.alternativeLabels = seqrepo_dataproxy.translate_sequence_identifier(ref_namespaced_id, \"GRCh38\")[1:]\n", + "seqref.model_dump(exclude_none=True)" + ] + }, + { + "cell_type": "markdown", + "id": "e10b693f-4d00-4b7f-a2e4-c5acc4b61256", + "metadata": {}, + "source": [ + "These changes then work their way back up to the parent models:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c43c14a2-cd91-4cb9-a634-8dd855bc0387", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'ga4gh:SL.nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'type': ,\n", + " 'digest': 'nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'sequenceReference': {'id': 'refseq:NC_000005.10',\n", + " 'type': ,\n", + " 'label': 'GRCh38:chr5',\n", + " 'alternativeLabels': ['GRCh38:5'],\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656509,\n", + " 'end': 80656510}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seqloc.model_dump(exclude_none=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5c25308b-56fb-4f3a-8521-2a1e4bc4d232", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'ga4gh:VA.LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n", + " 'type': ,\n", + " 'digest': 'LK_4rOVxyEwrEpaOVd-BDFV0ocbO5vgV',\n", + " 'location': {'id': 'ga4gh:SL.nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'type': ,\n", + " 'digest': 'nA5-KovovkH-5p3LF1657nkkeWFwrInI',\n", + " 'sequenceReference': {'id': 'refseq:NC_000005.10',\n", + " 'type': ,\n", + " 'label': 'GRCh38:chr5',\n", + " 'alternativeLabels': ['GRCh38:5'],\n", + " 'refgetAccession': 'SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI'},\n", + " 'start': 80656509,\n", + " 'end': 80656510},\n", + " 'state': {'type': ,\n", + " 'sequence': 'TT'}}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "allele.model_dump(exclude_none=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}