From bb52ef64c1498de8f7d8a9fd4ac561c3b8baf16e Mon Sep 17 00:00:00 2001 From: Anuradha Date: Mon, 24 Apr 2023 06:23:27 +0000 Subject: [PATCH 01/16] Adding test data --- README.md | 7 +- examples/test-data/GUIDE.md | 31 + examples/test-data/chr1.vcf.gz | Bin 0 -> 4838 bytes examples/test-data/chr1.vcf.gz.tbi | Bin 0 -> 149 bytes examples/test-data/submission.json | 1069 ++++++++++++++++++++++++++++ 5 files changed, 1106 insertions(+), 1 deletion(-) create mode 100644 examples/test-data/GUIDE.md create mode 100644 examples/test-data/chr1.vcf.gz create mode 100644 examples/test-data/chr1.vcf.gz.tbi create mode 100644 examples/test-data/submission.json diff --git a/README.md b/README.md index 0bd9dd3..f912e85 100644 --- a/README.md +++ b/README.md @@ -155,7 +155,7 @@ module "serverless-beacon" { region = "REGION" } ``` -Please refer to [./examples](./examples) to find a minimal and a complete setup. +Please refer to [./examples/minimum/](./examples/minimum/) or [./examples/full](./examples/full) to find a minimal and a complete setup. ## Development All the layers needed for the program to run are in layers folder. To add a new layer for immediate use with additional configs, run the following commands. Once the decision to use the library is finalised update the `init.sh` script to automate the process. @@ -188,6 +188,10 @@ Please make a copy of `backend.tf.template` with suited parameters and rename as ## API +### Example data + +Please find the data in [./examples/test-data/](./examples/test-data/) and use the [./examples/test-data/GUIDE.md](./examples/test-data/GUIDE.md) to try the provided test data. + ### Data ingestion API Use the following schemas for data submission @@ -224,3 +228,4 @@ $ ./init.sh -msse4.2 -O3 ### Provider produced inconsistent final plan If `terraform apply --auto-approve` complaints about a provider error. Please retry. If the issue persists, please raise an issue with the complete terraform log. + diff --git a/examples/test-data/GUIDE.md b/examples/test-data/GUIDE.md new file mode 100644 index 0000000..a5c1f63 --- /dev/null +++ b/examples/test-data/GUIDE.md @@ -0,0 +1,31 @@ +# Getting started with test data + +Please ensure you first upload the `chr1.vcf.gz` and `chr1.vcf.gz.tbi` files to an S3 bucket that is accessible from the sBeacon deployment account. Obtain the S3 URI for the `chr1.vcf.gz` from the uploaded desitation. Note that, both `vcf.gz` and `vcf.gz.tbi` files must have the same prefix in S3 for this to work. + +Now edit the `submission.json` file such that they match the S3 URI of the `vcf.gz` file. + +```json +... + "vcfLocations": [ + "s3:////chr1.vcf.gz" + ] +... +``` + +You can submit the data in two ways. + +### Submission as request body + +You can simply copy the edited JSON content in to the API gateway `/submit` POST endpoint. If you're using a REST client make sure you add authorization headers before you make the request. For example, Postman supports Authorization type AWS Signature and there you can enter AWS Keys. + +### Submission as an S3 payload + +Alternatively, you can upload edited `submission.json` file to an S3 location accessible from deployment. Then you can use the file's S3 URI as follows in the API Gateway or in your REST client. + +```json +{ + "s3Payload": "s3:////submission.json" +} +``` + +This approach is recommended for larger submissions with thousands of metadata entries. \ No newline at end of file diff --git a/examples/test-data/chr1.vcf.gz b/examples/test-data/chr1.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..e410ba518be740e8cb4a45b34ef007a727dd2d32 GIT binary patch literal 4838 zcma*rXH*kho(6CXp-L|b3L-^HsCg-&SC!s7211eEL8K=jC`uLS2m}P_Efhg%LJSB9 zh%`eFAd%i{fQ++e&z{+to!z}3?)^RIKKIl8_UB5V1d{xH5y?r^H2@_2Of`91ro10V zKdS^HFynJ^D-}`38@3E4zZ@MjSnGk|Edj{#ZkomTg(a|;Of%wQESLTEkd=26wyjc$ z)Izsr8~N#x_lpI1IEP_;-&sIzok3?_LD#^Yi~Sj2{LL(M<4jtYU4-n%HF|OaSA0Md zhq%S^+2+dB)#Eos-@L zT$!=8z9786wGX8}C=5#p-C65u4Uj`vFEzkXDMyYFIGw+w9ZT9?^=IzlGcRd)r3@8e zgUWFGITp#%ST=D$m6a@9DtA$%l*?R9CM%HmJMqTo=%UE7V2^z9J@746ZJ#HuP4Ouo zzC|y6QfNs|k%>x^^p-Eyk7x?GTRk~vJALX_@jSD2bl6QZk(>@N^pl-dL>Vf0uO!tL z!@aG~F`RAjy_x&V9F@8i*<{7GUk@@$Qo++JSQcmG68l8wsVy*8lid;!yD`nUBo?G6 z=x#Qp$q-yY<`<971o>y(8cf6mcq09?ctmIAXC`9%lq$)#P4{Z~6}OGtquabG_F7d~ zDfu(D!-&iG9#SgWDFJSkzW?p9@YPS9Twk0Mm0UQ98n(Kbm71Q2UViXZFo5$3MbTUS z$FZ2_OFS=_!1kZu?)OEwm)II(Ulz+O(?iW9huPdoV&ZtkG}P`&dF{==G@{WW7m25J zWdQ&xxGjn(>+gO@_U2Uw*q1bX`HO{wsnHF>58+`Ud9&eD?ZKx-4CGWH$xFTJ`^}_6 z!Zt+7{;f!+mi;wY)sx4^ukeE2;t?xE@=w$id%5-JU?cq^F9m@1pE8nAUrWb{!N;Q< zNt{)g2t|S?h@qVmDSO0=XPv7x#&E0oW0^mg}$QRz*0$@L@oQDs9529J7G5NfL zl&#(C5HWQ;pk}F@iuzhb9v@3FdvZ&W@{1OXIqnTqVbEwnjK0T7+Nhgra z7cWI?7|r*2Ru6+w_7L9eE88UYiSldjIN9)8 z6bY-fKkt3;Rfi$%mIHX2@Iuz{&0(kcqtOcwt^4q1b4e(29_%8;D+bGUXn~#@f*$yx z$0vsf!Ak*ri9f|e*9{&buD-2B?1{F)bLMXjZn`L+$>fXQ z99a%H_Y^$BZ8nG=6=kzrRUMa)K=O^n6Z%8aeQDcU?{L`Wa~_GSwDiHg4LGiKeOpao z?|h2`jX8dZr-Io*k==BA>zB67G@XaRFPGbzcI!n$F8|79Is2>aR^4hb{%q4hZdd^8 z@+tWuYrC=z)38aW(BZ-e8S;SqN~5ZLnvU|4f+Z$2@KWvQKFhk8OkLg?+&hh5iqvMz zO5f2PXe|kYnPBc65#Gp)xzA@w`D+t8OOkXyo!sN>Rs*Q3g3|$v3=LFwuccz|zfXxERk2 z-2>Lj{;zxmtOa&wXf5h6K>yB+GZPc8f5`uidW$+t(0*VoUw6iT(cJVsi#gEpuV`Uy zzkAHBZ~m3%XKEL3Cs@aUnnPGGw6oN?xlJI-c^^!HL)>=WG1 zLZT)QkF>0H($bA>O)`J*JDVZ@Tp3w@(`6d9$VJ7tL+^U-(2!g5&O!}1C!Qq zTH>OJO)~W8OS?QRd7+3+HT39DQ#vatRm5f(dJLLIhHANgBb~^JOP^FU>i83eNhk8- z(vKC5F8_q@8f_YgtF_}wnBlH}D-3ZF2_T?-6P6y=+a$XhZV6qQUK5UJWa?~HCYsen z`aivN5FPSdOTbo!fHG~9TvC=Ph%0I`y(d|9XMBTM2U%E+7m|-t4xD8Q#4<5%&>L9K zP7ZvdIo5V#b!<^!s)zl@6 z5SUWRV6)AZka(?+&xnCIx$DL&Yo)z2zMb-?C@k!wO=qHoxNei0bQz*cX=7q7?9-i& zsVc`aGexs-QH9$b@&~Y5{A-2P?1dBo%y|~|cSy=XSl9EDeyO%?SpF9Mi_00D@~xdZ z2vUsDaudCf(wN}!ooUY6;i$YHsnhYbl6y@=oMM+Zx$3xo9%M`=)?aI9RX z(#h~h#IDc=Oiw7qKbPn2YPTNi!?zO+1m7IXW6H}gqnuyj^++7_^q_B#=H$$bN->e^ zt!4>B*2L7iY-#3k-MNB%%Xa&6?td%5SD~&&2mOjLW7on0nQ-W&ApGDU94~r+HBG(D*+LVW z%euB7?)+A+g(wIfto3YM1|_a2u2i)bd;iFplhJ(#9zKt0V~IJWk}Qn)kS%^8^x5Be z!%q}HR*kTcjOZZN;)azMuJS-gU!tJ-IbNlVdVuIq3B4=CwVy}GwYP4->;)hdx`l${B*-qGmjIeR3Iq^KifLoX+hQ=?>=h^-gBTIoz?j3 zUOkxJni_kOiOpi5Ia)s!^!zRzzeEcJF90R+v9UWp*+6O~tZ&+_{6^!LqzP zVdwMOvV7WN@T^ccv`qFQIoiK3rkNwYe;H0(Sw7iWNa><6B%gwWbH#;kDpXym9q3$C z9k$p6j1!mbR7w#?_cJq7IitwlLli>IcQpFOyRcHA%g`EE=#g zO3MZt#|%ov?0hNtVI^4x`Z9BC(8dWT;>L(-MaUlyE%n5IUgD{5S%r424UzS9p;{J6 z>1tM+^S8JhaF3Cp6hfGhP+l_r^YL39^+t$aCC-cy zlYQnM=t!-frLY_ZuJ_MqC2spfIh;}r*VH?-?`^*%ZQ9^!zgCjhh#G5~C!uszESb5` zEmOY=@{0g#RoIix?n>-tt0ER1w7nLLha#tTQ$2* zK_PJc5IAcbrHXg6L!b}$yG~*tALG>&ehS2@(8a-;$s_x@m;A!>rWMSbbn5&%!bIjg zaGTdW7U7mZ;YXi;kZ`X1)0~SEYMish$@Yr~(xQAFN-AjIN8qv7B&JwsP{wO(_7p8SJ3DHHX(B11w9paOe@ zJd1M+Dw+4smIEJqXIvlEiEaTszaOLVZSf6;#>X@0UZzRGYz=yy$fq6^&!SHD{uzxE zEqP4TO`#qyVG_rtVBT<%KyR5s&@@%}vaM?TSMpZOYo@|vu<*-_5HiXJ)=5b8S`AIK ztt)td7VRK_Iq7Mkw1%+iLchZT8>NUj^_p%3ZM1rrgSTr>9Dru(&Rvy7ImV2YR&izmfW4)5 zX+GiLKnK{si<1CK zkbXZ}oy(K<%4hib&;wzY@Hn58ytx;g?c+AA6=M+)(WY_1MN|5Cs?e!LU92Pxr+>YRc1!%(ci)~5xnIUue*3}|pFAC^vjRqr z7Wzs&c;MJYl+@^sqhotFje=#@Pmeon_C03qz4I16>VbUKLi^ilpUIQpS;gbc*#898Af!9kjs=I1W1!m;g921k8$x!D(s$|IBMy`e1)BpyE!=toO zd{&~u)~Ttb&;7k9y5ttJNCU~DW5sW_{3{eAI@xVz*=@0x{j<(&c0y0nea1zv%erY> zt6Z1?6eZM7=~c|SbLQx!Gg`)5)NL54RPrXyAJGKMFd^!f=#|m?Xuwqso$^tWI$0!B ziL`{ue3Aa42DjE3{;2P_Z*UHBxMP4__Xk(O!nU_%?BUdV{BB?F`Gzu@NLsWM5;1H` TLc;Y=ldRh$r2jG4`up}bV|#ph literal 0 HcmV?d00001 diff --git a/examples/test-data/chr1.vcf.gz.tbi b/examples/test-data/chr1.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..691236597cd5646bd27c65c8ca3c1c073f8885f4 GIT binary patch literal 149 zcmb2|=3rp}f&Xj_PR>jW6%5>kpHfm%5)u-ak|cPUP6bFEF;7Wgo;1@*KveLXz)ab5 z0=Ke-Ttt25G(Man&~YWjfi*JVuR*QYIsq$bExyOUk2<`NSz*z0UYvVj@xn(2c4zED jRm^)zq-=XG9u{C@Ft+P9{>R9`AdhCTGy^l(9UuY#wuLLG literal 0 HcmV?d00001 diff --git a/examples/test-data/submission.json b/examples/test-data/submission.json new file mode 100644 index 0000000..c2ee649 --- /dev/null +++ b/examples/test-data/submission.json @@ -0,0 +1,1069 @@ +{ + "datasetId": "UNQ_1", + "dataset": { + "id": "UNQ_1", + "createDateTime": "2021-03-21T02:37:00-08:00", + "dataUseConditions": { + "duoDataUse": [ + { + "id": "DUO:0000042", + "label": "general research use", + "version": "17-07-2016" + } + ] + }, + "description": "Simulation set 1.", + "externalUrl": "http://example.org/wiki/Main_Page", + "info": {}, + "name": "Dataset with fake data", + "updateDateTime": "2022-08-05T17:21:00+01:00", + "version": "v1.1" + }, + "assemblyId": "GRCH38", + "cohortId": "UNQ_1", + "cohort": { + "id": "UNQ_1", + "cohortDataTypes": [ + { + "id": "OMIABIS:0000060", + "label": "survey data" + }, + { + "id": "OBI:0000070", + "label": "genotyping assay" + } + ], + "cohortDesign": { + "id": "orcid:0000-0003-3463-0775" + }, + "cohortSize": -1, + "cohortType": "beacon-defined", + "name": "CGG group" + }, + "vcfLocations": [ + "s3:////chr1.vcf.gz" + ], + "individuals": [ + { + "id": "UNQ_1-1", + "ethnicity": { + "id": "SNOMED:52075006", + "label": "Congolese" + }, + "geographicOrigin": { + "id": "SNOMED:223688001", + "label": "United States of America" + }, + "interventionsOrProcedures": [ + { + "procedureCode": { + "id": "NCIT:C79426" + } + }, + { + "procedureCode": { + "id": "NCIT:C64264" + } + } + ], + "karyotypicSex": "XXY", + "sex": { + "id": "SNOMED:407378000", + "label": "Surgically transgendered transsexual, male-to-female" + } + }, + { + "id": "UNQ_1-2", + "diseases": [ + { + "diseaseCode": { + "id": "SNOMED:734099007", + "label": "Neuroblastoma of central nervous system" + } + }, + { + "diseaseCode": { + "id": "SNOMED:135811000119107", + "label": "Lewy body dementia with behavioral disturbance (disorder)" + } + }, + { + "diseaseCode": { + "id": "SNOMED:23853001", + "label": "Disorder of the central nervous system" + } + } + ], + "ethnicity": { + "id": "SNOMED:12556008", + "label": "Tamils" + }, + "geographicOrigin": { + "id": "SNOMED:223688001", + "label": "United States of America" + }, + "interventionsOrProcedures": [ + { + "procedureCode": { + "id": "NCIT:C79426" + } + }, + { + "procedureCode": { + "id": "NCIT:C64264" + } + } + ], + "karyotypicSex": "XXYY", + "sex": { + "id": "SNOMED:407378000", + "label": "Surgically transgendered transsexual, male-to-female" + } + }, + { + "id": "UNQ_1-3", + "diseases": [ + { + "diseaseCode": { + "id": "SNOMED:26929004", + "label": "Alzheimer's disease" + } + }, + { + "diseaseCode": { + "id": "SNOMED:23853001", + "label": "Disorder of the central nervous system" + } + }, + { + "diseaseCode": { + "id": "SNOMED:359642000", + "label": "Diabetes mellitus type 2 in nonobese (disorder)" + } + } + ], + "ethnicity": { + "id": "SNOMED:113170005", + "label": "Aymara" + }, + "geographicOrigin": { + "id": "SNOMED:223688001", + "label": "United States of America" + }, + "interventionsOrProcedures": [ + { + "procedureCode": { + "id": "NCIT:C79426" + } + }, + { + "procedureCode": { + "id": "NCIT:C64263" + } + } + ], + "karyotypicSex": "XXX", + "sex": { + "id": "SNOMED:407374003", + "label": "Transsexual" + } + }, + { + "id": "UNQ_1-4", + "ethnicity": { + "id": "SNOMED:10432001", + "label": "Onge" + }, + "geographicOrigin": { + "id": "SNOMED:223600005", + "label": "India" + }, + "interventionsOrProcedures": [ + { + "procedureCode": { + "id": "NCIT:C79426" + } + }, + { + "procedureCode": { + "id": "NCIT:C64264" + } + } + ], + "karyotypicSex": "XYY", + "sex": { + "id": "SNOMED:407377005", + "label": "Female-to-male transsexual" + } + }, + { + "id": "UNQ_1-5", + "diseases": [ + { + "diseaseCode": { + "id": "SNOMED:254955001", + "label": "Pituitary carcinoma" + } + } + ], + "ethnicity": { + "id": "SNOMED:12556008", + "label": "Tamils" + }, + "geographicOrigin": { + "id": "SNOMED:223498002", + "label": "Africa" + }, + "interventionsOrProcedures": [ + { + "procedureCode": { + "id": "NCIT:C64263" + } + }, + { + "procedureCode": { + "id": "NCIT:C64264" + } + } + ], + "karyotypicSex": "XXXX", + "sex": { + "id": "SNOMED:407374003", + "label": "Transsexual" + } + }, + { + "id": "UNQ_1-6", + "diseases": [ + { + "diseaseCode": { + "id": "SNOMED:56265001", + "label": "Heart disease (disorder)" + } + } + ], + "ethnicity": { + "id": "SNOMED:17789004", + "label": "Papuans" + }, + "geographicOrigin": { + "id": "SNOMED:223713009", + "label": "Argentina" + }, + "interventionsOrProcedures": [ + { + "procedureCode": { + "id": "NCIT:C93025" + } + } + ], + "karyotypicSex": "XX", + "sex": { + "id": "SNOMED:248152002", + "label": "Female" + } + }, + { + "id": "UNQ_1-7", + "ethnicity": { + "id": "SNOMED:77502007", + "label": "Atacamenos" + }, + "geographicOrigin": { + "id": "SNOMED:223498002", + "label": "Africa" + }, + "interventionsOrProcedures": [ + { + "procedureCode": { + "id": "NCIT:C79426" + } + } + ], + "karyotypicSex": "XXXY", + "sex": { + "id": "SNOMED:407377005", + "label": "Female-to-male transsexual" + } + }, + { + "id": "UNQ_1-8", + "diseases": [ + { + "diseaseCode": { + "id": "SNOMED:359642000", + "label": "Diabetes mellitus type 2 in nonobese (disorder)" + } + }, + { + "diseaseCode": { + "id": "SNOMED:312991009", + "label": "Senile dementia of the Lewy body type (disorder)" + } + }, + { + "diseaseCode": { + "id": "SNOMED:81531005", + "label": "Diabetes mellitus type 2 in obese (disorder)" + } + } + ], + "ethnicity": { + "id": "SNOMED:89026003", + "label": "Alacaluf" + }, + "geographicOrigin": { + "id": "SNOMED:223498002", + "label": "Africa" + }, + "interventionsOrProcedures": [ + { + "procedureCode": { + "id": "NCIT:C64263" + } + } + ], + "karyotypicSex": "XX", + "sex": { + "id": "SNOMED:407378000", + "label": "Surgically transgendered transsexual, male-to-female" + } + }, + { + "id": "UNQ_1-9", + "diseases": [ + { + "diseaseCode": { + "id": "SNOMED:26929004", + "label": "Alzheimer's disease" + } + }, + { + "diseaseCode": { + "id": "SNOMED:81531005", + "label": "Diabetes mellitus type 2 in obese (disorder)" + } + }, + { + "diseaseCode": { + "id": "SNOMED:135811000119107", + "label": "Lewy body dementia with behavioral disturbance (disorder)" + } + } + ], + "ethnicity": { + "id": "SNOMED:10292001", + "label": "Guamians" + }, + "geographicOrigin": { + "id": "SNOMED:223498002", + "label": "Africa" + }, + "karyotypicSex": "XXXX", + "sex": { + "id": "SNOMED:407377005", + "label": "Female-to-male transsexual" + } + }, + { + "id": "UNQ_1-10", + "ethnicity": { + "id": "SNOMED:76460008", + "label": "Yanomama" + }, + "geographicOrigin": { + "id": "SNOMED:223688001", + "label": "United States of America" + }, + "interventionsOrProcedures": [ + { + "procedureCode": { + "id": "NCIT:C64264" + } + } + ], + "karyotypicSex": "XXXY", + "sex": { + "id": "SNOMED:248153007", + "label": "Male" + } + } + ], + "biosamples": [ + { + "id": "UNQ_1-1", + "individualId": "UNQ_1-1", + "biosampleStatus": { + "id": "SNOMED:365641003", + "label": "Minor blood groups - finding" + }, + "collectionDate": "2019-04-23", + "collectionMoment": "P32Y6M1D", + "histologicalDiagnosis": { + "id": "SNOMED:719046005", + "label": "12q14 microdeletion syndrome" + }, + "obtentionProcedure": { + "procedureCode": { + "id": "NCIT:C157179", + "label": "FGFR1 Mutation Analysis" + } + }, + "pathologicalTnmFinding": [ + { + "id": "NCIT:C48725", + "label": "T2a Stage Finding" + } + ], + "sampleOriginDetail": { + "id": "SNOMED:258497007", + "label": "Abscess swab" + }, + "sampleOriginType": { + "id": "SNOMED:31675002", + "label": "Capillary blood" + }, + "tumorProgression": { + "id": "NCIT:C84509", + "label": "Primary Malignant Neoplasm" + }, + "info": {}, + "notes": "" + }, + { + "id": "UNQ_1-2", + "individualId": "UNQ_1-2", + "biosampleStatus": { + "id": "SNOMED:702782002", + "label": "Mitochondrial 1555 A to G mutation positive" + }, + "collectionDate": "2022-04-23", + "collectionMoment": "P32Y6M1D", + "histologicalDiagnosis": { + "id": "SNOMED:771439009", + "label": "14q22q23 microdeletion syndrome" + }, + "pathologicalTnmFinding": [ + { + "id": "NCIT:C48699", + "label": "M0 Stage Finding" + } + ], + "sampleOriginDetail": { + "id": "SNOMED:734336008", + "label": "Specimen from aorta" + }, + "sampleProcessing": { + "id": "SNOMED:18809007", + "label": "Meckel's ganglionectomy" + }, + "tumorGrade": { + "id": "NCIT:C28080", + "label": "Grade 3a" + }, + "info": {}, + "notes": "", + "sampleOriginType": { + "id": "SNOMED:31675002", + "label": "Capillary blood" + } + }, + { + "id": "UNQ_1-3", + "individualId": "UNQ_1-3", + "biosampleStatus": { + "id": "SNOMED:702782002", + "label": "Mitochondrial 1555 A to G mutation positive" + }, + "collectionDate": "2021-04-23", + "collectionMoment": "P32Y6M1D", + "histologicalDiagnosis": { + "id": "SNOMED:771439009", + "label": "14q22q23 microdeletion syndrome" + }, + "pathologicalTnmFinding": [ + { + "id": "NCIT:C48725", + "label": "T2a Stage Finding" + } + ], + "sampleOriginType": { + "id": "SNOMED:702451000", + "label": "Cultured cells" + }, + "sampleProcessing": { + "id": "SNOMED:18809007", + "label": "Meckel's ganglionectomy" + }, + "tumorGrade": { + "id": "NCIT:C28080", + "label": "Grade 3a" + }, + "tumorProgression": { + "id": "NCIT:C4813", + "label": "Recurrent Malignant Neoplasm" + }, + "info": {}, + "notes": "" + }, + { + "id": "UNQ_1-4", + "individualId": "UNQ_1-4", + "biosampleStatus": { + "id": "SNOMED:365641003", + "label": "Minor blood groups - finding" + }, + "collectionDate": "2021-04-23", + "collectionMoment": "P7D", + "histologicalDiagnosis": { + "id": "SNOMED:771439009", + "label": "14q22q23 microdeletion syndrome" + }, + "obtentionProcedure": { + "procedureCode": { + "id": "NCIT:C157179", + "label": "FGFR1 Mutation Analysis" + } + }, + "pathologicalTnmFinding": [ + { + "id": "NCIT:C48725", + "label": "T2a Stage Finding" + } + ], + "sampleOriginDetail": { + "id": "SNOMED:258603007", + "label": "Respiratory specimen" + }, + "sampleOriginType": { + "id": "SNOMED:782814004", + "label": "Cultured autograft of skin" + }, + "sampleProcessing": { + "id": "SNOMED:18809007", + "label": "Meckel's ganglionectomy" + }, + "tumorGrade": { + "id": "NCIT:C28080", + "label": "Grade 3a" + }, + "tumorProgression": { + "id": "NCIT:C84509", + "label": "Primary Malignant Neoplasm" + }, + "info": {}, + "notes": "" + }, + { + "id": "UNQ_1-5", + "individualId": "UNQ_1-5", + "biosampleStatus": { + "id": "SNOMED:310294002", + "label": "Mitochondrial antibodies positive" + }, + "collectionDate": "2022-04-23", + "collectionMoment": "P7D", + "histologicalDiagnosis": { + "id": "SNOMED:362965005", + "label": "Disorder of body system (disorder)" + }, + "pathologicalStage": { + "id": "NCIT:C27977", + "label": "Stage IIIA" + }, + "pathologicalTnmFinding": [ + { + "id": "NCIT:C48725", + "label": "T2a Stage Finding" + } + ], + "sampleOriginDetail": { + "id": "SNOMED:258500001", + "label": "Nasopharyngeal swab" + }, + "sampleOriginType": { + "id": "SNOMED:782814004", + "label": "Cultured autograft of skin" + }, + "sampleProcessing": { + "id": "SNOMED:72019009", + "label": "Mechanical vitrectomy by posterior approach" + }, + "info": {}, + "notes": "" + }, + { + "id": "UNQ_1-6", + "individualId": "UNQ_1-6", + "biosampleStatus": { + "id": "SNOMED:276447000", + "label": "Mite present" + }, + "collectionDate": "2018-04-23", + "collectionMoment": "P32Y6M1D", + "histologicalDiagnosis": { + "id": "SNOMED:719046005", + "label": "12q14 microdeletion syndrome" + }, + "obtentionProcedure": { + "procedureCode": { + "id": "NCIT:C15189", + "label": "biopsy" + } + }, + "sampleOriginType": { + "id": "SNOMED:782814004", + "label": "Cultured autograft of skin" + }, + "sampleProcessing": { + "id": "SNOMED:87021001", + "label": "Mechanical vitrectomy by pars plana approach" + }, + "tumorGrade": { + "id": "NCIT:C28080", + "label": "Grade 3a" + }, + "tumorProgression": { + "id": "NCIT:C84509", + "label": "Primary Malignant Neoplasm" + }, + "info": {}, + "notes": "" + }, + { + "id": "UNQ_1-7", + "individualId": "UNQ_1-7", + "biosampleStatus": { + "id": "SNOMED:310294002", + "label": "Mitochondrial antibodies positive" + }, + "collectionDate": "2021-04-23", + "collectionMoment": "P32Y6M1D", + "histologicalDiagnosis": { + "id": "SNOMED:237592006", + "label": "Abnormality of bombesin secretion" + }, + "pathologicalTnmFinding": [ + { + "id": "NCIT:C48699", + "label": "M0 Stage Finding" + } + ], + "sampleOriginDetail": { + "id": "SNOMED:734336008", + "label": "Specimen from aorta" + }, + "sampleOriginType": { + "id": "SNOMED:31675002", + "label": "Capillary blood" + }, + "tumorProgression": { + "id": "NCIT:C84509", + "label": "Primary Malignant Neoplasm" + }, + "info": {}, + "notes": "" + }, + { + "id": "UNQ_1-8", + "individualId": "UNQ_1-8", + "biosampleStatus": { + "id": "SNOMED:702782002", + "label": "Mitochondrial 1555 A to G mutation positive" + }, + "collectionDate": "2015-04-23", + "collectionMoment": "P32Y6M1D", + "histologicalDiagnosis": { + "id": "SNOMED:237592006", + "label": "Abnormality of bombesin secretion" + }, + "sampleOriginDetail": { + "id": "SNOMED:385338007", + "label": "Specimen from anus obtained by transanal disk excision" + }, + "sampleOriginType": { + "id": "SNOMED:422236008", + "label": "Agar medium" + }, + "tumorGrade": { + "id": "NCIT:C28080", + "label": "Grade 3a" + }, + "info": {}, + "notes": "" + }, + { + "id": "UNQ_1-9", + "individualId": "UNQ_1-9", + "biosampleStatus": { + "id": "SNOMED:310293008", + "label": "Mitochondrial antibodies negative" + }, + "collectionDate": "2018-04-23", + "collectionMoment": "P32Y6M1D", + "histologicalDiagnosis": { + "id": "SNOMED:771439009", + "label": "14q22q23 microdeletion syndrome" + }, + "pathologicalTnmFinding": [ + { + "id": "NCIT:C48709", + "label": "N1c Stage Finding" + } + ], + "sampleOriginType": { + "id": "SNOMED:31675002", + "label": "Capillary blood" + }, + "sampleProcessing": { + "id": "SNOMED:87021001", + "label": "Mechanical vitrectomy by pars plana approach" + }, + "info": {}, + "notes": "" + }, + { + "id": "UNQ_1-10", + "individualId": "UNQ_1-10", + "biosampleStatus": { + "id": "SNOMED:365641003", + "label": "Minor blood groups - finding" + }, + "collectionDate": "2022-04-23", + "collectionMoment": "P7D", + "histologicalDiagnosis": { + "id": "SNOMED:719046005", + "label": "12q14 microdeletion syndrome" + }, + "pathologicalTnmFinding": [ + { + "id": "NCIT:C48709", + "label": "N1c Stage Finding" + } + ], + "sampleOriginType": { + "id": "SNOMED:422236008", + "label": "Agar medium" + }, + "sampleProcessing": { + "id": "SNOMED:72019009", + "label": "Mechanical vitrectomy by posterior approach" + }, + "tumorGrade": { + "id": "NCIT:C28080", + "label": "Grade 3a" + }, + "tumorProgression": { + "id": "NCIT:C84509", + "label": "Primary Malignant Neoplasm" + }, + "info": {}, + "notes": "" + } + ], + "runs": [ + { + "id": "UNQ_1-1", + "biosampleId": "UNQ_1-1", + "individualId": "UNQ_1-1", + "libraryLayout": "PAIRED", + "librarySelection": "RANDOM", + "librarySource": { + "id": "GENEPIO:0001969", + "label": "other library source" + }, + "libraryStrategy": "WGS", + "platform": "PacBio", + "platformModel": { + "id": "OBI:0002012", + "label": "PacBio RS II" + }, + "runDate": "2021-10-18" + }, + { + "id": "UNQ_1-2", + "biosampleId": "UNQ_1-2", + "individualId": "UNQ_1-2", + "libraryLayout": "PAIRED", + "librarySelection": "RANDOM", + "librarySource": { + "id": "GENEPIO:0001966", + "label": "genomic source" + }, + "libraryStrategy": "WGS", + "platform": "Illumina", + "platformModel": { + "id": "OBI:0002048", + "label": "Illumina HiSeq 3000" + }, + "runDate": "2021-10-18" + }, + { + "id": "UNQ_1-3", + "biosampleId": "UNQ_1-3", + "individualId": "UNQ_1-3", + "libraryLayout": "PAIRED", + "librarySelection": "RANDOM", + "librarySource": { + "id": "GENEPIO:0001966", + "label": "genomic source" + }, + "libraryStrategy": "WGS", + "platform": "NanoPore", + "platformModel": { + "id": "OBI:0002750", + "label": "Oxford Nanopore MinION" + }, + "runDate": "2021-10-18" + }, + { + "id": "UNQ_1-4", + "biosampleId": "UNQ_1-4", + "individualId": "UNQ_1-4", + "libraryLayout": "PAIRED", + "librarySelection": "RANDOM", + "librarySource": { + "id": "GENEPIO:0001969", + "label": "other library source" + }, + "libraryStrategy": "WGS", + "platform": "NanoPore", + "platformModel": { + "id": "OBI:0002750", + "label": "Oxford Nanopore MinION" + }, + "runDate": "2021-10-18" + }, + { + "id": "UNQ_1-5", + "biosampleId": "UNQ_1-5", + "individualId": "UNQ_1-5", + "libraryLayout": "PAIRED", + "librarySelection": "RANDOM", + "librarySource": { + "id": "GENEPIO:0001969", + "label": "other library source" + }, + "libraryStrategy": "WGS", + "platform": "PacBio", + "platformModel": { + "id": "OBI:0002012", + "label": "PacBio RS II" + }, + "runDate": "2018-01-01" + }, + { + "id": "UNQ_1-6", + "biosampleId": "UNQ_1-6", + "individualId": "UNQ_1-6", + "libraryLayout": "PAIRED", + "librarySelection": "RANDOM", + "librarySource": { + "id": "GENEPIO:0001969", + "label": "other library source" + }, + "libraryStrategy": "WGS", + "platform": "PacBio", + "platformModel": { + "id": "OBI:0002012", + "label": "PacBio RS II" + }, + "runDate": "2018-01-01" + }, + { + "id": "UNQ_1-7", + "biosampleId": "UNQ_1-7", + "individualId": "UNQ_1-7", + "libraryLayout": "PAIRED", + "librarySelection": "RANDOM", + "librarySource": { + "id": "GENEPIO:0001969", + "label": "other library source" + }, + "libraryStrategy": "WGS", + "platform": "Illumina", + "platformModel": { + "id": "OBI:0002048", + "label": "Illumina HiSeq 3000" + }, + "runDate": "2021-10-18" + }, + { + "id": "UNQ_1-8", + "biosampleId": "UNQ_1-8", + "individualId": "UNQ_1-8", + "libraryLayout": "PAIRED", + "librarySelection": "RANDOM", + "librarySource": { + "id": "GENEPIO:0001966", + "label": "genomic source" + }, + "libraryStrategy": "WGS", + "platform": "NanoPore", + "platformModel": { + "id": "OBI:0002750", + "label": "Oxford Nanopore MinION" + }, + "runDate": "2021-10-18" + }, + { + "id": "UNQ_1-9", + "biosampleId": "UNQ_1-9", + "individualId": "UNQ_1-9", + "libraryLayout": "PAIRED", + "librarySelection": "RANDOM", + "librarySource": { + "id": "GENEPIO:0001966", + "label": "genomic source" + }, + "libraryStrategy": "WGS", + "platform": "Illumina", + "platformModel": { + "id": "OBI:0002048", + "label": "Illumina HiSeq 3000" + }, + "runDate": "2018-01-01" + }, + { + "id": "UNQ_1-10", + "biosampleId": "UNQ_1-10", + "individualId": "UNQ_1-10", + "libraryLayout": "PAIRED", + "librarySelection": "RANDOM", + "librarySource": { + "id": "GENEPIO:0001969", + "label": "other library source" + }, + "libraryStrategy": "WGS", + "platform": "Illumina", + "platformModel": { + "id": "OBI:0002048", + "label": "Illumina HiSeq 3000" + }, + "runDate": "2022-08-08" + } + ], + "analyses": [ + { + "id": "UNQ_1-1", + "individualId": "UNQ_1-1", + "biosampleId": "UNQ_1-1", + "runId": "UNQ_1-1", + "aligner": "bwa-0.7.8", + "analysisDate": "2020-2-15", + "pipelineName": "pipeline 5", + "pipelineRef": "Example", + "variantCaller": "SoapSNP", + "vcfSampleId": "HG00096" + }, + { + "id": "UNQ_1-2", + "individualId": "UNQ_1-2", + "biosampleId": "UNQ_1-2", + "runId": "UNQ_1-2", + "aligner": "minimap2", + "analysisDate": "2019-3-17", + "pipelineName": "pipeline 1", + "pipelineRef": "Example", + "variantCaller": "GATK4.0", + "vcfSampleId": "HG00097" + }, + { + "id": "UNQ_1-3", + "individualId": "UNQ_1-3", + "biosampleId": "UNQ_1-3", + "runId": "UNQ_1-3", + "aligner": "minimap2", + "analysisDate": "2018-10-2", + "pipelineName": "pipeline 5", + "pipelineRef": "Example", + "variantCaller": "GATK4.0", + "vcfSampleId": "HG00099" + }, + { + "id": "UNQ_1-4", + "individualId": "UNQ_1-4", + "biosampleId": "UNQ_1-4", + "runId": "UNQ_1-4", + "aligner": "bwa-0.7.8", + "analysisDate": "2018-11-9", + "pipelineName": "pipeline 5", + "pipelineRef": "Example", + "variantCaller": "kmer2snp", + "vcfSampleId": "HG00100" + }, + { + "id": "UNQ_1-5", + "individualId": "UNQ_1-5", + "biosampleId": "UNQ_1-5", + "runId": "UNQ_1-5", + "aligner": "bowtie", + "analysisDate": "2019-5-27", + "pipelineName": "pipeline 3", + "pipelineRef": "Example", + "variantCaller": "GATK4.0", + "vcfSampleId": "HG00101" + }, + { + "id": "UNQ_1-6", + "individualId": "UNQ_1-6", + "biosampleId": "UNQ_1-6", + "runId": "UNQ_1-6", + "aligner": "bwa-0.7.8", + "analysisDate": "2021-11-22", + "pipelineName": "pipeline 1", + "pipelineRef": "Example", + "variantCaller": "SoapSNP", + "vcfSampleId": "HG00102" + }, + { + "id": "UNQ_1-7", + "individualId": "UNQ_1-7", + "biosampleId": "UNQ_1-7", + "runId": "UNQ_1-7", + "aligner": "bowtie", + "analysisDate": "2018-1-8", + "pipelineName": "pipeline 1", + "pipelineRef": "Example", + "variantCaller": "SoapSNP", + "vcfSampleId": "HG00103" + }, + { + "id": "UNQ_1-8", + "individualId": "UNQ_1-8", + "biosampleId": "UNQ_1-8", + "runId": "UNQ_1-8", + "aligner": "minimap2", + "analysisDate": "2022-3-6", + "pipelineName": "pipeline 1", + "pipelineRef": "Example", + "variantCaller": "GATK4.0", + "vcfSampleId": "HG00105" + }, + { + "id": "UNQ_1-9", + "individualId": "UNQ_1-9", + "biosampleId": "UNQ_1-9", + "runId": "UNQ_1-9", + "aligner": "bowtie", + "analysisDate": "2021-2-17", + "pipelineName": "pipeline 2", + "pipelineRef": "Example", + "variantCaller": "SoapSNP", + "vcfSampleId": "HG00106" + }, + { + "id": "UNQ_1-10", + "individualId": "UNQ_1-10", + "biosampleId": "UNQ_1-10", + "runId": "UNQ_1-10", + "aligner": "bwa-0.7.8", + "analysisDate": "2019-8-13", + "pipelineName": "pipeline 1", + "pipelineRef": "Example", + "variantCaller": "SoapSNP", + "vcfSampleId": "HG00107" + } + ], + "index": true +} \ No newline at end of file From b3efa98996a8ca66f0067b31e636313551f3d923 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Mon, 24 Apr 2023 06:54:33 +0000 Subject: [PATCH 02/16] Refactoring results count, updated examples guide --- examples/test-data/GUIDE.md | 298 +++++++++++++++++- .../route_g_variants_id_biosamples.py | 58 ++-- .../route_g_variants_id_individuals.py | 54 ++-- 3 files changed, 348 insertions(+), 62 deletions(-) diff --git a/examples/test-data/GUIDE.md b/examples/test-data/GUIDE.md index a5c1f63..44256ae 100644 --- a/examples/test-data/GUIDE.md +++ b/examples/test-data/GUIDE.md @@ -12,6 +12,8 @@ Now edit the `submission.json` file such that they match the S3 URI of the `vcf. ... ``` +## Data submission + You can submit the data in two ways. ### Submission as request body @@ -28,4 +30,298 @@ Alternatively, you can upload edited `submission.json` file to an S3 location ac } ``` -This approach is recommended for larger submissions with thousands of metadata entries. \ No newline at end of file +This approach is recommended for larger submissions with thousands of metadata entries. + +## API testing + +### POST requst to `/g_variants` with following payload + +```json +{ + "meta": { + "apiVersion": "v2.0" + }, + "query": { + "pagination": {}, + "includeResultsetResponses": "HIT", + "requestedGranularity": "record", + "filters": [ + ], + "requestParameters": { + "assemblyId": "GRCH38", + "start": [ + 546801 + ], + "end": [ + 546810 + ], + "referenceName": "1" + } + } +} +``` + +Result + +```json +{ + "meta": { + "beaconId": "au.csiro-serverless.beacon", + "apiVersion": "v2.0.0", + "returnedGranularity": "record", + "receivedRequestSummary": { + "apiVersion": "v2.0", + "requestedSchemas": [], + "filters": [], + "req_params": { + "assemblyId": "GRCH38", + "start": [ + 546801 + ], + "end": [ + 546810 + ], + "referenceName": "1" + }, + "includeResultsetResponses": "HIT", + "pagination": { + "skip": 0, + "limit": 10 + }, + "requestedGranularity": "record", + "testMode": false + }, + "returnedSchemas": [ + { + "entityType": "genomicVariation", + "schema": "beacon-g_variant-v2.0.0" + } + ] + }, + "responseSummary": { + "exists": true, + "numTotalResults": 2 + }, + "response": { + "resultSets": [ + { + "id": "", + "setType": "", + "exists": true, + "resultsCount": 2, + "results": [ + { + "variantInternalId": "R1JDSDM4CTEJNTQ2ODAyCUcJQw==", + "variation": { + "referenceBases": "G", + "alternateBases": "C", + "location": { + "interval": { + "start": { + "type": "Number", + "value": 546802 + }, + "end": { + "type": "Number", + "value": 546803 + }, + "type": "SequenceInterval" + }, + "sequence_id": "GRCH38", + "type": "SequenceLocation" + }, + "variantType": "SNP" + } + }, + { + "variantInternalId": "R1JDSDM4CTEJNTQ2ODA1CVQJQw==", + "variation": { + "referenceBases": "T", + "alternateBases": "C", + "location": { + "interval": { + "start": { + "type": "Number", + "value": 546805 + }, + "end": { + "type": "Number", + "value": 546806 + }, + "type": "SequenceInterval" + }, + "sequence_id": "GRCH38", + "type": "SequenceLocation" + }, + "variantType": "SNP" + } + } + ], + "resultsHandover": null + } + ] + }, + "beaconHandovers": [] +} +``` + +### POST request to `/g_variants/R1JDSDM4CTEJNTQ2ODAyCUcJQw==/individuals` with following payload + +```json +{ + "meta": { + "apiVersion": "v2.0" + }, + "query": { + "requestedGranularity": "record", + "pagination": { + "limit": 1 + }, + "filters": [] + } +} +``` + +Result + +```json +{ + "meta": { + "beaconId": "au.csiro-serverless.beacon", + "apiVersion": "v2.0.0", + "returnedGranularity": "record", + "receivedRequestSummary": { + "apiVersion": "v2.0", + "requestedSchemas": [], + "filters": [], + "req_params": {}, + "includeResultsetResponses": "HIT", + "pagination": { + "skip": 0, + "limit": 1 + }, + "requestedGranularity": "record", + "testMode": false + }, + "returnedSchemas": [ + { + "entityType": "individual", + "schema": "beacon-individual-v2.0.0" + } + ] + }, + "responseSummary": { + "exists": true, + "numTotalResults": 9 + }, + "response": { + "resultSets": [ + { + "id": "", + "setType": "", + "exists": true, + "resultsCount": 9, + "results": [ + { + "diseases": [ + { + "diseaseCode": { + "id": "SNOMED:56265001", + "label": "Heart disease (disorder)" + } + } + ], + "ethnicity": { + "id": "SNOMED:17789004", + "label": "Papuans" + }, + "exposures": "", + "geographicOrigin": { + "id": "SNOMED:223713009", + "label": "Argentina" + }, + "id": "UNQ_1-6", + "info": "", + "interventionsOrProcedures": [ + { + "procedureCode": { + "id": "NCIT:C93025" + } + } + ], + "karyotypicSex": "XX", + "measures": "", + "pedigrees": "", + "phenotypicFeatures": "", + "sex": { + "id": "SNOMED:248152002", + "label": "Female" + }, + "treatments": "" + } + ], + "resultsHandover": null + } + ] + }, + "beaconHandovers": [] +} +``` + +### POST request to `/individuals` with following payload + +```json +{ + "query": { + "filters": [ + { + "id": "SNOMED:223688001" + } + ], + "requestedGranularity": "count" + }, + "meta": { + "apiVersion": "v2.0" + } +} +``` + +Result + +```json +{ + "meta": { + "beaconId": "au.csiro-serverless.beacon", + "apiVersion": "v2.0.0", + "returnedGranularity": "count", + "receivedRequestSummary": { + "apiVersion": "v2.0", + "requestedSchemas": [], + "filters": [ + { + "id": "SNOMED:223688001" + } + ], + "req_params": {}, + "includeResultsetResponses": "HIT", + "pagination": { + "skip": 0, + "limit": 10 + }, + "requestedGranularity": "count", + "testMode": false + }, + "returnedSchemas": [ + { + "entityType": "individual", + "schema": "beacon-individual-v2.0.0" + } + ] + }, + "responseSummary": { + "exists": true, + "numTotalResults": 4 + }, + "beaconHandovers": [] +} +``` \ No newline at end of file diff --git a/lambda/getGenomicVariants/route_g_variants_id_biosamples.py b/lambda/getGenomicVariants/route_g_variants_id_biosamples.py index 93598a9..bc3d70d 100644 --- a/lambda/getGenomicVariants/route_g_variants_id_biosamples.py +++ b/lambda/getGenomicVariants/route_g_variants_id_biosamples.py @@ -1,4 +1,4 @@ -from collections import defaultdict +from collections import defaultdict, OrderedDict import json import base64 @@ -168,46 +168,42 @@ def route(request: RequestParams, variant_id): ) queries = [] + + dataset_samples_sorted = OrderedDict(sorted(dataset_samples.items())) iterated_biosamples = 0 chosen_biosamples = 0 + total_biosamples = sum([len(sample_names) for sample_names in dataset_samples_sorted.values()]) for dataset_id, sample_names in dataset_samples.items(): - if (len(sample_names)) > 0: - if request.query.requested_granularity == "count": - # query = get_count_query(dataset_id, sample_names) - # queries.append(query) - # TODO optimise for duplicate individuals - iterated_biosamples += len(sample_names) - elif request.query.requested_granularity == Granularity.RECORD: - # TODO optimise for duplicate individuals - chosen_samples = [] - - for sample_name in sample_names: - iterated_biosamples += 1 - if ( - iterated_biosamples > request.query.pagination.skip - and chosen_biosamples < request.query.pagination.limit - ): - chosen_samples.append(sample_name) - chosen_biosamples += 1 - - if chosen_biosamples == request.query.pagination.limit: - break - if len(chosen_samples) > 0: - query = get_record_query(dataset_id, chosen_samples) - queries.append(query) - - if request.query.requested_granularity == "boolean": + if len(sample_names) > 0 and request.query.requested_granularity == Granularity.RECORD: + # TODO optimise for duplicate individuals + chosen_samples = [] + + for sample_name in sample_names: + iterated_biosamples += 1 + if ( + iterated_biosamples > request.query.pagination.skip + and chosen_biosamples < request.query.pagination.limit + ): + chosen_samples.append(sample_name) + chosen_biosamples += 1 + + if chosen_biosamples == request.query.pagination.limit: + break + if len(chosen_samples) > 0: + query = get_record_query(dataset_id, chosen_samples) + queries.append(query) + + if request.query.requested_granularity == Granularity.BOOLEAN: response = build_beacon_boolean_response( {}, 1 if exists else 0, request, {}, DefaultSchemas.BIOSAMPLES ) print("Returning Response: {}".format(json.dumps(response))) return bundle_response(200, response) - if request.query.requested_granularity == "count": - count = iterated_biosamples + if request.query.requested_granularity == Granularity.COUNT: response = build_beacon_count_response( - {}, count, request, {}, DefaultSchemas.BIOSAMPLES + {}, total_biosamples, request, {}, DefaultSchemas.BIOSAMPLES ) print("Returning Response: {}".format(json.dumps(response))) return bundle_response(200, response) @@ -217,7 +213,7 @@ def route(request: RequestParams, variant_id): biosamples = Biosample.get_by_query(query) if len(queries) > 0 else [] response = build_beacon_resultset_response( jsons.dump(biosamples, strip_privates=True), - len(biosamples), + total_biosamples, request, {}, DefaultSchemas.BIOSAMPLES, diff --git a/lambda/getGenomicVariants/route_g_variants_id_individuals.py b/lambda/getGenomicVariants/route_g_variants_id_individuals.py index b977a90..645aa00 100644 --- a/lambda/getGenomicVariants/route_g_variants_id_individuals.py +++ b/lambda/getGenomicVariants/route_g_variants_id_individuals.py @@ -139,44 +139,38 @@ def route(request: RequestParams, variant_id): dataset_samples_sorted = OrderedDict(sorted(dataset_samples.items())) iterated_individuals = 0 chosen_individuals = 0 + total_individuals = sum([len(sample_names) for sample_names in dataset_samples_sorted.values()]) for dataset_id, sample_names in dataset_samples_sorted.items(): - if (len(sample_names)) > 0: - if request.query.requested_granularity == "count": - # query = get_count_query(dataset_id, sample_names) - # queries.append(query) - # TODO optimise for duplicate individuals - iterated_individuals += len(sample_names) - elif request.query.requested_granularity == Granularity.RECORD: - # TODO optimise for duplicate individuals - chosen_samples = [] - - for sample_name in sample_names: - iterated_individuals += 1 - if ( - iterated_individuals > request.query.pagination.skip - and chosen_individuals < request.query.pagination.limit - ): - chosen_samples.append(sample_name) - chosen_individuals += 1 - - if chosen_individuals == request.query.pagination.limit: - break - if len(chosen_samples) > 0: - query = get_record_query(dataset_id, chosen_samples) - queries.append(query) - - if request.query.requested_granularity == "boolean": + if len(sample_names) > 0 and request.query.requested_granularity == Granularity.RECORD: + # TODO optimise for duplicate individuals + chosen_samples = [] + + for sample_name in sample_names: + iterated_individuals += 1 + if ( + iterated_individuals > request.query.pagination.skip + and chosen_individuals < request.query.pagination.limit + ): + chosen_samples.append(sample_name) + chosen_individuals += 1 + + if chosen_individuals == request.query.pagination.limit: + break + if len(chosen_samples) > 0: + query = get_record_query(dataset_id, chosen_samples) + queries.append(query) + + if request.query.requested_granularity == Granularity.BOOLEAN: response = build_beacon_boolean_response( {}, 1 if exists else 0, request, {}, DefaultSchemas.INDIVIDUALS ) print("Returning Response: {}".format(json.dumps(response))) return bundle_response(200, response) - if request.query.requested_granularity == "count": - count = iterated_individuals + if request.query.requested_granularity == Granularity.COUNT: response = build_beacon_count_response( - {}, count, request, {}, DefaultSchemas.INDIVIDUALS + {}, total_individuals, request, {}, DefaultSchemas.INDIVIDUALS ) print("Returning Response: {}".format(json.dumps(response))) return bundle_response(200, response) @@ -186,7 +180,7 @@ def route(request: RequestParams, variant_id): individuals = Individual.get_by_query(query) if len(queries) > 0 else [] response = build_beacon_resultset_response( jsons.dump(individuals, strip_privates=True), - len(individuals), + total_individuals, request, {}, DefaultSchemas.INDIVIDUALS, From 0c55bb997c4d9b214db8c3a41a7cd2c7050f2f9c Mon Sep 17 00:00:00 2001 From: Anuradha Date: Thu, 27 Apr 2023 00:51:47 +0000 Subject: [PATCH 03/16] merge master --- s3.tf | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/s3.tf b/s3.tf index 1e4c749..9092f25 100644 --- a/s3.tf +++ b/s3.tf @@ -7,7 +7,16 @@ resource "aws_s3_bucket" "variants-bucket" { tags = var.common-tags } +resource "aws_s3_bucket_ownership_controls" "variants_bucket_ownership_controls" { + bucket = aws_s3_bucket.variants-bucket.id + rule { + object_ownership = "BucketOwnerPreferred" + } +} + resource "aws_s3_bucket_acl" "variants_bucket_acl" { + depends_on = [aws_s3_bucket_ownership_controls.variants_bucket_ownership_controls] + bucket = aws_s3_bucket.variants-bucket.id acl = "private" } @@ -38,7 +47,16 @@ resource "aws_s3_bucket" "metadata-bucket" { tags = var.common-tags } +resource "aws_s3_bucket_ownership_controls" "metadata_bucket_ownership_controls" { + bucket = aws_s3_bucket.metadata-bucket.id + rule { + object_ownership = "BucketOwnerPreferred" + } +} + resource "aws_s3_bucket_acl" "metadata" { + depends_on = [aws_s3_bucket_ownership_controls.metadata_bucket_ownership_controls] + bucket = aws_s3_bucket.metadata-bucket.id acl = "private" } @@ -82,7 +100,16 @@ resource "aws_s3_bucket" "lambda-layers-bucket" { tags = var.common-tags } +resource "aws_s3_bucket_ownership_controls" "lambda_layers_bucket_ownership_controls" { + bucket = aws_s3_bucket.lambda-layers-bucket.id + rule { + object_ownership = "BucketOwnerPreferred" + } +} + resource "aws_s3_bucket_acl" "lambda-layers" { + depends_on = [aws_s3_bucket_ownership_controls.lambda_layers_bucket_ownership_controls] + bucket = aws_s3_bucket.lambda-layers-bucket.id acl = "private" } From 8af4d1d5118ad8453bdb0513dab56ffaa9a4f141 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Thu, 27 Apr 2023 00:51:47 +0000 Subject: [PATCH 04/16] Added updated s3 ownershipt controls --- s3.tf | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/s3.tf b/s3.tf index 1e4c749..9092f25 100644 --- a/s3.tf +++ b/s3.tf @@ -7,7 +7,16 @@ resource "aws_s3_bucket" "variants-bucket" { tags = var.common-tags } +resource "aws_s3_bucket_ownership_controls" "variants_bucket_ownership_controls" { + bucket = aws_s3_bucket.variants-bucket.id + rule { + object_ownership = "BucketOwnerPreferred" + } +} + resource "aws_s3_bucket_acl" "variants_bucket_acl" { + depends_on = [aws_s3_bucket_ownership_controls.variants_bucket_ownership_controls] + bucket = aws_s3_bucket.variants-bucket.id acl = "private" } @@ -38,7 +47,16 @@ resource "aws_s3_bucket" "metadata-bucket" { tags = var.common-tags } +resource "aws_s3_bucket_ownership_controls" "metadata_bucket_ownership_controls" { + bucket = aws_s3_bucket.metadata-bucket.id + rule { + object_ownership = "BucketOwnerPreferred" + } +} + resource "aws_s3_bucket_acl" "metadata" { + depends_on = [aws_s3_bucket_ownership_controls.metadata_bucket_ownership_controls] + bucket = aws_s3_bucket.metadata-bucket.id acl = "private" } @@ -82,7 +100,16 @@ resource "aws_s3_bucket" "lambda-layers-bucket" { tags = var.common-tags } +resource "aws_s3_bucket_ownership_controls" "lambda_layers_bucket_ownership_controls" { + bucket = aws_s3_bucket.lambda-layers-bucket.id + rule { + object_ownership = "BucketOwnerPreferred" + } +} + resource "aws_s3_bucket_acl" "lambda-layers" { + depends_on = [aws_s3_bucket_ownership_controls.lambda_layers_bucket_ownership_controls] + bucket = aws_s3_bucket.lambda-layers-bucket.id acl = "private" } From 4d7292bca400304d0261254b7159fca77dc8d410 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Thu, 27 Apr 2023 01:41:04 +0000 Subject: [PATCH 05/16] capture granularity from query params --- .../python-modules/python/shared/apiutils/requests.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/shared_resources/python-modules/python/shared/apiutils/requests.py b/shared_resources/python-modules/python/shared/apiutils/requests.py index e742cd4..68cd79b 100644 --- a/shared_resources/python-modules/python/shared/apiutils/requests.py +++ b/shared_resources/python-modules/python/shared/apiutils/requests.py @@ -155,6 +155,8 @@ def from_request(self, query_params) -> Self: self.query.pagination.limit = int(v) elif k == "includeResultsetResponses": self.query.include_resultset_responses = IncludeResultsetResponses(v) + elif k == "requestedGranularity": + self.query.requested_granularity = Granularity(v) else: req_params_dict[k] = v if len(req_params_dict): From 3a8d50bcf2f2d35ce1b61628b5eeaa26a86978f3 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Thu, 27 Apr 2023 03:58:19 +0000 Subject: [PATCH 06/16] Bug fix: indexer --- lambda/indexer/lambda_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambda/indexer/lambda_function.py b/lambda/indexer/lambda_function.py index 484e7c7..f4f7336 100644 --- a/lambda/indexer/lambda_function.py +++ b/lambda/indexer/lambda_function.py @@ -146,7 +146,7 @@ def threaded_request_ontoserver(term, url): "compose": { "include": [ { - "system": data["baseUri"], + "system": "http://snomed.info/sct", "filter": [ { "property": "concept", From 1765c1c4f1b036089fff2513e69bcdc3d173d3ff Mon Sep 17 00:00:00 2001 From: Anuradha Date: Thu, 27 Apr 2023 03:58:44 +0000 Subject: [PATCH 07/16] Update read me --- examples/test-data/GUIDE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/test-data/GUIDE.md b/examples/test-data/GUIDE.md index c5d4073..1e98368 100644 --- a/examples/test-data/GUIDE.md +++ b/examples/test-data/GUIDE.md @@ -16,11 +16,11 @@ Now edit the `submission.json` file such that they match the S3 URI of the `vcf. You can submit the data in two ways. -### Submission as request body +### Option 1: Submission as request body You can simply copy the edited JSON content in to the API gateway `/submit_dataset` POST endpoint. If you're using a REST client make sure you add authorization headers before you make the request. For example, Postman supports Authorization type AWS Signature and there you can enter AWS Keys. -### Submission as an S3 payload +### Option 2: Submission as an S3 payload Alternatively, you can upload edited `submission.json` file to an S3 location accessible from deployment. Then you can use the file's S3 URI as follows in the API Gateway or in your REST client. From 03888810f1fe4aa8c4539966ed83eca47b21c444 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Thu, 27 Apr 2023 04:15:14 +0000 Subject: [PATCH 08/16] Update readme --- examples/test-data/GUIDE.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/test-data/GUIDE.md b/examples/test-data/GUIDE.md index 1e98368..a070ccc 100644 --- a/examples/test-data/GUIDE.md +++ b/examples/test-data/GUIDE.md @@ -1,20 +1,20 @@ # Getting started with test data -Please ensure you first upload the `chr1.vcf.gz` and `chr1.vcf.gz.tbi` files to an S3 bucket that is accessible from the sBeacon deployment account. Obtain the S3 URI for the `chr1.vcf.gz` from the uploaded desitation. Note that, both `vcf.gz` and `vcf.gz.tbi` files must have the same prefix in S3 for this to work. +Please ensure you first upload the `chr1.vcf.gz` and `chr1.vcf.gz.tbi` files to an S3 bucket that is accessible from the sBeacon deployment account. Obtain the S3 URI for the `chr1.vcf.gz` from the uploaded desitation. Note that, both `vcf.gz` and `vcf.gz.tbi` files must have the same prefix in S3 for this to work. Please note that, all the buckets you create in AWS are in the same region as the deployment. -Now edit the `submission.json` file such that they match the S3 URI of the `vcf.gz` file. +Now edit the `submission.json` using the S3 URI of the `vcf.gz` file. ```json -... +. . . "vcfLocations": [ "s3:////chr1.vcf.gz" ] -... +. . . ``` ## Data submission -You can submit the data in two ways. +You can submit this data in two ways. ### Option 1: Submission as request body @@ -30,7 +30,7 @@ Alternatively, you can upload edited `submission.json` file to an S3 location ac } ``` -This approach is recommended for larger submissions with thousands of metadata entries. +Option 2 is recommended for larger submissions with thousands of metadata entries. ## API testing From f328a8e64ab45b82b5b91b3ab62dfdb29626c48b Mon Sep 17 00:00:00 2001 From: Anuradha Date: Mon, 8 May 2023 03:57:07 +0000 Subject: [PATCH 09/16] lru caching for ontology terms, refactoring --- dynamodb.tf | 53 ----- iam.tf | 20 -- lambda/indexer/lambda_function.py | 194 ++---------------- main.tf | 1 - .../python/shared/athena/filters.py | 29 +-- .../python/shared/dynamodb/__init__.py | 1 - .../python/shared/dynamodb/onto_index.py | 80 -------- .../python/shared/ontoutils/__init__.py | 162 +++++++++++++++ 8 files changed, 188 insertions(+), 352 deletions(-) delete mode 100644 shared_resources/python-modules/python/shared/dynamodb/onto_index.py create mode 100644 shared_resources/python-modules/python/shared/ontoutils/__init__.py diff --git a/dynamodb.tf b/dynamodb.tf index 8f0bfe9..c3905f4 100644 --- a/dynamodb.tf +++ b/dynamodb.tf @@ -147,56 +147,3 @@ resource "aws_dynamodb_table" "variant_query_responses" { enabled = true } } - -# ontology term index -resource "aws_dynamodb_table" "ontology_terms" { - billing_mode = "PAY_PER_REQUEST" - hash_key = "id" - name = "OntoIndex" - tags = var.common-tags - - # this is the tab concatenated value of - # tableName, columnName, term - # this must not be repeated - attribute { - name = "id" - type = "S" - } - - attribute { - name = "tableName" - type = "S" - } - - attribute { - name = "tableTerms" - type = "S" - } - - attribute { - name = "term" - type = "S" - } - - # be able to query a term - global_secondary_index { - hash_key = "term" - name = "term_index" - projection_type = "ALL" - } - - # be able to query a tableName - global_secondary_index { - hash_key = "tableName" - name = "table_index" - projection_type = "ALL" - } - - # be able to query a terms in a table - # tab concatenated value of table and term - global_secondary_index { - hash_key = "tableTerms" - name = "tableterms_index" - projection_type = "ALL" - } -} \ No newline at end of file diff --git a/iam.tf b/iam.tf index 34a96bf..855322b 100644 --- a/iam.tf +++ b/iam.tf @@ -827,26 +827,6 @@ data "aws_iam_policy_document" "dynamodb-onto-access" { aws_dynamodb_table.anscestor_terms.arn, ] } - - statement { - actions = [ - "dynamodb:Query", - ] - resources = [ - "${aws_dynamodb_table.datasets.arn}/index/*", - "${aws_dynamodb_table.variant_query_responses.arn}/index/*", - "${aws_dynamodb_table.ontology_terms.arn}/index/*", - ] - } - - statement { - actions = [ - "dynamodb:Scan", - ] - resources = [ - aws_dynamodb_table.ontology_terms.arn, - ] - } } # DynamoDB Ontology Related Write Access diff --git a/lambda/indexer/lambda_function.py b/lambda/indexer/lambda_function.py index f4f7336..41d1ad6 100644 --- a/lambda/indexer/lambda_function.py +++ b/lambda/indexer/lambda_function.py @@ -1,16 +1,13 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from collections import defaultdict import threading -import urllib -import json import time -import re from smart_open import open as sopen -import requests import boto3 -from shared.dynamodb import OntoData, Ontology, Descendants, Anscestors +from shared.dynamodb import Descendants, Anscestors +from shared.ontoutils import request_hierarchy from shared.utils import ENV_ATHENA from ctas_queries import QUERY as CTAS_TEMPLATE from generate_query_index import QUERY as INDEX_QUERY @@ -42,36 +39,7 @@ ) -def get_ontology_details(ontology): - details = None - try: - details = Ontology.get(ontology) - except Ontology.DoesNotExist: - if ontology == "SNOMED": - # use ontoserver - details = Ontology(ontology.upper()) - details.data = json.dumps( - {"id": "SNOMED", "baseUri": "http://snomed.info/sct"} - ) - details.save() - else: - # use ENSEMBL - if response := requests.get(f"{ENSEMBL_OLS}/{ontology}"): - response_json = response.json() - details = Ontology(ontology.upper()) - details.data = json.dumps( - { - "id": response_json["ontologyId"].upper(), - "baseUri": response_json["config"]["baseUris"][0], - } - ) - details.save() - - # any other error must be raised - return details - - -def get_ontologies_clusters(): +def get_ontologie_terms_in_beacon(): query = f'SELECT DISTINCT term FROM "{ENV_ATHENA.ATHENA_TERMS_TABLE}"' response = athena.start_query_execution( @@ -83,7 +51,7 @@ def get_ontologies_clusters(): execution_id = response["QueryExecutionId"] await_result(execution_id) - ontology_clusters = defaultdict(set) + ontology_terms = list() with sopen( f"s3://{ENV_ATHENA.ATHENA_METADATA_BUCKET}/query-results/{execution_id}.csv" @@ -92,125 +60,27 @@ def get_ontologies_clusters(): if n == 0: continue term = line.strip().strip('"') - - # beacon API does not allow non CURIE formatted terms - # however, SNOMED appears are non-CURIE prefixed terms - # following is to support that, however API will not ingest - # always submit in form SNOMED:123212 - if re.match(r"(?i)(^SNOMED)|([0-9]+)", term): - ontology = "SNOMED" - ontology_clusters[ontology].add(term) - else: - ontology = term.split(":")[0] - ontology_clusters[ontology].add(term) - - return ontology_clusters + ontology_terms.append(term) + return ontology_terms -# in future, there could be an issue when descendants entries exceed 400KB +# TODO in future, there could be an issue when descendants entries exceed 400KB # which means we would have roughtly 20480, 20 byte entries (unlikely?) # this would also mean, our SQL queries would reach the 256KB limit # we should be able to easily spread terms across multiple dynamodb # entries and have multiple queries (as recommended by AWS) def index_terms_tree(): - # START subroutines - # subroutine for ensemble - def threaded_request_ensemble(term, url): - if response := requests.get(url): - response_json = response.json() - anscestors = set() - for response_term in response_json["_embedded"]["terms"]: - obo_id = response_term["obo_id"] - if obo_id: - anscestors.add(obo_id) - return (term, anscestors) - else: - print(f"Error fetching term from Ensembl OLS {term}") - - # subroutine for ontoserver - def threaded_request_ontoserver(term, url): - snomed = "SNOMED" in term.upper() - retries = 1 - response = None - while (not response or response.status_code != 200) and retries < 10: - retries += 1 - response = requests.post( - url, - json={ - "resourceType": "Parameters", - "parameter": [ - { - "name": "valueSet", - "resource": { - "resourceType": "ValueSet", - "compose": { - "include": [ - { - "system": "http://snomed.info/sct", - "filter": [ - { - "property": "concept", - "op": "generalizes", - "value": f"{term.replace('SNOMED:', '')}", - } - ], - } - ] - }, - }, - } - ], - }, - ) - if response.status_code == 200: - response_json = response.json() - anscestors = set() - for response_term in response_json["expansion"]["contains"]: - anscestors.add( - "SNOMED:" + response_term["code"] - if snomed - else response_term["code"] - ) - return (term, anscestors) - else: - time.sleep(1) - - if response.status_code != 200: - print(f"Error fetching term from Ontoserver {term}") - - # END subroutines - - ontology_clusters = get_ontologies_clusters() + terms_in_beacon = get_ontologie_terms_in_beacon() executor = ThreadPoolExecutor(500) futures = [] - for ontology, terms in ontology_clusters.items(): - if ontology == "SNOMED": - for term in terms: - # fetch only anscestors that aren't fetched yet - try: - data = Anscestors.get(term) - except Anscestors.DoesNotExist: - futures.append( - executor.submit(threaded_request_ontoserver, term, ONTOSERVER) - ) - else: - for term in terms: - # fetch only anscestors that aren't fetched yet - try: - data = Anscestors.get(term) - except Anscestors.DoesNotExist: - # details will be missing if the ontology info is not in OLS - if details := get_ontology_details(ontology): - data = json.loads(details.data) - iri = data["baseUri"] + term.split(":")[1] - iri_double_encoded = urllib.parse.quote_plus( - urllib.parse.quote_plus(iri) - ) - url = f"{ENSEMBL_OLS}/{ontology}/terms/{iri_double_encoded}/hierarchicalAncestors" - futures.append( - executor.submit(threaded_request_ensemble, term, url) - ) + for term in terms_in_beacon: + try: + Anscestors.get(term) + except Anscestors.DoesNotExist: + futures.append(executor.submit(request_hierarchy, term, True)) + + # record ancestors term_anscestors = defaultdict(set) for future in as_completed(futures): @@ -219,17 +89,21 @@ def threaded_request_ontoserver(term, url): term_anscestors[term].update(ancestors) term_anscestors[term].add(term) + # reverse the tree for descendent term search term_descendants = defaultdict(set) + # write ancestors with Anscestors.batch_write() as batch: for term, anscestors in term_anscestors.items(): item = Anscestors(term) item.anscestors = anscestors batch.save(item) + # record descendents for anscestor in anscestors: term_descendants[anscestor].add(term) + # write descendents with Descendants.batch_write() as batch: for term, descendants in term_descendants.items(): # if descendants are recorded, just update, else make record @@ -353,36 +227,6 @@ def record_relations(): await_result(response["QueryExecutionId"]) -# TODO re-evaluate the following function remove or useful? -def onto_index(): - response = athena.start_query_execution( - QueryString=ONTO_TERMS_QUERY, - QueryExecutionContext={"Database": ENV_ATHENA.ATHENA_METADATA_DATABASE}, - WorkGroup=ENV_ATHENA.ATHENA_WORKGROUP, - ) - execution_id = response["QueryExecutionId"] - await_result(execution_id) - - with sopen( - f"s3://{ENV_ATHENA.ATHENA_METADATA_BUCKET}/query-results/{execution_id}.csv" - ) as s3f: - for n, line in enumerate(s3f): - if n == 0: - continue - term, tablename, colname, type, label = [ - item.strip('"') for item in line.strip().split(",") - ] - entry = OntoData.make_index_entry( - term=term, - tableName=tablename, - columnName=colname, - type=type, - label=label, - ) - entry.save() - return - - def lambda_handler(event, context): # CTAS this must finish before all threads = [] diff --git a/main.tf b/main.tf index b16a97c..e5333d8 100644 --- a/main.tf +++ b/main.tf @@ -77,7 +77,6 @@ locals { DYNAMO_ONTOLOGIES_TABLE = aws_dynamodb_table.ontologies.name DYNAMO_ANSCESTORS_TABLE = aws_dynamodb_table.anscestor_terms.name DYNAMO_DESCENDANTS_TABLE = aws_dynamodb_table.descendant_terms.name - DYNAMO_ONTO_INDEX_TABLE = aws_dynamodb_table.ontology_terms.name } # layers binaries_layer = "${aws_lambda_layer_version.binaries_layer.layer_arn}:${aws_lambda_layer_version.binaries_layer.version}" diff --git a/shared_resources/python-modules/python/shared/athena/filters.py b/shared_resources/python-modules/python/shared/athena/filters.py index 9c02326..97a88d5 100644 --- a/shared_resources/python-modules/python/shared/athena/filters.py +++ b/shared_resources/python-modules/python/shared/athena/filters.py @@ -6,7 +6,10 @@ from .dataset import Dataset from .cohort import Cohort from .run import Run -from shared.dynamodb import Descendants, Anscestors +from shared.ontoutils import ( + get_term_ancestors_in_beacon, + get_term_descendants_in_beacon, +) from shared.utils import ENV_ATHENA from shared.apiutils import ( OntologyFilter, @@ -48,24 +51,6 @@ def _get_comparison_operator(filter: Union[AlphanumericFilter, OntologyFilter]): return "LIKE" if filter.operator == Operator.EQUAL else "NOT LIKE" -def _get_term_ancestors(term): - terms = set() - try: - terms.update(Anscestors.get(term).anscestors) - except Anscestors.DoesNotExist: - terms.add(term) - return terms - - -def _get_term_descendants(term: str): - terms = set() - try: - terms.update(Descendants.get(term).descendants) - except Descendants.DoesNotExist: - terms.add(term) - return terms - - def entity_search_conditions( filters: List[Union[OntologyFilter, AlphanumericFilter, CustomFilter]], id_type: str, @@ -117,12 +102,12 @@ def entity_search_conditions( if f.include_descendant_terms: # process inclusion of term descendants dependant on 'similarity' if f.similarity in (Similarity.HIGH or Similarity.EXACT): - expanded_terms = _get_term_descendants(f.id) + expanded_terms = get_term_descendants_in_beacon(f.id) else: # NOTE: this simplistic similarity method not nessisarily efficient or nessisarily desirable - ancestors = _get_term_ancestors(f.id) + ancestors = get_term_ancestors_in_beacon(f.id) ancestor_descendants = sorted( - [_get_term_descendants(a) for a in ancestors], key=len + [get_term_descendants_in_beacon(a) for a in ancestors], key=len ) if f.similarity == Similarity.MEDIUM: # all terms which have an ancestor half way up diff --git a/shared_resources/python-modules/python/shared/dynamodb/__init__.py b/shared_resources/python-modules/python/shared/dynamodb/__init__.py index d1de649..9873199 100644 --- a/shared_resources/python-modules/python/shared/dynamodb/__init__.py +++ b/shared_resources/python-modules/python/shared/dynamodb/__init__.py @@ -1,4 +1,3 @@ from .datasets import Dataset, VcfChromosomeMap -from .onto_index import OntoData, TableIndex, TableTermsIndex, TermIndex from .ontologies import Anscestors, Descendants, Ontology from .variant_queries import VariantQuery, VariantResponse, VariantResponseIndex, S3Location diff --git a/shared_resources/python-modules/python/shared/dynamodb/onto_index.py b/shared_resources/python-modules/python/shared/dynamodb/onto_index.py deleted file mode 100644 index 2b031ac..0000000 --- a/shared_resources/python-modules/python/shared/dynamodb/onto_index.py +++ /dev/null @@ -1,80 +0,0 @@ -import boto3 -from pynamodb.models import Model -from pynamodb.indexes import GlobalSecondaryIndex, AllProjection -from pynamodb.attributes import UnicodeAttribute - -from shared.utils import ENV_DYNAMO - - -SESSION = boto3.session.Session() -REGION = SESSION.region_name - - -# Terms index -class TermIndex(GlobalSecondaryIndex): - class Meta: - index_name = "term_index" - projection = AllProjection() - billing_mode = "PAY_PER_REQUEST" - region = REGION - - term = UnicodeAttribute(hash_key=True) - - -# TableNames -class TableIndex(GlobalSecondaryIndex): - class Meta: - index_name = "table_index" - projection = AllProjection() - billing_mode = "PAY_PER_REQUEST" - region = REGION - - tableName = UnicodeAttribute(hash_key=True) - - -# TableNames -class TableTermsIndex(GlobalSecondaryIndex): - class Meta: - index_name = "tableterms_index" - projection = AllProjection() - billing_mode = "PAY_PER_REQUEST" - region = REGION - - tableTerms = UnicodeAttribute(hash_key=True) - - -# ontoIndex table -class OntoData(Model): - class Meta: - table_name = ENV_DYNAMO.DYNAMO_ONTO_INDEX_TABLE - region = REGION - - id = UnicodeAttribute(hash_key=True) - tableTerms = UnicodeAttribute() - tableName = UnicodeAttribute() - columnName = UnicodeAttribute() - term = UnicodeAttribute() - label = UnicodeAttribute() - type = UnicodeAttribute() - - termIndex = TermIndex() - tableIndex = TableIndex() - tableTermsIndex = TableTermsIndex() - - @classmethod - def make_index_entry(cls, tableName, columnName, term, label, type): - id = f"{tableName}\t{columnName}\t{term}" - tableTerms = f"{tableName}\t{term}" - entry = OntoData(hash_key=id) - entry.tableName = tableName - entry.tableTerms = tableTerms - entry.columnName = columnName - entry.term = term - entry.label = label - entry.type = type - - return entry - - -if __name__ == "__main__": - pass diff --git a/shared_resources/python-modules/python/shared/ontoutils/__init__.py b/shared_resources/python-modules/python/shared/ontoutils/__init__.py new file mode 100644 index 0000000..cb70bad --- /dev/null +++ b/shared_resources/python-modules/python/shared/ontoutils/__init__.py @@ -0,0 +1,162 @@ +import json +import time +import urllib +from functools import lru_cache + +import requests + +from shared.dynamodb import Ontology, Descendants, Anscestors + + +ENSEMBL_OLS = "https://www.ebi.ac.uk/ols/api/ontologies" +ONTOSERVER = "https://r4.ontoserver.csiro.au/fhir/ValueSet/$expand" + + +@lru_cache() +def get_term_ancestors_in_beacon(term): + terms = set() + try: + terms.update(Anscestors.get(term).anscestors) + except Anscestors.DoesNotExist: + terms.add(term) + return terms + + +@lru_cache() +def get_term_descendants_in_beacon(term: str): + terms = set() + try: + terms.update(Descendants.get(term).descendants) + except Descendants.DoesNotExist: + terms.add(term) + return terms + + +@lru_cache() +def get_term_all_ancestors(term: str): + term, ancestors = request_hierarchy(term, True) + ancestors.add(term) + + return ancestors + + +@lru_cache() +def get_term_all_descendants(term: str): + term, descendants = request_hierarchy(term, False) + descendants.add(term) + + return descendants + + +@lru_cache() +def get_ontology_details(ontology) -> Ontology: + details = None + try: + details = Ontology.get(ontology) + except Ontology.DoesNotExist: + if ontology == "SNOMED": + # use ontoserver + details = Ontology(ontology.upper()) + details.data = json.dumps( + {"id": "SNOMED", "baseUri": "http://snomed.info/sct"} + ) + details.save() + else: + # use ENSEMBL + if response := requests.get(f"{ENSEMBL_OLS}/{ontology}"): + response_json = response.json() + details = Ontology(ontology.upper()) + details.data = json.dumps( + { + "id": response_json["ontologyId"].upper(), + "baseUri": response_json["config"]["baseUris"][0], + } + ) + details.save() + + return details + + +@lru_cache() +def request_ontoserver_hierarchy(term: str, ancestors=True): + snomed = "SNOMED" in term.upper() + retries = 1 + response = None + while (not response or response.status_code != 200) and retries < 10: + retries += 1 + response = requests.post( + ONTOSERVER, + json={ + "resourceType": "Parameters", + "parameter": [ + { + "name": "valueSet", + "resource": { + "resourceType": "ValueSet", + "compose": { + "include": [ + { + "system": "http://snomed.info/sct", + "filter": [ + { + "property": "concept", + "op": "generalizes" + if ancestors + else "descendent-of", + "value": f"{term.replace('SNOMED:', '')}", + } + ], + } + ] + }, + }, + } + ], + }, + ) + if response.status_code == 200: + response_json = response.json() + members = set() + for response_term in response_json["expansion"]["contains"]: + members.add( + "SNOMED:" + response_term["code"] + if snomed + else response_term["code"] + ) + return (term, members) + else: + time.sleep(1) + + raise Exception(f"Error fetching term from Ontoserver {term}") + + +@lru_cache() +def request_ensembl_hierarchy(term: str, ancestors=True): + ontology, code = term.split(":") + details = get_ontology_details(ontology) + # if no details available, it is probably not an ontology term + if not details: + return (term, set()) + + data = json.loads(details.data) + iri = data["baseUri"] + code + iri_double_encoded = urllib.parse.quote_plus(urllib.parse.quote_plus(iri)) + url = f"{ENSEMBL_OLS}/{ontology}/terms/{iri_double_encoded}/{'hierarchicalAncestors' if ancestors else 'hierarchicalDescendants'}" + + if response := requests.get(url): + response_json = response.json() + members = set() + for response_term in response_json["_embedded"]["terms"]: + obo_id = response_term["obo_id"] + if obo_id: + members.add(obo_id) + return (term, members) + + raise Exception(f"Error fetching term from Ensembl OLS {term}") + + +@lru_cache() +def request_hierarchy(term, ancestors): + if term.startswith("SNOMED"): + return request_ontoserver_hierarchy(term, ancestors) + return request_ensembl_hierarchy(term, ancestors) From fc178351570aef07b1ec199761c1550db419a785 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Mon, 8 May 2023 13:30:50 +0930 Subject: [PATCH 10/16] Refactoring and performance improvements (#71) * Update readme * lru caching for ontology terms, refactoring --- dynamodb.tf | 53 ----- examples/test-data/GUIDE.md | 12 +- iam.tf | 20 -- lambda/indexer/lambda_function.py | 194 ++---------------- main.tf | 1 - .../python/shared/athena/filters.py | 29 +-- .../python/shared/dynamodb/__init__.py | 1 - .../python/shared/dynamodb/onto_index.py | 80 -------- .../python/shared/ontoutils/__init__.py | 162 +++++++++++++++ 9 files changed, 194 insertions(+), 358 deletions(-) delete mode 100644 shared_resources/python-modules/python/shared/dynamodb/onto_index.py create mode 100644 shared_resources/python-modules/python/shared/ontoutils/__init__.py diff --git a/dynamodb.tf b/dynamodb.tf index 8f0bfe9..c3905f4 100644 --- a/dynamodb.tf +++ b/dynamodb.tf @@ -147,56 +147,3 @@ resource "aws_dynamodb_table" "variant_query_responses" { enabled = true } } - -# ontology term index -resource "aws_dynamodb_table" "ontology_terms" { - billing_mode = "PAY_PER_REQUEST" - hash_key = "id" - name = "OntoIndex" - tags = var.common-tags - - # this is the tab concatenated value of - # tableName, columnName, term - # this must not be repeated - attribute { - name = "id" - type = "S" - } - - attribute { - name = "tableName" - type = "S" - } - - attribute { - name = "tableTerms" - type = "S" - } - - attribute { - name = "term" - type = "S" - } - - # be able to query a term - global_secondary_index { - hash_key = "term" - name = "term_index" - projection_type = "ALL" - } - - # be able to query a tableName - global_secondary_index { - hash_key = "tableName" - name = "table_index" - projection_type = "ALL" - } - - # be able to query a terms in a table - # tab concatenated value of table and term - global_secondary_index { - hash_key = "tableTerms" - name = "tableterms_index" - projection_type = "ALL" - } -} \ No newline at end of file diff --git a/examples/test-data/GUIDE.md b/examples/test-data/GUIDE.md index 1e98368..a070ccc 100644 --- a/examples/test-data/GUIDE.md +++ b/examples/test-data/GUIDE.md @@ -1,20 +1,20 @@ # Getting started with test data -Please ensure you first upload the `chr1.vcf.gz` and `chr1.vcf.gz.tbi` files to an S3 bucket that is accessible from the sBeacon deployment account. Obtain the S3 URI for the `chr1.vcf.gz` from the uploaded desitation. Note that, both `vcf.gz` and `vcf.gz.tbi` files must have the same prefix in S3 for this to work. +Please ensure you first upload the `chr1.vcf.gz` and `chr1.vcf.gz.tbi` files to an S3 bucket that is accessible from the sBeacon deployment account. Obtain the S3 URI for the `chr1.vcf.gz` from the uploaded desitation. Note that, both `vcf.gz` and `vcf.gz.tbi` files must have the same prefix in S3 for this to work. Please note that, all the buckets you create in AWS are in the same region as the deployment. -Now edit the `submission.json` file such that they match the S3 URI of the `vcf.gz` file. +Now edit the `submission.json` using the S3 URI of the `vcf.gz` file. ```json -... +. . . "vcfLocations": [ "s3:////chr1.vcf.gz" ] -... +. . . ``` ## Data submission -You can submit the data in two ways. +You can submit this data in two ways. ### Option 1: Submission as request body @@ -30,7 +30,7 @@ Alternatively, you can upload edited `submission.json` file to an S3 location ac } ``` -This approach is recommended for larger submissions with thousands of metadata entries. +Option 2 is recommended for larger submissions with thousands of metadata entries. ## API testing diff --git a/iam.tf b/iam.tf index 34a96bf..855322b 100644 --- a/iam.tf +++ b/iam.tf @@ -827,26 +827,6 @@ data "aws_iam_policy_document" "dynamodb-onto-access" { aws_dynamodb_table.anscestor_terms.arn, ] } - - statement { - actions = [ - "dynamodb:Query", - ] - resources = [ - "${aws_dynamodb_table.datasets.arn}/index/*", - "${aws_dynamodb_table.variant_query_responses.arn}/index/*", - "${aws_dynamodb_table.ontology_terms.arn}/index/*", - ] - } - - statement { - actions = [ - "dynamodb:Scan", - ] - resources = [ - aws_dynamodb_table.ontology_terms.arn, - ] - } } # DynamoDB Ontology Related Write Access diff --git a/lambda/indexer/lambda_function.py b/lambda/indexer/lambda_function.py index f4f7336..41d1ad6 100644 --- a/lambda/indexer/lambda_function.py +++ b/lambda/indexer/lambda_function.py @@ -1,16 +1,13 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from collections import defaultdict import threading -import urllib -import json import time -import re from smart_open import open as sopen -import requests import boto3 -from shared.dynamodb import OntoData, Ontology, Descendants, Anscestors +from shared.dynamodb import Descendants, Anscestors +from shared.ontoutils import request_hierarchy from shared.utils import ENV_ATHENA from ctas_queries import QUERY as CTAS_TEMPLATE from generate_query_index import QUERY as INDEX_QUERY @@ -42,36 +39,7 @@ ) -def get_ontology_details(ontology): - details = None - try: - details = Ontology.get(ontology) - except Ontology.DoesNotExist: - if ontology == "SNOMED": - # use ontoserver - details = Ontology(ontology.upper()) - details.data = json.dumps( - {"id": "SNOMED", "baseUri": "http://snomed.info/sct"} - ) - details.save() - else: - # use ENSEMBL - if response := requests.get(f"{ENSEMBL_OLS}/{ontology}"): - response_json = response.json() - details = Ontology(ontology.upper()) - details.data = json.dumps( - { - "id": response_json["ontologyId"].upper(), - "baseUri": response_json["config"]["baseUris"][0], - } - ) - details.save() - - # any other error must be raised - return details - - -def get_ontologies_clusters(): +def get_ontologie_terms_in_beacon(): query = f'SELECT DISTINCT term FROM "{ENV_ATHENA.ATHENA_TERMS_TABLE}"' response = athena.start_query_execution( @@ -83,7 +51,7 @@ def get_ontologies_clusters(): execution_id = response["QueryExecutionId"] await_result(execution_id) - ontology_clusters = defaultdict(set) + ontology_terms = list() with sopen( f"s3://{ENV_ATHENA.ATHENA_METADATA_BUCKET}/query-results/{execution_id}.csv" @@ -92,125 +60,27 @@ def get_ontologies_clusters(): if n == 0: continue term = line.strip().strip('"') - - # beacon API does not allow non CURIE formatted terms - # however, SNOMED appears are non-CURIE prefixed terms - # following is to support that, however API will not ingest - # always submit in form SNOMED:123212 - if re.match(r"(?i)(^SNOMED)|([0-9]+)", term): - ontology = "SNOMED" - ontology_clusters[ontology].add(term) - else: - ontology = term.split(":")[0] - ontology_clusters[ontology].add(term) - - return ontology_clusters + ontology_terms.append(term) + return ontology_terms -# in future, there could be an issue when descendants entries exceed 400KB +# TODO in future, there could be an issue when descendants entries exceed 400KB # which means we would have roughtly 20480, 20 byte entries (unlikely?) # this would also mean, our SQL queries would reach the 256KB limit # we should be able to easily spread terms across multiple dynamodb # entries and have multiple queries (as recommended by AWS) def index_terms_tree(): - # START subroutines - # subroutine for ensemble - def threaded_request_ensemble(term, url): - if response := requests.get(url): - response_json = response.json() - anscestors = set() - for response_term in response_json["_embedded"]["terms"]: - obo_id = response_term["obo_id"] - if obo_id: - anscestors.add(obo_id) - return (term, anscestors) - else: - print(f"Error fetching term from Ensembl OLS {term}") - - # subroutine for ontoserver - def threaded_request_ontoserver(term, url): - snomed = "SNOMED" in term.upper() - retries = 1 - response = None - while (not response or response.status_code != 200) and retries < 10: - retries += 1 - response = requests.post( - url, - json={ - "resourceType": "Parameters", - "parameter": [ - { - "name": "valueSet", - "resource": { - "resourceType": "ValueSet", - "compose": { - "include": [ - { - "system": "http://snomed.info/sct", - "filter": [ - { - "property": "concept", - "op": "generalizes", - "value": f"{term.replace('SNOMED:', '')}", - } - ], - } - ] - }, - }, - } - ], - }, - ) - if response.status_code == 200: - response_json = response.json() - anscestors = set() - for response_term in response_json["expansion"]["contains"]: - anscestors.add( - "SNOMED:" + response_term["code"] - if snomed - else response_term["code"] - ) - return (term, anscestors) - else: - time.sleep(1) - - if response.status_code != 200: - print(f"Error fetching term from Ontoserver {term}") - - # END subroutines - - ontology_clusters = get_ontologies_clusters() + terms_in_beacon = get_ontologie_terms_in_beacon() executor = ThreadPoolExecutor(500) futures = [] - for ontology, terms in ontology_clusters.items(): - if ontology == "SNOMED": - for term in terms: - # fetch only anscestors that aren't fetched yet - try: - data = Anscestors.get(term) - except Anscestors.DoesNotExist: - futures.append( - executor.submit(threaded_request_ontoserver, term, ONTOSERVER) - ) - else: - for term in terms: - # fetch only anscestors that aren't fetched yet - try: - data = Anscestors.get(term) - except Anscestors.DoesNotExist: - # details will be missing if the ontology info is not in OLS - if details := get_ontology_details(ontology): - data = json.loads(details.data) - iri = data["baseUri"] + term.split(":")[1] - iri_double_encoded = urllib.parse.quote_plus( - urllib.parse.quote_plus(iri) - ) - url = f"{ENSEMBL_OLS}/{ontology}/terms/{iri_double_encoded}/hierarchicalAncestors" - futures.append( - executor.submit(threaded_request_ensemble, term, url) - ) + for term in terms_in_beacon: + try: + Anscestors.get(term) + except Anscestors.DoesNotExist: + futures.append(executor.submit(request_hierarchy, term, True)) + + # record ancestors term_anscestors = defaultdict(set) for future in as_completed(futures): @@ -219,17 +89,21 @@ def threaded_request_ontoserver(term, url): term_anscestors[term].update(ancestors) term_anscestors[term].add(term) + # reverse the tree for descendent term search term_descendants = defaultdict(set) + # write ancestors with Anscestors.batch_write() as batch: for term, anscestors in term_anscestors.items(): item = Anscestors(term) item.anscestors = anscestors batch.save(item) + # record descendents for anscestor in anscestors: term_descendants[anscestor].add(term) + # write descendents with Descendants.batch_write() as batch: for term, descendants in term_descendants.items(): # if descendants are recorded, just update, else make record @@ -353,36 +227,6 @@ def record_relations(): await_result(response["QueryExecutionId"]) -# TODO re-evaluate the following function remove or useful? -def onto_index(): - response = athena.start_query_execution( - QueryString=ONTO_TERMS_QUERY, - QueryExecutionContext={"Database": ENV_ATHENA.ATHENA_METADATA_DATABASE}, - WorkGroup=ENV_ATHENA.ATHENA_WORKGROUP, - ) - execution_id = response["QueryExecutionId"] - await_result(execution_id) - - with sopen( - f"s3://{ENV_ATHENA.ATHENA_METADATA_BUCKET}/query-results/{execution_id}.csv" - ) as s3f: - for n, line in enumerate(s3f): - if n == 0: - continue - term, tablename, colname, type, label = [ - item.strip('"') for item in line.strip().split(",") - ] - entry = OntoData.make_index_entry( - term=term, - tableName=tablename, - columnName=colname, - type=type, - label=label, - ) - entry.save() - return - - def lambda_handler(event, context): # CTAS this must finish before all threads = [] diff --git a/main.tf b/main.tf index b16a97c..e5333d8 100644 --- a/main.tf +++ b/main.tf @@ -77,7 +77,6 @@ locals { DYNAMO_ONTOLOGIES_TABLE = aws_dynamodb_table.ontologies.name DYNAMO_ANSCESTORS_TABLE = aws_dynamodb_table.anscestor_terms.name DYNAMO_DESCENDANTS_TABLE = aws_dynamodb_table.descendant_terms.name - DYNAMO_ONTO_INDEX_TABLE = aws_dynamodb_table.ontology_terms.name } # layers binaries_layer = "${aws_lambda_layer_version.binaries_layer.layer_arn}:${aws_lambda_layer_version.binaries_layer.version}" diff --git a/shared_resources/python-modules/python/shared/athena/filters.py b/shared_resources/python-modules/python/shared/athena/filters.py index 9c02326..97a88d5 100644 --- a/shared_resources/python-modules/python/shared/athena/filters.py +++ b/shared_resources/python-modules/python/shared/athena/filters.py @@ -6,7 +6,10 @@ from .dataset import Dataset from .cohort import Cohort from .run import Run -from shared.dynamodb import Descendants, Anscestors +from shared.ontoutils import ( + get_term_ancestors_in_beacon, + get_term_descendants_in_beacon, +) from shared.utils import ENV_ATHENA from shared.apiutils import ( OntologyFilter, @@ -48,24 +51,6 @@ def _get_comparison_operator(filter: Union[AlphanumericFilter, OntologyFilter]): return "LIKE" if filter.operator == Operator.EQUAL else "NOT LIKE" -def _get_term_ancestors(term): - terms = set() - try: - terms.update(Anscestors.get(term).anscestors) - except Anscestors.DoesNotExist: - terms.add(term) - return terms - - -def _get_term_descendants(term: str): - terms = set() - try: - terms.update(Descendants.get(term).descendants) - except Descendants.DoesNotExist: - terms.add(term) - return terms - - def entity_search_conditions( filters: List[Union[OntologyFilter, AlphanumericFilter, CustomFilter]], id_type: str, @@ -117,12 +102,12 @@ def entity_search_conditions( if f.include_descendant_terms: # process inclusion of term descendants dependant on 'similarity' if f.similarity in (Similarity.HIGH or Similarity.EXACT): - expanded_terms = _get_term_descendants(f.id) + expanded_terms = get_term_descendants_in_beacon(f.id) else: # NOTE: this simplistic similarity method not nessisarily efficient or nessisarily desirable - ancestors = _get_term_ancestors(f.id) + ancestors = get_term_ancestors_in_beacon(f.id) ancestor_descendants = sorted( - [_get_term_descendants(a) for a in ancestors], key=len + [get_term_descendants_in_beacon(a) for a in ancestors], key=len ) if f.similarity == Similarity.MEDIUM: # all terms which have an ancestor half way up diff --git a/shared_resources/python-modules/python/shared/dynamodb/__init__.py b/shared_resources/python-modules/python/shared/dynamodb/__init__.py index d1de649..9873199 100644 --- a/shared_resources/python-modules/python/shared/dynamodb/__init__.py +++ b/shared_resources/python-modules/python/shared/dynamodb/__init__.py @@ -1,4 +1,3 @@ from .datasets import Dataset, VcfChromosomeMap -from .onto_index import OntoData, TableIndex, TableTermsIndex, TermIndex from .ontologies import Anscestors, Descendants, Ontology from .variant_queries import VariantQuery, VariantResponse, VariantResponseIndex, S3Location diff --git a/shared_resources/python-modules/python/shared/dynamodb/onto_index.py b/shared_resources/python-modules/python/shared/dynamodb/onto_index.py deleted file mode 100644 index 2b031ac..0000000 --- a/shared_resources/python-modules/python/shared/dynamodb/onto_index.py +++ /dev/null @@ -1,80 +0,0 @@ -import boto3 -from pynamodb.models import Model -from pynamodb.indexes import GlobalSecondaryIndex, AllProjection -from pynamodb.attributes import UnicodeAttribute - -from shared.utils import ENV_DYNAMO - - -SESSION = boto3.session.Session() -REGION = SESSION.region_name - - -# Terms index -class TermIndex(GlobalSecondaryIndex): - class Meta: - index_name = "term_index" - projection = AllProjection() - billing_mode = "PAY_PER_REQUEST" - region = REGION - - term = UnicodeAttribute(hash_key=True) - - -# TableNames -class TableIndex(GlobalSecondaryIndex): - class Meta: - index_name = "table_index" - projection = AllProjection() - billing_mode = "PAY_PER_REQUEST" - region = REGION - - tableName = UnicodeAttribute(hash_key=True) - - -# TableNames -class TableTermsIndex(GlobalSecondaryIndex): - class Meta: - index_name = "tableterms_index" - projection = AllProjection() - billing_mode = "PAY_PER_REQUEST" - region = REGION - - tableTerms = UnicodeAttribute(hash_key=True) - - -# ontoIndex table -class OntoData(Model): - class Meta: - table_name = ENV_DYNAMO.DYNAMO_ONTO_INDEX_TABLE - region = REGION - - id = UnicodeAttribute(hash_key=True) - tableTerms = UnicodeAttribute() - tableName = UnicodeAttribute() - columnName = UnicodeAttribute() - term = UnicodeAttribute() - label = UnicodeAttribute() - type = UnicodeAttribute() - - termIndex = TermIndex() - tableIndex = TableIndex() - tableTermsIndex = TableTermsIndex() - - @classmethod - def make_index_entry(cls, tableName, columnName, term, label, type): - id = f"{tableName}\t{columnName}\t{term}" - tableTerms = f"{tableName}\t{term}" - entry = OntoData(hash_key=id) - entry.tableName = tableName - entry.tableTerms = tableTerms - entry.columnName = columnName - entry.term = term - entry.label = label - entry.type = type - - return entry - - -if __name__ == "__main__": - pass diff --git a/shared_resources/python-modules/python/shared/ontoutils/__init__.py b/shared_resources/python-modules/python/shared/ontoutils/__init__.py new file mode 100644 index 0000000..cb70bad --- /dev/null +++ b/shared_resources/python-modules/python/shared/ontoutils/__init__.py @@ -0,0 +1,162 @@ +import json +import time +import urllib +from functools import lru_cache + +import requests + +from shared.dynamodb import Ontology, Descendants, Anscestors + + +ENSEMBL_OLS = "https://www.ebi.ac.uk/ols/api/ontologies" +ONTOSERVER = "https://r4.ontoserver.csiro.au/fhir/ValueSet/$expand" + + +@lru_cache() +def get_term_ancestors_in_beacon(term): + terms = set() + try: + terms.update(Anscestors.get(term).anscestors) + except Anscestors.DoesNotExist: + terms.add(term) + return terms + + +@lru_cache() +def get_term_descendants_in_beacon(term: str): + terms = set() + try: + terms.update(Descendants.get(term).descendants) + except Descendants.DoesNotExist: + terms.add(term) + return terms + + +@lru_cache() +def get_term_all_ancestors(term: str): + term, ancestors = request_hierarchy(term, True) + ancestors.add(term) + + return ancestors + + +@lru_cache() +def get_term_all_descendants(term: str): + term, descendants = request_hierarchy(term, False) + descendants.add(term) + + return descendants + + +@lru_cache() +def get_ontology_details(ontology) -> Ontology: + details = None + try: + details = Ontology.get(ontology) + except Ontology.DoesNotExist: + if ontology == "SNOMED": + # use ontoserver + details = Ontology(ontology.upper()) + details.data = json.dumps( + {"id": "SNOMED", "baseUri": "http://snomed.info/sct"} + ) + details.save() + else: + # use ENSEMBL + if response := requests.get(f"{ENSEMBL_OLS}/{ontology}"): + response_json = response.json() + details = Ontology(ontology.upper()) + details.data = json.dumps( + { + "id": response_json["ontologyId"].upper(), + "baseUri": response_json["config"]["baseUris"][0], + } + ) + details.save() + + return details + + +@lru_cache() +def request_ontoserver_hierarchy(term: str, ancestors=True): + snomed = "SNOMED" in term.upper() + retries = 1 + response = None + while (not response or response.status_code != 200) and retries < 10: + retries += 1 + response = requests.post( + ONTOSERVER, + json={ + "resourceType": "Parameters", + "parameter": [ + { + "name": "valueSet", + "resource": { + "resourceType": "ValueSet", + "compose": { + "include": [ + { + "system": "http://snomed.info/sct", + "filter": [ + { + "property": "concept", + "op": "generalizes" + if ancestors + else "descendent-of", + "value": f"{term.replace('SNOMED:', '')}", + } + ], + } + ] + }, + }, + } + ], + }, + ) + if response.status_code == 200: + response_json = response.json() + members = set() + for response_term in response_json["expansion"]["contains"]: + members.add( + "SNOMED:" + response_term["code"] + if snomed + else response_term["code"] + ) + return (term, members) + else: + time.sleep(1) + + raise Exception(f"Error fetching term from Ontoserver {term}") + + +@lru_cache() +def request_ensembl_hierarchy(term: str, ancestors=True): + ontology, code = term.split(":") + details = get_ontology_details(ontology) + # if no details available, it is probably not an ontology term + if not details: + return (term, set()) + + data = json.loads(details.data) + iri = data["baseUri"] + code + iri_double_encoded = urllib.parse.quote_plus(urllib.parse.quote_plus(iri)) + url = f"{ENSEMBL_OLS}/{ontology}/terms/{iri_double_encoded}/{'hierarchicalAncestors' if ancestors else 'hierarchicalDescendants'}" + + if response := requests.get(url): + response_json = response.json() + members = set() + for response_term in response_json["_embedded"]["terms"]: + obo_id = response_term["obo_id"] + if obo_id: + members.add(obo_id) + return (term, members) + + raise Exception(f"Error fetching term from Ensembl OLS {term}") + + +@lru_cache() +def request_hierarchy(term, ancestors): + if term.startswith("SNOMED"): + return request_ontoserver_hierarchy(term, ancestors) + return request_ensembl_hierarchy(term, ancestors) From e7154811a62ce94ea84a557b20b3213e9a9e5718 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Wed, 31 May 2023 01:57:03 +0000 Subject: [PATCH 11/16] Bug fix; handle custom filters, condition error --- lambda/indexer/lambda_function.py | 1 - .../python-modules/python/shared/apiutils/requests.py | 2 +- .../python-modules/python/shared/athena/filters.py | 10 +++++++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/lambda/indexer/lambda_function.py b/lambda/indexer/lambda_function.py index 41d1ad6..c98f918 100644 --- a/lambda/indexer/lambda_function.py +++ b/lambda/indexer/lambda_function.py @@ -132,7 +132,6 @@ def await_result(execution_id, sleep=2): status = exec["QueryExecution"]["Status"]["State"] if status in ("QUEUED", "RUNNING"): - print(f"Sleeping {sleep} seconds") time.sleep(sleep) retries += 1 diff --git a/shared_resources/python-modules/python/shared/apiutils/requests.py b/shared_resources/python-modules/python/shared/apiutils/requests.py index 68cd79b..fb89c73 100644 --- a/shared_resources/python-modules/python/shared/apiutils/requests.py +++ b/shared_resources/python-modules/python/shared/apiutils/requests.py @@ -131,7 +131,7 @@ def __init__(self, **data): self._filters = data.get("filters", []) @validator("filters", pre=True, each_item=True) - def check_squares(cls, term): + def transform_filters(cls, term): if isinstance(term, str): term = {"id": term} return term diff --git a/shared_resources/python-modules/python/shared/athena/filters.py b/shared_resources/python-modules/python/shared/athena/filters.py index 97a88d5..6c0bf05 100644 --- a/shared_resources/python-modules/python/shared/athena/filters.py +++ b/shared_resources/python-modules/python/shared/athena/filters.py @@ -101,7 +101,7 @@ def entity_search_conditions( # if descendantTerms is false, then similarity measures dont really make sense... if f.include_descendant_terms: # process inclusion of term descendants dependant on 'similarity' - if f.similarity in (Similarity.HIGH or Similarity.EXACT): + if f.similarity in (Similarity.HIGH, Similarity.EXACT): expanded_terms = get_term_descendants_in_beacon(f.id) else: # NOTE: this simplistic similarity method not nessisarily efficient or nessisarily desirable @@ -125,6 +125,14 @@ def entity_search_conditions( join_constraints.append( f""" SELECT RI.{type_relations_table_id[id_type]} FROM "{ENV_ATHENA.ATHENA_RELATIONS_TABLE}" RI JOIN "{ENV_ATHENA.ATHENA_TERMS_INDEX_TABLE}" TI ON RI.{type_relations_table_id[group]}=TI.id WHERE TI.kind='{group}' AND TI.term IN ({expanded_terms}) """ ) + elif isinstance(f, CustomFilter): + # TODO this is a dummy replacement, for future implementation + group = f.scope or default_scope + expanded_terms = "?" + join_execution_parameters += [f.id] + join_constraints.append( + f""" SELECT RI.{type_relations_table_id[id_type]} FROM "{ENV_ATHENA.ATHENA_RELATIONS_TABLE}" RI JOIN "{ENV_ATHENA.ATHENA_TERMS_INDEX_TABLE}" TI ON RI.{type_relations_table_id[group]}=TI.id WHERE TI.kind='{group}' AND TI.term IN ({expanded_terms}) """ + ) # format fragments together to form coherent SQL expression join_constraints = " INTERSECT ".join(join_constraints) From 90f08bc8a2e25acf1ad1ccec6ccc26669489e91a Mon Sep 17 00:00:00 2001 From: Anuradha Date: Thu, 27 Apr 2023 04:15:14 +0000 Subject: [PATCH 12/16] Update readme --- examples/test-data/GUIDE.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/test-data/GUIDE.md b/examples/test-data/GUIDE.md index 1e98368..a070ccc 100644 --- a/examples/test-data/GUIDE.md +++ b/examples/test-data/GUIDE.md @@ -1,20 +1,20 @@ # Getting started with test data -Please ensure you first upload the `chr1.vcf.gz` and `chr1.vcf.gz.tbi` files to an S3 bucket that is accessible from the sBeacon deployment account. Obtain the S3 URI for the `chr1.vcf.gz` from the uploaded desitation. Note that, both `vcf.gz` and `vcf.gz.tbi` files must have the same prefix in S3 for this to work. +Please ensure you first upload the `chr1.vcf.gz` and `chr1.vcf.gz.tbi` files to an S3 bucket that is accessible from the sBeacon deployment account. Obtain the S3 URI for the `chr1.vcf.gz` from the uploaded desitation. Note that, both `vcf.gz` and `vcf.gz.tbi` files must have the same prefix in S3 for this to work. Please note that, all the buckets you create in AWS are in the same region as the deployment. -Now edit the `submission.json` file such that they match the S3 URI of the `vcf.gz` file. +Now edit the `submission.json` using the S3 URI of the `vcf.gz` file. ```json -... +. . . "vcfLocations": [ "s3:////chr1.vcf.gz" ] -... +. . . ``` ## Data submission -You can submit the data in two ways. +You can submit this data in two ways. ### Option 1: Submission as request body @@ -30,7 +30,7 @@ Alternatively, you can upload edited `submission.json` file to an S3 location ac } ``` -This approach is recommended for larger submissions with thousands of metadata entries. +Option 2 is recommended for larger submissions with thousands of metadata entries. ## API testing From b08dd6cf746aaeff33f768d3c09d2cfd03e1aae9 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Mon, 8 May 2023 03:57:07 +0000 Subject: [PATCH 13/16] lru caching for ontology terms, refactoring --- dynamodb.tf | 53 ----- iam.tf | 20 -- lambda/indexer/lambda_function.py | 194 ++---------------- main.tf | 1 - .../python/shared/athena/filters.py | 29 +-- .../python/shared/dynamodb/__init__.py | 1 - .../python/shared/dynamodb/onto_index.py | 80 -------- .../python/shared/ontoutils/__init__.py | 162 +++++++++++++++ 8 files changed, 188 insertions(+), 352 deletions(-) delete mode 100644 shared_resources/python-modules/python/shared/dynamodb/onto_index.py create mode 100644 shared_resources/python-modules/python/shared/ontoutils/__init__.py diff --git a/dynamodb.tf b/dynamodb.tf index 8f0bfe9..c3905f4 100644 --- a/dynamodb.tf +++ b/dynamodb.tf @@ -147,56 +147,3 @@ resource "aws_dynamodb_table" "variant_query_responses" { enabled = true } } - -# ontology term index -resource "aws_dynamodb_table" "ontology_terms" { - billing_mode = "PAY_PER_REQUEST" - hash_key = "id" - name = "OntoIndex" - tags = var.common-tags - - # this is the tab concatenated value of - # tableName, columnName, term - # this must not be repeated - attribute { - name = "id" - type = "S" - } - - attribute { - name = "tableName" - type = "S" - } - - attribute { - name = "tableTerms" - type = "S" - } - - attribute { - name = "term" - type = "S" - } - - # be able to query a term - global_secondary_index { - hash_key = "term" - name = "term_index" - projection_type = "ALL" - } - - # be able to query a tableName - global_secondary_index { - hash_key = "tableName" - name = "table_index" - projection_type = "ALL" - } - - # be able to query a terms in a table - # tab concatenated value of table and term - global_secondary_index { - hash_key = "tableTerms" - name = "tableterms_index" - projection_type = "ALL" - } -} \ No newline at end of file diff --git a/iam.tf b/iam.tf index 34a96bf..855322b 100644 --- a/iam.tf +++ b/iam.tf @@ -827,26 +827,6 @@ data "aws_iam_policy_document" "dynamodb-onto-access" { aws_dynamodb_table.anscestor_terms.arn, ] } - - statement { - actions = [ - "dynamodb:Query", - ] - resources = [ - "${aws_dynamodb_table.datasets.arn}/index/*", - "${aws_dynamodb_table.variant_query_responses.arn}/index/*", - "${aws_dynamodb_table.ontology_terms.arn}/index/*", - ] - } - - statement { - actions = [ - "dynamodb:Scan", - ] - resources = [ - aws_dynamodb_table.ontology_terms.arn, - ] - } } # DynamoDB Ontology Related Write Access diff --git a/lambda/indexer/lambda_function.py b/lambda/indexer/lambda_function.py index f4f7336..41d1ad6 100644 --- a/lambda/indexer/lambda_function.py +++ b/lambda/indexer/lambda_function.py @@ -1,16 +1,13 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from collections import defaultdict import threading -import urllib -import json import time -import re from smart_open import open as sopen -import requests import boto3 -from shared.dynamodb import OntoData, Ontology, Descendants, Anscestors +from shared.dynamodb import Descendants, Anscestors +from shared.ontoutils import request_hierarchy from shared.utils import ENV_ATHENA from ctas_queries import QUERY as CTAS_TEMPLATE from generate_query_index import QUERY as INDEX_QUERY @@ -42,36 +39,7 @@ ) -def get_ontology_details(ontology): - details = None - try: - details = Ontology.get(ontology) - except Ontology.DoesNotExist: - if ontology == "SNOMED": - # use ontoserver - details = Ontology(ontology.upper()) - details.data = json.dumps( - {"id": "SNOMED", "baseUri": "http://snomed.info/sct"} - ) - details.save() - else: - # use ENSEMBL - if response := requests.get(f"{ENSEMBL_OLS}/{ontology}"): - response_json = response.json() - details = Ontology(ontology.upper()) - details.data = json.dumps( - { - "id": response_json["ontologyId"].upper(), - "baseUri": response_json["config"]["baseUris"][0], - } - ) - details.save() - - # any other error must be raised - return details - - -def get_ontologies_clusters(): +def get_ontologie_terms_in_beacon(): query = f'SELECT DISTINCT term FROM "{ENV_ATHENA.ATHENA_TERMS_TABLE}"' response = athena.start_query_execution( @@ -83,7 +51,7 @@ def get_ontologies_clusters(): execution_id = response["QueryExecutionId"] await_result(execution_id) - ontology_clusters = defaultdict(set) + ontology_terms = list() with sopen( f"s3://{ENV_ATHENA.ATHENA_METADATA_BUCKET}/query-results/{execution_id}.csv" @@ -92,125 +60,27 @@ def get_ontologies_clusters(): if n == 0: continue term = line.strip().strip('"') - - # beacon API does not allow non CURIE formatted terms - # however, SNOMED appears are non-CURIE prefixed terms - # following is to support that, however API will not ingest - # always submit in form SNOMED:123212 - if re.match(r"(?i)(^SNOMED)|([0-9]+)", term): - ontology = "SNOMED" - ontology_clusters[ontology].add(term) - else: - ontology = term.split(":")[0] - ontology_clusters[ontology].add(term) - - return ontology_clusters + ontology_terms.append(term) + return ontology_terms -# in future, there could be an issue when descendants entries exceed 400KB +# TODO in future, there could be an issue when descendants entries exceed 400KB # which means we would have roughtly 20480, 20 byte entries (unlikely?) # this would also mean, our SQL queries would reach the 256KB limit # we should be able to easily spread terms across multiple dynamodb # entries and have multiple queries (as recommended by AWS) def index_terms_tree(): - # START subroutines - # subroutine for ensemble - def threaded_request_ensemble(term, url): - if response := requests.get(url): - response_json = response.json() - anscestors = set() - for response_term in response_json["_embedded"]["terms"]: - obo_id = response_term["obo_id"] - if obo_id: - anscestors.add(obo_id) - return (term, anscestors) - else: - print(f"Error fetching term from Ensembl OLS {term}") - - # subroutine for ontoserver - def threaded_request_ontoserver(term, url): - snomed = "SNOMED" in term.upper() - retries = 1 - response = None - while (not response or response.status_code != 200) and retries < 10: - retries += 1 - response = requests.post( - url, - json={ - "resourceType": "Parameters", - "parameter": [ - { - "name": "valueSet", - "resource": { - "resourceType": "ValueSet", - "compose": { - "include": [ - { - "system": "http://snomed.info/sct", - "filter": [ - { - "property": "concept", - "op": "generalizes", - "value": f"{term.replace('SNOMED:', '')}", - } - ], - } - ] - }, - }, - } - ], - }, - ) - if response.status_code == 200: - response_json = response.json() - anscestors = set() - for response_term in response_json["expansion"]["contains"]: - anscestors.add( - "SNOMED:" + response_term["code"] - if snomed - else response_term["code"] - ) - return (term, anscestors) - else: - time.sleep(1) - - if response.status_code != 200: - print(f"Error fetching term from Ontoserver {term}") - - # END subroutines - - ontology_clusters = get_ontologies_clusters() + terms_in_beacon = get_ontologie_terms_in_beacon() executor = ThreadPoolExecutor(500) futures = [] - for ontology, terms in ontology_clusters.items(): - if ontology == "SNOMED": - for term in terms: - # fetch only anscestors that aren't fetched yet - try: - data = Anscestors.get(term) - except Anscestors.DoesNotExist: - futures.append( - executor.submit(threaded_request_ontoserver, term, ONTOSERVER) - ) - else: - for term in terms: - # fetch only anscestors that aren't fetched yet - try: - data = Anscestors.get(term) - except Anscestors.DoesNotExist: - # details will be missing if the ontology info is not in OLS - if details := get_ontology_details(ontology): - data = json.loads(details.data) - iri = data["baseUri"] + term.split(":")[1] - iri_double_encoded = urllib.parse.quote_plus( - urllib.parse.quote_plus(iri) - ) - url = f"{ENSEMBL_OLS}/{ontology}/terms/{iri_double_encoded}/hierarchicalAncestors" - futures.append( - executor.submit(threaded_request_ensemble, term, url) - ) + for term in terms_in_beacon: + try: + Anscestors.get(term) + except Anscestors.DoesNotExist: + futures.append(executor.submit(request_hierarchy, term, True)) + + # record ancestors term_anscestors = defaultdict(set) for future in as_completed(futures): @@ -219,17 +89,21 @@ def threaded_request_ontoserver(term, url): term_anscestors[term].update(ancestors) term_anscestors[term].add(term) + # reverse the tree for descendent term search term_descendants = defaultdict(set) + # write ancestors with Anscestors.batch_write() as batch: for term, anscestors in term_anscestors.items(): item = Anscestors(term) item.anscestors = anscestors batch.save(item) + # record descendents for anscestor in anscestors: term_descendants[anscestor].add(term) + # write descendents with Descendants.batch_write() as batch: for term, descendants in term_descendants.items(): # if descendants are recorded, just update, else make record @@ -353,36 +227,6 @@ def record_relations(): await_result(response["QueryExecutionId"]) -# TODO re-evaluate the following function remove or useful? -def onto_index(): - response = athena.start_query_execution( - QueryString=ONTO_TERMS_QUERY, - QueryExecutionContext={"Database": ENV_ATHENA.ATHENA_METADATA_DATABASE}, - WorkGroup=ENV_ATHENA.ATHENA_WORKGROUP, - ) - execution_id = response["QueryExecutionId"] - await_result(execution_id) - - with sopen( - f"s3://{ENV_ATHENA.ATHENA_METADATA_BUCKET}/query-results/{execution_id}.csv" - ) as s3f: - for n, line in enumerate(s3f): - if n == 0: - continue - term, tablename, colname, type, label = [ - item.strip('"') for item in line.strip().split(",") - ] - entry = OntoData.make_index_entry( - term=term, - tableName=tablename, - columnName=colname, - type=type, - label=label, - ) - entry.save() - return - - def lambda_handler(event, context): # CTAS this must finish before all threads = [] diff --git a/main.tf b/main.tf index b16a97c..e5333d8 100644 --- a/main.tf +++ b/main.tf @@ -77,7 +77,6 @@ locals { DYNAMO_ONTOLOGIES_TABLE = aws_dynamodb_table.ontologies.name DYNAMO_ANSCESTORS_TABLE = aws_dynamodb_table.anscestor_terms.name DYNAMO_DESCENDANTS_TABLE = aws_dynamodb_table.descendant_terms.name - DYNAMO_ONTO_INDEX_TABLE = aws_dynamodb_table.ontology_terms.name } # layers binaries_layer = "${aws_lambda_layer_version.binaries_layer.layer_arn}:${aws_lambda_layer_version.binaries_layer.version}" diff --git a/shared_resources/python-modules/python/shared/athena/filters.py b/shared_resources/python-modules/python/shared/athena/filters.py index 9c02326..97a88d5 100644 --- a/shared_resources/python-modules/python/shared/athena/filters.py +++ b/shared_resources/python-modules/python/shared/athena/filters.py @@ -6,7 +6,10 @@ from .dataset import Dataset from .cohort import Cohort from .run import Run -from shared.dynamodb import Descendants, Anscestors +from shared.ontoutils import ( + get_term_ancestors_in_beacon, + get_term_descendants_in_beacon, +) from shared.utils import ENV_ATHENA from shared.apiutils import ( OntologyFilter, @@ -48,24 +51,6 @@ def _get_comparison_operator(filter: Union[AlphanumericFilter, OntologyFilter]): return "LIKE" if filter.operator == Operator.EQUAL else "NOT LIKE" -def _get_term_ancestors(term): - terms = set() - try: - terms.update(Anscestors.get(term).anscestors) - except Anscestors.DoesNotExist: - terms.add(term) - return terms - - -def _get_term_descendants(term: str): - terms = set() - try: - terms.update(Descendants.get(term).descendants) - except Descendants.DoesNotExist: - terms.add(term) - return terms - - def entity_search_conditions( filters: List[Union[OntologyFilter, AlphanumericFilter, CustomFilter]], id_type: str, @@ -117,12 +102,12 @@ def entity_search_conditions( if f.include_descendant_terms: # process inclusion of term descendants dependant on 'similarity' if f.similarity in (Similarity.HIGH or Similarity.EXACT): - expanded_terms = _get_term_descendants(f.id) + expanded_terms = get_term_descendants_in_beacon(f.id) else: # NOTE: this simplistic similarity method not nessisarily efficient or nessisarily desirable - ancestors = _get_term_ancestors(f.id) + ancestors = get_term_ancestors_in_beacon(f.id) ancestor_descendants = sorted( - [_get_term_descendants(a) for a in ancestors], key=len + [get_term_descendants_in_beacon(a) for a in ancestors], key=len ) if f.similarity == Similarity.MEDIUM: # all terms which have an ancestor half way up diff --git a/shared_resources/python-modules/python/shared/dynamodb/__init__.py b/shared_resources/python-modules/python/shared/dynamodb/__init__.py index d1de649..9873199 100644 --- a/shared_resources/python-modules/python/shared/dynamodb/__init__.py +++ b/shared_resources/python-modules/python/shared/dynamodb/__init__.py @@ -1,4 +1,3 @@ from .datasets import Dataset, VcfChromosomeMap -from .onto_index import OntoData, TableIndex, TableTermsIndex, TermIndex from .ontologies import Anscestors, Descendants, Ontology from .variant_queries import VariantQuery, VariantResponse, VariantResponseIndex, S3Location diff --git a/shared_resources/python-modules/python/shared/dynamodb/onto_index.py b/shared_resources/python-modules/python/shared/dynamodb/onto_index.py deleted file mode 100644 index 2b031ac..0000000 --- a/shared_resources/python-modules/python/shared/dynamodb/onto_index.py +++ /dev/null @@ -1,80 +0,0 @@ -import boto3 -from pynamodb.models import Model -from pynamodb.indexes import GlobalSecondaryIndex, AllProjection -from pynamodb.attributes import UnicodeAttribute - -from shared.utils import ENV_DYNAMO - - -SESSION = boto3.session.Session() -REGION = SESSION.region_name - - -# Terms index -class TermIndex(GlobalSecondaryIndex): - class Meta: - index_name = "term_index" - projection = AllProjection() - billing_mode = "PAY_PER_REQUEST" - region = REGION - - term = UnicodeAttribute(hash_key=True) - - -# TableNames -class TableIndex(GlobalSecondaryIndex): - class Meta: - index_name = "table_index" - projection = AllProjection() - billing_mode = "PAY_PER_REQUEST" - region = REGION - - tableName = UnicodeAttribute(hash_key=True) - - -# TableNames -class TableTermsIndex(GlobalSecondaryIndex): - class Meta: - index_name = "tableterms_index" - projection = AllProjection() - billing_mode = "PAY_PER_REQUEST" - region = REGION - - tableTerms = UnicodeAttribute(hash_key=True) - - -# ontoIndex table -class OntoData(Model): - class Meta: - table_name = ENV_DYNAMO.DYNAMO_ONTO_INDEX_TABLE - region = REGION - - id = UnicodeAttribute(hash_key=True) - tableTerms = UnicodeAttribute() - tableName = UnicodeAttribute() - columnName = UnicodeAttribute() - term = UnicodeAttribute() - label = UnicodeAttribute() - type = UnicodeAttribute() - - termIndex = TermIndex() - tableIndex = TableIndex() - tableTermsIndex = TableTermsIndex() - - @classmethod - def make_index_entry(cls, tableName, columnName, term, label, type): - id = f"{tableName}\t{columnName}\t{term}" - tableTerms = f"{tableName}\t{term}" - entry = OntoData(hash_key=id) - entry.tableName = tableName - entry.tableTerms = tableTerms - entry.columnName = columnName - entry.term = term - entry.label = label - entry.type = type - - return entry - - -if __name__ == "__main__": - pass diff --git a/shared_resources/python-modules/python/shared/ontoutils/__init__.py b/shared_resources/python-modules/python/shared/ontoutils/__init__.py new file mode 100644 index 0000000..cb70bad --- /dev/null +++ b/shared_resources/python-modules/python/shared/ontoutils/__init__.py @@ -0,0 +1,162 @@ +import json +import time +import urllib +from functools import lru_cache + +import requests + +from shared.dynamodb import Ontology, Descendants, Anscestors + + +ENSEMBL_OLS = "https://www.ebi.ac.uk/ols/api/ontologies" +ONTOSERVER = "https://r4.ontoserver.csiro.au/fhir/ValueSet/$expand" + + +@lru_cache() +def get_term_ancestors_in_beacon(term): + terms = set() + try: + terms.update(Anscestors.get(term).anscestors) + except Anscestors.DoesNotExist: + terms.add(term) + return terms + + +@lru_cache() +def get_term_descendants_in_beacon(term: str): + terms = set() + try: + terms.update(Descendants.get(term).descendants) + except Descendants.DoesNotExist: + terms.add(term) + return terms + + +@lru_cache() +def get_term_all_ancestors(term: str): + term, ancestors = request_hierarchy(term, True) + ancestors.add(term) + + return ancestors + + +@lru_cache() +def get_term_all_descendants(term: str): + term, descendants = request_hierarchy(term, False) + descendants.add(term) + + return descendants + + +@lru_cache() +def get_ontology_details(ontology) -> Ontology: + details = None + try: + details = Ontology.get(ontology) + except Ontology.DoesNotExist: + if ontology == "SNOMED": + # use ontoserver + details = Ontology(ontology.upper()) + details.data = json.dumps( + {"id": "SNOMED", "baseUri": "http://snomed.info/sct"} + ) + details.save() + else: + # use ENSEMBL + if response := requests.get(f"{ENSEMBL_OLS}/{ontology}"): + response_json = response.json() + details = Ontology(ontology.upper()) + details.data = json.dumps( + { + "id": response_json["ontologyId"].upper(), + "baseUri": response_json["config"]["baseUris"][0], + } + ) + details.save() + + return details + + +@lru_cache() +def request_ontoserver_hierarchy(term: str, ancestors=True): + snomed = "SNOMED" in term.upper() + retries = 1 + response = None + while (not response or response.status_code != 200) and retries < 10: + retries += 1 + response = requests.post( + ONTOSERVER, + json={ + "resourceType": "Parameters", + "parameter": [ + { + "name": "valueSet", + "resource": { + "resourceType": "ValueSet", + "compose": { + "include": [ + { + "system": "http://snomed.info/sct", + "filter": [ + { + "property": "concept", + "op": "generalizes" + if ancestors + else "descendent-of", + "value": f"{term.replace('SNOMED:', '')}", + } + ], + } + ] + }, + }, + } + ], + }, + ) + if response.status_code == 200: + response_json = response.json() + members = set() + for response_term in response_json["expansion"]["contains"]: + members.add( + "SNOMED:" + response_term["code"] + if snomed + else response_term["code"] + ) + return (term, members) + else: + time.sleep(1) + + raise Exception(f"Error fetching term from Ontoserver {term}") + + +@lru_cache() +def request_ensembl_hierarchy(term: str, ancestors=True): + ontology, code = term.split(":") + details = get_ontology_details(ontology) + # if no details available, it is probably not an ontology term + if not details: + return (term, set()) + + data = json.loads(details.data) + iri = data["baseUri"] + code + iri_double_encoded = urllib.parse.quote_plus(urllib.parse.quote_plus(iri)) + url = f"{ENSEMBL_OLS}/{ontology}/terms/{iri_double_encoded}/{'hierarchicalAncestors' if ancestors else 'hierarchicalDescendants'}" + + if response := requests.get(url): + response_json = response.json() + members = set() + for response_term in response_json["_embedded"]["terms"]: + obo_id = response_term["obo_id"] + if obo_id: + members.add(obo_id) + return (term, members) + + raise Exception(f"Error fetching term from Ensembl OLS {term}") + + +@lru_cache() +def request_hierarchy(term, ancestors): + if term.startswith("SNOMED"): + return request_ontoserver_hierarchy(term, ancestors) + return request_ensembl_hierarchy(term, ancestors) From c533112a663a23f1541e1f02214dbdb4be626760 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Wed, 31 May 2023 01:57:03 +0000 Subject: [PATCH 14/16] Bug fix; handle custom filters, condition error --- lambda/indexer/lambda_function.py | 1 - .../python-modules/python/shared/apiutils/requests.py | 2 +- .../python-modules/python/shared/athena/filters.py | 10 +++++++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/lambda/indexer/lambda_function.py b/lambda/indexer/lambda_function.py index 41d1ad6..c98f918 100644 --- a/lambda/indexer/lambda_function.py +++ b/lambda/indexer/lambda_function.py @@ -132,7 +132,6 @@ def await_result(execution_id, sleep=2): status = exec["QueryExecution"]["Status"]["State"] if status in ("QUEUED", "RUNNING"): - print(f"Sleeping {sleep} seconds") time.sleep(sleep) retries += 1 diff --git a/shared_resources/python-modules/python/shared/apiutils/requests.py b/shared_resources/python-modules/python/shared/apiutils/requests.py index 68cd79b..fb89c73 100644 --- a/shared_resources/python-modules/python/shared/apiutils/requests.py +++ b/shared_resources/python-modules/python/shared/apiutils/requests.py @@ -131,7 +131,7 @@ def __init__(self, **data): self._filters = data.get("filters", []) @validator("filters", pre=True, each_item=True) - def check_squares(cls, term): + def transform_filters(cls, term): if isinstance(term, str): term = {"id": term} return term diff --git a/shared_resources/python-modules/python/shared/athena/filters.py b/shared_resources/python-modules/python/shared/athena/filters.py index 97a88d5..6c0bf05 100644 --- a/shared_resources/python-modules/python/shared/athena/filters.py +++ b/shared_resources/python-modules/python/shared/athena/filters.py @@ -101,7 +101,7 @@ def entity_search_conditions( # if descendantTerms is false, then similarity measures dont really make sense... if f.include_descendant_terms: # process inclusion of term descendants dependant on 'similarity' - if f.similarity in (Similarity.HIGH or Similarity.EXACT): + if f.similarity in (Similarity.HIGH, Similarity.EXACT): expanded_terms = get_term_descendants_in_beacon(f.id) else: # NOTE: this simplistic similarity method not nessisarily efficient or nessisarily desirable @@ -125,6 +125,14 @@ def entity_search_conditions( join_constraints.append( f""" SELECT RI.{type_relations_table_id[id_type]} FROM "{ENV_ATHENA.ATHENA_RELATIONS_TABLE}" RI JOIN "{ENV_ATHENA.ATHENA_TERMS_INDEX_TABLE}" TI ON RI.{type_relations_table_id[group]}=TI.id WHERE TI.kind='{group}' AND TI.term IN ({expanded_terms}) """ ) + elif isinstance(f, CustomFilter): + # TODO this is a dummy replacement, for future implementation + group = f.scope or default_scope + expanded_terms = "?" + join_execution_parameters += [f.id] + join_constraints.append( + f""" SELECT RI.{type_relations_table_id[id_type]} FROM "{ENV_ATHENA.ATHENA_RELATIONS_TABLE}" RI JOIN "{ENV_ATHENA.ATHENA_TERMS_INDEX_TABLE}" TI ON RI.{type_relations_table_id[group]}=TI.id WHERE TI.kind='{group}' AND TI.term IN ({expanded_terms}) """ + ) # format fragments together to form coherent SQL expression join_constraints = " INTERSECT ".join(join_constraints) From 97eb502f9cf67e0909e820f4c244c31ed43f8474 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Mon, 24 Apr 2023 06:54:33 +0000 Subject: [PATCH 15/16] Refactoring results count, updated examples guide --- examples/test-data/GUIDE.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/test-data/GUIDE.md b/examples/test-data/GUIDE.md index a070ccc..744ae7f 100644 --- a/examples/test-data/GUIDE.md +++ b/examples/test-data/GUIDE.md @@ -13,6 +13,11 @@ Now edit the `submission.json` using the S3 URI of the `vcf.gz` file. ``` ## Data submission +<<<<<<< HEAD +======= + +You can submit the data in two ways. +>>>>>>> b3efa98 (Refactoring results count, updated examples guide) You can submit this data in two ways. @@ -30,7 +35,11 @@ Alternatively, you can upload edited `submission.json` file to an S3 location ac } ``` +<<<<<<< HEAD Option 2 is recommended for larger submissions with thousands of metadata entries. +======= +This approach is recommended for larger submissions with thousands of metadata entries. +>>>>>>> b3efa98 (Refactoring results count, updated examples guide) ## API testing From cc56e2acae528780cba55396366993e4245d77a9 Mon Sep 17 00:00:00 2001 From: Anuradha Date: Thu, 27 Apr 2023 03:58:44 +0000 Subject: [PATCH 16/16] Update read me --- examples/test-data/GUIDE.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/examples/test-data/GUIDE.md b/examples/test-data/GUIDE.md index 744ae7f..a070ccc 100644 --- a/examples/test-data/GUIDE.md +++ b/examples/test-data/GUIDE.md @@ -13,11 +13,6 @@ Now edit the `submission.json` using the S3 URI of the `vcf.gz` file. ``` ## Data submission -<<<<<<< HEAD -======= - -You can submit the data in two ways. ->>>>>>> b3efa98 (Refactoring results count, updated examples guide) You can submit this data in two ways. @@ -35,11 +30,7 @@ Alternatively, you can upload edited `submission.json` file to an S3 location ac } ``` -<<<<<<< HEAD Option 2 is recommended for larger submissions with thousands of metadata entries. -======= -This approach is recommended for larger submissions with thousands of metadata entries. ->>>>>>> b3efa98 (Refactoring results count, updated examples guide) ## API testing