From 7094d3cf192dfc25ff69456ec7f1e71e7df2c264 Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 10 Sep 2024 16:15:44 +0100 Subject: [PATCH] Use vcf2zarr in GWAS tutorial notebook Build docs with Python 3.10 Use %%bash for running vcf2zarr --- .github/workflows/check-docs.yml | 2 +- .github/workflows/docs.yml | 2 +- docs/examples/1kg.schema.json | 1537 +++ docs/examples/gwas_tutorial.ipynb | 18198 +++++++++++++++++++--------- requirements-doc.txt | 1 + 5 files changed, 14053 insertions(+), 5687 deletions(-) create mode 100644 docs/examples/1kg.schema.json diff --git a/.github/workflows/check-docs.yml b/.github/workflows/check-docs.yml index 5e23f3dd5..8ecf7dcdb 100644 --- a/.github/workflows/check-docs.yml +++ b/.github/workflows/check-docs.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9"] + python-version: ["3.10"] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 3217acaa4..33abd3b5e 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -15,7 +15,7 @@ jobs: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: - python-version: '3.9' + python-version: '3.10' - name: Install dependencies run: | sudo apt update -y diff --git a/docs/examples/1kg.schema.json b/docs/examples/1kg.schema.json new file mode 100644 index 000000000..2142d9e07 --- /dev/null +++ b/docs/examples/1kg.schema.json @@ -0,0 +1,1537 @@ +{ + "format_version": "0.4", + "samples_chunk_size": 1000, + "variants_chunk_size": 10000, + "samples": [ + { + "id": "HG00096" + }, + { + "id": "HG00099" + }, + { + "id": "HG00105" + }, + { + "id": "HG00118" + }, + { + "id": "HG00129" + }, + { + "id": "HG00148" + }, + { + "id": "HG00177" + }, + { + "id": "HG00182" + }, + { + "id": "HG00242" + }, + { + "id": "HG00254" + }, + { + "id": "HG00265" + }, + { + "id": "HG00271" + }, + { + "id": "HG00274" + }, + { + "id": "HG00332" + }, + { + "id": "HG00335" + }, + { + "id": "HG00369" + }, + { + "id": "HG00421" + }, + { + "id": "HG00436" + }, + { + "id": "HG00452" + }, + { + "id": "HG00472" + }, + { + "id": "HG00530" + }, + { + "id": "HG00534" + }, + { + "id": "HG00583" + }, + { + "id": "HG00590" + }, + { + "id": "HG00598" + }, + { + "id": "HG00607" + }, + { + "id": "HG00619" + }, + { + "id": "HG00623" + }, + { + "id": "HG00657" + }, + { + "id": "HG00663" + }, + { + "id": "HG00704" + }, + { + "id": "HG00705" + }, + { + "id": "HG00733" + }, + { + "id": "HG00864" + }, + { + "id": "HG00881" + }, + { + "id": "HG01052" + }, + { + "id": "HG01070" + }, + { + "id": "HG01075" + }, + { + "id": "HG01164" + }, + { + "id": "HG01174" + }, + { + "id": "HG01241" + }, + { + "id": "HG01248" + }, + { + "id": "HG01256" + }, + { + "id": "HG01275" + }, + { + "id": "HG01284" + }, + { + "id": "HG01334" + }, + { + "id": "HG01348" + }, + { + "id": "HG01396" + }, + { + "id": "HG01443" + }, + { + "id": "HG01491" + }, + { + "id": "HG01498" + }, + { + "id": "HG01537" + }, + { + "id": "HG01572" + }, + { + "id": "HG01606" + }, + { + "id": "HG01623" + }, + { + "id": "HG01630" + }, + { + "id": "HG01783" + }, + { + "id": "HG01784" + }, + { + "id": "HG01790" + }, + { + "id": "HG01799" + }, + { + "id": "HG01801" + }, + { + "id": "HG01806" + }, + { + "id": "HG01812" + }, + { + "id": "HG01813" + }, + { + "id": "HG01817" + }, + { + "id": "HG01848" + }, + { + "id": "HG01849" + }, + { + "id": "HG01857" + }, + { + "id": "HG01863" + }, + { + "id": "HG01874" + }, + { + "id": "HG01915" + }, + { + "id": "HG01924" + }, + { + "id": "HG01965" + }, + { + "id": "HG01970" + }, + { + "id": "HG01991" + }, + { + "id": "HG02010" + }, + { + "id": "HG02020" + }, + { + "id": "HG02054" + }, + { + "id": "HG02086" + }, + { + "id": "HG02087" + }, + { + "id": "HG02116" + }, + { + "id": "HG02122" + }, + { + "id": "HG02130" + }, + { + "id": "HG02131" + }, + { + "id": "HG02152" + }, + { + "id": "HG02154" + }, + { + "id": "HG02165" + }, + { + "id": "HG02224" + }, + { + "id": "HG02232" + }, + { + "id": "HG02236" + }, + { + "id": "HG02250" + }, + { + "id": "HG02259" + }, + { + "id": "HG02298" + }, + { + "id": "HG02318" + }, + { + "id": "HG02345" + }, + { + "id": "HG02351" + }, + { + "id": "HG02363" + }, + { + "id": "HG02373" + }, + { + "id": "HG02383" + }, + { + "id": "HG02384" + }, + { + "id": "HG02386" + }, + { + "id": "HG02388" + }, + { + "id": "HG02389" + }, + { + "id": "HG02397" + }, + { + "id": "HG02419" + }, + { + "id": "HG02462" + }, + { + "id": "HG02464" + }, + { + "id": "HG02497" + }, + { + "id": "HG02511" + }, + { + "id": "HG02521" + }, + { + "id": "HG02561" + }, + { + "id": "HG02574" + }, + { + "id": "HG02580" + }, + { + "id": "HG02595" + }, + { + "id": "HG02603" + }, + { + "id": "HG02629" + }, + { + "id": "HG02651" + }, + { + "id": "HG02682" + }, + { + "id": "HG02688" + }, + { + "id": "HG02690" + }, + { + "id": "HG02699" + }, + { + "id": "HG02760" + }, + { + "id": "HG02768" + }, + { + "id": "HG02771" + }, + { + "id": "HG02792" + }, + { + "id": "HG02798" + }, + { + "id": "HG02811" + }, + { + "id": "HG02814" + }, + { + "id": "HG02840" + }, + { + "id": "HG02870" + }, + { + "id": "HG02881" + }, + { + "id": "HG02970" + }, + { + "id": "HG02973" + }, + { + "id": "HG03009" + }, + { + "id": "HG03046" + }, + { + "id": "HG03074" + }, + { + "id": "HG03091" + }, + { + "id": "HG03105" + }, + { + "id": "HG03127" + }, + { + "id": "HG03193" + }, + { + "id": "HG03224" + }, + { + "id": "HG03237" + }, + { + "id": "HG03241" + }, + { + "id": "HG03247" + }, + { + "id": "HG03259" + }, + { + "id": "HG03267" + }, + { + "id": "HG03354" + }, + { + "id": "HG03366" + }, + { + "id": "HG03367" + }, + { + "id": "HG03380" + }, + { + "id": "HG03419" + }, + { + "id": "HG03449" + }, + { + "id": "HG03451" + }, + { + "id": "HG03458" + }, + { + "id": "HG03490" + }, + { + "id": "HG03491" + }, + { + "id": "HG03511" + }, + { + "id": "HG03556" + }, + { + "id": "HG03563" + }, + { + "id": "HG03598" + }, + { + "id": "HG03603" + }, + { + "id": "HG03607" + }, + { + "id": "HG03636" + }, + { + "id": "HG03684" + }, + { + "id": "HG03686" + }, + { + "id": "HG03690" + }, + { + "id": "HG03731" + }, + { + "id": "HG03740" + }, + { + "id": "HG03755" + }, + { + "id": "HG03800" + }, + { + "id": "HG03815" + }, + { + "id": "HG03832" + }, + { + "id": "HG03850" + }, + { + "id": "HG03873" + }, + { + "id": "HG03897" + }, + { + "id": "HG03905" + }, + { + "id": "HG03937" + }, + { + "id": "HG03948" + }, + { + "id": "HG03973" + }, + { + "id": "HG04054" + }, + { + "id": "HG04059" + }, + { + "id": "HG04063" + }, + { + "id": "HG04096" + }, + { + "id": "HG04099" + }, + { + "id": "HG04140" + }, + { + "id": "HG04171" + }, + { + "id": "HG04209" + }, + { + "id": "HG04210" + }, + { + "id": "HG04229" + }, + { + "id": "HG04239" + }, + { + "id": "NA07347" + }, + { + "id": "NA11918" + }, + { + "id": "NA11919" + }, + { + "id": "NA12045" + }, + { + "id": "NA12273" + }, + { + "id": "NA12342" + }, + { + "id": "NA12414" + }, + { + "id": "NA12546" + }, + { + "id": "NA12760" + }, + { + "id": "NA12878" + }, + { + "id": "NA18516" + }, + { + "id": "NA18525" + }, + { + "id": "NA18534" + }, + { + "id": "NA18541" + }, + { + "id": "NA18557" + }, + { + "id": "NA18565" + }, + { + "id": "NA18616" + }, + { + "id": "NA18619" + }, + { + "id": "NA18623" + }, + { + "id": "NA18630" + }, + { + "id": "NA18631" + }, + { + "id": "NA18740" + }, + { + "id": "NA18853" + }, + { + "id": "NA18865" + }, + { + "id": "NA18873" + }, + { + "id": "NA18874" + }, + { + "id": "NA18916" + }, + { + "id": "NA18960" + }, + { + "id": "NA18966" + }, + { + "id": "NA18975" + }, + { + "id": "NA18976" + }, + { + "id": "NA18978" + }, + { + "id": "NA18990" + }, + { + "id": "NA19060" + }, + { + "id": "NA19063" + }, + { + "id": "NA19076" + }, + { + "id": "NA19086" + }, + { + "id": "NA19087" + }, + { + "id": "NA19096" + }, + { + "id": "NA19113" + }, + { + "id": "NA19118" + }, + { + "id": "NA19185" + }, + { + "id": "NA19209" + }, + { + "id": "NA19311" + }, + { + "id": "NA19314" + }, + { + "id": "NA19317" + }, + { + "id": "NA19321" + }, + { + "id": "NA19379" + }, + { + "id": "NA19384" + }, + { + "id": "NA19390" + }, + { + "id": "NA19397" + }, + { + "id": "NA19399" + }, + { + "id": "NA19404" + }, + { + "id": "NA19446" + }, + { + "id": "NA19448" + }, + { + "id": "NA19455" + }, + { + "id": "NA19456" + }, + { + "id": "NA19466" + }, + { + "id": "NA19655" + }, + { + "id": "NA19657" + }, + { + "id": "NA19670" + }, + { + "id": "NA19678" + }, + { + "id": "NA19679" + }, + { + "id": "NA19701" + }, + { + "id": "NA19720" + }, + { + "id": "NA19756" + }, + { + "id": "NA19761" + }, + { + "id": "NA19764" + }, + { + "id": "NA19786" + }, + { + "id": "NA20318" + }, + { + "id": "NA20351" + }, + { + "id": "NA20517" + }, + { + "id": "NA20518" + }, + { + "id": "NA20529" + }, + { + "id": "NA20587" + }, + { + "id": "NA20757" + }, + { + "id": "NA20798" + }, + { + "id": "NA20799" + }, + { + "id": "NA20800" + }, + { + "id": "NA20810" + }, + { + "id": "NA20826" + }, + { + "id": "NA20858" + }, + { + "id": "NA20864" + }, + { + "id": "NA20869" + }, + { + "id": "NA20877" + }, + { + "id": "NA20888" + }, + { + "id": "NA20910" + }, + { + "id": "NA21101" + }, + { + "id": "NA21113" + }, + { + "id": "NA21114" + }, + { + "id": "NA21116" + }, + { + "id": "NA21118" + }, + { + "id": "NA21133" + }, + { + "id": "NA21143" + } + ], + "contigs": [ + { + "id": "1", + "length": 249250621 + }, + { + "id": "2", + "length": 243199373 + }, + { + "id": "3", + "length": 198022430 + }, + { + "id": "4", + "length": 191154276 + }, + { + "id": "5", + "length": 180915260 + }, + { + "id": "6", + "length": 171115067 + }, + { + "id": "7", + "length": 159138663 + }, + { + "id": "8", + "length": 146364022 + }, + { + "id": "9", + "length": 141213431 + }, + { + "id": "10", + "length": 135534747 + }, + { + "id": "11", + "length": 135006516 + }, + { + "id": "12", + "length": 133851895 + }, + { + "id": "13", + "length": 115169878 + }, + { + "id": "14", + "length": 107349540 + }, + { + "id": "15", + "length": 102531392 + }, + { + "id": "16", + "length": 90354753 + }, + { + "id": "17", + "length": 81195210 + }, + { + "id": "18", + "length": 78077248 + }, + { + "id": "19", + "length": 59128983 + }, + { + "id": "20", + "length": 63025520 + }, + { + "id": "21", + "length": 48129895 + }, + { + "id": "22", + "length": 51304566 + }, + { + "id": "X", + "length": 155270560 + }, + { + "id": "Y", + "length": 59373566 + }, + { + "id": "MT", + "length": 16569 + }, + { + "id": "GL000207.1", + "length": 4262 + }, + { + "id": "GL000226.1", + "length": 15008 + }, + { + "id": "GL000229.1", + "length": 19913 + }, + { + "id": "GL000231.1", + "length": 27386 + }, + { + "id": "GL000210.1", + "length": 27682 + }, + { + "id": "GL000239.1", + "length": 33824 + }, + { + "id": "GL000235.1", + "length": 34474 + }, + { + "id": "GL000201.1", + "length": 36148 + }, + { + "id": "GL000247.1", + "length": 36422 + }, + { + "id": "GL000245.1", + "length": 36651 + }, + { + "id": "GL000197.1", + "length": 37175 + }, + { + "id": "GL000203.1", + "length": 37498 + }, + { + "id": "GL000246.1", + "length": 38154 + }, + { + "id": "GL000249.1", + "length": 38502 + }, + { + "id": "GL000196.1", + "length": 38914 + }, + { + "id": "GL000248.1", + "length": 39786 + }, + { + "id": "GL000244.1", + "length": 39929 + }, + { + "id": "GL000238.1", + "length": 39939 + }, + { + "id": "GL000202.1", + "length": 40103 + }, + { + "id": "GL000234.1", + "length": 40531 + }, + { + "id": "GL000232.1", + "length": 40652 + }, + { + "id": "GL000206.1", + "length": 41001 + }, + { + "id": "GL000240.1", + "length": 41933 + }, + { + "id": "GL000236.1", + "length": 41934 + }, + { + "id": "GL000241.1", + "length": 42152 + }, + { + "id": "GL000243.1", + "length": 43341 + }, + { + "id": "GL000242.1", + "length": 43523 + }, + { + "id": "GL000230.1", + "length": 43691 + }, + { + "id": "GL000237.1", + "length": 45867 + }, + { + "id": "GL000233.1", + "length": 45941 + }, + { + "id": "GL000204.1", + "length": 81310 + }, + { + "id": "GL000198.1", + "length": 90085 + }, + { + "id": "GL000208.1", + "length": 92689 + }, + { + "id": "GL000191.1", + "length": 106433 + }, + { + "id": "GL000227.1", + "length": 128374 + }, + { + "id": "GL000228.1", + "length": 129120 + }, + { + "id": "GL000214.1", + "length": 137718 + }, + { + "id": "GL000221.1", + "length": 155397 + }, + { + "id": "GL000209.1", + "length": 159169 + }, + { + "id": "GL000218.1", + "length": 161147 + }, + { + "id": "GL000220.1", + "length": 161802 + }, + { + "id": "GL000213.1", + "length": 164239 + }, + { + "id": "GL000211.1", + "length": 166566 + }, + { + "id": "GL000199.1", + "length": 169874 + }, + { + "id": "GL000217.1", + "length": 172149 + }, + { + "id": "GL000216.1", + "length": 172294 + }, + { + "id": "GL000215.1", + "length": 172545 + }, + { + "id": "GL000205.1", + "length": 174588 + }, + { + "id": "GL000219.1", + "length": 179198 + }, + { + "id": "GL000224.1", + "length": 179693 + }, + { + "id": "GL000223.1", + "length": 180455 + }, + { + "id": "GL000195.1", + "length": 182896 + }, + { + "id": "GL000212.1", + "length": 186858 + }, + { + "id": "GL000222.1", + "length": 186861 + }, + { + "id": "GL000200.1", + "length": 187035 + }, + { + "id": "GL000193.1", + "length": 189789 + }, + { + "id": "GL000194.1", + "length": 191469 + }, + { + "id": "GL000225.1", + "length": 211173 + }, + { + "id": "GL000192.1", + "length": 547496 + } + ], + "filters": [ + { + "id": "PASS", + "description": "All filters passed" + } + ], + "fields": [ + { + "name": "variant_contig", + "dtype": "i1", + "shape": [ + 10879 + ], + "chunks": [ + 10000 + ], + "dimensions": [ + "variants" + ], + "description": "An identifier from the reference genome or an angle-bracketed ID string pointing to a contig in the assembly file", + "vcf_field": null, + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 0, + "blocksize": 0 + }, + "filters": [] + }, + { + "name": "variant_filter", + "dtype": "bool", + "shape": [ + 10879, + 1 + ], + "chunks": [ + 10000, + 1 + ], + "dimensions": [ + "variants", + "filters" + ], + "description": "Filter status of the variant", + "vcf_field": null, + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 2, + "blocksize": 0 + }, + "filters": [] + }, + { + "name": "variant_allele", + "dtype": "O", + "shape": [ + 10879, + 2 + ], + "chunks": [ + 10000, + 2 + ], + "dimensions": [ + "variants", + "alleles" + ], + "description": "List of the reference and alternate alleles", + "vcf_field": null, + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 0, + "blocksize": 0 + }, + "filters": [] + }, + { + "name": "variant_id", + "dtype": "O", + "shape": [ + 10879 + ], + "chunks": [ + 10000 + ], + "dimensions": [ + "variants" + ], + "description": "List of unique identifiers where applicable", + "vcf_field": null, + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 0, + "blocksize": 0 + }, + "filters": [] + }, + { + "name": "variant_id_mask", + "dtype": "bool", + "shape": [ + 10879 + ], + "chunks": [ + 10000 + ], + "dimensions": [ + "variants" + ], + "description": "", + "vcf_field": null, + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 2, + "blocksize": 0 + }, + "filters": [] + }, + { + "name": "variant_quality", + "dtype": "f4", + "shape": [ + 10879 + ], + "chunks": [ + 10000 + ], + "dimensions": [ + "variants" + ], + "description": "Phred-scaled quality score", + "vcf_field": "QUAL", + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 0, + "blocksize": 0 + }, + "filters": [] + }, + { + "name": "variant_position", + "dtype": "i4", + "shape": [ + 10879 + ], + "chunks": [ + 10000 + ], + "dimensions": [ + "variants" + ], + "description": "The reference position", + "vcf_field": "POS", + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 0, + "blocksize": 0 + }, + "filters": [] + }, + { + "name": "call_AD", + "dtype": "i1", + "shape": [ + 10879, + 284, + 2 + ], + "chunks": [ + 10000, + 1000, + 2 + ], + "dimensions": [ + "variants", + "samples", + "alleles" + ], + "description": "", + "vcf_field": "FORMAT/AD", + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 0, + "blocksize": 0 + }, + "filters": [] + }, + { + "name": "call_DP", + "dtype": "i1", + "shape": [ + 10879, + 284 + ], + "chunks": [ + 10000, + 1000 + ], + "dimensions": [ + "variants", + "samples" + ], + "description": "", + "vcf_field": "FORMAT/DP", + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 0, + "blocksize": 0 + }, + "filters": [] + }, + { + "name": "call_GQ", + "dtype": "i1", + "shape": [ + 10879, + 284 + ], + "chunks": [ + 10000, + 1000 + ], + "dimensions": [ + "variants", + "samples" + ], + "description": "", + "vcf_field": "FORMAT/GQ", + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 0, + "blocksize": 0 + }, + "filters": [] + }, + { + "name": "call_genotype_phased", + "dtype": "bool", + "shape": [ + 10879, + 284 + ], + "chunks": [ + 10000, + 1000 + ], + "dimensions": [ + "variants", + "samples" + ], + "description": "", + "vcf_field": null, + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 2, + "blocksize": 0 + }, + "filters": [] + }, + { + "name": "call_genotype", + "dtype": "i1", + "shape": [ + 10879, + 284, + 2 + ], + "chunks": [ + 10000, + 1000, + 2 + ], + "dimensions": [ + "variants", + "samples", + "ploidy" + ], + "description": "", + "vcf_field": null, + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 2, + "blocksize": 0 + }, + "filters": [] + }, + { + "name": "call_genotype_mask", + "dtype": "bool", + "shape": [ + 10879, + 284, + 2 + ], + "chunks": [ + 10000, + 1000, + 2 + ], + "dimensions": [ + "variants", + "samples", + "ploidy" + ], + "description": "", + "vcf_field": null, + "compressor": { + "id": "blosc", + "cname": "zstd", + "clevel": 7, + "shuffle": 2, + "blocksize": 0 + }, + "filters": [] + } + ] +} \ No newline at end of file diff --git a/docs/examples/gwas_tutorial.ipynb b/docs/examples/gwas_tutorial.ipynb index ad8fee564..b0d45f25f 100644 --- a/docs/examples/gwas_tutorial.ipynb +++ b/docs/examples/gwas_tutorial.ipynb @@ -25,8 +25,7 @@ "metadata": {}, "outputs": [], "source": [ - "import sgkit as sg\n", - "from sgkit.io.vcf import vcf_to_zarr" + "import sgkit as sg" ] }, { @@ -79,8 +78,12 @@ "import requests\n", "\n", "if not Path(\"1kg.vcf.bgz\").exists():\n", - " response = requests.get(\"https://storage.googleapis.com/sgkit-gwas-tutorial/1kg.vcf.bgz\")\n", + " response = requests.get(\"https://storage.googleapis.com/sgkit-data/tutorial/1kg.vcf.bgz\")\n", " with open(\"1kg.vcf.bgz\", \"wb\") as f:\n", + " f.write(response.content)\n", + "if not Path(\"1kg.vcf.bgz.tbi\").exists():\n", + " response = requests.get(\"https://storage.googleapis.com/sgkit-data/tutorial/1kg.vcf.bgz.tbi\")\n", + " with open(\"1kg.vcf.bgz.tbi\", \"wb\") as f:\n", " f.write(response.content)" ] }, @@ -97,27 +100,33 @@ "id": "elementary-college", "metadata": {}, "source": [ - "Next, [convert it to Zarr](https://sgkit-dev.github.io/sgkit/latest/user_guide.html#converting-genetic-data-to-zarr), stored on the local filesystem in a directory called _1kg.zarr_." + "Next, convert the VCF file to Zarr using the `vcf2zarr` command in [bio2zarr](https://sgkit-dev.github.io/bio2zarr/), stored on the local filesystem in a directory called _1kg.vcz_." ] }, { "cell_type": "code", "execution_count": 4, - "id": "composed-injury", + "id": "78effc1d-b45e-4af5-85ae-e0e7a40ca049", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[W::bcf_hdr_check_sanity] PL should be declared as Number=G\n" + " Scan: 0%| | 0.00/1.00 [00:00 1kg.schema.json # then edit 1kg.schema.json by hand\n", + "vcf2zarr encode --force -s 1kg.schema.json 1kg.icf 1kg.vcz" ] }, { @@ -125,7 +134,11 @@ "id": "plastic-running", "metadata": {}, "source": [ - "We passed a few arguments to the `vcf_to_zarr` conversion function, so it only converts the first alternate allele (`max_alt_alleles=1`), and to load extra VCF fields we are interested in (`GT`, `DP`, `GQ`, and `AD`). Also, `AD` needed defining as having a `Number` definition of `R` (one value for each allele, including the reference), since the dataset we are using defines it as `.` which means \"unknown\".\n", + "We used the `vcf2zarr explode` command to first convert the VCF to an \"intermediate columnar format\" (ICF), then the `vcf2zarr encode` command to convert the ICF to Zarr, which by convention is stored in a directory with a `vcz` extension.\n", + "\n", + "Note that we specified a JSON schema file that was created with the `vcf2zarr mkschema` command (commented out above), then edited to drop some fields that are not needed for this tutorial (such as `FORMAT/PL`). It was also updated to change the `call_AD` field's third dimension to be `alleles`, which was not set by `vcf2zarr` since the dataset we are using defines `FORMAT/AD` as `.` which means \"unknown\", rather than `R`.\n", + "\n", + "For more information about using `vcf2zarr`, see the tutorial in the [bio2zarr documentation](https://sgkit-dev.github.io/bio2zarr/).\n", "\n", "Now the data has been written as Zarr, all downstream operations on will be much faster. Note that sgkit uses an [Xarray](http://xarray.pydata.org/en/stable/) dataset to represent the VCF data, where Hail uses MatrixTable." ] @@ -137,7 +150,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds = sg.load_dataset(\"1kg.zarr\")" + "ds = sg.load_dataset(\"1kg.vcz\")" ] }, { @@ -198,6 +211,7 @@ "}\n", "\n", "html[theme=dark],\n", + "html[data-theme=dark],\n", "body[data-theme=dark],\n", "body.vscode-dark {\n", " --xr-font-color0: rgba(255, 255, 255, 1);\n", @@ -428,6 +442,11 @@ " grid-column: 4;\n", "}\n", "\n", + ".xr-index-preview {\n", + " grid-column: 2 / 5;\n", + " color: var(--xr-font-color2);\n", + "}\n", + "\n", ".xr-var-name,\n", ".xr-var-dims,\n", ".xr-var-dtype,\n", @@ -449,14 +468,16 @@ "}\n", "\n", ".xr-var-attrs,\n", - ".xr-var-data {\n", + ".xr-var-data,\n", + ".xr-index-data {\n", " display: none;\n", " background-color: var(--xr-background-color) !important;\n", " padding-bottom: 5px !important;\n", "}\n", "\n", ".xr-var-attrs-in:checked ~ .xr-var-attrs,\n", - ".xr-var-data-in:checked ~ .xr-var-data {\n", + ".xr-var-data-in:checked ~ .xr-var-data,\n", + ".xr-index-data-in:checked ~ .xr-index-data {\n", " display: block;\n", "}\n", "\n", @@ -466,13 +487,16 @@ "\n", ".xr-var-name span,\n", ".xr-var-data,\n", + ".xr-index-name div,\n", + ".xr-index-data,\n", ".xr-attrs {\n", " padding-left: 25px !important;\n", "}\n", "\n", ".xr-attrs,\n", ".xr-var-attrs,\n", - ".xr-var-data {\n", + ".xr-var-data,\n", + ".xr-index-data {\n", " grid-column: 1 / -1;\n", "}\n", "\n", @@ -510,7 +534,8 @@ "}\n", "\n", ".xr-icon-database,\n", - ".xr-icon-file-text2 {\n", + ".xr-icon-file-text2,\n", + ".xr-no-icon {\n", " display: inline-block;\n", " vertical-align: middle;\n", " width: 1em;\n", @@ -519,26 +544,26 @@ " stroke: currentColor;\n", " fill: currentColor;\n", "}\n", - "
<xarray.Dataset>\n",
+       "
<xarray.Dataset> Size: 28MB\n",
        "Dimensions:               (variants: 10879, samples: 284, alleles: 2,\n",
        "                           ploidy: 2, contigs: 84, filters: 1)\n",
        "Dimensions without coordinates: variants, samples, alleles, ploidy, contigs,\n",
        "                                filters\n",
        "Data variables: (12/17)\n",
-       "    call_AD               (variants, samples, alleles) int32 dask.array<chunksize=(10000, 284, 2), meta=np.ndarray>\n",
-       "    call_DP               (variants, samples) int32 dask.array<chunksize=(10000, 284), meta=np.ndarray>\n",
-       "    call_GQ               (variants, samples) int32 dask.array<chunksize=(10000, 284), meta=np.ndarray>\n",
-       "    call_genotype         (variants, samples, ploidy) int8 dask.array<chunksize=(10000, 284, 2), meta=np.ndarray>\n",
-       "    call_genotype_mask    (variants, samples, ploidy) bool dask.array<chunksize=(10000, 284, 2), meta=np.ndarray>\n",
-       "    call_genotype_phased  (variants, samples) bool dask.array<chunksize=(10000, 284), meta=np.ndarray>\n",
+       "    call_AD               (variants, samples, alleles) int8 6MB dask.array<chunksize=(10000, 284, 2), meta=np.ndarray>\n",
+       "    call_DP               (variants, samples) int8 3MB dask.array<chunksize=(10000, 284), meta=np.ndarray>\n",
+       "    call_GQ               (variants, samples) int8 3MB dask.array<chunksize=(10000, 284), meta=np.ndarray>\n",
+       "    call_genotype         (variants, samples, ploidy) int8 6MB dask.array<chunksize=(10000, 284, 2), meta=np.ndarray>\n",
+       "    call_genotype_mask    (variants, samples, ploidy) bool 6MB dask.array<chunksize=(10000, 284, 2), meta=np.ndarray>\n",
+       "    call_genotype_phased  (variants, samples) bool 3MB dask.array<chunksize=(10000, 284), meta=np.ndarray>\n",
        "    ...                    ...\n",
-       "    variant_contig        (variants) int8 dask.array<chunksize=(10000,), meta=np.ndarray>\n",
-       "    variant_filter        (variants, filters) bool dask.array<chunksize=(10000, 1), meta=np.ndarray>\n",
-       "    variant_id            (variants) object dask.array<chunksize=(10000,), meta=np.ndarray>\n",
-       "    variant_id_mask       (variants) bool dask.array<chunksize=(10000,), meta=np.ndarray>\n",
-       "    variant_position      (variants) int32 dask.array<chunksize=(10000,), meta=np.ndarray>\n",
-       "    variant_quality       (variants) float32 dask.array<chunksize=(10000,), meta=np.ndarray>\n",
-       "Attributes: (7)