From 57934c899da1b9e4582ae5ff1443f5d6d326c69d Mon Sep 17 00:00:00 2001 From: Alex Kotlar Date: Tue, 19 Nov 2024 14:46:46 -0500 Subject: [PATCH] remove unneeded code --- INSTALL.md | 10 - Makefile | 7 +- config/.gitignore | 2 - config/hg19.yml | 518 - config/hg38.yml | 718 -- go/cmd/dosage/main.go | 118 - go/decompress/decompress.go | 216 - go/decompress/decompress_test.go | 421 - go/go.mod | 33 - go/go.sum | 139 - go/test/opensearch/testdata/input.txt | 4 - install-apt.sh | 107 - install-rpm.sh | 107 - perl/.gitignore | 19 - perl/.perlcriticrc | 17 - perl/.perltidyrc | 40 - perl/.tidyallrc | 15 - perl/INSTALL.md | 322 - perl/README.md | 1033 -- perl/bin/bystro-annotate.pl | 41 - perl/bin/bystro-build.pl | 220 - perl/bin/bystro-utils.pl | 184 - perl/bin/read_db_util.pl | 82 - perl/cpanfile | 83 - perl/dist.ini | 76 - ...example_bystro_vcf_preprocessor_output.tsv | 11 - perl/example_vcf.tsv | 9 - perl/example_vcf_dosage_matrix.feather | Bin 1218 -> 0 bytes perl/issues/subtxn_bug.pl | 176 - perl/lib/Interface.pm | 278 - perl/lib/Seq.pm | 862 -- perl/lib/Seq/Base.pm | 121 - perl/lib/Seq/Build.pm | 207 - perl/lib/Seq/DBManager.pm | 1181 -- perl/lib/Seq/Definition.pm | 306 - perl/lib/Seq/Headers.pm | 250 - perl/lib/Seq/InputFile.pm | 179 - perl/lib/Seq/Output.pm | 255 - perl/lib/Seq/Output/Delimiters.pm | 165 - perl/lib/Seq/Output/Fields.pm | 24 - perl/lib/Seq/Role/ConfigFromFile.pm | 94 - perl/lib/Seq/Role/IO.pm | 480 - perl/lib/Seq/Role/Message.pm | 341 - perl/lib/Seq/Role/Validator.pm | 55 - perl/lib/Seq/Statistics.pm | 171 - perl/lib/Seq/Tracks.pm | 471 - perl/lib/Seq/Tracks/Base.pm | 310 - perl/lib/Seq/Tracks/Base/MapFieldNames.pm | 144 - perl/lib/Seq/Tracks/Base/MapTrackNames.pm | 110 - perl/lib/Seq/Tracks/Base/Types.pm | 155 - perl/lib/Seq/Tracks/Build.pm | 611 - perl/lib/Seq/Tracks/Build/CompletionMeta.pm | 92 - perl/lib/Seq/Tracks/Build/LocalFilesPaths.pm | 31 - perl/lib/Seq/Tracks/Cadd.pm | 98 - perl/lib/Seq/Tracks/Cadd/Build.pm | 625 - perl/lib/Seq/Tracks/Cadd/Order.pm | 40 - perl/lib/Seq/Tracks/Gene.pm | 485 - perl/lib/Seq/Tracks/Gene/Build.pm | 647 - perl/lib/Seq/Tracks/Gene/Build/TX.pm | 663 - perl/lib/Seq/Tracks/Gene/Definition.pm | 39 - perl/lib/Seq/Tracks/Gene/Site.pm | 249 - perl/lib/Seq/Tracks/Gene/Site/CodonMap.pm | 194 - perl/lib/Seq/Tracks/Gene/Site/SiteTypeMap.pm | 175 - perl/lib/Seq/Tracks/Get.pm | 114 - perl/lib/Seq/Tracks/Nearest.pm | 161 - perl/lib/Seq/Tracks/Nearest/Build.pm | 864 -- perl/lib/Seq/Tracks/README.md | 394 - perl/lib/Seq/Tracks/Reference.pm | 33 - perl/lib/Seq/Tracks/Reference/Build.pm | 192 - perl/lib/Seq/Tracks/Reference/MapBases.pm | 32 - perl/lib/Seq/Tracks/Region.pm | 13 - perl/lib/Seq/Tracks/Region/Build.pm | 15 - perl/lib/Seq/Tracks/Region/RegionTrackPath.pm | 18 - perl/lib/Seq/Tracks/Score.pm | 48 - perl/lib/Seq/Tracks/Score/Build.pm | 214 - perl/lib/Seq/Tracks/Score/Build/Round.pm | 23 - perl/lib/Seq/Tracks/Sparse.pm | 19 - perl/lib/Seq/Tracks/Sparse/Build.pm | 560 - perl/lib/Seq/Tracks/Vcf.pm | 107 - perl/lib/Seq/Tracks/Vcf/Build.pm | 670 -- perl/lib/Utils/Base.pm | 242 - perl/lib/Utils/CaddToBed.pm | 148 - perl/lib/Utils/DbSnp2FormatInfo.pm | 211 - perl/lib/Utils/Fetch.pm | 248 - perl/lib/Utils/FilterCadd.pm | 331 - perl/lib/Utils/LiftOverCadd.pm | 164 - perl/lib/Utils/RefGeneXdbnsfp.pm | 278 - perl/lib/Utils/RenameTrack.pm | 104 - perl/lib/Utils/SortCadd.pm | 209 - perl/lib/Utils/SqlWriter.pm | 213 - perl/lib/Utils/SqlWriter/Connection.pm | 64 - perl/lib/Utils/scripts/cadd_indel_to_vcf.sh | 126 - .../Utils/scripts/dbsnp_rename_chrs_25_39.sh | 63 - .../scripts/extract_gnomad_an_af_nhomalt.pl | 58 - perl/lib/Utils/scripts/split_vcf_by_chr.pl | 100 - .../t/test_extract_gnomad_an_af_nhomalt.pl | 88 - perl/t/dbmanager-cursor.t | 118 - perl/t/dbmanager-del.t | 107 - perl/t/definition.t | 73 - perl/t/headers.t | 253 - perl/t/inputFile.t | 48 - perl/t/lib/TestUtils.pm | 118 - perl/t/msgpack.t | 30 - perl/t/output.t | 200 - perl/t/output/delimiters.t | 47 - perl/t/role/02-message.t | 127 - perl/t/role/1-sparseArrays.t | 22 - perl/t/tracks/base/01-convert.t | 76 - perl/t/tracks/base/02-convert-pack.t | 46 - perl/t/tracks/base/normalizeWantedChr.t | 116 - perl/t/tracks/base/types.t | 27 - perl/t/tracks/build/02-local-files.t | 47 - .../build/build_field_transformations.t | 71 - perl/t/tracks/build/coerceFeatureType.t | 175 - perl/t/tracks/build/coerceUndefinedValues.t | 125 - perl/t/tracks/build/ref_cannot_be_skipped.t | 36 - perl/t/tracks/build/ref_cannot_be_skipped.yml | 12 - ...22.organized-by-chr.txt.sorted.29lines.txt | 29 - .../tracks/cadd/db/ref/chr22_contrived.fastq | 4 - perl/t/tracks/cadd/integration.t | 153 - perl/t/tracks/cadd/integration.yml | 20 - perl/t/tracks/chrWanted.t | 50 - perl/t/tracks/gene/construct.t | 21 - .../variant_summary.txt.MT.1600_3250.gz | Bin 523 -> 0 bytes .../db/raw/ref/chr10_fake_overlap.fasta.gz | Bin 8080 -> 0 bytes perl/t/tracks/gene/db/raw/ref/chrM.fa.gz | Bin 5537 -> 0 bytes .../db/raw/ref/fakeRef.hg19.60950_70966.txt | 10017 ---------------- .../2018-09-06.hg19.kgXref.fetch.orig.gz | Bin 504 -> 0 bytes .../gene/db/raw/refSeq/hg19.complex.txt | 2 - .../gene/db/raw/refSeq/hg19.kgXref.chrM.gz | Bin 211 -> 0 bytes .../gene/db/raw/refSeq/hg19.kgXref.fetch.gz | Bin 523 -> 0 bytes .../gene/db/raw/refSeq/hg19.refGene.chrM | 2 - .../gene/db/raw/trackNames_meta/data.mdb | Bin 20480 -> 0 bytes .../gene/db/raw/trackNames_meta/lock.mdb | Bin 8320 -> 0 bytes perl/t/tracks/gene/join.t | 126 - perl/t/tracks/gene/join.yml | 77 - perl/t/tracks/gene/join_no_build.t | 133 - perl/t/tracks/gene/join_no_build.yml | 78 - perl/t/tracks/gene/ncrna.t | 211 - perl/t/tracks/gene/ncrna.yml | 27 - perl/t/tracks/gene/overlap.t | 261 - perl/t/tracks/gene/overlap.yml | 64 - perl/t/tracks/gene/region.t | 79 - perl/t/tracks/gene/region.yml | 27 - perl/t/tracks/gene/simple.t | 123 - perl/t/tracks/gene/simple.yml | 27 - perl/t/tracks/gene/site.t | 56 - perl/t/tracks/gene/test-prepare-ref.yml | 13 - perl/t/tracks/merge.t | 49 - .../tracks/nearest/db/hg19/raw/ref/chrM.fa.gz | Bin 5537 -> 0 bytes .../db/hg19/raw/refSeq/hg19.refGene.chrM | 8 - perl/t/tracks/nearest/integration.t | 540 - perl/t/tracks/nearest/test.yml | 94 - .../reference/db/raw/refTest/chrM.fa.gz | Bin 5549 -> 0 bytes perl/t/tracks/reference/integration.t | 74 - perl/t/tracks/reference/integration.yml | 15 - perl/t/tracks/score/build/rounder.t | 32 - .../phastCons/chrM.phastCons100way.wigFix.gz | Bin 13098 -> 0 bytes perl/t/tracks/score/integration.t | 107 - perl/t/tracks/score/integration.yml | 21 - perl/t/tracks/sparse/build_clinvar.t | 140 - perl/t/tracks/sparse/clinvar-test-config.yml | 380 - .../sparse/raw/clinvar/clinvar-small.tsv | 16 - perl/t/tracks/vcf/README.md | 11 - perl/t/tracks/vcf/clinvar.t | 150 - perl/t/tracks/vcf/clinvar.yml | 79 - perl/t/tracks/vcf/integration.t | 909 -- .../integration_scrambled_multiple_files.t | 1012 -- .../clinvar_alleles.single.b38.vcf.1klines.gz | Bin 62902 -> 0 bytes .../clinvar_alleles.single.b38.vcf.1lines.gz | Bin 2627 -> 0 bytes .../vcf/raw/gnomad.genomes.scrambled/test.vcf | 200 - .../test_split_part1.vcf | 198 - .../test_split_part2.vcf.gz | Bin 7257 -> 0 bytes perl/t/tracks/vcf/raw/gnomad.genomes/test.vcf | 200 - perl/t/tracks/vcf/test.hg38.chr22.yml | 101 - .../vcf/test.scrambled_multiple_files.yml | 101 - perl/t/utils/dbsnp2FormatInfo.t | 131 - perl/t/utils/filterCadd.t | 254 - .../raw/cadd/test.filterCadd.cadd.chr1.txt.gz | Bin 466 -> 0 bytes .../raw/cadd/test.filterCadd.cadd.chr2.txt | 23 - .../raw/cadd/test.filterCadd.cadd.chr22.txt | 23 - perl/t/utils/scripts/split_by_chr.t | 92 - perl/t/utils/scripts/vcf_example.vcf | 9 - perl/t/utils/sqlWriter.t | 147 - python/python/bystro/ancestry/__init__.py | 10 - .../ancestry/adversarial_autoencoder.py | 419 - .../ancestry_model_products/.gitignore | 3 - .../python/bystro/ancestry/ancestry_types.py | 139 - python/python/bystro/ancestry/asserts.py | 37 - python/python/bystro/ancestry/data/.gitignore | 4 - .../bystro/ancestry/data/kgp_vcfs/.gitignore | 22 - .../python/bystro/ancestry/define_callset.py | 154 - python/python/bystro/ancestry/gmm_ancestry.py | 436 - python/python/bystro/ancestry/inference.py | 382 - .../ancestry/intermediate_data/.gitignore | 1 - python/python/bystro/ancestry/model.py | 265 - .../preprocess_1kgp_using_gnomad_loadings.sh | 75 - .../python/bystro/ancestry/preprocess_vcfs.sh | 42 - .../python/bystro/ancestry/tests/__init__.py | 1 - .../tests/test_adversarial_autoencoder.py | 32 - .../ancestry/tests/test_ancestry_gmm.py | 47 - .../ancestry/tests/test_ancestry_types.py | 213 - .../ancestry/tests/test_define_callset.py | 66 - .../bystro/ancestry/tests/test_inference.py | 128 - .../bystro/ancestry/tests/test_train.py | 288 - .../bystro/ancestry/tests/test_train_utils.py | 18 - python/python/bystro/ancestry/train.py | 615 - .../bystro/ancestry/train_chip_model.py | 56 - .../bystro/ancestry/train_gnomad_model.py | 55 - python/python/bystro/ancestry/train_utils.py | 59 - python/python/bystro/ancestry/upload_model.py | 49 - python/python/bystro/api/ancestry.py | 180 - .../api/tests/ancestry_expected_output.tsv | 4 - .../bystro/api/tests/ancestry_input.json | 1 - .../bystro/api/tests/test_ancestry_api.py | 63 - 215 files changed, 2 insertions(+), 43412 deletions(-) delete mode 100644 config/.gitignore delete mode 100644 config/hg19.yml delete mode 100644 config/hg38.yml delete mode 100644 go/cmd/dosage/main.go delete mode 100644 go/decompress/decompress.go delete mode 100644 go/decompress/decompress_test.go delete mode 100644 go/go.mod delete mode 100644 go/go.sum delete mode 100644 go/test/opensearch/testdata/input.txt delete mode 100755 install-apt.sh delete mode 100755 install-rpm.sh delete mode 100644 perl/.gitignore delete mode 100644 perl/.perlcriticrc delete mode 100644 perl/.perltidyrc delete mode 100644 perl/.tidyallrc delete mode 100644 perl/INSTALL.md delete mode 100644 perl/README.md delete mode 100755 perl/bin/bystro-annotate.pl delete mode 100755 perl/bin/bystro-build.pl delete mode 100755 perl/bin/bystro-utils.pl delete mode 100644 perl/bin/read_db_util.pl delete mode 100644 perl/cpanfile delete mode 100644 perl/dist.ini delete mode 100644 perl/example_bystro_vcf_preprocessor_output.tsv delete mode 100644 perl/example_vcf.tsv delete mode 100644 perl/example_vcf_dosage_matrix.feather delete mode 100644 perl/issues/subtxn_bug.pl delete mode 100644 perl/lib/Interface.pm delete mode 100644 perl/lib/Seq.pm delete mode 100644 perl/lib/Seq/Base.pm delete mode 100644 perl/lib/Seq/Build.pm delete mode 100644 perl/lib/Seq/DBManager.pm delete mode 100644 perl/lib/Seq/Definition.pm delete mode 100644 perl/lib/Seq/Headers.pm delete mode 100644 perl/lib/Seq/InputFile.pm delete mode 100644 perl/lib/Seq/Output.pm delete mode 100644 perl/lib/Seq/Output/Delimiters.pm delete mode 100644 perl/lib/Seq/Output/Fields.pm delete mode 100644 perl/lib/Seq/Role/ConfigFromFile.pm delete mode 100644 perl/lib/Seq/Role/IO.pm delete mode 100644 perl/lib/Seq/Role/Message.pm delete mode 100644 perl/lib/Seq/Role/Validator.pm delete mode 100644 perl/lib/Seq/Statistics.pm delete mode 100644 perl/lib/Seq/Tracks.pm delete mode 100644 perl/lib/Seq/Tracks/Base.pm delete mode 100644 perl/lib/Seq/Tracks/Base/MapFieldNames.pm delete mode 100644 perl/lib/Seq/Tracks/Base/MapTrackNames.pm delete mode 100644 perl/lib/Seq/Tracks/Base/Types.pm delete mode 100644 perl/lib/Seq/Tracks/Build.pm delete mode 100644 perl/lib/Seq/Tracks/Build/CompletionMeta.pm delete mode 100644 perl/lib/Seq/Tracks/Build/LocalFilesPaths.pm delete mode 100644 perl/lib/Seq/Tracks/Cadd.pm delete mode 100644 perl/lib/Seq/Tracks/Cadd/Build.pm delete mode 100644 perl/lib/Seq/Tracks/Cadd/Order.pm delete mode 100644 perl/lib/Seq/Tracks/Gene.pm delete mode 100644 perl/lib/Seq/Tracks/Gene/Build.pm delete mode 100644 perl/lib/Seq/Tracks/Gene/Build/TX.pm delete mode 100644 perl/lib/Seq/Tracks/Gene/Definition.pm delete mode 100644 perl/lib/Seq/Tracks/Gene/Site.pm delete mode 100644 perl/lib/Seq/Tracks/Gene/Site/CodonMap.pm delete mode 100644 perl/lib/Seq/Tracks/Gene/Site/SiteTypeMap.pm delete mode 100644 perl/lib/Seq/Tracks/Get.pm delete mode 100644 perl/lib/Seq/Tracks/Nearest.pm delete mode 100644 perl/lib/Seq/Tracks/Nearest/Build.pm delete mode 100644 perl/lib/Seq/Tracks/README.md delete mode 100644 perl/lib/Seq/Tracks/Reference.pm delete mode 100644 perl/lib/Seq/Tracks/Reference/Build.pm delete mode 100644 perl/lib/Seq/Tracks/Reference/MapBases.pm delete mode 100644 perl/lib/Seq/Tracks/Region.pm delete mode 100644 perl/lib/Seq/Tracks/Region/Build.pm delete mode 100644 perl/lib/Seq/Tracks/Region/RegionTrackPath.pm delete mode 100644 perl/lib/Seq/Tracks/Score.pm delete mode 100644 perl/lib/Seq/Tracks/Score/Build.pm delete mode 100644 perl/lib/Seq/Tracks/Score/Build/Round.pm delete mode 100644 perl/lib/Seq/Tracks/Sparse.pm delete mode 100644 perl/lib/Seq/Tracks/Sparse/Build.pm delete mode 100644 perl/lib/Seq/Tracks/Vcf.pm delete mode 100644 perl/lib/Seq/Tracks/Vcf/Build.pm delete mode 100644 perl/lib/Utils/Base.pm delete mode 100644 perl/lib/Utils/CaddToBed.pm delete mode 100644 perl/lib/Utils/DbSnp2FormatInfo.pm delete mode 100644 perl/lib/Utils/Fetch.pm delete mode 100644 perl/lib/Utils/FilterCadd.pm delete mode 100644 perl/lib/Utils/LiftOverCadd.pm delete mode 100644 perl/lib/Utils/RefGeneXdbnsfp.pm delete mode 100644 perl/lib/Utils/RenameTrack.pm delete mode 100644 perl/lib/Utils/SortCadd.pm delete mode 100644 perl/lib/Utils/SqlWriter.pm delete mode 100644 perl/lib/Utils/SqlWriter/Connection.pm delete mode 100755 perl/lib/Utils/scripts/cadd_indel_to_vcf.sh delete mode 100755 perl/lib/Utils/scripts/dbsnp_rename_chrs_25_39.sh delete mode 100644 perl/lib/Utils/scripts/extract_gnomad_an_af_nhomalt.pl delete mode 100644 perl/lib/Utils/scripts/split_vcf_by_chr.pl delete mode 100644 perl/lib/Utils/scripts/t/test_extract_gnomad_an_af_nhomalt.pl delete mode 100644 perl/t/dbmanager-cursor.t delete mode 100644 perl/t/dbmanager-del.t delete mode 100644 perl/t/definition.t delete mode 100644 perl/t/headers.t delete mode 100644 perl/t/inputFile.t delete mode 100644 perl/t/lib/TestUtils.pm delete mode 100644 perl/t/msgpack.t delete mode 100644 perl/t/output.t delete mode 100644 perl/t/output/delimiters.t delete mode 100644 perl/t/role/02-message.t delete mode 100644 perl/t/role/1-sparseArrays.t delete mode 100644 perl/t/tracks/base/01-convert.t delete mode 100644 perl/t/tracks/base/02-convert-pack.t delete mode 100644 perl/t/tracks/base/normalizeWantedChr.t delete mode 100644 perl/t/tracks/base/types.t delete mode 100644 perl/t/tracks/build/02-local-files.t delete mode 100644 perl/t/tracks/build/build_field_transformations.t delete mode 100644 perl/t/tracks/build/coerceFeatureType.t delete mode 100644 perl/t/tracks/build/coerceUndefinedValues.t delete mode 100644 perl/t/tracks/build/ref_cannot_be_skipped.t delete mode 100644 perl/t/tracks/build/ref_cannot_be_skipped.yml delete mode 100644 perl/t/tracks/cadd/db/cadd/whole_genome_SNVs.tsv.chr22.organized-by-chr.txt.sorted.29lines.txt delete mode 100644 perl/t/tracks/cadd/db/ref/chr22_contrived.fastq delete mode 100644 perl/t/tracks/cadd/integration.t delete mode 100644 perl/t/tracks/cadd/integration.yml delete mode 100644 perl/t/tracks/chrWanted.t delete mode 100644 perl/t/tracks/gene/construct.t delete mode 100644 perl/t/tracks/gene/db/raw/clinvar/variant_summary.txt.MT.1600_3250.gz delete mode 100644 perl/t/tracks/gene/db/raw/ref/chr10_fake_overlap.fasta.gz delete mode 100644 perl/t/tracks/gene/db/raw/ref/chrM.fa.gz delete mode 100644 perl/t/tracks/gene/db/raw/ref/fakeRef.hg19.60950_70966.txt delete mode 100644 perl/t/tracks/gene/db/raw/refSeq/2018-09-06.hg19.kgXref.fetch.orig.gz delete mode 100644 perl/t/tracks/gene/db/raw/refSeq/hg19.complex.txt delete mode 100644 perl/t/tracks/gene/db/raw/refSeq/hg19.kgXref.chrM.gz delete mode 100644 perl/t/tracks/gene/db/raw/refSeq/hg19.kgXref.fetch.gz delete mode 100644 perl/t/tracks/gene/db/raw/refSeq/hg19.refGene.chrM delete mode 100644 perl/t/tracks/gene/db/raw/trackNames_meta/data.mdb delete mode 100644 perl/t/tracks/gene/db/raw/trackNames_meta/lock.mdb delete mode 100644 perl/t/tracks/gene/join.t delete mode 100644 perl/t/tracks/gene/join.yml delete mode 100644 perl/t/tracks/gene/join_no_build.t delete mode 100644 perl/t/tracks/gene/join_no_build.yml delete mode 100644 perl/t/tracks/gene/ncrna.t delete mode 100644 perl/t/tracks/gene/ncrna.yml delete mode 100644 perl/t/tracks/gene/overlap.t delete mode 100644 perl/t/tracks/gene/overlap.yml delete mode 100644 perl/t/tracks/gene/region.t delete mode 100644 perl/t/tracks/gene/region.yml delete mode 100644 perl/t/tracks/gene/simple.t delete mode 100644 perl/t/tracks/gene/simple.yml delete mode 100644 perl/t/tracks/gene/site.t delete mode 100644 perl/t/tracks/gene/test-prepare-ref.yml delete mode 100644 perl/t/tracks/merge.t delete mode 100644 perl/t/tracks/nearest/db/hg19/raw/ref/chrM.fa.gz delete mode 100644 perl/t/tracks/nearest/db/hg19/raw/refSeq/hg19.refGene.chrM delete mode 100644 perl/t/tracks/nearest/integration.t delete mode 100644 perl/t/tracks/nearest/test.yml delete mode 100644 perl/t/tracks/reference/db/raw/refTest/chrM.fa.gz delete mode 100644 perl/t/tracks/reference/integration.t delete mode 100644 perl/t/tracks/reference/integration.yml delete mode 100644 perl/t/tracks/score/build/rounder.t delete mode 100644 perl/t/tracks/score/db/raw/phastCons/chrM.phastCons100way.wigFix.gz delete mode 100644 perl/t/tracks/score/integration.t delete mode 100644 perl/t/tracks/score/integration.yml delete mode 100644 perl/t/tracks/sparse/build_clinvar.t delete mode 100644 perl/t/tracks/sparse/clinvar-test-config.yml delete mode 100644 perl/t/tracks/sparse/raw/clinvar/clinvar-small.tsv delete mode 100644 perl/t/tracks/vcf/README.md delete mode 100644 perl/t/tracks/vcf/clinvar.t delete mode 100644 perl/t/tracks/vcf/clinvar.yml delete mode 100644 perl/t/tracks/vcf/integration.t delete mode 100644 perl/t/tracks/vcf/integration_scrambled_multiple_files.t delete mode 100644 perl/t/tracks/vcf/raw/clinvar.match/clinvar_alleles.single.b38.vcf.1klines.gz delete mode 100644 perl/t/tracks/vcf/raw/clinvar.match/clinvar_alleles.single.b38.vcf.1lines.gz delete mode 100644 perl/t/tracks/vcf/raw/gnomad.genomes.scrambled/test.vcf delete mode 100644 perl/t/tracks/vcf/raw/gnomad.genomes.scrambled/test_split_part1.vcf delete mode 100644 perl/t/tracks/vcf/raw/gnomad.genomes.scrambled/test_split_part2.vcf.gz delete mode 100644 perl/t/tracks/vcf/raw/gnomad.genomes/test.vcf delete mode 100644 perl/t/tracks/vcf/test.hg38.chr22.yml delete mode 100644 perl/t/tracks/vcf/test.scrambled_multiple_files.yml delete mode 100644 perl/t/utils/dbsnp2FormatInfo.t delete mode 100644 perl/t/utils/filterCadd.t delete mode 100644 perl/t/utils/raw/cadd/test.filterCadd.cadd.chr1.txt.gz delete mode 100644 perl/t/utils/raw/cadd/test.filterCadd.cadd.chr2.txt delete mode 100644 perl/t/utils/raw/cadd/test.filterCadd.cadd.chr22.txt delete mode 100644 perl/t/utils/scripts/split_by_chr.t delete mode 100644 perl/t/utils/scripts/vcf_example.vcf delete mode 100644 perl/t/utils/sqlWriter.t delete mode 100644 python/python/bystro/ancestry/__init__.py delete mode 100644 python/python/bystro/ancestry/adversarial_autoencoder.py delete mode 100644 python/python/bystro/ancestry/ancestry_model_products/.gitignore delete mode 100644 python/python/bystro/ancestry/ancestry_types.py delete mode 100644 python/python/bystro/ancestry/asserts.py delete mode 100644 python/python/bystro/ancestry/data/.gitignore delete mode 100644 python/python/bystro/ancestry/data/kgp_vcfs/.gitignore delete mode 100644 python/python/bystro/ancestry/define_callset.py delete mode 100644 python/python/bystro/ancestry/gmm_ancestry.py delete mode 100644 python/python/bystro/ancestry/inference.py delete mode 100644 python/python/bystro/ancestry/intermediate_data/.gitignore delete mode 100644 python/python/bystro/ancestry/model.py delete mode 100644 python/python/bystro/ancestry/preprocess_1kgp_using_gnomad_loadings.sh delete mode 100644 python/python/bystro/ancestry/preprocess_vcfs.sh delete mode 100644 python/python/bystro/ancestry/tests/__init__.py delete mode 100644 python/python/bystro/ancestry/tests/test_adversarial_autoencoder.py delete mode 100644 python/python/bystro/ancestry/tests/test_ancestry_gmm.py delete mode 100644 python/python/bystro/ancestry/tests/test_ancestry_types.py delete mode 100644 python/python/bystro/ancestry/tests/test_define_callset.py delete mode 100644 python/python/bystro/ancestry/tests/test_inference.py delete mode 100644 python/python/bystro/ancestry/tests/test_train.py delete mode 100644 python/python/bystro/ancestry/tests/test_train_utils.py delete mode 100644 python/python/bystro/ancestry/train.py delete mode 100644 python/python/bystro/ancestry/train_chip_model.py delete mode 100644 python/python/bystro/ancestry/train_gnomad_model.py delete mode 100644 python/python/bystro/ancestry/train_utils.py delete mode 100644 python/python/bystro/ancestry/upload_model.py delete mode 100644 python/python/bystro/api/ancestry.py delete mode 100644 python/python/bystro/api/tests/ancestry_expected_output.tsv delete mode 100644 python/python/bystro/api/tests/ancestry_input.json delete mode 100644 python/python/bystro/api/tests/test_ancestry_api.py diff --git a/INSTALL.md b/INSTALL.md index 7f483898d..dfd1fa2d2 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -19,20 +19,10 @@ Bystro is compatible with Linux and MacOS. Windows support is experimental. If y brew install cmake ``` -## Installing the Bystro Annotator - -Besides the Bystro ML library, which lives in bystro/python, we also have a Perl library that is used to annotate genetic data, providing necessary information for the Bystro ML library bioinformatics modules. - -The Bystro Annotator, which handles processing genetic data (VCF files), performing quality control, feature labeling (annotation) of variants and samples, and generating an annotation output and genotype dosage matrices, is written in Perl. - -To install and configure the Bystro Annotator, follow the instructions in [perl/INSTALL.md](perl/INSTALL.md). - ## Setting up the Bystro project for development If you wish to stand up a local development environment, we recommend using Miniconda to manage Bystro Python dependencies: https://docs.conda.io/projects/miniconda/en/latest/ -Once Bystro annotator installation is complete, and assuming Conda/Miniconda has been installed, run : - ```sh # Install Rust curl https://sh.rustup.rs -sSf | sh -s -- -y diff --git a/Makefile b/Makefile index 613358883..5ebb8b351 100644 --- a/Makefile +++ b/Makefile @@ -14,13 +14,10 @@ install-python: build-python pip install "$$WHEEL_FILE"; \ fi -install-go: - go install github.com/bystrogenomics/bystro-vcf@2.2.3 - -install: install-python install-go +install: install-python uninstall: pip uninstall -y bystro binary_path=$(which bystro-vcf 2>/dev/null) && [ -n "$binary_path" ] && rm "$binary_path" -develop: install-go build-python-dev \ No newline at end of file +develop: build-python-dev \ No newline at end of file diff --git a/config/.gitignore b/config/.gitignore deleted file mode 100644 index e7b1c25ad..000000000 --- a/config/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# ignore this during local development -beanstalk.yml \ No newline at end of file diff --git a/config/hg19.yml b/config/hg19.yml deleted file mode 100644 index 8c53c358c..000000000 --- a/config/hg19.yml +++ /dev/null @@ -1,518 +0,0 @@ ---- -assembly: hg19 -build_author: alexkotlar -build_date: 2024-05-06T12:19:00 -chromosomes: - - chr1 - - chr2 - - chr3 - - chr4 - - chr5 - - chr6 - - chr7 - - chr8 - - chr9 - - chr10 - - chr11 - - chr12 - - chr13 - - chr14 - - chr15 - - chr16 - - chr17 - - chr18 - - chr19 - - chr20 - - chr21 - - chr22 - - chrM - - chrX - - chrY -database_dir: /mnt/annotator/hg19_v10 -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA --sample %sampleList% --keepPos --keepId --dosageOutput %dosageMatrixOutPath% - program: bystro-vcf -files_dir: /mnt/files1/bystro_annotator/raw_files/hg19 -statistics: - dbSNPnameField: dbSNP.id - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tsv - tab: .statistics.tsv - programPath: bystro-stats - refTrackField: ref - siteTypeField: refSeq.siteType -temp_dir: ~ -tracks: - outputOrder: - - ref - - refSeq - - nearest.refSeq - - nearestTss.refSeq - - clinvarVcf - - gnomad.exomes - - gnomad.genomes - - dbSNP - - cadd - - caddIndel - tracks: - - build_author: alexkotlar - build_date: 2024-05-06T12:19:00 - local_files: - - chr1.fa.gz - - chr2.fa.gz - - chr3.fa.gz - - chr4.fa.gz - - chr5.fa.gz - - chr6.fa.gz - - chr7.fa.gz - - chr8.fa.gz - - chr9.fa.gz - - chr10.fa.gz - - chr11.fa.gz - - chr12.fa.gz - - chr13.fa.gz - - chr14.fa.gz - - chr15.fa.gz - - chr16.fa.gz - - chr17.fa.gz - - chr18.fa.gz - - chr19.fa.gz - - chr20.fa.gz - - chr21.fa.gz - - chr22.fa.gz - - chrM.fa.gz - - chrX.fa.gz - - chrY.fa.gz - name: ref - type: reference - utils: - - args: - remoteDir: http://hgdownload.soe.ucsc.edu/goldenPath/hg19/chromosomes/ - remoteFiles: - - chr1.fa.gz - - chr2.fa.gz - - chr3.fa.gz - - chr4.fa.gz - - chr5.fa.gz - - chr6.fa.gz - - chr7.fa.gz - - chr8.fa.gz - - chr9.fa.gz - - chr10.fa.gz - - chr11.fa.gz - - chr12.fa.gz - - chr13.fa.gz - - chr14.fa.gz - - chr15.fa.gz - - chr16.fa.gz - - chr17.fa.gz - - chr18.fa.gz - - chr19.fa.gz - - chr20.fa.gz - - chr21.fa.gz - - chr22.fa.gz - - chrM.fa.gz - - chrX.fa.gz - - chrY.fa.gz - completed: 2023-11-09T20:16:00 - name: fetch - version: 36 - - build_author: alexkotlar - build_date: 2024-05-06T12:19:00 - build_field_transformations: - description: split [;] - ensemblID: split [;] - kgID: split [;] - mRNA: split [;] - protAcc: split [;] - rfamAcc: split [;] - spDisplayID: split [;] - spID: split [;] - tRnaName: split [;] - features: - - name - - name2 - - description - - kgID - - mRNA - - spID - - spDisplayID - - protAcc - - rfamAcc - - tRnaName - - ensemblID - - isCanonical - local_files: - - hg19.kgXref.chr1.gz - - hg19.kgXref.chr2.gz - - hg19.kgXref.chr3.gz - - hg19.kgXref.chr4.gz - - hg19.kgXref.chr5.gz - - hg19.kgXref.chr6.gz - - hg19.kgXref.chr7.gz - - hg19.kgXref.chr8.gz - - hg19.kgXref.chr9.gz - - hg19.kgXref.chr10.gz - - hg19.kgXref.chr11.gz - - hg19.kgXref.chr12.gz - - hg19.kgXref.chr13.gz - - hg19.kgXref.chr14.gz - - hg19.kgXref.chr15.gz - - hg19.kgXref.chr16.gz - - hg19.kgXref.chr17.gz - - hg19.kgXref.chr18.gz - - hg19.kgXref.chr19.gz - - hg19.kgXref.chr20.gz - - hg19.kgXref.chr21.gz - - hg19.kgXref.chr22.gz - - hg19.kgXref.chrM.gz - - hg19.kgXref.chrX.gz - - hg19.kgXref.chrY.gz - name: refSeq - type: gene - utils: - - args: - connection: - database: hg19 - sql: | - SELECT - r.*, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.kgID, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS kgID, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.description, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS description, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(e.value, '') SEPARATOR ';') FROM knownToEnsembl e JOIN kgXref x ON x.kgID=e.name WHERE x.refseq=r.name) AS ensemblID, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.tRnaName, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS tRnaName, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.spID, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS spID, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.spDisplayID, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS spDisplayID, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(rl.protAcc, '') SEPARATOR ';') FROM hgFixed.refLink rl WHERE rl.mrnaAcc=r.name) AS protAcc, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.mRNA, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS mRNA, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.rfamAcc, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS rfamAcc, - COALESCE((SELECT MAX(CASE WHEN k.transcript IS NOT NULL THEN 'true' ELSE 'false' END) FROM knownCanonical k WHERE k.transcript IN (SELECT kgID FROM kgXref x WHERE x.refseq=r.name)), 'false') AS isCanonical - FROM - refGene r - WHERE - chrom=%chromosomes%; - completed: 2024-05-06T11:47:00 - name: fetch - version: 5 - - build_author: alexkotlar - build_date: 2024-05-06T12:19:00 - build_row_filters: - AS_FilterStatus: == PASS - features: - - alt - - id - - AN: number - - AF: number - - AN_female: number - - AF_female: number - - non_cancer_AN: number - - non_cancer_AF: number - - non_neuro_AN: number - - non_neuro_AF: number - - non_topmed_AN: number - - non_topmed_AF: number - - controls_AN: number - - controls_AF: number - - AN_nfe_seu: number - - AF_nfe_seu: number - - AN_nfe_bgr: number - - AF_nfe_bgr: number - - AN_afr: number - - AF_afr: number - - AN_sas: number - - AF_sas: number - - AN_nfe_onf: number - - AF_nfe_onf: number - - AN_amr: number - - AF_amr: number - - AN_eas: number - - AF_eas: number - - AN_nfe_swe: number - - AF_nfe_swe: number - - AN_nfe_nwe: number - - AF_nfe_nwe: number - - AN_eas_jpn: number - - AF_eas_jpn: number - - AN_eas_kor: number - - AF_eas_kor: number - local_files: - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.1.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.2.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.3.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.4.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.5.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.6.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.7.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.8.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.9.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.10.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.11.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.12.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.13.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.14.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.15.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.16.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.17.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.18.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.19.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.20.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.21.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.22.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.X.vcf.bgz - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/exomes/gnomad.exomes.r2.1.1.sites.Y.vcf.bgz - name: gnomad.exomes - type: vcf - version: 5 - - build_author: alexkotlar - build_date: 2024-05-06T12:19:00 - local_files: - - whole_genome_SNVs.tsv.chr1.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr10.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr11.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr12.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr13.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr14.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr15.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr16.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr17.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr18.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr19.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr2.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr20.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr21.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr22.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr3.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr4.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr5.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr6.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr7.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr8.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr9.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chrX.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chrY.organized-by-chr.txt.sorted.txt.gz - name: cadd - sorted: 1 - type: cadd - version: 7 - - build_author: alexkotlar - build_date: 2024-05-06T12:19:00 - dist: true - features: - - name2 - - name - from: txStart - name: nearest.refSeq - ref: refSeq - to: txEnd - type: nearest - version: 5 - - build_author: alexkotlar - build_date: 2024-05-06T12:19:00 - dist: true - features: - - name2 - - name - from: txStart - name: nearestTss.refSeq - ref: refSeq - type: nearest - version: 5 - - build_author: alexkotlar - build_date: 2024-05-06T12:19:00 - features: - - alt - - id - - AN: number - - AF: number - - AN_female: number - - AF_female: number - - non_neuro_AN: number - - non_neuro_AF: number - - non_topmed_AN: number - - non_topmed_AF: number - - controls_AN: number - - controls_AF: number - - AN_nfe_seu: number - - AF_nfe_seu: number - - AN_afr: number - - AF_afr: number - - AN_nfe_onf: number - - AF_nfe_onf: number - - AN_amr: number - - AF_amr: number - - AN_eas: number - - AF_eas: number - - AN_nfe_nwe: number - - AF_nfe_nwe: number - - AN_nfe_est: number - - AF_nfe_est: number - - AN_nfe: number - - AF_nfe: number - - AN_fin: number - - AF_fin: number - - AN_asj: number - - AF_asj: number - - AN_oth: number - - AF_oth: number - local_files: - - /mnt/files1/bystro_annotator/raw_files/hg19/gnomad2/vcf/genomes/gnomad.genomes.r2.1.1.sites.*.vcf.bgz - name: gnomad.genomes - type: vcf - version: 5 - - build_author: alexkotlar - build_date: 2024-05-06T12:19:00 - features: - - id - - alt - - TOMMO: number - - ExAC: number - - GnomAD: number - - Korea1K: number - - GoNL: number - - KOREAN: number - - TWINSUK: number - - Vietnamese: number - - GENOME_DK: number - - GoESP: number - - GnomAD_exomes: number - - Siberian: number - - PRJEB37584: number - - SGDP_PRJ: number - - 1000Genomes: number - - dbGaP_PopFreq: number - - NorthernSweden: number - - HapMap: number - - TOPMED: number - - ALSPAC: number - - Qatari: number - - MGP: number - local_files: - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.1_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.2_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.3_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.4_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.5_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.6_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.7_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.8_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.9_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.10_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.11_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.12_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.13_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.14_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.15_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.16_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.17_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.18_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.19_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.20_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.21_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.22_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.X_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.Y_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg19/dbSNP/GRCh37.dbSNP155.vcf.MT_processed.vcf.gz - name: dbSNP - type: vcf - utils: - - completed: 2023-11-09T20:19:00 - name: DbSnp2FormatInfo - version: 5 - - build_author: alexkotlar - build_date: 2024-05-06T12:19:00 - build_field_transformations: - CLNDISDB: split [|] - CLNDN: split [|] - CLNSIGCONF: split [|] - CLNSIGINC: split [|] - features: - - id - - alt - - AF_ESP: number - - AF_EXAC: number - - AF_TGP: number - - ALLELEID: number - - CLNDN - - CLNDNINCL - - CLNHGVS - - CLNREVSTAT - - CLNSIG - - CLNSIGCONF - - CLNVCSO - - DBVARID - - ORIGIN - - SSR - - RS - local_files: - - clinvar.vcf.gz - name: clinvarVcf - type: vcf - utils: - - args: - remoteFiles: - - https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz - completed: 2024-05-02T01:02:00 - name: fetch - version: 5 - - build_author: alexkotlar - build_date: 2024-05-06T12:19:00 - features: - - alt - - PHRED: number - local_files: - - /mnt/files1/bystro_annotator/raw_files/hg19/caddIndel/Indels.vcf.gz - name: caddIndel - type: vcf - version: 5 - - based: 1 - build_field_transformations: - chrom: chr . - clinicalSignificance: split [;] - origin: split [;] - phenotypeList: split [;] - reviewStatus: split [;] - type: split [;] - build_row_filters: - Assembly: == GRCh37 - features: - - alleleID: number - - phenotypeList - - clinicalSignificance - - type - - origin - - numberSubmitters: number - - reviewStatus - - referenceAllele - - alternateAllele - fieldMap: - "#AlleleID": alleleID - AlternateAllele: alternateAllele - Chromosome: chrom - ClinicalSignificance: clinicalSignificance - NumberSubmitters: numberSubmitters - Origin: origin - PhenotypeIDS: phenotypeIDs - PhenotypeList: phenotypeList - ReferenceAllele: referenceAllele - ReviewStatus: reviewStatus - Start: chromStart - Stop: chromEnd - Type: type - local_files: - - variant_summary.txt.gz - name: clinvar - no_build: true - type: sparse - utils: - - args: - remoteFiles: - - ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz - completed: 2024-03-07T12:51:00 - name: fetch -version: 5 diff --git a/config/hg38.yml b/config/hg38.yml deleted file mode 100644 index eae6108a7..000000000 --- a/config/hg38.yml +++ /dev/null @@ -1,718 +0,0 @@ ---- -assembly: hg38 -build_author: alexkotlar -build_date: 2024-08-12T11:02:00 -chromosomes: - - chr1 - - chr2 - - chr3 - - chr4 - - chr5 - - chr6 - - chr7 - - chr8 - - chr9 - - chr10 - - chr11 - - chr12 - - chr13 - - chr14 - - chr15 - - chr16 - - chr17 - - chr18 - - chr19 - - chr20 - - chr21 - - chr22 - - chrM - - chrX - - chrY -database_dir: /mnt/annotator/hg38_v11 -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA --sample %sampleList% --keepPos --keepId --dosageOutput %dosageMatrixOutPath% - program: bystro-vcf -files_dir: /mnt/files1/bystro_annotator/raw_files/hg38 -statistics: - dbSNPnameField: dbSNP.name - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tsv - tab: .statistics.tsv - programPath: bystro-stats - refTrackField: ref - siteTypeField: refSeq.siteType -temp_dir: ~ -tracks: - outputOrder: - - ref - - refSeq - - nearest.refSeq - - nearestTss.refSeq - - gnomad.joint - - gnomad.exomes - - gnomad.genomes - - dbSNP - - cadd - - caddIndel - - clinvarVcf - - logofunc - - genebass - tracks: - - build_author: alexkotlar - build_date: 2024-08-12T11:02:00 - local_files: - - chr1.fa.gz - - chr2.fa.gz - - chr3.fa.gz - - chr4.fa.gz - - chr5.fa.gz - - chr6.fa.gz - - chr7.fa.gz - - chr8.fa.gz - - chr9.fa.gz - - chr10.fa.gz - - chr11.fa.gz - - chr12.fa.gz - - chr13.fa.gz - - chr14.fa.gz - - chr15.fa.gz - - chr16.fa.gz - - chr17.fa.gz - - chr18.fa.gz - - chr19.fa.gz - - chr20.fa.gz - - chr21.fa.gz - - chr22.fa.gz - - chrX.fa.gz - - chrY.fa.gz - - chrM.fa.gz - name: ref - type: reference - utils: - - args: - remoteDir: http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/ - remoteFiles: - - chr1.fa.gz - - chr2.fa.gz - - chr3.fa.gz - - chr4.fa.gz - - chr5.fa.gz - - chr6.fa.gz - - chr7.fa.gz - - chr8.fa.gz - - chr9.fa.gz - - chr10.fa.gz - - chr11.fa.gz - - chr12.fa.gz - - chr13.fa.gz - - chr14.fa.gz - - chr15.fa.gz - - chr16.fa.gz - - chr17.fa.gz - - chr18.fa.gz - - chr19.fa.gz - - chr20.fa.gz - - chr21.fa.gz - - chr22.fa.gz - - chrX.fa.gz - - chrY.fa.gz - - chrM.fa.gz - completed: 2024-03-11T12:19:00 - name: fetch - version: 38 - - build_author: alexkotlar - build_date: 2024-08-12T11:02:00 - build_field_transformations: - description: split [;] - ensemblID: split [;] - kgID: split [;] - mRNA: split [;] - protAcc: split [;] - rfamAcc: split [;] - spDisplayID: split [;] - spID: split [;] - tRnaName: split [;] - features: - - name - - name2 - - description - - kgID - - mRNA - - spID - - spDisplayID - - protAcc - - rfamAcc - - tRnaName - - ensemblID - - isCanonical - local_files: - - hg38.kgXref.chr1.gz - - hg38.kgXref.chr2.gz - - hg38.kgXref.chr3.gz - - hg38.kgXref.chr4.gz - - hg38.kgXref.chr5.gz - - hg38.kgXref.chr6.gz - - hg38.kgXref.chr7.gz - - hg38.kgXref.chr8.gz - - hg38.kgXref.chr9.gz - - hg38.kgXref.chr10.gz - - hg38.kgXref.chr11.gz - - hg38.kgXref.chr12.gz - - hg38.kgXref.chr13.gz - - hg38.kgXref.chr14.gz - - hg38.kgXref.chr15.gz - - hg38.kgXref.chr16.gz - - hg38.kgXref.chr17.gz - - hg38.kgXref.chr18.gz - - hg38.kgXref.chr19.gz - - hg38.kgXref.chr20.gz - - hg38.kgXref.chr21.gz - - hg38.kgXref.chr22.gz - - hg38.kgXref.chrM.gz - - hg38.kgXref.chrX.gz - - hg38.kgXref.chrY.gz - name: refSeq - type: gene - utils: - - args: - connection: - database: hg38 - sql: | - SELECT - r.*, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.kgID, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS kgID, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.description, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS description, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(e.value, '') SEPARATOR ';') FROM knownToEnsembl e JOIN kgXref x ON x.kgID=e.name WHERE x.refseq=r.name) AS ensemblID, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.tRnaName, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS tRnaName, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.spID, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS spID, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.spDisplayID, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS spDisplayID, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(rl.protAcc, '') SEPARATOR ';') FROM hgFixed.refLink rl WHERE rl.mrnaAcc=r.name) AS protAcc, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.mRNA, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS mRNA, - (SELECT GROUP_CONCAT(DISTINCT NULLIF(x.rfamAcc, '') SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS rfamAcc, - COALESCE((SELECT MAX(CASE WHEN k.transcript IS NOT NULL THEN 'true' ELSE 'false' END) FROM knownCanonical k WHERE k.transcript IN (SELECT kgID FROM kgXref x WHERE x.refseq=r.name)), 'false') AS isCanonical - FROM - refGene r - WHERE - chrom=%chromosomes%; - completed: 2024-05-05T21:56:00 - name: fetch - version: 38 - - build_author: alexkotlar - build_date: 2024-08-12T11:02:00 - features: - - alt - - id - - spliceai_ds_max: number - - pangolin_largest_ds: number - - phylop: number - - sift_max: number - - polyphen_max: number - - AN: number - - AF: number - - AF_XX: number - - AN_XX: number - - AF_XY: number - - AN_XY: number - - AF_afr: number - - AN_afr: number - - AF_amr: number - - AN_amr: number - - AF_asj: number - - AN_asj: number - - AF_eas: number - - AN_eas: number - - AF_fin: number - - AN_fin: number - - AF_mid: number - - AN_mid: number - - AF_nfe: number - - AN_nfe: number - - AF_non_ukb: number - - AN_non_ukb: number - - AF_non_ukb_afr: number - - AN_non_ukb_afr: number - - AF_non_ukb_amr: number - - AN_non_ukb_amr: number - - AF_non_ukb_asj: number - - AN_non_ukb_asj: number - - AF_non_ukb_eas: number - - AN_non_ukb_eas: number - - AF_non_ukb_fin: number - - AN_non_ukb_fin: number - - AF_non_ukb_mid: number - - AN_non_ukb_mid: number - - AF_non_ukb_nfe: number - - AN_non_ukb_nfe: number - - AF_non_ukb_remaining: number - - AN_non_ukb_remaining: number - - AF_non_ukb_sas: number - - AN_non_ukb_sas: number - - AF_remaining: number - - AN_remaining: number - - AF_sas: number - - AN_sas: number - - AF_grpmax: number - - AN_grpmax: number - - AF_grpmax_non_ukb: number - - AN_grpmax_non_ukb: number - local_files: - - gnomad.exomes.v4.1.sites.chr1.vcf.bgz - - gnomad.exomes.v4.1.sites.chr2.vcf.bgz - - gnomad.exomes.v4.1.sites.chr3.vcf.bgz - - gnomad.exomes.v4.1.sites.chr4.vcf.bgz - - gnomad.exomes.v4.1.sites.chr5.vcf.bgz - - gnomad.exomes.v4.1.sites.chr6.vcf.bgz - - gnomad.exomes.v4.1.sites.chr7.vcf.bgz - - gnomad.exomes.v4.1.sites.chr8.vcf.bgz - - gnomad.exomes.v4.1.sites.chr9.vcf.bgz - - gnomad.exomes.v4.1.sites.chr10.vcf.bgz - - gnomad.exomes.v4.1.sites.chr11.vcf.bgz - - gnomad.exomes.v4.1.sites.chr12.vcf.bgz - - gnomad.exomes.v4.1.sites.chr13.vcf.bgz - - gnomad.exomes.v4.1.sites.chr14.vcf.bgz - - gnomad.exomes.v4.1.sites.chr15.vcf.bgz - - gnomad.exomes.v4.1.sites.chr16.vcf.bgz - - gnomad.exomes.v4.1.sites.chr17.vcf.bgz - - gnomad.exomes.v4.1.sites.chr18.vcf.bgz - - gnomad.exomes.v4.1.sites.chr19.vcf.bgz - - gnomad.exomes.v4.1.sites.chr20.vcf.bgz - - gnomad.exomes.v4.1.sites.chr21.vcf.bgz - - gnomad.exomes.v4.1.sites.chr22.vcf.bgz - - gnomad.exomes.v4.1.sites.chrX.vcf.bgz - - gnomad.exomes.v4.1.sites.chrY.vcf.bgz - name: gnomad.exomes - type: vcf - utils: - - args: - remoteFiles: - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr1.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr2.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr3.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr4.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr5.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr6.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr7.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr8.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr9.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr10.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr11.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr12.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr13.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr14.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr15.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr16.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr17.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr18.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr19.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr20.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr22.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chrX.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chrY.vcf.bgz - completed: 2024-08-12T01:37:00 - name: fetch - version: 33 - - build_author: alexkotlar - build_date: 2024-08-12T11:02:00 - local_files: - - whole_genome_SNVs.tsv.chr1.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr10.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr11.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr12.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr13.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr14.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr15.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr16.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr17.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr18.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr19.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr2.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr20.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr21.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr22.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr3.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr4.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr5.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr6.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr7.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr8.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr9.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chrX.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chrY.organized-by-chr.txt.sorted.txt.gz - name: cadd - sorted: 1 - type: cadd - utils: - - completed: 2023-09-09T11:18:00 - name: SortCadd - version: 8 - - build_author: alexkotlar - build_date: 2024-08-12T11:02:00 - features: - - alt - - PHRED: number - local_files: - - /mnt/files1/bystro_annotator/raw_files/hg38/caddIndel/gnomad.genomes.r4.0.indel.vcf.gz - name: caddIndel - type: vcf - version: 10 - - build_author: alexkotlar - build_date: 2024-08-12T11:02:00 - dist: true - features: - - name2 - - name - from: txStart - name: nearest.refSeq - ref: refSeq - to: txEnd - type: nearest - version: 13 - - build_author: alexkotlar - build_date: 2024-08-12T11:02:00 - dist: true - features: - - name2 - - name - from: txStart - name: nearestTss.refSeq - ref: refSeq - type: nearest - version: 8 - - build_author: alexkotlar - build_date: 2024-08-12T11:02:00 - features: - - alt - - id - - AF_exomes: number - - AN_exomes: number - - AF_genomes: number - - AN_genomes: number - - AF_joint: number - - AN_joint: number - - AF_joint_XX: number - - AN_joint_XX: number - - AF_joint_XY: number - - AN_joint_XY: number - - AF_joint_afr: number - - AN_joint_afr: number - - AF_joint_ami: number - - AN_joint_ami: number - - AF_joint_amr: number - - AN_joint_amr: number - - AF_joint_asj: number - - AN_joint_asj: number - - AF_joint_eas: number - - AN_joint_eas: number - - AF_joint_fin: number - - AN_joint_fin: number - - AF_joint_mid: number - - AN_joint_mid: number - - AF_joint_nfe: number - - AN_joint_nfe: number - - AF_joint_raw: number - - AN_joint_raw: number - - AF_joint_remaining: number - - AN_joint_remaining: number - - AF_joint_sas: number - - AN_joint_sas: number - - AF_grpmax_joint: number - - AN_grpmax_joint: number - local_files: - - gnomad.joint.v4.1.sites.chr1.vcf.bgz - - gnomad.joint.v4.1.sites.chr2.vcf.bgz - - gnomad.joint.v4.1.sites.chr3.vcf.bgz - - gnomad.joint.v4.1.sites.chr4.vcf.bgz - - gnomad.joint.v4.1.sites.chr5.vcf.bgz - - gnomad.joint.v4.1.sites.chr6.vcf.bgz - - gnomad.joint.v4.1.sites.chr7.vcf.bgz - - gnomad.joint.v4.1.sites.chr8.vcf.bgz - - gnomad.joint.v4.1.sites.chr9.vcf.bgz - - gnomad.joint.v4.1.sites.chr10.vcf.bgz - - gnomad.joint.v4.1.sites.chr11.vcf.bgz - - gnomad.joint.v4.1.sites.chr12.vcf.bgz - - gnomad.joint.v4.1.sites.chr13.vcf.bgz - - gnomad.joint.v4.1.sites.chr14.vcf.bgz - - gnomad.joint.v4.1.sites.chr15.vcf.bgz - - gnomad.joint.v4.1.sites.chr16.vcf.bgz - - gnomad.joint.v4.1.sites.chr17.vcf.bgz - - gnomad.joint.v4.1.sites.chr18.vcf.bgz - - gnomad.joint.v4.1.sites.chr19.vcf.bgz - - gnomad.joint.v4.1.sites.chr20.vcf.bgz - - gnomad.joint.v4.1.sites.chr21.vcf.bgz - - gnomad.joint.v4.1.sites.chr22.vcf.bgz - - gnomad.joint.v4.1.sites.chrX.vcf.bgz - - gnomad.joint.v4.1.sites.chrY.vcf.bgz - name: gnomad.joint - type: vcf - utils: - - args: - remoteFiles: - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr1.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr2.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr3.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr4.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr5.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr6.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr7.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr8.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr9.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr10.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr11.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr12.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr13.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr14.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr15.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr16.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr17.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr18.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr19.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr20.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr21.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr22.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chrX.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chrY.vcf.bgz - completed: 2024-08-12T02:43:00 - name: fetch - version: 31 - - build_author: alexkotlar - build_date: 2024-08-12T11:02:00 - features: - - alt - - id - - spliceai_ds_max: number - - pangolin_largest_ds: number - - phylop: number - - sift_max: number - - polyphen_max: number - - AN: number - - AF: number - - AF_XX: number - - AN_XX: number - - AF_XY: number - - AN_XY: number - - AF_afr: number - - AN_afr: number - - AF_ami: number - - AN_ami: number - - AF_amr: number - - AN_amr: number - - AF_asj: number - - AN_asj: number - - AF_eas: number - - AN_eas: number - - AF_fin: number - - AN_fin: number - - AF_mid: number - - AN_mid: number - - AF_nfe: number - - AN_nfe: number - - AF_remaining: number - - AN_remaining: number - - AF_sas: number - - AN_sas: number - - AF_grpmax: number - - AN_grpmax: number - local_files: - - gnomad.genomes.v4.1.sites.chr1.vcf.bgz - - gnomad.genomes.v4.1.sites.chr2.vcf.bgz - - gnomad.genomes.v4.1.sites.chr3.vcf.bgz - - gnomad.genomes.v4.1.sites.chr4.vcf.bgz - - gnomad.genomes.v4.1.sites.chr5.vcf.bgz - - gnomad.genomes.v4.1.sites.chr6.vcf.bgz - - gnomad.genomes.v4.1.sites.chr7.vcf.bgz - - gnomad.genomes.v4.1.sites.chr8.vcf.bgz - - gnomad.genomes.v4.1.sites.chr9.vcf.bgz - - gnomad.genomes.v4.1.sites.chr10.vcf.bgz - - gnomad.genomes.v4.1.sites.chr11.vcf.bgz - - gnomad.genomes.v4.1.sites.chr12.vcf.bgz - - gnomad.genomes.v4.1.sites.chr13.vcf.bgz - - gnomad.genomes.v4.1.sites.chr14.vcf.bgz - - gnomad.genomes.v4.1.sites.chr15.vcf.bgz - - gnomad.genomes.v4.1.sites.chr16.vcf.bgz - - gnomad.genomes.v4.1.sites.chr17.vcf.bgz - - gnomad.genomes.v4.1.sites.chr18.vcf.bgz - - gnomad.genomes.v4.1.sites.chr19.vcf.bgz - - gnomad.genomes.v4.1.sites.chr20.vcf.bgz - - gnomad.genomes.v4.1.sites.chr21.vcf.bgz - - gnomad.genomes.v4.1.sites.chr22.vcf.bgz - - gnomad.genomes.v4.1.sites.chrX.vcf.bgz - - gnomad.genomes.v4.1.sites.chrY.vcf.bgz - name: gnomad.genomes - type: vcf - utils: - - args: - remoteFiles: - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr1.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr2.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr3.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr4.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr5.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr6.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr7.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr8.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr9.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr10.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr11.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr12.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr13.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr14.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr15.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr16.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr17.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr18.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr19.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr20.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr21.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr22.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chrX.vcf.bgz - - https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chrY.vcf.bgz - completed: 2024-08-11T22:41:00 - name: fetch - version: 31 - - build_author: alexkotlar - build_date: 2024-08-12T11:02:00 - features: - - id - - alt - - TOMMO: number - - ExAC: number - - GnomAD: number - - Korea1K: number - - GoNL: number - - KOREAN: number - - TWINSUK: number - - Vietnamese: number - - GENOME_DK: number - - GoESP: number - - GnomAD_exomes: number - - Siberian: number - - PRJEB37584: number - - SGDP_PRJ: number - - 1000Genomes: number - - dbGaP_PopFreq: number - - NorthernSweden: number - - HapMap: number - - TOPMED: number - - ALSPAC: number - - Qatari: number - - MGP: number - local_files: - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.1_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.2_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.3_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.4_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.5_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.6_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.7_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.8_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.9_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.10_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.11_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.12_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.13_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.14_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.15_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.16_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.17_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.18_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.19_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.20_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.21_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.22_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.X_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.Y_processed.vcf.gz - - /mnt/files1/bystro_annotator/raw_files/hg38/dbSNP/GRCh38.dbSNP155.vcf.MT_processed.vcf.gz - name: dbSNP - type: vcf - utils: - - completed: 2024-03-10T16:30:00 - name: DbSnp2FormatInfo - version: 10 - - build_author: alexkotlar - build_date: 2024-08-12T11:02:00 - features: - - id - - alt - - prediction - - neutral: number - - gof: number - - lof: number - fieldMap: - LoGoFunc_GOF: gof - LoGoFunc_LOF: lof - LoGoFunc_neutral: neutral - local_files: - - LoGoFuncVotingEnsemble_preds_final.vcf.gz - name: logofunc - type: vcf - utils: - - args: - remoteFiles: - - https://bystro-db.s3.amazonaws.com/src/LoGoFuncVotingEnsemble/LoGoFuncVotingEnsemble_preds_final.vcf.gz - completed: 2024-07-29T10:52:00 - name: fetch - version: 3 - - build_author: alexkotlar - build_date: 2024-08-12T11:02:00 - build_field_transformations: - CLNDN: split [|] - CLNSIGCONF: split [|] - CLNSIGINC: split [|] - features: - - id - - alt - - AF_ESP: number - - AF_EXAC: number - - AF_TGP: number - - ALLELEID: number - - CLNDN - - CLNDNINCL - - CLNHGVS - - CLNREVSTAT - - CLNSIG - - CLNSIGCONF - - CLNVCSO - - DBVARID - - ORIGIN - - SSR - - RS - local_files: - - clinvar.vcf.gz - name: clinvarVcf - type: vcf - utils: - - args: - remoteFiles: - - https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz - completed: 2024-08-12T08:44:00 - name: fetch - version: 6 - - build_author: alexkotlar - build_date: 2024-08-12T11:02:00 - build_field_transformations: - description: split [|] - phenocode: split [|] - features: - - id - - alt - - phenocode - - description - local_files: - - processed_genebass_significant.vcf.gz - name: genebass - type: vcf - utils: - - args: - remoteFiles: - - https://bystro-db.s3.amazonaws.com/src/genebass/processed_genebass_significant.vcf.gz - completed: 2024-07-30T21:44:00 - name: fetch - version: 7 -version: 235 diff --git a/go/cmd/dosage/main.go b/go/cmd/dosage/main.go deleted file mode 100644 index 6df1b6f90..000000000 --- a/go/cmd/dosage/main.go +++ /dev/null @@ -1,118 +0,0 @@ -package main - -import ( - "errors" - "flag" - "fmt" - "io" - "log" - "os" - - "github.com/apache/arrow/go/v14/arrow/arrio" - "github.com/apache/arrow/go/v14/arrow/ipc" - "github.com/apache/arrow/go/v14/arrow/memory" -) - -func main() { - // Accept an output arg that is a string, and a variadic number of input files - var ( - outputArg string - inputArgs []string - ) - - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "dosage --output \n") - } - - flag.StringVar(&outputArg, "output", "", "output arg that is a string") - flag.Parse() - - inputArgs = flag.Args() - - err := processFiles(outputArg, inputArgs) - if err != nil { - log.Fatal(err) - } -} - -func processFile(ipcFileWriter *ipc.FileWriter, filePath string, pool memory.Allocator) error { - file, err := os.Open(filePath) - if err != nil { - return fmt.Errorf("could not open file %s: %w", filePath, err) - } - defer file.Close() - - r, err := ipc.NewFileReader(file, ipc.WithAllocator(pool)) - if err != nil { - log.Fatalf("could not create file reader for %s: %w", filePath, err) - return err - } - defer r.Close() - - n, err := arrio.Copy(ipcFileWriter, r) - if err != nil { - return fmt.Errorf("could not copy ARROW stream: %w", err) - } - if got, want := n, int64(r.NumRecords()); got != want { - return fmt.Errorf("invalid number of records written (got=%d, want=%d)", got, want) - } - - return nil -} - -func processFiles(outPath string, inPaths []string) error { - dest, err := os.Create(outPath) - if err != nil { - log.Fatal(err) - } - defer dest.Close() - - mem := memory.NewGoAllocator() - - // Read the first file and get the schema, then write that file - r, err := os.Open(inPaths[0]) - if err != nil { - log.Fatal(err) - } - defer r.Close() - - rr, err := ipc.NewFileReader(r, ipc.WithAllocator(mem)) - if err != nil { - if errors.Is(err, io.EOF) { - return nil - } - return err - } - defer rr.Close() - - ww, err := ipc.NewFileWriter(dest, []ipc.Option{ipc.WithAllocator(mem), ipc.WithSchema(rr.Schema()), ipc.WithZstd()}...) - if err != nil { - return fmt.Errorf("could not create ARROW file writer: %w", err) - } - - defer ww.Close() - - n, err := arrio.Copy(ww, rr) - if err != nil { - return fmt.Errorf("could not copy ARROW stream: %w", err) - } - if got, want := n, int64(rr.NumRecords()); got != want { - return fmt.Errorf("invalid number of records written (got=%d, want=%d)", got, want) - } - - // Read the rest of the files and append them - for _, inPath := range inPaths[1:] { - err := processFile(ww, inPath, mem) - - if err != nil { - return err - } - } - - err = ww.Close() - if err != nil { - return fmt.Errorf("could not close output ARROW stream: %w", err) - } - - return nil -} diff --git a/go/decompress/decompress.go b/go/decompress/decompress.go deleted file mode 100644 index 254d24a62..000000000 --- a/go/decompress/decompress.go +++ /dev/null @@ -1,216 +0,0 @@ -package decompress - -import ( - "bufio" - "errors" - "io" - "log" - "os" - "strings" - - "github.com/biogo/hts/bgzf" - gzip "github.com/klauspost/pgzip" -) - -const ExpectedAnnotationFileSuffix = "annotation.tsv.gz" -const DefaultBufferSize = 64 * 1024 * 8 // 8 bgzip blocks at a time - -var ErrBufferSize = errors.New("bufferSize must be greater than 0") - -type BystroReader interface { - ReadLines() ([]byte, error) - ReadLine() ([]byte, error) -} - -type BzfBystroReader struct { - Reader *bgzf.Reader - BufferSize uint -} - -type BufioBystroReader struct { - Reader *bufio.Reader - BufferSize uint -} - -// Read a line up to the next newline character, and return the line excluding the newline character -// Implementation follows the example in the bgzf package documentation: https://github.com/biogo/hts/blob/bb1e21d1bfc7f2b1e124ca0a1ed98493d191db78/bgzf/line_example_test.go#L70 -func readLineBgzipNoTx(r *bgzf.Reader) ([]byte, error) { - var ( - data []byte - b byte - err error - ) - for { - b, err = r.ReadByte() - if err != nil { - break - } - if b == '\n' { - break - } - data = append(data, b) - } - return data, err -} - -func readLineBgzip(r *bgzf.Reader) ([]byte, error) { - tx := r.Begin() - data, err := readLineBgzipNoTx(r) - tx.End() - return data, err -} - -func readLinesBgzipWithBuffer(r *bgzf.Reader, bufferSize uint) ([]byte, error) { - if bufferSize == 0 { - return nil, ErrBufferSize - } - - buf := make([]byte, bufferSize) - - tx := r.Begin() - defer tx.End() - - bytesRead, err := r.Read(buf) - - if bytesRead == 0 { - return nil, err - } - - if err != nil { - if buf[bytesRead-1] != '\n' { - return buf[:bytesRead], err - } - - return buf[:bytesRead-1], err - } - - // Since not at EOF, we know that there is more to read - if buf[bytesRead-1] != '\n' { - remainder, err := readLineBgzipNoTx(r) - return append(buf[:bytesRead], remainder...), err - } - // last byte is newline - return buf[:bytesRead-1], err -} - -func readLine(r *bufio.Reader) ([]byte, error) { - var ( - data []byte - b byte - err error - ) - for { - b, err = r.ReadByte() - if err != nil { - break - } - if b == '\n' { - break - } - data = append(data, b) - } - return data, err -} - -func readLinesWithBuffer(r *bufio.Reader, bufferSize uint) ([]byte, error) { - if bufferSize == 0 { - return nil, ErrBufferSize - } - - buf := make([]byte, bufferSize) - - bytesRead, err := io.ReadFull(r, buf) - - if bytesRead == 0 { - return nil, err - } - - // The typical errors will io.ErrUnexpectedEOF (buffer longer than input) and io.EOF (file ended) - if err != nil { - // Ensure that bgzip and gzip implementations are consistent - if err == io.ErrUnexpectedEOF { - err = io.EOF - } - - if buf[bytesRead-1] == '\n' { - return buf[:bytesRead-1], err - } - - return buf[:bytesRead], err - } - - if buf[bytesRead-1] != '\n' { - remainder, err := readLine(r) - return append(buf[:bytesRead], remainder...), err - } - // last byte is newline - return buf[:bytesRead-1], err -} - -func (r *BzfBystroReader) ReadLines() ([]byte, error) { - return readLinesBgzipWithBuffer(r.Reader, DefaultBufferSize) -} - -func (r *BzfBystroReader) ReadLine() ([]byte, error) { - return readLineBgzip(r.Reader) -} - -func (r *BufioBystroReader) ReadLines() ([]byte, error) { - return readLinesWithBuffer(r.Reader, DefaultBufferSize) -} -func (r *BufioBystroReader) ReadLine() ([]byte, error) { - return r.Reader.ReadBytes('\n') -} - -func GetHeaderPaths(b BystroReader) ([][]string, []string) { - line, err := b.ReadLine() - if err != nil { - log.Fatalf("Error reading header line due to: [%s]\n", err) - } - - headers := strings.Fields(string(line)) - - headerPaths := [][]string{} - - for _, header := range headers { - path := strings.Split(header, ".") - - headerPaths = append(headerPaths, path) - } - - return headerPaths, headers -} - -func GetAnnotationFh(input *os.File) (BystroReader, error) { - bzfReader, err := getBgzipReaderFromBgzipAnnotation(input) - - if err == nil { - return bzfReader, nil - } - - input.Seek(0, 0) - - bufioReader, err := getBufioReaderFromGzipAnnotation(input) - - return bufioReader, err -} - -func getBgzipReaderFromBgzipAnnotation(inputFh *os.File) (*BzfBystroReader, error) { - bgzfReader, err := bgzf.NewReader(inputFh, 0) - if err != nil { - return nil, err - } - - return &BzfBystroReader{Reader: bgzfReader}, nil -} - -func getBufioReaderFromGzipAnnotation(inputFh *os.File) (*BufioBystroReader, error) { - gzipReader, err := gzip.NewReader(inputFh) - if err != nil { - return nil, err - } - - bufioReader := bufio.NewReader(gzipReader) - - return &BufioBystroReader{Reader: bufioReader}, nil -} diff --git a/go/decompress/decompress_test.go b/go/decompress/decompress_test.go deleted file mode 100644 index 287272c6e..000000000 --- a/go/decompress/decompress_test.go +++ /dev/null @@ -1,421 +0,0 @@ -package decompress - -import ( - "bufio" - "bytes" - "fmt" - "io" - "log" - "math/rand" - "os" - "path/filepath" - "strings" - "sync" - "testing" - - "github.com/biogo/hts/bgzf" - "github.com/klauspost/compress/gzip" -) - -// https://stackoverflow.com/questions/22892120/how-to-generate-a-random-string-of-a-fixed-length-in-go -const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" - -func randStringBytes(n int) string { - b := make([]byte, n) - for i := range b { - b[i] = letterBytes[rand.Intn(len(letterBytes))] - } - return string(b) -} - -// Helper function to create a bgzip compressed data from a string -func createBgzipData(data string) (*bytes.Buffer, error) { - var buffer bytes.Buffer - writer := bgzf.NewWriter(&buffer, 1) - - _, err := writer.Write([]byte(data)) - if err != nil { - writer.Close() - return nil, err - } - - err = writer.Close() - if err != nil { - return nil, err - } - - return &buffer, nil -} - -// Helper function to create a bgzip compressed data from a string -func createGzipData(data string) (*bytes.Buffer, error) { - var buffer bytes.Buffer - writer := gzip.NewWriter(&buffer) - - _, err := writer.Write([]byte(data)) - if err != nil { - writer.Close() - return nil, err - } - - err = writer.Close() - if err != nil { - return nil, err - } - - return &buffer, nil -} - -func Test_readLineBgzip(t *testing.T) { - testData := "hello\nworld" - compressedData, err := createBgzipData(testData) - if err != nil { - t.Fatalf("Failed to create compressed bgzip data: %v", err) - } - - reader, err := bgzf.NewReader(compressedData, 1) - if err != nil { - t.Fatalf("Failed to create bgzf.Reader: %v", err) - } - - // Test the function - line, err := readLineBgzip(reader) - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if string(line) != "hello" { - t.Errorf("Expected 'hello', got '%s'", string(line)) - } -} - -func Test_readLineGzip(t *testing.T) { - testData := "hello\nworld" - compressedData, err := createGzipData(testData) - if err != nil { - t.Fatalf("Failed to create compressed bgzip data: %v", err) - } - - reader, err := gzip.NewReader(compressedData) - if err != nil { - t.Fatalf("Failed to create bgzf.Reader: %v", err) - } - - bufioReader := bufio.NewReader(reader) - - // Test the function - line, err := readLine(bufioReader) - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if string(line) != "hello" { - t.Errorf("Expected 'hello', got '%s'", string(line)) - } -} - -func Test_readLine(t *testing.T) { - // Create a bufio.Reader with test data - testData := "test line\nanother line" - reader := bufio.NewReader(strings.NewReader(testData)) - - // Test the function - line, err := readLine(reader) - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if string(line) != "test line" { - t.Errorf("Expected 'test line', got '%s'", string(line)) - } -} - -func TestReadLinesDefaultBuffer(t *testing.T) { - testReadLinesWithBuffer(t, 100_000, DefaultBufferSize) -} - -func TestReadLinesSmallBuffers1(t *testing.T) { - log.Print("Testing ReadLines with buffer of 1 byte") - testReadLinesWithBuffer(t, 100_000, 1) - log.Print("Testing ReadLines with buffer of 10 bytes") - testReadLinesWithBuffer(t, 100_000, 100) -} - -func TestReadLinesSmallBuffers1Parallel(t *testing.T) { - // Test if the code is re-entrant - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - testReadLinesWithBuffer(t, 100_000, 1) - }() - - wg.Add(1) - go func() { - defer wg.Done() - testReadLinesWithBuffer(t, 100_000, 100) - }() - - wg.Wait() -} - -func TestReadLinesSmallBuffers2(t *testing.T) { - log.Print("Testing ReadLines with buffer of 400 bytes") - testReadLinesWithBuffer(t, 100_000, 400) - log.Print("Testing ReadLines with buffer of 1000 bytes") - testReadLinesWithBuffer(t, 100_000, 1000) -} - -func TestReadLinesSmallBuffers2Parallel(t *testing.T) { - // Test if the code is re-entrant - log.Print("Testing ReadLines with buffer of 400 and 1000 bytes, run in parallel") - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - testReadLinesWithBuffer(t, 100_000, 400) - }() - - wg.Add(1) - go func() { - defer wg.Done() - testReadLinesWithBuffer(t, 100_000, 1000) - }() - - wg.Wait() -} - -func testReadLinesWithBuffer(t *testing.T, numLines int, bufferSize uint) { - // Generate tmp directory - tmpDir := t.TempDir() - // Step 1: Generate Test Data - testDataBytes := generateTestData(numLines) - - // Step 2: Compress the Data - random_name := randStringBytes(10) - compressedFileName := filepath.Join(tmpDir, fmt.Sprintf("test_data_%s.bgz", random_name)) - defer os.Remove(compressedFileName) - - compressData(testDataBytes, compressedFileName) - - // Step 3: Read the Compressed Data - expectedLines := strings.Split(string(testDataBytes), "\n") - testDataBytes = nil - - err := readCompressedData(compressedFileName, expectedLines, bufferSize) - - if err != nil { - log.Fatalf("Error reading compressed data: %v", err) - } -} - -func generateTestData(numLines int) []byte { - min := 1 - max := 3000 - - var buffer bytes.Buffer - for i := 0; i < numLines; i++ { - // generate a random line - - buffer.WriteString(fmt.Sprintf("Line %d %s\n", i, randStringBytes(rand.Intn(max-min)+min))) - } - return buffer.Bytes() -} - -func compressData(data []byte, fileName string) { - f, _ := os.Create(fileName) - defer f.Close() - - w := bgzf.NewWriter(f, 1) - defer w.Close() - - w.Write(data) -} - -func readCompressedData(fileName string, expectedLines []string, bufferSize uint) error { - f, _ := os.Open(fileName) - defer f.Close() - - r, _ := bgzf.NewReader(f, 0) - defer r.Close() - - i := 0 - for { - lines, err := readLinesBgzipWithBuffer(r, bufferSize) - - if len(lines) > 0 { - strLines := strings.Split(string(lines), "\n") - - for _, line := range strLines { - if line != expectedLines[i] { - return fmt.Errorf("Line %d does not match. Got: %s, expected: %s", i, line, expectedLines[i]) - } - - i += 1 - } - } - - if err != nil { - if err == io.EOF { - return nil - } - return err - } - } -} - -// NOTE: test data comes from `../test/opensearch/testdata/input.txt` -func Test_readLinesWithBuffer(t *testing.T) { - type args struct { - bufferSize uint - input []byte - } - type want struct { - file string - golden []byte - } - tests := []struct { - name string - args args - want []byte - wantErr []error - }{ - { - name: "empty input", - args: args{ - bufferSize: 100, - input: []byte(""), - }, - want: []byte(""), - wantErr: []error{io.EOF, io.EOF, io.EOF}, - }, - { - name: "0 buffer", - args: args{ - bufferSize: 0, - input: []byte(""), - }, - want: []byte(""), - wantErr: []error{ErrBufferSize, ErrBufferSize, ErrBufferSize}, - }, - { - name: "When buffer is not as long as input we expect no errors, and the file is read up until the first newline (exclusive)", - args: args{ - bufferSize: 1, - input: []byte("a\tb.c\td.e.f\n1\tA\t1;2|3;4/5\n1|2\tA\t1;2|3;4/5\na/2\tA;B\t1/2|3;4/5\n"), - }, - want: []byte("a\tb.c\td.e.f"), - wantErr: []error{nil, nil, nil}, - }, - { - name: "When buffer is longer than input, we expected either ErrUnexpectedEOF (bufio read) or EOF (bgzip read)", - args: args{ - bufferSize: 100, - input: []byte("a\tb.c\td.e.f\n1\tA\t1;2|3;4/5\n1|2\tA\t1;2|3;4/5\na/2\tA;B\t1/2|3;4/5\n"), - }, - want: []byte("a\tb.c\td.e.f\n1\tA\t1;2|3;4/5\n1|2\tA\t1;2|3;4/5\na/2\tA;B\t1/2|3;4/5"), - wantErr: []error{io.EOF, io.EOF, io.EOF}, - }, - { - name: "We do not truncate the input when newline is missing from end of file", - args: args{ - bufferSize: 100, - input: []byte("a\tb.c\td.e.f\n1\tA\t1;2|3;4/5\n1|2\tA\t1;2|3;4/5\na/2\tA;B\t1/2|3;4/5"), - }, - want: []byte("a\tb.c\td.e.f\n1\tA\t1;2|3;4/5\n1|2\tA\t1;2|3;4/5\na/2\tA;B\t1/2|3;4/5"), - wantErr: []error{io.EOF, io.EOF, io.EOF}, - }, - { - name: "Extra newlines within middle of file are retained", - args: args{ - bufferSize: 100, - input: []byte("a\tb.c\td.e.f\n\n\n1\tA\t1;2|3;4/5\n1|2\tA\t1;2|3;4/5\na/2\tA;B\t1/2|3;4/5\n"), - }, - want: []byte("a\tb.c\td.e.f\n\n\n1\tA\t1;2|3;4/5\n1|2\tA\t1;2|3;4/5\na/2\tA;B\t1/2|3;4/5"), - wantErr: []error{io.EOF, io.EOF, io.EOF}, - }, - { - name: "We support large buffers", - args: args{ - bufferSize: 1_000_000, - input: []byte("a\tb.c\td.e.f\n\n\n1\tA\t1;2|3;4/5\n1|2\tA\t1;2|3;4/5\na/2\tA;B\t1/2|3;4/5\n"), - }, - want: []byte("a\tb.c\td.e.f\n\n\n1\tA\t1;2|3;4/5\n1|2\tA\t1;2|3;4/5\na/2\tA;B\t1/2|3;4/5"), - wantErr: []error{io.EOF, io.EOF, io.EOF}, - }, - } - - for _, tt := range tests { - log.Println("Running decompressed IO test: ", tt.name) - b := bytes.NewReader(tt.args.input) - r := bufio.NewReader(b) - got, err := readLinesWithBuffer(r, tt.args.bufferSize) - - if err != tt.wantErr[0] { - log.Fatalf("Expected error: %v\ngot: %v", tt.wantErr, err) - } - - if !bytes.Equal(got, tt.want) { - log.Fatalf("Expected:\n%v\ngot:\n%v", string(tt.want), string(got)) - } - } - - for _, tt := range tests { - log.Println("Running gzip test: ", tt.name) - - // Compress the data - var buf bytes.Buffer - writer := gzip.NewWriter(&buf) - - if _, err := writer.Write(tt.args.input); err != nil { - log.Fatal(err) - } - if err := writer.Close(); err != nil { - log.Fatal(err) - } - - b := bytes.NewReader(buf.Bytes()) - r, err := gzip.NewReader(b) - if err != nil { - log.Fatal(err) - } - bufioReader := bufio.NewReader(r) - - got, err := readLinesWithBuffer(bufioReader, tt.args.bufferSize) - - if err != tt.wantErr[1] { - log.Fatalf("Expected error: %v\ngot: %v", tt.wantErr, err) - } - - if !bytes.Equal(got, tt.want) { - log.Fatalf("Expected:\n%v\ngot:\n%v", string(tt.want), string(got)) - } - } - - for _, tt := range tests { - log.Println("Running bgzip test: ", tt.name) - - var buf bytes.Buffer - writer := bgzf.NewWriter(&buf, 1) - - if _, err := writer.Write(tt.args.input); err != nil { - log.Fatal(err) - } - if err := writer.Close(); err != nil { - log.Fatal(err) - } - - b := bytes.NewReader(buf.Bytes()) - r, err := bgzf.NewReader(b, 0) - if err != nil { - log.Fatal(err) - } - - got, err := readLinesBgzipWithBuffer(r, tt.args.bufferSize) - - if err != tt.wantErr[2] { - log.Fatalf("Expected error: %v, got: %v", tt.wantErr[1], err) - } - - if !bytes.Equal(got, tt.want) { - log.Fatalf("Expected: %v, got: %v", tt.want, got) - } - } -} diff --git a/go/go.mod b/go/go.mod deleted file mode 100644 index d5e099278..000000000 --- a/go/go.mod +++ /dev/null @@ -1,33 +0,0 @@ -module bystro - -go 1.21 - -require ( - github.com/apache/arrow/go/v14 v14.0.2 - github.com/beanstalkd/go-beanstalk v0.2.0 - github.com/biogo/hts v1.4.4 - github.com/bystrogenomics/bystro-vcf v0.0.0-20240425204515-a3bed256638d - github.com/bytedance/sonic v1.12.3 - github.com/klauspost/compress v1.17.4 - github.com/klauspost/pgzip v1.2.6 - github.com/opensearch-project/opensearch-go/v2 v2.3.0 - github.com/tidwall/btree v1.7.0 - gopkg.in/yaml.v3 v3.0.1 -) - -require ( - github.com/bytedance/sonic/loader v0.2.0 // indirect - github.com/cloudwego/base64x v0.1.4 // indirect - github.com/cloudwego/iasm v0.2.0 // indirect - github.com/goccy/go-json v0.10.2 // indirect - github.com/google/flatbuffers v23.5.26+incompatible // indirect - github.com/klauspost/cpuid/v2 v2.2.6 // indirect - github.com/pierrec/lz4/v4 v4.1.21 // indirect - github.com/twitchyliquid64/golang-asm v0.15.1 // indirect - github.com/zeebo/xxh3 v1.0.2 // indirect - golang.org/x/arch v0.6.0 // indirect - golang.org/x/mod v0.14.0 // indirect - golang.org/x/sys v0.16.0 // indirect - golang.org/x/tools v0.17.0 // indirect - golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect -) diff --git a/go/go.sum b/go/go.sum deleted file mode 100644 index bc49e72e4..000000000 --- a/go/go.sum +++ /dev/null @@ -1,139 +0,0 @@ -github.com/apache/arrow/go/v14 v14.0.2 h1:N8OkaJEOfI3mEZt07BIkvo4sC6XDbL+48MBPWO5IONw= -github.com/apache/arrow/go/v14 v14.0.2/go.mod h1:u3fgh3EdgN/YQ8cVQRguVW3R+seMybFg8QBQ5LU+eBY= -github.com/aws/aws-sdk-go v1.44.263/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= -github.com/aws/aws-sdk-go-v2 v1.18.0/go.mod h1:uzbQtefpm44goOPmdKyAlXSNcwlRgF3ePWVW6EtJvvw= -github.com/aws/aws-sdk-go-v2/config v1.18.25/go.mod h1:dZnYpD5wTW/dQF0rRNLVypB396zWCcPiBIvdvSWHEg4= -github.com/aws/aws-sdk-go-v2/credentials v1.13.24/go.mod h1:jYPYi99wUOPIFi0rhiOvXeSEReVOzBqFNOX5bXYoG2o= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.3/go.mod h1:4Q0UFP0YJf0NrsEuEYHpM9fTSEVnD16Z3uyEF7J9JGM= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.33/go.mod h1:7i0PF1ME/2eUPFcjkVIwq+DOygHEoK92t5cDqNgYbIw= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.27/go.mod h1:UrHnn3QV/d0pBZ6QBAEQcqFLf8FAzLmoUfPVIueOvoM= -github.com/aws/aws-sdk-go-v2/internal/ini v1.3.34/go.mod h1:Etz2dj6UHYuw+Xw830KfzCfWGMzqvUTCjUj5b76GVDc= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.27/go.mod h1:EOwBD4J4S5qYszS5/3DpkejfuK+Z5/1uzICfPaZLtqw= -github.com/aws/aws-sdk-go-v2/service/sso v1.12.10/go.mod h1:ouy2P4z6sJN70fR3ka3wD3Ro3KezSxU6eKGQI2+2fjI= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.14.10/go.mod h1:AFvkxc8xfBe8XA+5St5XIHHrQQtkxqrRincx4hmMHOk= -github.com/aws/aws-sdk-go-v2/service/sts v1.19.0/go.mod h1:BgQOMsg8av8jset59jelyPW7NoZcZXLVpDsXunGDrk8= -github.com/aws/smithy-go v1.13.5/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA= -github.com/beanstalkd/go-beanstalk v0.2.0 h1:6UOJugnu47uNB2jJO/lxyDgeD1Yds7owYi1USELqexA= -github.com/beanstalkd/go-beanstalk v0.2.0/go.mod h1:/G8YTyChOtpOArwLTQPY1CHB+i212+av35bkPXXj56Y= -github.com/biogo/boom v0.0.0-20150317015657-28119bc1ffc1/go.mod h1:fwtxkutinkQcME9Zlywh66T0jZLLjgrwSLY2WxH2N3U= -github.com/biogo/hts v1.4.4 h1:Z+TminqAKRE/t6nyy5PwI/DL90kdew4GpghB+QdjjFk= -github.com/biogo/hts v1.4.4/go.mod h1:AfPn4uJQ2zxi04Q/4vccdmCX16W+IsHXVguPsdh4HE4= -github.com/bystrogenomics/bystro-vcf v0.0.0-20240425204515-a3bed256638d h1:Y3qdBlf9Q1DJfGFP47GNwU7ytJklAqpuxgoapAs4U80= -github.com/bystrogenomics/bystro-vcf v0.0.0-20240425204515-a3bed256638d/go.mod h1:ssLIZJL1hUm8xIFZO33X2Mp0b3Ju7oJ2liBW0Qov0KQ= -github.com/bytedance/sonic v1.12.3 h1:W2MGa7RCU1QTeYRTPE3+88mVC0yXmsRQRChiyVocVjU= -github.com/bytedance/sonic v1.12.3/go.mod h1:B8Gt/XvtZ3Fqj+iSKMypzymZxw/FVwgIGKzMzT9r/rk= -github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU= -github.com/bytedance/sonic/loader v0.2.0 h1:zNprn+lsIP06C/IqCHs3gPQIvnvpKbbxyXQP1iU4kWM= -github.com/bytedance/sonic/loader v0.2.0/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU= -github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y= -github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w= -github.com/cloudwego/iasm v0.2.0 h1:1KNIy1I1H9hNNFEEH3DVnI4UujN+1zjpuk6gwHLTssg= -github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= -github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= -github.com/google/flatbuffers v23.5.26+incompatible h1:M9dgRyhJemaM4Sw8+66GHBu8ioaQmyPLg1b8VwK5WJg= -github.com/google/flatbuffers v23.5.26+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= -github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4= -github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= -github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= -github.com/klauspost/compress v1.17.4 h1:Ej5ixsIri7BrIjBkRZLTo6ghwrEtHFk7ijlczPW4fZ4= -github.com/klauspost/compress v1.17.4/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM= -github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= -github.com/klauspost/cpuid/v2 v2.2.6 h1:ndNyv040zDGIDh8thGkXYjnFtiN02M1PVVF+JE/48xc= -github.com/klauspost/cpuid/v2 v2.2.6/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= -github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU= -github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= -github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M= -github.com/kortschak/utter v0.0.0-20190412033250-50fe362e6560/go.mod h1:oDr41C7kH9wvAikWyFhr6UFr8R7nelpmCF5XR5rL7I8= -github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs= -github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= -github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/opensearch-project/opensearch-go/v2 v2.3.0 h1:nQIEMr+A92CkhHrZgUhcfsrZjibvB3APXf2a1VwCmMQ= -github.com/opensearch-project/opensearch-go/v2 v2.3.0/go.mod h1:8LDr9FCgUTVoT+5ESjc2+iaZuldqE+23Iq0r1XeNue8= -github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= -github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= -github.com/tidwall/btree v1.7.0 h1:L1fkJH/AuEh5zBnnBbmTwQ5Lt+bRJ5A8EWecslvo9iI= -github.com/tidwall/btree v1.7.0/go.mod h1:twD9XRA5jj9VUQGELzDO4HPQTNJsoWWfYEL+EUQ2cKY= -github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI= -github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= -github.com/ulikunitz/xz v0.5.10/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= -github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= -github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= -github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= -golang.org/x/arch v0.6.0 h1:S0JTfE48HbRj80+4tbvZDYsJ3tGv6BUU3XxyZ7CirAc= -golang.org/x/arch v0.6.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI= -golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.14.0 h1:dGoOF9QVLYng8IHTm7BAyWqCqSheQ5pYWGhzW00YJr0= -golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= -golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= -golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= -golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.17.0 h1:FvmRgNOcs3kOa+T20R1uhfP9F6HgG2mfxDv1vrx1Htc= -golang.org/x/tools v0.17.0/go.mod h1:xsh6VxdV005rRVaS6SSAf9oiAqljS7UZUacMZ8Bnsps= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 h1:+cNy6SZtPcJQH3LJVLOSmiC7MMxXNOb3PU/VUEz+EhU= -golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= -gonum.org/v1/gonum v0.12.0 h1:xKuo6hzt+gMav00meVPUlXwSdoEJP46BR+wdxQEFK2o= -gonum.org/v1/gonum v0.12.0/go.mod h1:73TDxJfAAHeA8Mk9mf8NlIppyhQNo5GLTcYeqgo2lvY= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= -gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50= diff --git a/go/test/opensearch/testdata/input.txt b/go/test/opensearch/testdata/input.txt deleted file mode 100644 index 27601102e..000000000 --- a/go/test/opensearch/testdata/input.txt +++ /dev/null @@ -1,4 +0,0 @@ -a b.c d.e.f -1 A 1;2|3;4/5 -1|2 A 1;2|3;4/5 -a/2 A;B 1/2|3;4/5 diff --git a/install-apt.sh b/install-apt.sh deleted file mode 100755 index b1b193de8..000000000 --- a/install-apt.sh +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env bash -set -e -set -o pipefail - -# Default values -DEFAULT_GO_PLATFORM="linux-amd64" -DEFAULT_GO_VERSION="1.21.4" -DEFAULT_PROFILE_FILE=$(./install/detect-shell-profile.sh "$HOME") - -# Function to display usage information -show_help() { - echo "Usage: $0 [OPTIONS]" - echo "" - echo "Options:" - echo " --profile-file PROFILE_FILE Specify the shell profile file to update (default: auto-detected, e.g., ~/.bash_profile)" - echo " --go-platform GO_PLATFORM Specify the Go platform (default: linux-amd64)" - echo " --go-version GO_VERSION Specify the Go version (default: 1.21.4)" - echo " --help Show this help message and exit" - echo "" - exit 0 -} - -# Parse command-line arguments -PROFILE_FILE="$DEFAULT_PROFILE_FILE" -GO_PLATFORM="$DEFAULT_GO_PLATFORM" -GO_VERSION="$DEFAULT_GO_VERSION" -while [[ $# -gt 0 ]]; do - case $1 in - --profile-file) - PROFILE_FILE="$2" - shift 2 - ;; - --go-platform) - GO_PLATFORM="$2" - shift 2 - ;; - --go-version) - GO_VERSION="$2" - shift 2 - ;; - --help) - show_help - ;; - *) - echo "Unknown option: $1" - show_help - ;; - esac -done - -# Use the home directory of the invoking user, not root -if [[ -n "$SUDO_USER" ]]; then - HOME_DIR="$(getent passwd "$SUDO_USER" | cut -d: -f6)" -else - HOME_DIR="$HOME" -fi - -echo "Home directory is $HOME_DIR" - -BYSTRO_INSTALL_DIR=$(pwd) -LOCAL_INSTALL_DIR="$HOME_DIR/.local" -BINARY_INSTALL_DIR="$HOME_DIR/.local/bin" - -echo "Install directory is $BYSTRO_INSTALL_DIR" -echo "PROFILE is $PROFILE_FILE" -echo "Go platform is $GO_PLATFORM" - -# Install RPM dependencies -sudo ./install/install-apt-deps.sh - -# Install HTSlib -./install/install-htslib.sh "$PROFILE_FILE" "$LOCAL_INSTALL_DIR" - -# Install LiftOver -./install/install-liftover-linux.sh "$PROFILE_FILE" "$BINARY_INSTALL_DIR" - -# Install LMDB -sudo ./install/install-lmdb-linux.sh - -# Install Perlbrew -./install/install-perlbrew-linux.sh "$PROFILE_FILE" "$HOME_DIR" perl-5.34.0 - -# Install Go -./install/install-go.sh "$PROFILE_FILE" "$HOME_DIR" "$LOCAL_INSTALL_DIR" "$BYSTRO_INSTALL_DIR" "$GO_PLATFORM" "$GO_VERSION" - -# Export Bystro libraries to shell profile -./install/export-bystro-libs.sh "$PROFILE_FILE" "$BYSTRO_INSTALL_DIR" - -# Create logs directory -mkdir -p logs - -echo "\nTesting Bystro installation" - -bash -c ". $PROFILE_FILE && cd perl && prove -r ./t -j$(nproc)" -if [ $? -eq 0 ]; then - echo "\nBystro installation succeeded!" -else - echo "\nBystro installation failed" - exit 1 -fi - -echo -e "\n\nREMEMBER TO INCREASE ULIMIT ABOVE 1024 IF RUNNING MANY FORKS\n\n" - -echo -e "To get started with Bystro, for instance to run Bystro Annotator: \n" -echo "Update your shell to reflect the newly installed programs: 'source $PROFILE_FILE'" -echo "Run Bystro Annotator: 'bystro-annotate.pl --help'" -echo -e "\n\n" diff --git a/install-rpm.sh b/install-rpm.sh deleted file mode 100755 index 8a6acac60..000000000 --- a/install-rpm.sh +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env bash -set -e -set -o pipefail - -# Default values -DEFAULT_GO_PLATFORM="linux-amd64" -DEFAULT_GO_VERSION="1.21.4" -DEFAULT_PROFILE_FILE=$(./install/detect-shell-profile.sh "$HOME") - -# Function to display usage information -show_help() { - echo "Usage: $0 [OPTIONS]" - echo "" - echo "Options:" - echo " --profile-file PROFILE_FILE Specify the shell profile file to update (default: auto-detected, e.g., ~/.bash_profile)" - echo " --go-platform GO_PLATFORM Specify the Go platform (default: linux-amd64)" - echo " --go-version GO_VERSION Specify the Go version (default: 1.21.4)" - echo " --help Show this help message and exit" - echo "" - exit 0 -} - -# Parse command-line arguments -PROFILE_FILE="$DEFAULT_PROFILE_FILE" -GO_PLATFORM="$DEFAULT_GO_PLATFORM" -GO_VERSION="$DEFAULT_GO_VERSION" -while [[ $# -gt 0 ]]; do - case $1 in - --profile-file) - PROFILE_FILE="$2" - shift 2 - ;; - --go-platform) - GO_PLATFORM="$2" - shift 2 - ;; - --go-version) - GO_VERSION="$2" - shift 2 - ;; - --help) - show_help - ;; - *) - echo "Unknown option: $1" - show_help - ;; - esac -done - -# Use the home directory of the invoking user, not root -if [[ -n "$SUDO_USER" ]]; then - HOME_DIR="$(getent passwd "$SUDO_USER" | cut -d: -f6)" -else - HOME_DIR="$HOME" -fi - -echo "Home directory is $HOME_DIR" - -BYSTRO_INSTALL_DIR=$(pwd) -LOCAL_INSTALL_DIR="$HOME_DIR/.local" -BINARY_INSTALL_DIR="$HOME_DIR/.local/bin" - -echo "Install directory is $BYSTRO_INSTALL_DIR" -echo "PROFILE is $PROFILE_FILE" -echo "Go platform is $GO_PLATFORM" - -# Install RPM dependencies -sudo ./install/install-rpm-deps.sh - -# Install HTSlib -./install/install-htslib.sh "$PROFILE_FILE" "$LOCAL_INSTALL_DIR" - -# Install LiftOver -./install/install-liftover-linux.sh "$PROFILE_FILE" "$BINARY_INSTALL_DIR" - -# Install LMDB -sudo ./install/install-lmdb-linux.sh - -# Install Perlbrew -./install/install-perlbrew-linux.sh "$PROFILE_FILE" "$HOME_DIR" perl-5.34.0 - -# Install Go -./install/install-go.sh "$PROFILE_FILE" "$HOME_DIR" "$LOCAL_INSTALL_DIR" "$BYSTRO_INSTALL_DIR" "$GO_PLATFORM" "$GO_VERSION" - -# Export Bystro libraries to shell profile -./install/export-bystro-libs.sh "$PROFILE_FILE" "$BYSTRO_INSTALL_DIR" - -# Create logs directory -mkdir -p logs - -echo "\nTesting Bystro installation" - -bash -c ". $PROFILE_FILE && cd perl && prove -r ./t -j$(nproc)" -if [ $? -eq 0 ]; then - echo "\nBystro installation succeeded!" -else - echo "\nBystro installation failed" - exit 1 -fi - -echo -e "\n\nREMEMBER TO INCREASE ULIMIT ABOVE 1024 IF RUNNING MANY FORKS\n\n" - -echo -e "To get started with Bystro, for instance to run Bystro Annotator: \n" -echo "Update your shell to reflect the newly installed programs: 'source $PROFILE_FILE'" -echo "Run Bystro Annotator: 'bystro-annotate.pl --help'" -echo -e "\n\n" diff --git a/perl/.gitignore b/perl/.gitignore deleted file mode 100644 index b3da003d8..000000000 --- a/perl/.gitignore +++ /dev/null @@ -1,19 +0,0 @@ -*.mdb -/.build -/.tidyall.d -/local -Bytro-* -t/db -t/tracks/build/db -t/tracks/gene/db/index -t/tracks/nearest/db/hg19/index -t/tracks/reference/db/index -t/tracks/score/db/index -t/tracks/sparse/index -t/tracks/vcf/index -t/utils/filterCadd.*.yml -t/utils/filterCadd.yml.utils-bak.* -t/utils/index -t/utils/raw/cadd/cadd.*.log -t/utils/raw/cadd/cadd.*.log -t/utils/raw/cadd/test.*/ diff --git a/perl/.perlcriticrc b/perl/.perlcriticrc deleted file mode 100644 index 43bfe6516..000000000 --- a/perl/.perlcriticrc +++ /dev/null @@ -1,17 +0,0 @@ -severity = 3 -verbose = 8 - -[Variables::ProhibitPunctuationVars] -allow = $@ $! $_ - -# Turn these off -[-NamingConventions::Capitalization] -[-BuiltinFunctions::ProhibitStringyEval] -[-ControlStructures::ProhibitPostfixControls] -[-ControlStructures::ProhibitUnlessBlocks] -[-Documentation::RequirePodSections] -[-InputOutput::ProhibitInteractiveTest] -[-References::ProhibitDoubleSigils] -[-RegularExpressions::RequireExtendedFormatting] -[-InputOutput::ProhibitTwoArgOpen] -[-Modules::ProhibitEvilModules] diff --git a/perl/.perltidyrc b/perl/.perltidyrc deleted file mode 100644 index 0cfb02f73..000000000 --- a/perl/.perltidyrc +++ /dev/null @@ -1,40 +0,0 @@ -# modified from DAGOLDEN .perltidyrc file - --se # Errors to STDERR - --l=85 # Max line width target --vmll # variable maximum line length --wc=10 # depth to reduce indentation levels --i=2 # Indent level --ci=2 # Continuation - --vt=0 # vertical tightness --cti=0 # extra indentation for closing brackets --vtc=0 # close parens on own line if possible - --nsot # stack opening --nsct # stack closing - --notr # opening tokens on right of a line --pt=1 # parenthesis tightness --bt=1 # brace tightness --sbt=1 # square bracket tightness --bbt=0 # block brace tightness --cab=1 # break at all commas after => if container is open - --nsfp # no space after function --nsfs # no space before semicolons in for loops - --nolq # Don't outdent long quoted strings --nola # Don't outdent labels --nolc # Don't outdent long comments --nokw # Don't outdent keywords --nhsc # Don't expect hanging side comments --nbbc # No blank before comments --tso # Tight secret operators - --msc=1 # Space to side comment - --wbb="% + - * / x != == >= <= =~ !~ < > | &" # Break before all operators except assignment - --ole=unix # line endings diff --git a/perl/.tidyallrc b/perl/.tidyallrc deleted file mode 100644 index ee2ab7fcc..000000000 --- a/perl/.tidyallrc +++ /dev/null @@ -1,15 +0,0 @@ -; Install Code::TidyAll -; run "tidyall -a" to tidy all files -; run "tidyall -g" to tidy only files modified from git - -[PerlTidy] -argv = --pro="$ROOT/.perltidyrc" -select = {bench,bin,issues,lib,t}/**/*.{pl,pm,t} - -; [PerlCritic] -; select = {bin,lib}/**/*.pm -; argv = --severity 4 - -[SortLines] -select = .gitignore -select = .dockerignore \ No newline at end of file diff --git a/perl/INSTALL.md b/perl/INSTALL.md deleted file mode 100644 index 21f50f8b2..000000000 --- a/perl/INSTALL.md +++ /dev/null @@ -1,322 +0,0 @@ -# Bystro Annotator Package Installation and Configuration - -## Installation - -These instructions assume that you are in the `perl` directory of the Bystro repository, e.g. `~/bystro/perl`. - -### Installing Bystro Annotator using Docker - -To build a Docker image using the `Dockerfile`, run the following: - -```bash -cd ../ && docker build -t bystro-annotator -f Dockerfile.perl . -# Run Bystro Annotator from the new Docker container; replace with the desired command -# If no command provided, will automatically run bystro-annotate.pl --help -docker run bystro-annotator -``` - -- Commands: - - Run the annotator: `docker run bystro-annotator bystro-annotate.pl --help` - - Build a new Bystro database: `docker run bystro-annotator bystro-build.pl --help` - - Fetch dependencies, before building: `docker run bystro-annotator bystro-utils.pl --help` - -### Installing Bystro Annotator on Bare Metal / Directly on Host Operating System - -The easiest way to install Bystro directly on your machine is to run: - -- Debian/Ubuntu: `../install-apt.sh` -- Centos/Fedora/Amazon Linux: `../install-rpm.sh` - -You will be prompted for "sudo" access to install the necessary system level dependencies. - -### Manual/Custom Install - -The previous instructions configured a local copy of Perl for you, using Perlbrew. If you want to use your system's Perl, -or otherwise control the installation process, follow the "Manual/Custom Install" instructions to give you greater control over installation. - -Else, just skip to the next section [Configure Bystro Annotator](#configuring-the-bystro-annotator). - -First you'll need to install some prerequisites: - -- Debian/Ubuntu: `sudo ../install/install-apt-deps.sh` -- Centos/Fedora/Amazon Linux: `sudo ../install/install-rpm-deps.sh` -- bgzip: `../install/install-htslib.sh ~/.profile ~/.local` - -Bystro relies on a few `Go` programs, which can be installed with the following: - -```bash -# Where to install the Bystro Go programs (will go into ~/.local/go in this case) -BYSTRO_GO_PROGRAMS_INSTALL_DIR=~/.local -# Where to install Go itself (will go into ~/go in this case) -GOLANG_BINARY_INSTALL_DIR=~/ -# Where to add the Go binaries to your PATH -PROFILE_PATH=~/.profile -# Where Bystro is installed -BYSTRO_INSTALL_DIR=~/bystro -# The platform to install Go for -GO_PLATFORM=linux-amd64 -# The version of Go to install -GO_VERSION=1.21.4 - -# BYSTRO_GO_PROGRAMS_INSTALL_DIR and GO_BINARY_INSTALL_DIR directories must exist -mkdir -p $BYSTRO_GO_PROGRAMS_INSTALL_DIR -mkdir -p $GOLANG_BINARY_INSTALL_DIR - -# Assuming we are installing this on linux, on an x86 processor -# and that our login shell environment is stored in ~/.profile (another common one is ~/.bash_profile) -../install/install-go.sh $PROFILE_PATH $GOLANG_BINARY_INSTALL_DIR $BYSTRO_GO_PROGRAMS_INSTALL_DIR $BYSTRO_INSTALL_DIR $GO_PLATFORM $GO_VERSION - -source ~/.profile -``` - -Next, we need to install the Bystro Perl library and its Perl dependencies. The instructions for installing the Bystro Perl library use [`cpm`](https://metacpan.org/pod/App::cpanminus). - -- Alternatively you can use [cpanm](https://metacpan.org/dist/App-cpanminus/view/bin/cpanm), which can be installed with the following: `curl -fsSL https://cpanmin.us | perl - App::cpanminus` -- Just replace every `cpm install --test` and `cpm install` command with `cpanm` - -
- -To install `cpm`, run the following: - -```bash -# Install cpm -curl -fsSL https://raw.githubusercontent.com/skaji/cpm/main/cpm | perl - install App::cpm -``` - -You will need to configure where Perl stores its libraries. By default, `cpm` will install libraries in `./local` in the current directory. - -- You will need to make sure that this path is in your `PERL5LIB` environment variable: - - ```bash - # Assuming you were in the ~/bystro/perl directory when you ran `cpm install`, you would get a folder `~/bystro/perl/local` with the libraries and binaries cpm installed - # We need to add this to our PERL5LIB and PATH environment variables - # You would put these commands in your ~/.profile or ~/.bash_profile - export PERL5LIB=~/bystro/perl/local/lib/perl5:$PERL5LIB - export PATH=~/bystro/perl/local/bin:$PATH - ``` - -- If you want to install libraries and binaries into a different local directory, replace `cpm install` with `cpm install -L=/path/to`, which will cause libraries to be installed in `/path/to/lib/perl5` and binaries into `/path/to/bin`. You will need to make sure that these paths are in your `PERL5LIB` and `PATH` environment variables respectively: - - ```bash - # Assuming you ran `cpm install -L=/path/to` for all of your cpm install commands - # Put this in your ~/.profile or ~/.bash_profile - export PERL5LIB=/path/to/lib/perl5:$PERL5LIB - export PATH=/path/to/bin:$PATH - ``` - -- If you want to install libraries in the default Perl library path, as specified by Perl's @INC, replace the `cpm install` commands with `cpm install -g` - -
- -A few dependencies must be specially separately installed: - -```bash -cpm install --test https://github.com/bystrogenomics/msgpack-perl.git - -ALIEN_INSTALL_TYPE=share cpm install --test Alien::LMDB -cpm install --test LMDB_File - -# no --test option because it has a trivial failure related to formatting of cli help strings -cpm install MouseX::Getopt -``` - -However, if you are using Perl > 5.36.0, you will need to manually install LMDB_File 0.14, which will require `make`: - -```bash -ALIEN_INSTALL_TYPE=share cpm install --test Alien::LMDB -git clone --depth 1 --recurse-submodules https://github.com/salortiz/LMDB_File.git \ - && cd LMDB_File \ - && git checkout 34acb71d7d86575fe7abb3f7ad95e8653019b282 \ - && perl Makefile.PL && make distmeta \ - && ln -s MYMETA.json META.json && ln -s MYMETA.yml META.yml \ - && cpm install --show-build-log-on-failure --test . \ - && cd .. - && rm -rf LMDB_File -``` - -Now you can install the rest of the dependencies: - -```bash - cpm install -``` - -
- -Now you're ready to try Bystro: - -```bash -# First let's run our test suite -cd ~/bystro/perl -prove -r ./t -j$(nproc) - -# Then let's try running bystro-annotate.pl -bystro-annotate.pl --help - -# Expected output -# usage: bystro-annotate.pl [-?cio] [long options...] -# --[no-]help (or -?) Prints this usage information. -# aka --usage -# --input STR... (or -i) Input files. Supports mulitiple files: -# --in file1 --in file2 --in file3 -# aka --in -# --output STR (or -o) Base path for output files: /path/to/output -# aka --out -# --[no-]json Do you want to output JSON instead? -# Incompatible with run_statistics -# --config STR (or -c) Yaml config file path. -# aka --configuration -# --overwrite INT Overwrite existing output file. -# --[no-]read_ahead For dense datasets, use system read-ahead -# --debug NUM -# --verbose INT -# --compress STR Enable compression. Specify the type of -# compression: lz4 gz bgz. `bgz` is an alias -# for gz (gzip); when bgzip is available, it -# will be used and will generate a block -# gzipped file with index -# --[no-]archive Place all outputs into a tarball? -# --run_statistics INT Create per-sample feature statistics (like -# transition:transversions)? -# --delete_temp INT Delete the temporary directory made during -# annotation -# --wantedChr STR Annotate a single chromosome -# aka --chr, --wanted_chr -# --maxThreads INT Number of CPU threads to use (optional) -# aka --threads -# --publisher STR Tell Bystro how to send messages to a -# plugged-in interface (such as a web -# interface) -# --[no-]ignore_unknown_chr Don't quit if we find a non-reference -# chromosome (like ChrUn) -# --json_config STR JSON config file path. Use this if you -# wish to invoke the annotator by file -# passing. -# --result_summary_path STR Where to output the result summary. -# Defaults to STDOUT -``` - -## Configuring the Bystro Annotator - -Once Bystro is installed, we need to download a database for the species/assembly we're going to be analyzing and then configure the Bystro Annotator to use it. - -Database configurations are stored in YAML files in the `config` directory. By default Bystro ships with configurations for human genome assemblies hg19 (`~/bystro/config/hg19.yml`) and hg38 (`~/bystro/config/hg38.yml`), and the Rat assembly rn7 (`~/bystro/config/rn7.yml`) though you can create your own configurations for other species/assemblies. - -### Example Configuration - -1. Download and unpack the human hg38 Bystro database - - ```bash - MY_DATABASE_DIR=/mnt/annotator - sudo mkdir -p $MY_DATABASE_DIR - sudo chown -R $USER:$USER $MY_DATABASE_DIR - cd $MY_DATABASE_DIR - wget https://s3.amazonaws.com/bystro-db/hg38_v11.tar.gz - bgzip -d -c --threads 32 hg38_v11.tar.gz | tar xvf - - ``` - - - You can chooose a directory other than `/mnt/annotator/`; that is just the default expected by ~/bystro/config/hg38.yml. If you choose something else, just update the `database_dir` property in the configuration file - - - with `yq`: - - ```bash - MY_DATABASE_DIR=/path/somewhere/else - # Assuming we downloaded and unpacked the database to /path/somewhere/else/hg38_v11 - # Update the database_dir property in the configuration file using `yq` - # You can also do this manually by editing the configuration file (in this example ~/bystro/config/hg38.yml) - yq write -i ~/bystro/config/hg38.yml database_dir $MY_DATABASE_DIR/hg38_v11 - ``` - - - `tar` is required to unpack the database, which is stored as a compresssed tarball, but you can unzip the tarball uzing `gzip -d -c` instead of `bgzip -d -c --threads 32` if you don't have `bgzip` installed. It will work, just slower. - - - You need ~691GB of free space for hg38 and ~376GB of free space for hg19, including the space for the tar.gz archives. - - - The unpacked databases are ~517GB for hg38 and ~283GB for hg19. - -2. (optional) Configure your Bystro Annotator to use a temporary directory with fast local storage, by editing the configuration files `tmp_dir` property to a directory on your fast local storage. This directory must be writable by the user running bystro-annotate.pl. - - If you've installed `yq` this is easy: - - ```bash - MY_FAST_LOCAL_TEMP_STORAGE_FOLDER=/mnt/annotator/tmp - mkdir -p $MY_FAST_LOCAL_STORAGE - - # Or edit ~/bystro/config/hg38.yml file manually - yq write -i ~/bystro/config/hg38.yml temp_dir $MY_FAST_LOCAL_TEMP_STORAGE_FOLDER - ``` - - If temp_dir is not set, the files will be written directly to the output directory (see `--output` option in `bystro-annotate.pl`). - -## Databases - -1. [Human (hg38) database](https://s3.amazonaws.com/bystro-db/hg38_v11.tar.gz) -2. [Human (hg19) database](https://s3.amazonaws.com/bystro-db/hg19_v10.tar.gz) -3. [Rat (rn7) database](https://s3.amazonaws.com/bystro-db/rn7.tar.gz) -4. There are no restrictions on species support, but for the open source Bystro Annotator we currently only build human and rat genomes, and do not guarantee that the open-source version will be up to date. Please create a GitHub issue if you would like us to support others or need updates to the current databases. - -## Running your first annotation - -Example: Annotate an hg38 VCF file: - -```sh -bystro-annotate.pl --config ~/bystro/config/hg38.yml --threads 32 --input gnomad.genomes.v4.0.sites.chr22.vcf.bgz --output test/my_annotation --compress gz -``` - -The above command will annotate the `gnomad.genomes.v4.0.sites.chr22.vcf.bgz` file with the hg38 database, using 32 threads, and output the results to `test`, and will use `my_annotation` as the prefix for output files. - -The result of this command will be: - -```sh -Created completion file -{ - "error" : null, - "totalProgress" : 8599234, - "totalSkipped" : 0, - "results" : { - "header" : "my_annotation.annotation.header.json", - "sampleList" : "my_annotation.sample_list", - "annotation" : "my_annotation.annotation.tsv.gz", - "dosageMatrixOutPath" : "my_annotation.dosage.feather", - "config" : "hg38.yml", - "log" : "my_annotation.annotation.log.txt", - "statistics" : { - "qc" : "my_annotation.statistics.qc.tsv", - "json" : "my_annotation.statistics.json", - "tab" : "my_annotation.statistics.tsv" - } - } -} -``` - -Explanation of the output: - -- `my_annotation.annotation.header.json`: The header of the annotated dataset -- `my_annotation.sample_list`: The list of samples in the annotated dataset -- `my_annotation.annotation.tsv.gz`: A block gzipped TSV file with one row per variant and one column per annotation. Can be decompressed with `bgzip` or any program compatible with the gzip format, like `gzip` and `pigz`. -- `my_annotation.dosage.feather`: The dosage matrix file, where the first column is the `locus` column in the format "chr:pos:ref:alt", and columns following that are sample columns, with the dosage of the variant for that sample (0 for homozygous reference, 1 for 1 copy of the alternate allele, 2 for 2, and so on). -1 indicates missing genotypes. The dosage is the expected number of alternate alleles, given the genotype. This is useful for downstream analyses like imputation, or for calculating polygenic risk scores - - This file is in the [Arrow feather format](https://arrow.apache.org/docs/python/feather.html), also known as the "IPC" format. This is an ultra-efficient format for machine learning, and is widely supported, in Python libraries like [Pandas](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html), [Polars](https://docs.pola.rs/api/python/stable/reference/api/polars.read_ipc.html), [PyArrow](https://arrow.apache.org/docs/python/generated/pyarrow.feather.read_feather.html), as well as languages like [R](https://arrow.apache.org/docs/r/reference/read_feather.html) and [Julia](https://github.com/apache/arrow-julia) -- `hg38.yml`: The configuration file used for the annotation. You can use this to either re-build the Bystro database from scratch, or to re-run the annotation with the same configuration -- `my_annotation.annotation.log.txt`: The log file for the annotation -- `my_annotation.statistics.tsv`: A TSV file with sample-wise statistics on the annotation -- `my_annotation.statistics.qc.tsv`: A TSV file that lists any samples that failed quality control checks, currently defined as being outside 3 standard deviations from the mean on any of the sample-wise statistics -- `my_annotation.statistics.json`: A JSON file with the same sample-wise statistics on the annotation -- 'totalProgress': The number of variants processed; this is the number of variants passed to the Bystro annotator by the bystro-vcf pre-processor, which performs primary quality control checks, such as excluding sites that have no samples with non-missing genotypes, or which are not FILTER=PASS in the input VCF. We also exclude sites that are not in the Bystro database, and sites that are not in the Bystro database that are not in the input VCF. In more detail: - - Variants must have FILTER value of PASS or " . " - - Variants and ref must be ACTG (no structural variants retained) - - Multiallelics are split into separate records, and annotated separately - - MNPs are split into separate SNPs and annotated separately - - Indels are left-aligned - - The first base of an indel must be the reference base after multiallelic decomposition and left-alignment - - If genotypes are provided, entirely missing sites are dropped - -## Developer Resources - -### Coding style and tidying - -The `.perltidyrc` gives the coding style and `tidyall` from [Code::TidyAll](https://metacpan.org/dist/Code-TidyAll) can be used to tidy all files with `tidyall -a`. -Please tidy all files before submitting patches. - -Install tidyall, perltidy, and perlcritic like so: - -```bash -cpanm Code::TidyAll Perl::Tidy Perl::Critic -``` diff --git a/perl/README.md b/perl/README.md deleted file mode 100644 index b263c8b6e..000000000 --- a/perl/README.md +++ /dev/null @@ -1,1033 +0,0 @@ -# Bystro High Dimensional Genomic Annotator Documentation - -## What is the Bystro Annotator? - -In order to make use of genetic data, which is composed of variants (colloquially known as mutations) and the individuals (samples) that have those variants, we need to clean that data and then carefully describe those variants with as much accurate information as possible. This information can then be used for qualitative and quantitative studies of the variants/samples. These descriptions are known as labels, model parameters, features, or "annotations", depending on your field. These activites are collectively called data curation and data labeling, but in genetics we call them quality control and annotation. - -The Bystro Annotator is the fastest and most comprehensive data curation and labeling library in the world for genetic data. It takes 1 or more VCF ([Variant Call Format](https://samtools.github.io/hts-specs/VCFv4.2.pdf)) or SNP ([PEMapper/PECaller](https://www.pnas.org/doi/full/10.1073/pnas.1618065114)) files as input, and outputs a cleaned and thoroughly labeled (annotated) representation of the data, along with a genotype dosage matrix in the [Arrow Feather V2/IPC format](https://arrow.apache.org/docs/python/feather.html), as well as a set of statistics that describe sample-level characteristics of the data. - -Bystro Annotator annotates variants as well as sample genotypes. It is capable of processing millions of samples and billions of mutations on commodity hardware such as a laptop or a workstation. It is roughly **100,000** times faster than [Variant Effect Predictor](https://www.ensembl.org/vep) (VEP), **100** times faster than [Annovar](https://annovar.openbioinformatics.org/en/latest/), and **50** times faster than [Illumina Connected Annotations](https://developer.illumina.com/illumina-connected-annotations), all while outputting more annotations than any of these tools. What would take VEP years to do, Bystro can do in minutes to hours, all without requiring multiple servers. - -Bystro's performance isn't just about speed, it's also about comprehensiveness. In statistics "dimensionality" refers to how many parameters/covariates/descriptions a model has. Imagine that we are modeling a genetic variant to try to understand how it impacts a disease. The parameters in the model are the annotations we have for that variant. Bystro can output thousands of annotations for each variant, and can do so for millions of variants in seconds. This is why we call it "high dimensional". - -For example, Bystro can afford to provide complete annotations from gnomad v4.1, for all gnomAD populations and subpopulations, from the exomes, genomes, and joint datasets, genome-wide. This means intersecting terabytes of data over the entire genome with each individual variant, all done in microseconds per variant. **No other tool can do this**. - -## Running Your First Annotation - -See the [INSTALL.md#configuring-the-bystro-annotator](./INSTALL.md#configuring-the-bystro-annotator) section for instructions on how to configure the Bystro Annotator - -```sh -bystro-annotate.pl --config ~/bystro/config/hg38.yml --threads 32 --input gnomad.genomes.v4.0.sites.chr22.vcf.bgz --output test/my_annotation --compress gz -``` - -The above command will annotate the `gnomad.genomes.v4.0.sites.chr22.vcf.bgz` file with the hg38 database, using 32 threads, and output the results to `test`, and will use `my_annotation` as the prefix for output files. - -The result of this command will be: - -```sh -Created completion file -{ - "error" : null, - "totalProgress" : 8599234, - "totalSkipped" : 0, - "results" : { - "header" : "my_annotation.annotation.header.json", - "sampleList" : "my_annotation.sample_list", - "annotation" : "my_annotation.annotation.tsv.gz", - "dosageMatrixOutPath" : "my_annotation.dosage.feather", - "config" : "hg38.yml", - "log" : "my_annotation.annotation.log.txt", - "statistics" : { - "qc" : "my_annotation.statistics.qc.tsv", - "json" : "my_annotation.statistics.json", - "tab" : "my_annotation.statistics.tsv" - } - } -} -``` - -Explanation of the output: - -- `my_annotation.annotation.header.json`: The header of the annotated dataset - -- `my_annotation.sample_list`: The list of samples in the annotated dataset - -- `my_annotation.annotation.tsv.gz`: A block gzipped TSV file with one row per variant and one column per annotation. Can be decompressed with `bgzip` or any program compatible with the gzip format, like `gzip` and `pigz` - -- `my_annotation.dosage.feather`: The dosage matrix file, where the first column is the `locus` column in the format "chr:pos:ref:alt", and columns following that are sample columns, with the dosage of the variant for that sample (0 for homozygous reference, 1 for 1 copy of the alternate allele, 2 for 2, and so on). We do not assume that the sample is diploid and will count all chromosomes. -1 indicates missing genotypes. The dosage is the number of alternate alleles. This is useful for downstream analyses like calculating polygenic risk scores - - - This file is in the [Arrow Feather V2 / IPC format](https://arrow.apache.org/docs/python/feather.html), also known as the "IPC" format. This is an ultra-efficient format for machine learning, and is widely supported, in Python libraries like [Pandas](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html), [Polars](https://docs.pola.rs/api/python/stable/reference/api/polars.read_ipc.html), [PyArrow](https://arrow.apache.org/docs/python/generated/pyarrow.feather.read_feather.html), as well as languages like [R](https://arrow.apache.org/docs/r/reference/read_feather.html) and [Julia](https://github.com/apache/arrow-julia) - -- `hg38.yml`: The configuration file used for the annotation. You can use this to either re-build the Bystro Annotation Database from scratch, or to re-run the annotation with the same configuration - -- `my_annotation.annotation.log.txt`: The log file for the annotation - -- `my_annotation.statistics.tsv`: A TSV file with sample-wise statistics on the annotation - -- `my_annotation.statistics.qc.tsv`: A TSV file that lists any samples that failed quality control checks, currently defined as being outside 3 standard deviations from the mean on any of the sample-wise statistics - -- `my_annotation.statistics.json`: A JSON file with the same sample-wise statistics on the annotation - -- `totalProgress`: The number of variants processed; this is the number of variants passed to the Bystro annotator by the bystro-vcf pre-processor, which performs primary quality control checks, such as excluding sites that have no samples with non-missing genotypes, or which are not FILTER=PASS in the input VCF. We also exclude sites that are not in the Bystro Annotation Database, and sites that are not in the Bystro Annotation Database that are not in the input VCF. In more detail: - - - Variants must have FILTER value of PASS or " . " - - Variants and ref must be ACTG (no structural variants retained) - - Multiallelics are split into separate records, and annotated separately - - MNPs are split into separate SNPs and annotated separately - - Indels are left-aligned - - The first base of an indel must be the reference base after multiallelic decomposition and left-alignment - - If genotypes are provided, entirely missing sites are dropped - -## Let's Take a Closer Look at the Annotation Output - -The Bystro annotation outputs is a tab-separated file with one header row, and then N rows of annotated variants, one variant per row. The sample genotypes for each variant, and sample-level statistics for each variant are stored in each row in sparse fashion. The annotations are divided into several categories, each of which is described in detail in the [Bystro Annotation Fields](#bystro-annotation-fields) section. - -As mentioned, corresponding to the annotation output is a genotype dosage matrix output, which contains the dense representation of genotypes, 1 byte per genotype. It is stored in the [Arrow Feather V2 format](https://arrow.apache.org/docs/python/feather.html), with data compressed using `zstd` compression. Arrow Feather V2 is a columnar datastore, so despite the genotype dosage matrix being typically large, it can be read in a streaming fashion, and even in chunks of samples. We find that we can process thousands of samples within just a few gigabytes of RAM. - -## Bystro Annotation Output In Depth - -One of the key advantages of Bystro's design is that it outputs data in such a complete manner that it is possible to re-create the source files used for annotation from the Bystro annotation output. Bystro's output formats are therefore designed to retain and reflect complex nested relationships between variant descriptions. Here are key aspects of how we output the data: - -1. **Array-Based Fields**: Each annotation field is an array. For fields with multiple values (e.g., transcripts, gene names), the values are separated by semicolons (`;`). The order of the values is maintained across related fields to preserve relationships between the data points. For example: - - - `refSeq.name` might contain `NM1;NM2`, and `refSeq.name2` (gene symbols for these transcripts) could be `Gene1;Gene1`. This ensures the first transcript, `NM1`, corresponds to `Gene1`, and the second transcript, `NM2`, also corresponds to `Gene1`. These relationships are maintained across all fields within a track. - -2. **Nested Arrays**: Some fields may be nested arrays, where the individual entries are further divided using forward slashes (`/`). For example, if a transcript has alternate IDs, you may see `NM1a/NM1b;NM2`, indicating two alternate IDs for the first transcript (NM1a and NM1b) and 1 for the 2nd (NM2). This way we can maintain the relationships between the order of fields. - -3. **Insertions and Deletions**: For transcript-level annotations like `refSeq.*` (refSeq transcript annotations), `nearest.*`, and `nearestTss.*` (nearest gene by transcript boundaries and by distance to the transcription start site respectively), insertions and deletions affecting multiple nucleotides are separated by pipes (`|`). This allows reporting a transcript consequence per disrupted base. - -4. **Reserved Delimiters**: The reserved delimiters described in points 1-3 (`;`, `/`, and `|`) will be stripped and replaced with a comma if found in source inputs to the Bystro Annotation Database. - -## What Information Can Bystro Annotator Output? - -Bystro Annotator is a general-purpose data curation and labeling engine for genetic data, and has no restrictions on the types of annotations/feature labels it can output. Theoretically it can even support binary data, such as images. - -## Variant Representations - -Bystro's variant representation deviates slightly from the standard VCF format in the name of simplicity. In particular, it drops the rule that the alternate allele must be ACTG. Dropping this single restriction allows us to represent deletions as occuring at the actual first deleted base, rather than the base before, as in the VCF format. This has a number of knock on benefits: - -- The `inputRef` (reference base) in Bystro's annotation outputs is always exactly 1 base long - -- The `pos` (position) in Bystro's annotation outputs is always the first affected base, except in the case of insertions, where it is the base before the insertion, since the insertion by definition is between two reference bases - -- It is possible to represent all multiallelic site using a single reference base, a single position, and a list of alleles - -The Bystro Genotype Dosage Matrix, is a columnar dataset, generated for every collection of VCFs submitted. Its first column is the `locus`, which is `chr:pos:ref:alt`. Every column after that is labeled by the sample name, and contains a -1 for missing genotypes, 0 for reference, 1 for a single alternate allele, 2 for 2 alternate alleles (homozygous in a diploid organism), and so on. It can be used for many things, such as to calculate polygenic risk scores. - -### Comparing the VCF and Bystro Variant Representations - -Below we'll demonstrate how the Bystro Annotator handles different kinds of variants, using some examples. We'll do these demonstartions using the Bystro VCF preprocessor, which is a Go program used by the Bystro Annotator to convert a VCF into a partially annotated tab-separated output. The Bystro VCF preprocessor is installed with Bystro Annotator (see the [INSTALL.md](./INSTALL.md) file for instructions on how to install Bystro Annotator). If you don't have Bystro Annotator installed, you can still run the examples as long as you install the Bystro VCF preprocessor by running `go install github.com/bystrogenomics/bystro-vcf@2.2.3`. - -Please note that we are not showing the full Bystro Annotator output described below. We're showing just the first 17 columns of the output, which are the most important for understanding the variant representation and sample genotype handling. - -``` -cat ~/bystro/perl/example_vcf.tsv | bystro-vcf --keepId --emptyField "NA" --keepPos -``` - -If you also want to output the **Bystro Genotype Dosage Matrix**, at a small performance hit, you can run: - -``` -cat ~/bystro/perl/example_vcf.tsv | bystro-vcf --keepId --emptyField "NA" --keepPos --dosageOutput example_vcf_dosage_matrix.feather -``` - -Input Example VCF: - -| CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO | FORMAT | NA00001 | NA00002 | NA00003 | -| ----- | ------- | --------------------------- | ---- | ------- | ---- | ------ | --------------------------------- | ----------- | -------------- | -------------- | -------------- | -| 20 | 1 | SIMPLE_SNP | A | T | 50 | PASS | . | GT:GQ:DP:HQ | 0/1:54:7:56,60 | 0/0:48:4:51,51 | 0/0:48:4:51,51 | -| 20 | 1110696 | MULTIALLELIC_SNP | A | G,T | 67 | PASS | NS=2;DP=10;AF=0.333,0.667;AA=T;DB | GT:GQ:DP | 1/2:21:6 | 2/1:2:0 | 2/2:35:4 | -| 20 | 1 | SIMPLE_INSERTION | A | AC | 50 | PASS | . | GT:GQ:DP | 0/0:54:7 | 0/0:48:4 | 1/0:61:2 | -| 20 | 1 | INSERTION_BETWEEN_TWO_BASES | AT | ACCT | 50 | PASS | . | GT:GQ:DP | 0/1:35:4 | 0/1:17:2 | 1/1:40:3 | -| 20 | 1234567 | microsat1 | GTCT | G,GTACT | 50 | PASS | . | GT:GQ:DP | 0/1:35:4 | 0/2:17:2 | 1/1:40:3 | -| 20 | 3 | EXAMPLE_MISSING_MNP | CCC | AAA | 50 | PASS | NS=3;DP=9;AA=G | GT | ./1 | 0/0 | 1/1 | - -Expected Bystro VCF preprocessor Output: - -| chrom | pos | type | inputRef | alt | trTv | heterozygotes | heterozygosity | homozygotes | homozygosity | missingGenos | missingness | ac | an | sampleMaf | vcfPos | id | -| ----- | ------- | ------------ | -------- | --- | ---- | --------------- | -------------- | ----------- | ------------ | ------------ | ----------- | --- | --- | --------- | ------- | --------------------------- | -| chr20 | 1 | SNP | A | T | 2 | NA00001 | 0.333 | NA | 0 | NA | 0 | 1 | 6 | 0.167 | 1 | SIMPLE_SNP | -| chr20 | 1110696 | MULTIALLELIC | A | G | 0 | NA00001;NA00002 | 0.667 | NA | 0 | NA | 0 | 2 | 6 | 0.333 | 1110696 | MULTIALLELIC_SNP | -| chr20 | 1110696 | MULTIALLELIC | A | T | 0 | NA00001;NA00002 | 0.667 | NA00003 | 0.333 | NA | 0 | 4 | 6 | 0.667 | 1110696 | MULTIALLELIC_SNP | -| chr20 | 1 | INS | A | +C | 0 | NA00003 | 0.333 | NA | 0 | NA | 0 | 1 | 6 | 0.167 | 1 | SIMPLE_INSERTION | -| chr20 | 1 | INS | A | +CC | 0 | NA00001;NA00002 | 0.667 | NA00003 | 0.333 | NA | 0 | 4 | 6 | 0.667 | 1 | INSERTION_BETWEEN_TWO_BASES | -| chr20 | 1234568 | MULTIALLELIC | T | -3 | 0 | NA00001 | 0.333 | NA00003 | 0.333 | NA | 0 | 3 | 6 | 0.5 | 1234567 | microsat1 | -| chr20 | 1234568 | MULTIALLELIC | T | +A | 0 | NA00002 | 0.333 | NA | 0 | NA | 0 | 1 | 6 | 0.167 | 1234567 | microsat1 | -| chr20 | 3 | MNP | C | A | 2 | NA | 0 | NA00003 | 0.5 | NA00001 | 0.333 | 2 | 4 | 0.5 | 3 | EXAMPLE_MISSING_MNP | -| chr20 | 4 | MNP | C | A | 2 | NA | 0 | NA00003 | 0.5 | NA00001 | 0.333 | 2 | 4 | 0.5 | 3 | EXAMPLE_MISSING_MNP | -| chr20 | 5 | MNP | C | A | 2 | NA | 0 | NA00003 | 0.5 | NA00001 | 0.333 | 2 | 4 | 0.5 | 3 | EXAMPLE_MISSING_MNP | - -Expected Bystro Genotype Dosage Matrix Output: - -```python -import pandas as pd -df = pd.read_feather('example_vcf_dosage_matrix.feather') -print(df) -``` - -| locus | NA0001 | NA0002 | NA0003 | -| ------------------ | ------ | ------ | ------ | -| chr20:1:A:T | 1 | 0 | 0 | -| chr20:1110696:A:G | 1 | 1 | 0 | -| chr20:1110696:A:T | 1 | 1 | 2 | -| chr20:1:A:+C | 0 | 0 | 1 | -| chr20:1:A:+CC | 1 | 1 | 2 | -| chr20:1234568:T:-3 | 1 | 0 | 2 | -| chr20:1234568:T:+A | 0 | 1 | 0 | -| chr20:3:C:A | -1 | 0 | 2 | -| chr20:4:C:A | -1 | 0 | 2 | -| chr20:5:C:A | -1 | 0 | 2 | - -- Note that missing genotypes are represented as -1 in the genotype dosage matrix output -- If a sample's genotype for a variant has any missing alleles, we consider that sample as missing for the variant: in the annotation output it is added to the `missingGenos` column, `an` is subtracted by the zygosity (typically 2 for humans), while in the genotype dosage matrix, the sample's genotype dosage is `-1` -- If every sample's genotype for a variant has 1+ missing alleles, the variant is dropped from all outputs - -#### Explanation for SIMPLE_SNP - -VCF Representation: - -| CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO | FORMAT | NA00001 | NA00002 | NA00003 | -| ----- | --- | ---------- | --- | --- | ---- | ------ | ---- | ----------- | -------------- | -------------- | -------------- | -| 20 | 1 | SIMPLE_SNP | A | T | 50 | PASS | . | GT:GQ:DP:HQ | 0/1:54:7:56,60 | 0/0:48:4:51,51 | 0/0:48:4:51,51 | - -Bystro Representation: - -| chrom | pos | type | inputRef | alt | trTv | heterozygotes | heterozygosity | homozygotes | homozygosity | missingGenos | missingness | ac | an | sampleMaf | vcfPos | id | -| ----- | --- | ---- | -------- | --- | ---- | ------------- | -------------- | ----------- | ------------ | ------------ | ----------- | --- | --- | --------- | ------ | ---------- | -| chr20 | 1 | SNP | A | T | 2 | NA00001 | 0.333 | NA | 0 | NA | 0 | 1 | 6 | 0.167 | 1 | SIMPLE_SNP | - -Bystro Genotype Dosage Matrix Output: - -| locus | NA0001 | NA0002 | NA0003 | -| ----------- | ------ | ------ | ------ | -| chr20:1:A:T | 1 | 0 | 0 | - -The Bystro and VCF formats for simple, well-normalized SNPs are the same. In addition to the position, variant type, reference, and alternate, Bystro's VCF preprocessor (bystro-vcf) also outputs whether a variant is a transition (1), transversion (2) or neither (0), descriptive information about the genotypes, including which samples are heterozygotes, homozygotes, or missing genotypes, vcfPos (which describes the original position in the VCF file, pre-normalization), and the VCF ID. Meanwhile the genotype dosage matrix output shows the number of alternate alleles for each sample at each variant. - -#### Explanation for MULTIALLELIC_SNP - -VCF Representation: - -| CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO | FORMAT | NA00001 | NA00002 | NA00003 | -| ----- | ------- | ---------------- | --- | --- | ---- | ------ | --------------------------------- | -------- | -------- | ------- | -------- | -| 20 | 1110696 | MULTIALLELIC_SNP | A | G,T | 67 | PASS | NS=2;DP=10;AF=0.333,0.667;AA=T;DB | GT:GQ:DP | 1/2:21:6 | 2/1:2:0 | 2/2:35:4 | - -Bystro Representation: - -| chrom | pos | type | inputRef | alt | trTv | heterozygotes | heterozygosity | homozygotes | homozygosity | missingGenos | missingness | ac | an | sampleMaf | vcfPos | id | -| ----- | ------- | ------------ | -------- | --- | ---- | --------------- | -------------- | ----------- | ------------ | ------------ | ----------- | --- | --- | --------- | ------- | ---------------- | -| chr20 | 1110696 | MULTIALLELIC | A | G | 0 | NA00001;NA00002 | 0.667 | NA | 0 | NA | 0 | 2 | 6 | 0.333 | 1110696 | MULTIALLELIC_SNP | -| chr20 | 1110696 | MULTIALLELIC | A | T | 0 | NA00001;NA00002 | 0.667 | NA00003 | 0.333 | NA | 0 | 4 | 6 | 0.667 | 1110696 | MULTIALLELIC_SNP | - -The VCF representation shows two different SNPs at the same position. NA00001 and NA00002 have 1 copy of each allele, while NA00003 has 2 copies of the A>T allele and 0 copies of A>G. Bystro's representation decomposes the multiallelic site into two separate rows, one for each allele. The first row shows the A>G allele, and the second row shows the A>T allele. Since NA00001 and NA00002 are heterozygous for both A>G and A>T, on each line they are listed in the heterozygotes columns, while NA00003 is homozygous for A>T and is listed in the homozygotes column only for the A>T allele row. The zygosity and sampleMaf (sample minor allele frequency) fields are calculated based on the allele in the row. - -#### Explanation for SIMPLE_INSERTION - -VCF Representation: - -| CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO | FORMAT | NA00001 | NA00002 | NA00003 | -| ----- | --- | ---------------- | --- | --- | ---- | ------ | ---- | -------- | -------- | -------- | -------- | -| 20 | 1 | SIMPLE_INSERTION | A | AC | 50 | PASS | . | GT:GQ:DP | 0/0:54:7 | 0/0:48:4 | 1/0:61:2 | - -Bystro Representation: - -| chrom | pos | type | inputRef | alt | trTv | heterozygotes | heterozygosity | homozygotes | homozygosity | missingGenos | missingness | ac | an | sampleMaf | vcfPos | id | -| ----- | --- | ---- | -------- | --- | ---- | ------------- | -------------- | ----------- | ------------ | ------------ | ----------- | --- | --- | --------- | ------ | ---------------- | -| chr20 | 1 | INS | A | +C | 0 | NA00003 | 0.333 | NA | 0 | NA | 0 | 1 | 6 | 0.167 | 1 | SIMPLE_INSERTION | - -The VCF representation shows an insertion of a C base after the A base at position 1. Bystro's representation shows the insertion as occurring at the A base, with the reference base being A and the alternate allele being +C. The heterozygotes column lists NA00003 as heterozygous for the insertion. - -#### Explanation for INSERTION_BETWEEN_TWO_BASES - -VCF Representation: - -| CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO | FORMAT | NA00001 | NA00002 | NA00003 | -| ----- | --- | --------------------------- | --- | ---- | ---- | ------ | ---- | -------- | -------- | -------- | -------- | -| 20 | 1 | INSERTION_BETWEEN_TWO_BASES | AT | ACCT | 50 | PASS | . | GT:GQ:DP | 0/1:35:4 | 0/1:17:2 | 1/1:40:3 | - -Bystro Representation: - -| chrom | pos | type | inputRef | alt | trTv | heterozygotes | heterozygosity | homozygotes | homozygosity | missingGenos | missingness | ac | an | sampleMaf | vcfPos | id | -| ----- | --- | ---- | -------- | --- | ---- | --------------- | -------------- | ----------- | ------------ | ------------ | ----------- | --- | --- | --------- | ------ | --------------------------- | -| chr20 | 1 | INS | A | +CC | 0 | NA00001;NA00002 | 0.667 | NA00003 | 0.333 | NA | 0 | 4 | 6 | 0.667 | 1 | INSERTION_BETWEEN_TWO_BASES | - -The VCF representation shows an insertion of CC between the A and T bases. Bystro's representation shows the insertion as occurring after the A base, with the reference base being A and the alternate allele being +CC. NA00001 and NA00002 are heterozygous for the insertion, while NA00003 is homozygous for the insertion and therefore listed in the homozygotes column. - -#### Explanation for microsat1 - -VCF Representation: - -| CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO | FORMAT | NA00001 | NA00002 | NA00003 | -| ----- | ------- | --------- | ---- | ------- | ---- | ------ | ---- | -------- | -------- | -------- | -------- | -| 20 | 1234567 | microsat1 | GTCT | G,GTACT | 50 | PASS | . | GT:GQ:DP | 0/1:35:4 | 0/2:17:2 | 1/1:40:3 | - -Bystro Representation: - -| chrom | pos | type | inputRef | alt | trTv | heterozygotes | heterozygosity | homozygotes | homozygosity | missingGenos | missingness | ac | an | sampleMaf | vcfPos | id | -| ----- | ------- | ------------ | -------- | --- | ---- | ------------- | -------------- | ----------- | ------------ | ------------ | ----------- | --- | --- | --------- | ------- | --------- | -| chr20 | 1234568 | MULTIALLELIC | T | -3 | 0 | NA00001 | 0.333 | NA00003 | 0.333 | NA | 0 | 3 | 6 | 0.5 | 1234567 | microsat1 | -| chr20 | 1234568 | MULTIALLELIC | T | +A | 0 | NA00002 | 0.333 | NA | 0 | NA | 0 | 1 | 6 | 0.167 | 1234567 | microsat1 | - -The VCF representation shows a multiallelic site with two alleles. The first allele is GTCT>G at position 124567, because the VCF format's POS is the first base of the reference. In reality, this deletion is the deletion of the TCT bases starting at position 1234568, but because of VCF's padding requirements, the VCF format cannot show it as such. Bystro shows this allele at the correct position, 1234568, as a `-3`. 2 samples have this allele, NA00001 and NA00003. NA00001 is heterozygous, and NA00003 is homozygous, and are listed as such in the Bystro output. - -The second allele is GTCT>GTACT, with the insertion of an "A" occuring after the "T" base at position 1234568. Again, because of the VCF format's padding rule, this representation cannot be shown directly in the VCF format, but must be inferred. Bystro normalizes the representation, showing the insertion at the correct base, 1234568. - -#### Explanation for EXAMPLE_MISSING_MNP - -VCF Representation: - -| CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO | FORMAT | NA00001 | NA00002 | NA00003 | -| ----- | --- | ------------------- | --- | --- | ---- | ------ | -------------- | ------ | ------- | ------- | ------- | -| 20 | 3 | EXAMPLE_MISSING_MNP | CCC | AAA | 50 | PASS | NS=3;DP=9;AA=G | GT | ./1 | 0/0 | 1/1 | - -Bystro Representation: - -| chrom | pos | type | inputRef | alt | trTv | heterozygotes | heterozygosity | homozygotes | homozygosity | missingGenos | missingness | ac | an | sampleMaf | vcfPos | id | -| ----- | --- | ---- | -------- | --- | ---- | ------------- | -------------- | ----------- | ------------ | ------------ | ----------- | --- | --- | --------- | ------ | ------------------- | -| chr20 | 3 | MNP | C | A | 2 | NA | 0 | NA00003 | 0.5 | NA00001 | 0.333 | 2 | 4 | 0.5 | 3 | EXAMPLE_MISSING_MNP | -| chr20 | 4 | MNP | C | A | 2 | NA | 0 | NA00003 | 0.5 | NA00001 | 0.333 | 2 | 4 | 0.5 | 3 | EXAMPLE_MISSING_MNP | -| chr20 | 5 | MNP | C | A | 2 | NA | 0 | NA00003 | 0.5 | NA00001 | 0.333 | 2 | 4 | 0.5 | 3 | EXAMPLE_MISSING_MNP | - -The VCF representation shows a multi-nucleotide polymorphism (MNP) at position 3, where 3 bases are changed from CCC to AAA. An MNP is really 3 single nucleotide polymorphisms next to each other, typically linked on the same chromosome. Bystro decomposes the MNP into 3 separate rows, each with a single nucleotide change. The first row shows the first base change, the second row shows the second base change, and the third row shows the third base change. NA00001 was unsuccessfully typed, with 1 of its 2 chromosomes having an ambiguous or low quality genotype ("."). Bystro, to be conservative ("garbage in means garbage out"), counts this sample as having a missing genotype, and subtracts 2 from the `an` (allele number). - -## What is the Bystro Annotation Database? - -To output annotations, the user must point Bystro Annotator at a Bystro Annotator Database, which is a high-performance embedded memory-mapped database used by the Bystro Annotator to label variants. Three default databases are provided, for Humans (hg19 and hg38), and rats (rn7). See the [INSTALL.md#databases](./INSTALL.md#databases) section for more information on how to download these databases. - -Key points: - -- A Bystro Annotation Database is a high-performance memory-mapped key-value database that uses the Lightning Memory Map Database (LMDB) engine. It supports millions of lookups per second on a single machine, and can be used to store and retrieve annotations for millions of variants. - -- Bystro Annotation Databases can be re-created from the YAML configuration file corresponding to that database, and new databases with different information can be created by editing the YAML configuration file, and re-running the Bystro Annotation Database creation process. - -- To create a Bystro Annotation Database, the user needs to provide a YAML configuration file that specifies all of the source file locations, the location to write the database, and the tracks/fields to output, and then runs `bystro-build.pl --config /path/to/config`. This will create a Bystro Annotation Database that can be used to annotate VCF or SNP files. - -## Annotation Fields for Default Human Assembly hg38 and hg19 Bystro Annotation Databases - -The default Bystro Annotation Databases for humans (hg38 and hg19) contain a large number of annotations, including transcript annotations, regulatory annotations, and conservation scores. Below are the default fields outputted by the Bystro Annotator when using the latest default hg38 and hg19 databases. See [INSTALL.md#databases](./INSTALL.md#databases) for more information on how to download these databases. - -The total number of annotation fields outputted when using the default Human hg38 Bystro Annotation Databases is 226. - -### Basic Fields - -Sourced from the input file, or calculated based on input fields from the VCF or SNP file pre-processor. - -`chrom` - chromosome, always prepended with "chr" - -`pos` - genomic position after Bystro normalizes variant representations - -- Positions always correspond to the first affected base. - -`type` - the type of variant - -- VCF format types: `SNP`, `INS`, `DEL`, `MULTIALLELIC` - - Multi-nucleotide polymorphisms are decomposed into separate rows of `type` "SNP", each of 1 variant. In the next release of Bystro these will be called "MNP" and have a "linkage" property to enable you to reconstruct the original MNP, while still retaining the full set of per-SNP annotations - - Multiallelics are decomposed into separate rows, but retain the "MULTIALLELIC" `type` -- SNP format types: `SNP`, `INS`, `DEL`, `MULTIALLELIC`, `DENOVO` - -`inputRef` - the reference base, as it is present in the input file, after variant and reference normalization. - -- This is generated by the input file pre-processor (hence `input` in the name), and is always 1 base long - the affected reference base at that position - -`alt` - the alternate/nonreference allele - -- VCF multi-allelic and MNP sites are decomposed into individual entries of a single allele. - - Genotypes are properly segregated per allele - -`trTv` - transition:transversion ratio for your dataset at this position - -- Possible values: `0`, `1`, `2` - - 0 indicates neither transition nor transversion (which occurs when the alternate allele is an insertion or deleetion) - - 1 is a transition (purine -> purine or pyrimidine -> pyrimidine) - - 2 is a transversion (pyridine -> pyrimidine or vice versa) - -`heterozygotes` - The array of heterozygous sample labels - -`heterozygosity` - The fraction of all samples (excluding missing samples) that are heterozygous for the alternate allele - -`homozygotes` - The array of homozygous sample labels - -`homozygosity` - The fraction of all samples that are homozygous for the alternate allele - -`missingGenos` - The samples that did not have a genotype (e.g., ".") at the site. If an individual has at least 1 missing genotype, they are considered missing for the site. - -- For instance, `0|.` , `.|.` , `.|0` are all considered missing and result in the sample being added to the `missingGenos` field, and the `an` being decremented by the zygosity (typically 2 for humans) - -`missingness` - The fraction of all samples that have missing genotypes for the alternate allele - -`ac` - The alternate allele count - -`an` - The total non-mising allele count - -`sampleMaf` - The in-sample alternate allele frequency - -`vcfPos` - The original VCF `POS`, unaffected by Bystro normalization transformations - -`id` - The VCF `ID` field, if any - -`discordant` - TRUE if the reference base provided in the input VCF matches the Bystro-annotated UCSC reference, FALSE otherwise - -`ref` - The Bystro-annotated reference base(s), from the 'ref' track in the Bystro Annotation Database - -- In the default Bystro Annotation Database, this is sourced from the UCSC reference genome -- In custom Bystro Annotation Databases, this can be sourced from any reference genome -- In the case of insertions the `ref` will be 2 bases long, the base just before the insertion, and the one right after -- In the case of deletions, the ref will be as long as the deletion, up to 32 bases (after that, the ref will be truncated) - -
- -### Transcript Annotations - -In the default Bystro Annotation Database, we source transcript annotations from the UCSC refGene track, joined on other UCSC tracks: knownToEnsembl, kgXref, knownCanonical. - -- See [refGene](https://genome.ucsc.edu/cgi-bin/hgTables?db=hg38&hgta_group=genes&hgta_track=refSeqComposite&hgta_table=refGene&hgta_doSchema=describe+table+schema) and [kgXref](https://genome.cse.ucsc.edu/cgi-bin/hgTables?hgsid=1893397768_GDljX7p8FQaqUVJ3FZD1cSUFpeV2&hgta_doSchemaDb=hg38&hgta_doSchemaTable=kgXref) for more information - -- In custom Bystro Annotation Databases, these annotations can be sourced from any UCSC transcript track, and multiple such `gene` type tracks can be defined in a single Bystro Annotation Database (annotations for all will be outputted) - -- **When a site is intergenic, all `refSeq` annotations will be `NA`** - -`refSeq.siteType` - the kind of effect the `alt` allele has on this transcript. - -- Possible types: `intronic`, `exonic`, `UTR3`, `UTR5`, `spliceAcceptor`, `spliceDonor`, `ncRNA` - -`refSeq.exonicAlleleFunction` - The coding effect of the variant - -- Possible values: `synonymous`, `nonSynonymous`, `indel-nonFrameshift`, `indel-frameshift`, `stopGain`, `stopLoss`, `startLoss` -- This will be `NA `for non-coding `siteType` - -`refSeq.refCodon`- The reference codon based on _in silico_ transcription of the reference assembly - -`refSeq.altCodon` - The _in silico_ transcribed codon after modification by the `alt` allele - -`refSeq.refAminoAcid`- The amino acid based on _in silico_ translation of the reference transcript - -`refSeq.altAminoAcid` - The _in-silico_ translated amino acid after modification by the `alt` allele - -`refSeq.codonPosition` - The site's position within the codon (1, 2, 3) - -`refSeq.codonNumber` - The codon number within the transcript - -`refSeq.strand` - The positive or negative (Watson or Crick) strand - -`refSeq.name` - RefSeq transcript ID - -`refSeq.name2` - RefSeq gene sybmol - -`refSeq.description` - The long form description of the RefSeq transcript - -`refSeq.kgID` - UCSC's Known Genes ID - -`refSeq.mRNA` - The mRNA ID, the transcript ID starting with NM\_ - -`refSeq.spID` - UniProt protein accession number - -`refSeq.spDisplayID` - UniProt display ID - -`refSeq.protAcc` - NCBI protein accession number - -`refSeq.rfamAcc` - Rfam accession number - -`refSeq.tRnaName` - Name from the tRNA track - -`refSeq.ensemblID` - The Ensembl transcript id - -`refSeq.isCanonical` - Whether the transcript is the canonical splice variant for the gene - -
- -### nearest.refSeq - -The nearest transcript(s), calculated by trascript start, transcript end boundaries. Transcripts that are equidistant are all outputted. - -`nearest.refSeq.name` - The nearest transcript(s) RefSeq transcript ID - -`nearest.refSeq.name2` - The nearest transcript(s) RefSeq gene symbol - -`nearest.refSeq.dist` - The distance to these transcripts. Negative values indicate the site is downstream of the transcript - -
- -### nearestTss.refSeq - -The nearest transcript(s), calculated by the distance to the nearest transcript start site (TSS). Transcripts with the same TSS are all outputted. - -`nearestTss.refSeq.name` - The nearest transcript(s) RefSeq transcript ID - -`nearestTss.refSeq.name2` - The nearest transcript(s) RefSeq gene symbol - -`nearestTss.refSeq.dist` - A single value; the distance to these transcripts' transcription start site. Negative values indicate the site is downstream of the TSS - -
- -### gnomAD Annotations - -Annotations from the gnomAD v4.1 (hg38 assembly annotations) or v2.1.1 (hg19 assembly annotations) whole-genome set - -Since the data available for hg19 and hg38 differ, we will discuss them separately below. - -
- -### hg38 gnomad.joint - -Annotations from the gnomAD v4.1 (hg38 assembly annotations) joint set - -`gnomad.joint.alt`: The Bystro VCF-preprocessor's ALT record for this gnomAD site. This should always match the row's `alt` field value - -`gnomad.joint.id`: The VCF `ID` field - -`gnomad.joint.AF_exomes`: Alternate allele frequency in exomes - -`gnomad.joint.AN_exomes`: Total number of alleles in exomes - -`gnomad.joint.AF_genomes`: Alternate allele frequency in genomes - -`gnomad.joint.AF_joint`: Alternate allele frequency in joint subset - -`gnomad.joint.AN_joint`: Total number of alleles in joint subset - -`gnomad.joint.AN_genomes`: Total number of alleles in genomes - -`gnomad.joint.AF_joint_XX`: Alternate allele frequency in XX samples in joint subset - -`gnomad.joint.AN_joint_XX`: Total number of alleles in XX samples in joint subset - -`gnomad.joint.AF_joint_XY`: Alternate allele frequency in XY samples in joint subset - -`gnomad.joint.AN_joint_XY`: Total number of alleles in XY samples in joint subset - -`gnomad.joint.AF_joint_afr`:Alternate allele frequency in samples of African/African-American ancestry in joint subset - -`gnomad.joint.AN_joint_afr`: Total number of alleles in samples of African/African-American ancestry in joint subset - -`gnomad.joint.AF_joint_ami`: Alternate allele frequency in samples of Amish ancestry in joint subset - -`gnomad.joint.AN_joint_ami`: Total number of alleles in samples of Amish ancestry in joint subset - -`gnomad.joint.AF_joint_amr`: Alternate allele frequency in samples of Latino ancestry in joint subset - -`gnomad.joint.AN_joint_amr`: Total number of alleles in samples of Latino ancestry in joint subset - -`gnomad.joint.AF_joint_asj`: Alternate allele frequency in samples of Ashkenazi Jewish ancestry in joint subset - -`gnomad.joint.AN_joint_asj`: Total number of alleles in samples of Ashkenazi Jewish ancestry in joint subset - -`gnomad.joint.AF_joint_eas`: Alternate allele frequency in samples of East Asian ancestry in joint subset - -`gnomad.joint.AN_joint_eas`: Total number of alleles in samples of East Asian ancestry in joint subset - -`gnomad.joint.AF_joint_fin`: Alternate allele frequency in samples of Finnish ancestry in joint subset - -`gnomad.joint.AN_joint_fin`: Total number of alleles in samples of Finnish ancestry in joint subset - -`gnomad.joint.AF_joint_mid`: Alternate allele frequency in samples of Middle Eastern ancestry in joint subset - -`gnomad.joint.AN_joint_mid`: Total number of alleles in samples of Middle Eastern ancestry in joint subset - -`gnomad.joint.AF_joint_nfe`: Alternate allele frequency in samples of Non-Finnish European ancestry in joint subset - -`gnomad.joint.AN_joint_nfe`: Total number of alleles in samples of Non-Finnish European ancestry in joint subset - -`gnomad.joint.AF_joint_raw`: Alternate allele frequency in samples, before removing low-confidence genotypes in joint dataset - -`gnomad.joint.AN_joint_raw`: Total number of alleles in samples, before removing low-confidence genotypes in joint dataset - -`gnomad.joint.AF_joint_remaining`: Alternate allele frequency in samples in the Remaining individuals genetic ancestry group in joint dataset - -`gnomad.joint.AN_joint_remaining`: Total number of alleles in samples in the Remaining individuals genetic ancestry group in joint dataset - -`gnomad.joint.AF_joint_sas`: Alternate allele frequency in samples in the South Asian genetic ancestry group in joint dataset - -`gnomad.joint.AN_joint_sas`: Total number of alleles in samples in the South Asian genetic ancestry group in joint dataset - -`gnomad.joint.AF_grpmax_joint`: Maximum allele frequency across genetic ancestry groups in the joint subset - -`gnomad.joint.AN_grpmax_joint`: Total number of alleles in the genetic ancestry group with the maximum allele frequency in the joint subset - -
- -### hg38 gnomad.genomes - -Annotations from the gnomAD v4.1 whole-genome set - -`gnomad.genomes.alt`: The Bystro VCF-preprocessor's ALT record for this gnomAD site. This should always match the row's `alt` field value - -`gnomad.genomes.id`: The VCF `ID` field - -`gnomad.genomes.spliceai_ds_max`: Illumina's SpliceAI max delta score; interpreted as the probability of the variant being splice-altering - -`gnomad.genomes.pangolin_largest_ds`: Pangolin's largest delta score across 2 splicing consequences, which reflects the probability of the variant being splice-altering - -`gnomad.genomes.phylop`: Base-wise conservation score across the 241 placental mammals in the Zoonomia project. Score ranges from -20 to 9.28, and reflects acceleration (faster evolution than expected under neutral drift, assigned negative scores) - -`gnomad.genomes.sift_max`: Score reflecting the scaled probability of the amino acid substitution being tolerated, ranging from 0 to 1. Scores below 0.05 are predicted to impact protein function. We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript - -`gnomad.genomes.polyphen_max`: Score that predicts the possible impact of an amino acid substitution on the structure and function of a human protein, ranging from 0.0 (tolerated) to 1.0 (deleterious). We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript - -`gnomad.genomes.AN`: Total number of alleles - -`gnomad.genomes.AF`: Alternate allele frequency - -`gnomad.genomes.AF_XX`: Alternate allele frequency in XX samples - -`gnomad.genomes.AN_XX`: Total number of alleles in XX samples - -`gnomad.genomes.AF_XY`: Alternate allele frequency in XY samples - -`gnomad.genomes.AN_XY`: Total number of alleles in XY samples - -`gnomad.genomes.AF_afr`: Alternate allele frequency in samples of African/African-American ancestry - -`gnomad.genomes.AN_afr`: Total number of alleles in samples of African/African-American ancestry - -`gnomad.genomes.AF_ami`: Alternate allele frequency in samples of Amish ancestry - -`gnomad.genomes.AN_ami`: Total number of alleles in samples of Amish ancestry - -`gnomad.genomes.AF_amr`: Alternate allele frequency in samples of Latino ancestry - -`gnomad.genomes.AN_amr`: Total number of alleles in samples of Latino ancestry - -`gnomad.genomes.AF_asj`: Alternate allele frequency in samples of Ashkenazi Jewish ancestry - -`gnomad.genomes.AN_asj`: Total number of alleles in samples of Ashkenazi Jewish ancestry - -`gnomad.genomes.AF_eas`: Alternate allele frequency in samples of East Asian ancestry - -`gnomad.genomes.AN_eas`: Total number of alleles in samples of East Asian ancestry - -`gnomad.genomes.AF_fin`: Alternate allele frequency in samples of Finnish ancestry - -`gnomad.genomes.AN_fin`: Total number of alleles in samples of Finnish ancestry - -`gnomad.genomes.AF_mid`: Alternate allele frequency in samples of Middle Eastern ancestry - -`gnomad.genomes.AN_mid`: Total number of alleles in samples of Middle Eastern ancestry - -`gnomad.genomes.AF_nfe`: Alternate allele frequency in samples of Non-Finnish European ancestry - -`gnomad.genomes.AN_nfe`: Total number of alleles in samples of Non-Finnish European ancestry - -`gnomad.genomes.AF_remaining`: Alternate allele frequency in samples of Remaining individuals ancestry - -`gnomad.genomes.AN_remaining`: Total number of alleles in samples of Remaining individuals ancestry - -`gnomad.genomes.AF_sas`: Alternate allele frequency in samples of South Asian ancestry - -`gnomad.genomes.AN_sas`: Total number of alleles in samples of South Asian ancestry - -`gnomad.genomes.AF_grpmax`: Maximum allele frequency across genetic ancestry groups - -`gnomad.genomes.AN_grpmax`: Total number of alleles in the genetic ancestry group with the maximum allele frequency - -
- -### hg38 gnomad.exomes - -Annotations from gnomAD v4.1 whole-exome set - -`gnomad.exomes.alt`: The Bystro VCF-preprocessor's ALT record for this gnomAD site. This should always match the row's `alt` field value - -`gnomad.exomes.id`: The VCF `ID` field - -`gnomad.exomes.spliceai_ds_max`: Illumina's SpliceAI max delta score; interpreted as the probability of the variant being splice-altering - -`gnomad.exomes.pangolin_largest_ds`: Pangolin's largest delta score across 2 splicing consequences, which reflects the probability of the variant being splice-altering - -`gnomad.exomes.phylop`: Base-wise conservation score across the 241 placental mammals in the Zoonomia project. Score ranges from -20 to 9.28, and reflects acceleration (faster evolution than expected under neutral drift, assigned negative scores) - -`gnomad.exomes.sift_max`: Score reflecting the scaled probability of the amino acid substitution being tolerated, ranging from 0 to 1. Scores below 0.05 are predicted to impact protein function. We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript - -`gnomad.exomes.polyphen_max`: Score that predicts the possible impact of an amino acid substitution on the structure and function of a human protein, ranging from 0.0 (tolerated) to 1.0 (deleterious). We prioritize max scores for MANE Select transcripts where possible and otherwise report a score for the canonical transcript - -`gnomad.exomes.AN`: Total number of alleles - -`gnomad.exomes.AF`: Alternate allele frequency - -`gnomad.exomes.AF_XX`: Alternate allele frequency in XX samples - -`gnomad.exomes.AN_XX`: Total number of alleles in XX samples - -`gnomad.exomes.AF_XY`: Alternate allele frequency in XY samples - -`gnomad.exomes.AN_XY`: Total number of alleles in XY samples - -`gnomad.exomes.AF_afr`: Alternate allele frequency in samples of African/African-American ancestry - -`gnomad.exomes.AN_afr`: Total number of alleles in samples of African/African-American ancestry - -`gnomad.exomes.AF_amr`: Alternate allele frequency in samples of Latino ancestry - -`gnomad.exomes.AN_amr`: Total number of alleles in samples of Latino ancestry - -`gnomad.exomes.AF_asj`: Alternate allele frequency in samples of Ashkenazi Jewish ancestry - -`gnomad.exomes.AN_asj`: Total number of alleles in samples of Ashkenazi Jewish ancestry - -`gnomad.exomes.AF_eas`: Alternate allele frequency in samples of East Asian ancestry - -`gnomad.exomes.AN_eas`: Total number of alleles in samples of East Asian ancestry - -`gnomad.exomes.AF_fin`: Alternate allele frequency in samples of Finnish ancestry - -`gnomad.exomes.AN_fin`: Total number of alleles in samples of Finnish ancestry - -`gnomad.exomes.AF_mid`: Alternate allele frequency in samples of Middle Eastern ancestry - -`gnomad.exomes.AN_mid`: Total number of alleles in samples of Middle Eastern ancestry - -`gnomad.exomes.AF_nfe`: Alternate allele frequency in samples of Non-Finnish European ancestry - -`gnomad.exomes.AN_nfe`: Total number of alleles in samples of Non-Finnish European ancestry - -`gnomad.exomes.AF_non_ukb`: Alternate allele frequency in non_ukb subset - -`gnomad.exomes.AN_non_ukb`: Total number of alleles in non_ukb subset - -`gnomad.exomes.AF_non_ukb_afr`: Alternate allele frequency in samples of African/African-American ancestry in non_ukb subset - -`gnomad.exomes.AN_non_ukb_afr`: Total number of alleles in samples of African/African-American ancestry in non_ukb subset - -`gnomad.exomes.AF_non_ukb_amr`: Alternate allele frequency in samples of Latino ancestry in non_ukb subset - -`gnomad.exomes.AN_non_ukb_amr`: Total number of alleles in samples of Latino ancestry in non_ukb subset - -`gnomad.exomes.AF_non_ukb_asj`: Alternate allele frequency in samples of Ashkenazi Jewish ancestry in non_ukb subset - -`gnomad.exomes.AN_non_ukb_asj`: Total number of alleles in samples of Ashkenazi Jewish ancestry in non_ukb subset - -`gnomad.exomes.AF_non_ukb_eas`: Alternate allele frequency in samples of East Asian ancestry in non_ukb subset - -`gnomad.exomes.AN_non_ukb_eas`: Total number of alleles in samples of East Asian ancestry in non_ukb subset - -`gnomad.exomes.AF_non_ukb_fin`: Alternate allele frequency in samples of Finnish ancestry in non_ukb subset - -`gnomad.exomes.AN_non_ukb_fin`: Total number of alleles in samples of Finnish ancestry in non_ukb subset - -`gnomad.exomes.AF_non_ukb_mid`: Alternate allele frequency in samples of Middle Eastern ancestry in non_ukb subset - -`gnomad.exomes.AN_non_ukb_mid`: Total number of alleles in samples of Middle Eastern ancestry in non_ukb subset - -`gnomad.exomes.AF_non_ukb_nfe`: Alternate allele frequency in samples of Non-Finnish European ancestry in non_ukb subset - -`gnomad.exomes.AN_non_ukb_nfe`: Total number of alleles in samples of Non-Finnish European ancestry in non_ukb subset - -`gnomad.exomes.AF_non_ukb_remaining`: Alternate allele frequency in samples of Remaining individuals ancestry in non_ukb subset - -`gnomad.exomes.AN_non_ukb_remaining`: Total number of alleles in samples of Remaining individuals ancestry in non_ukb subset - -`gnomad.exomes.AF_non_ukb_sas`: Alternate allele frequency in samples of South Asian ancestry in non_ukb subset - -`gnomad.exomes.AN_non_ukb_sas`: Total number of alleles in samples of South Asian ancestry in non_ukb subset - -`gnomad.exomes.AF_remaining`: Alternate allele frequency in samples of Remaining individuals ancestry - -`gnomad.exomes.AN_remaining`: Total number of alleles in samples of Remaining individuals ancestry - -`gnomad.exomes.AF_sas`: Alternate allele frequency in samples of South Asian ancestry - -`gnomad.exomes.AN_sas`: Total number of alleles in samples of South Asian ancestry - -`gnomad.exomes.AF_grpmax`: Maximum allele frequency across genetic ancestry groups - -`gnomad.exomes.AN_grpmax`: Total number of alleles in the genetic ancestry group with the maximum allele frequency - -`gnomad.exomes.AF_grpmax_non_ukb`: Maximum allele frequency across genetic ancestry groups in non_ukb subset - -`gnomad.exomes.AN_grpmax_non_ukb`: Total number of alleles in the genetic ancestry group with the maximum allele frequency in non_ukb subset - -`gnomad.exomes.AF_grpmax_joint`: Maximum allele frequency across genetic ancestry groups in joint subset - -`gnomad.exomes.AN_grpmax_joint`: Total number of alleles in the genetic ancestry group with the maximum allele frequency in joint subset - -
- -### hg19 gnomad.genomes (v2.1.1 - latest release for hg19) - -`gnomad.genomes.alt`: The Bystro VCF-preprocessor's ALT record for this gnomAD site. This should always match the row's `alt` field value - -`gnomad.genomes.id`: The VCF `ID` field - -`gnomad.genomes.AN`: Total number of alleles - -`gnomad.genomes.AF`: Alternate allele frequency - -`gnomad.genomes.AN_female`: Total number of alleles in female samples - -`gnomad.genomes.AF_female`: Alternate allele frequency in female samples - -`gnomad.genomes.non_neuro_AN`: Total number of alleles in samples in the non_neuro subset - -`gnomad.genomes.non_neuro_AF`: Alternate allele frequency in samples in the non_neuro subset - -`gnomad.genomes.non_topmed_AN`: Total number of alleles in samples in the non_topmed subset - -`gnomad.genomes.non_topmed_AF`: Alternate allele frequency in samples in the non_topmed subset - -`gnomad.genomes.controls_AN`: Total number of alleles in samples in the controls subset - -`gnomad.genomes.controls_AF`: Alternate allele frequency in samples in the controls subset - -`gnomad.genomes.AN_nfe_seu`: Total number of alleles in samples of Southern European ancestry - -`gnomad.genomes.AF_nfe_seu`: Alternate allele frequency in samples of Southern European ancestry - -`gnomad.genomes.AN_nfe_onf`: Total number of alleles in samples of Other Non-Finnish European ancestry - -`gnomad.genomes.AF_nfe_onf`: Alternate allele frequency in samples of Other Non-Finnish European ancestry - -`gnomad.genomes.AN_amr`: Total number of alleles in samples of Latino ancestry - -`gnomad.genomes.AF_amr`: Alternate allele frequency in samples of Latino ancestry - -`gnomad.genomes.AN_eas`: Total number of alleles in samples of East Asian ancestry - -`gnomad.genomes.AF_eas`: Alternate allele frequency in samples of East Asian ancestry - -`gnomad.genomes.AN_nfe_nwe`: Total number of alleles in samples of Northwestern European ancestry - -`gnomad.genomes.AF_nfe_nwe`: Alternate allele frequency in samples of Northwestern European ancestry - -`gnomad.genomes.AN_nfe_est`: Total number of alleles in samples of Estonian ancestry - -`gnomad.genomes.AF_nfe_est`: Alternate allele frequency in samples of Estonian ancestry - -`gnomad.genomes.AN_nfe`: Total number of alleles in samples of Non-Finnish European ancestry - -`gnomad.genomes.AF_nfe`: Alternate allele frequency in samples of Non-Finnish European ancestry - -`gnomad.genomes.AN_fin`: Total number of alleles in samples of Finnish ancestry - -`gnomad.genomes.AF_fin`: Alternate allele frequency in samples of Finnish ancestry - -`gnomad.genomes.AN_asj`: Total number of alleles in samples of Ashkenazi Jewish ancestry - -`gnomad.genomes.AF_asj`: Alternate allele frequency in samples of Ashkenazi Jewish ancestry - -`gnomad.genomes.AN_oth`: Total number of alleles in samples of Other ancestry - -`gnomad.genomes.AF_oth`: Alternate allele frequency in samples of Other ancestry - -
- -### hg19 gnomad.exomes (v2.1.1 - latest release for hg19) - -Annotations from the gnomAD v2.1.1 exome set - -`gnomad.exomes.alt`: The Bystro VCF-preprocessor's ALT record for this gnomAD site. This should always match the row's `alt` field value - -`gnomad.exomes.id`: The VCF `ID` field - -`gnomad.exomes.AN`: Total number of alleles - -`gnomad.exomes.AF`: Alternate allele frequency - -`gnomad.exomes.AN_female`: Total number of alleles in female samples - -`gnomad.exomes.AF_female`: Alternate allele frequency in female samples - -`gnomad.exomes.non_cancer_AN`: Total number of alleles in samples in the non_cancer subset - -`gnomad.exomes.non_cancer_AF`: Alternate allele frequency in samples in the non_cancer subset - -`gnomad.exomes.non_neuro_AN`: Total number of alleles in samples in the non_neuro subset - -`gnomad.exomes.non_neuro_AF`: Alternate allele frequency in samples in the non_neuro subset - -`gnomad.exomes.non_topmed_AN`: Total number of alleles in samples in the non_topmed subset - -`gnomad.exomes.non_topmed_AF`: Alternate allele frequency in samples in the non_topmed subset - -`gnomad.exomes.controls_AN`: Total number of alleles in samples in the controls subset - -`gnomad.exomes.controls_AF`: Alternate allele frequency in samples in the controls subset - -`gnomad.exomes.AN_nfe_seu`: Total number of alleles in samples of Southern European ancestry - -`gnomad.exomes.AF_nfe_seu`: Alternate allele frequency in samples Southern European ancestry - -`gnomad.exomes.AN_nfe_bgr`: Total number of alleles in samples of Bulgarian (Eastern European) ancestry - -`gnomad.exomes.AF_nfe_bgr`: Alternate allele frequency in samples of Bulgarian (Eastern European) ancestry - -`gnomad.exomes.AN_afr`: Total number of alleles in samples of African/African-American ancestry - -`gnomad.exomes.AF_afr`: Alternate allele frequency in samples of African/African-American ancestry - -`gnomad.exomes.AN_sas`: Total number of alleles in samples of South Asian ancestry - -`gnomad.exomes.AF_sas`: Alternate allele frequency in samples of South Asian ancestry - -`gnomad.exomes.AN_nfe_onf`: Total number of alleles in samples of Other Non-Finnish European ancestry - -`gnomad.exomes.AF_nfe_onf`: Alternate allele frequency in samples of Other Non-Finnish European ancestry - -`gnomad.exomes.AN_amr`: Total number of alleles in samples of Latino ancestry - -`gnomad.exomes.AF_amr`: Alternate allele frequency in samples of Latino ancestry - -`gnomad.exomes.AN_eas`: Total number of alleles in samples of East Asian ancestry - -`gnomad.exomes.AF_eas`: Alternate allele frequency in samples of East Asian ancestry - -`gnomad.exomes.AN_nfe_swe`: Total number of alleles in samples of Swedish ancestry - -`gnomad.exomes.AF_nfe_swe`: Alternate allele frequency in samples of Swedish ancestry - -`gnomad.exomes.AN_nfe_nwe`: Total number of alleles in samples of Northwestern European ancestry - -`gnomad.exomes.AF_nfe_nwe`: Alternate allele frequency in samples of Northwestern European ancestry - -`gnomad.exomes.AN_eas_jpn`: Total number of alleles in samples of Japanese ancestry - -`gnomad.exomes.AF_eas_jpn`: Alternate allele frequency in samples of Japanese ancestry - -`gnomad.exomes.AN_eas_kor`: Total number of alleles in samples of Korean ancestry - -`gnomad.exomes.AF_eas_kor`: Alternate allele frequency in samples of Korean ancestry - -
- -### [dbSNP](https://www.ncbi.nlm.nih.gov/snp) - -dbSNP 155 annotations. Descriptions taken from UCSC's [reference on dbSNP155](https://genome.ucsc.edu/cgi-bin/hgTrackUi?db=hg38&g=dbSnp155Composite) - -`dbSNP.id`: The dbSN VCF `ID` - -`dbSNP.alt`: The Bystro VCF-preprocessor's ALT record for this dbSNP site. This should always match the row's `alt` field value - -`dbSNP.TOMMO`: Allele frequency from the Tohoku Medical Megabank Project contains an allele frequency panel of 3552 Japanese individuals, including the X chromosome - -`dbSNP.ExAC`: Allele frequency from the Exome Aggregation Consortium (ExAC) dataset contains 60,706 unrelated individuals sequenced as part of various disease-specific and population genetic studies. Individuals affected by severe pediatric disease have been removed - -`dbSNP.GnomAD`: Allele frequency from the gnomAD v3 project. This gnomAD genome dataset includes a catalog containing 602M SNVs and 105M indels based on the whole-genome sequencing of 71,702 samples mapped to the GRCh38 build of the human reference genome. - -`dbSNP.Korea1K`: Allele frequency from the Korea1K dataset, which contains 1,094 Korean personal genomes with clinical information - -`dbSNP.GoNL`: Allele frequency from the Genome of the Netherlands (GoNL) project. The Genome of the Netherlands (GoNL) Project characterizes DNA sequence variation, common and rare, for SNVs and short insertions and deletions (indels) and large deletions in 769 individuals of Dutch ancestry selected from five biobanks under the auspices of the Dutch hub of the Biobanking and Biomolecular Research Infrastructure (BBMRI-NL). - -`dbSNP.KOREAN`: Allele frequency from the Korean Reference Genome Database contains data for 1,465 Korean individuals - -`dbSNP.TWINSUK`: Allele frequency from the TwinsUK project. The UK10K - TwinsUK project contains 1854 samples from the Department of Twin Research and Genetic Epidemiology (DTR). The dataset contains data obtained from the 11,000 identical and non-identical twins between the ages of 16 and 85 years old. - -`dbSNP.Vietnamese`: Allele frequency from the Kinh Vietnamese database contains 24.81 million variants (22.47 million single nucleotide polymorphisms (SNPs) and 2.34 million indels), of which 0.71 million variants are novel - -`dbSNP.GENOME_DK`: Allele frequency from the Danish reference pan genome [phase II](https://www.ncbi.nlm.nih.gov/bioproject/?term=PRJEB19794). The dataset contains the sequencing of Danish parent-offspring trios to determine genomic variation within the Danish population. - -`dbSNP.GoESP`: Allele frequency from the NHLBI Exome Sequencing Project (ESP) dataset. The NHLBI Grand Opportunity Exome Sequencing Project (GO-ESP) dataset contains 6,503 samples drawn from multiple ESP cohorts and represents all of the ESP exome variant data. - -`dbSNP.GnomAD_exomes`: Allele frequency from the Genome Aggregation Database (gnomAD) exome dataset. The gnomAD exome dataset comprises a total of 16 million SNVs and 1.2 million indels from 125,748 exomes in 14 populations - -`dbSNP.Siberian`: Allele frequency from a dataset that contains paired-end whole-genome sequencing data of 28 modern-day humans from Siberia and Western Russia. - -`dbSNP.PRJEB37584`: Allele frequencies from the [PRJEB37584](https://www.ebi.ac.uk/ena/browser/view/PRJEB37584) dataset. The dataset contains genome-wide genotype analysis that identified copy number variations in cranial meningiomas in Chinese patients, and demonstrated diverse CNV burdens among individuals with diverse clinical features. - -`dbSNP.SGDP_PRJ`: Allele frequencies from the [SGDP_PRJ](https://www.ebi.ac.uk/ena/browser/view/PRJEB9586) dataset. The Simons Genome Diversity Project dataset contains 263 C-panel fully public samples and 16 B-panel fully public samples for a total of 279 samples. - -`dbSNP.1000Genomes`: Allele frequency from the 1000 Genomes Project dataset. The 1000 Genomes Project dataset contains 2,504 individuals from 26 populations across Africa, East Asia, Europe, and the Americas. - -`dbSNP.dbGaP_PopFreq`: Allele frequency from the new source of dbGaP aggregated frequency data (>1 Million Subjects) provided by dbSNP. - -`dbSNP.NorthernSweden`: Allele frequency from a dataset that contains 300 whole-genome sequenced human samples from the county of Vasterbotten in northern Sweden. - -`dbSNP.HapMap`: Allele frequency from the HapMap project dataset. The International HapMap Project contains samples from African, Asian, or European populations. - -`dbSNP.TOPMED`: Allele frequencies from the TOPMED dataset, which contains freeze 8 panel that includes about 158,000 individuals. The approximate ethnic breakdown is European(41%), African (31%), Hispanic or Latino (15%), East Asian (9%), and unknown (4%) ancestry. - -`dbSNP.ALSPAC`: Allele frequency from the Avon Longitudinal Study of Parents and Children (ALSPAC) dataset. The UK10K - Avon Longitudinal Study of Parents and Children project contains 1927 sample including individuals obtained from the ALSPAC population. This population contains more than 14,000 mothers enrolled during pregnancy in 1991 and 1992. - -`dbSNP.Qatari`: Allele frequency from the Qatar Genome dataset. The dataset contains initial mappings of the genomes of more than 1,000 Qatari nationals. - -`dbSNP.MGP`: MGP contains aggregated information on 267 healthy individuals, representative of the Spanish population that were used as controls in the MGP (Medical Genome Project). - -
- -### [cadd](http://cadd.gs.washington.edu) - -A score >=0 that indicates deleteriousness of a variant. Variants with cadd > 15 are more likely to be deleterious. -See http://cadd.gs.washington.edu. - -
- -### [caddIndel](http://cadd.gs.washington.edu) - -A score >=0 that indicates deleteriousness of a variant. Variants with cadd > 15 are more likely to be deleterious. -See http://cadd.gs.washington.edu. - -caddIndel scores are only defined for indels and MNPs. For SNPs, use the `cadd` field. - -- Note that because Bystro decomposes MNPs into "SNP" records, the `caddIndel` field will occasionally be populated for SNPs (which are in fact part of MNPs in the CADD Indel dataset). - -`caddIndel.alt`: The Bystro VCF-preprocessor's ALT record for this CADD site. This should always match the row's `alt` field value - -`caddIndel.id`: The CADD VCF `ID` - -`caddIndel.PHRED`: The CADD PHRED score for the insertion or deletion - -
- -### clinvarVcf - -ClinVar annotations, sourced from the ClinVar VCF dataset - -`clinvarVcf.id`: The ClinVar VCF `ID` - -`clinvarVcf.alt`: The Bystro VCF-preprocessor's ALT record for this ClinVar site. This should always match the row's `alt` field value - -`clinvarVcf.AF_ESP`: Allele frequencies from GO-ESP - -`clinvarVcf.AF_EXAC`: Allele frequencies from ExAC - -`clinvarVcf.AF_TGP`: Allele frequencies from TGP - -`clinvarVcf.ALLELEID`: The ClinVar Allele ID - -`clinvarVcf.CLNDN`: ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB - -`clinvarVcf.CLNDNINCL`: For included Variant : ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB - -`clinvarVcf.CLNHGVS`: Top-level (primary assembly, alt, or patch) HGVS expression - -`clinvarVcf.CLNREVSTAT`: ClinVar review status of germline classification for the Variation ID - -`clinvarVcf.CLNSIG`: Aggregate germline classification for this single variant; - -`clinvarVcf.CLNSIGCONF`: Conflicting germline classification for this single variant - -`clinvarVcf.CLNVCSO`: Sequence Ontology id for variant type - -`clinvarVcf.DBVARID`: NSV accessions from dbVar for the variant - -`clinvarVcf.ORIGIN`: Allele origin. One or more of the following values may be added: 0 - unknown; 1 - germline; 2 - somatic; 4 - inherited; 8 - paternal; 16 - maternal; 32 - de-novo; 64 - biparental; 128 - uniparental; 256 - not-tested; 512 - tested-inconclusive; 1073741824 - other - -`clinvarVcf.RS`: dbSNP ID (i.e. rs number) - -
- -### (hg38-only) [LoGoFunc](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10688473/) - -A machine learning method for predicting pathogenic GOF, pathogenic LOF, and neutral genetic variants, trained on a broad range of gene-, protein-, and variant-level features describing diverse biological characteristics. - -`logofunc.id`: The LoGoFunc VCF `ID` - -`logofunc.alt`: The Bystro VCF-preprocessor's ALT record for this LoGoFunc site. This should always match the row's `alt` field value - -`logofunc.prediction`: The LoGoFunc prediction - -`logofunc.neutral`: The LoGoFunc neutral score - -`logofunc.gof`: The LoGoFunc gain of function (GOF) score - -`logofunc.lof`: The LoGoFunc loss of function (LOF) score - -
- -### (hg38-only) [GeneBass]() - -GeneBass provides statistics on the impact of genetic variants from gene-based phenome-wide association study (PheWAS) analysis results. See the link for more information. - -`genebass.id`: The GeneBass VCF `ID` - -`genebass.alt`: The Bystro VCF-preprocessor's ALT record for this GeneBass site. This should always match the row's `alt` field value - -`genebass.phenocode`: The GeneBass phenotype code - -`genebass.description`: The GeneBass description diff --git a/perl/bin/bystro-annotate.pl b/perl/bin/bystro-annotate.pl deleted file mode 100755 index 33c726bc9..000000000 --- a/perl/bin/bystro-annotate.pl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env perl - -use 5.10.0; -use strict; -use warnings; -use Interface; -use Getopt::Long; -use DDP; - -my $app = Interface->new_with_options(); - -$app->annotate; - -=head1 NAME - -annotate_snpfile.pl - -=head1 DESCRIPTION - -This program annotates an input file, using the Bystro database - -=head1 VALID_FILES - -1. PEMapper/PECaller .snp file (typically has .snp extension, but we accept any extension, as long as file is properly formatted (see below) - -2. VCF file (typically has .vcf extension, but we accept any extension, as long as file is properly formatted (see below)) - -=head1 EXAMPLES - - annotate_snpfile.pl --in --out --config config/hg19.yml - -=head1 AUTHOR - -Alex Kotlar -=head1 NAME - -=head1 SEE ALSO - -Seq Package - -=cut diff --git a/perl/bin/bystro-build.pl b/perl/bin/bystro-build.pl deleted file mode 100755 index 199710f68..000000000 --- a/perl/bin/bystro-build.pl +++ /dev/null @@ -1,220 +0,0 @@ -#!/usr/bin/env perl - -use 5.10.0; -use strict; -use warnings; - -use Carp qw/ croak /; -use Getopt::Long; -use Path::Tiny qw/path/; -use Pod::Usage; -use Log::Any::Adapter; -use YAML::XS qw/ LoadFile /; - -use DDP; - -use Seq::Build; - -my ( - $yaml_config, $wantedType, $wantedName, $verbose, - $maxThreads, $help, $wantedChr, $dryRunInsertions, - $logDir, $debug, $overwrite, $delete, - $regionTrackOnly, $skipCompletionCheck -); - -$debug = 0; - -# usage -GetOptions( - 'c|config=s' => \$yaml_config, - 't|type=s' => \$wantedType, - 'n|name=s' => \$wantedName, - 'v|verbose=i' => \$verbose, - 'h|help' => \$help, - 'd|debug=i' => \$debug, - 'o|overwrite' => \$overwrite, - 'chr=s' => \$wantedChr, - 'delete' => \$delete, - 'build_region_track_only' => \$regionTrackOnly, - 'skip_completion_check' => \$skipCompletionCheck, - 'dry_run_insertions|dry|dryRun' => \$dryRunInsertions, - 'log_dir=s' => \$logDir, - 'threads=i' => \$maxThreads -) or pod2usage(2); - -if ($help) { - pod2usage(1); - exit; -} - -unless ($yaml_config) { - pod2usage("Error: --config is required"); -} - -if ($debug) { - say STDERR "Running with the following parameters:"; - my $options = { - config => $yaml_config, - wantedChr => $wantedChr, - wantedType => $wantedType, - wantedName => $wantedName, - overwrite => $overwrite || 0, - debug => $debug || 0, - delete => !!$delete, - build_region_track_only => !!$regionTrackOnly, - skipCompletionCheck => !!$skipCompletionCheck, - dryRunInsertions => !!$dryRunInsertions, - logDir => $logDir, - maxThreads => $maxThreads, - verbose => $verbose, - }; - - p $options; -} - -# read config file to determine genome name for log and check validity -my $config_href = LoadFile($yaml_config); -# get absolute path for YAML file and db_location -$yaml_config = path($yaml_config)->absolute->stringify; - -my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) = localtime(); - -$year += 1900; -# # set log file -my $log_name = - join '.', 'build', $config_href->{assembly}, $wantedType - || $wantedName - || 'allTracks', $wantedChr - || 'allChr', - "$mday\_$mon\_$year\_$hour\:$min\:$sec", 'log'; - -if ( !$logDir ) { - $logDir = $config_href->{database_dir}; - - # make or silently fail - path($logDir)->mkpath(); -} -my $logPath = path($logDir)->child($log_name)->absolute->stringify; - -my $builder_options_href = { - config => $yaml_config, - wantedChr => $wantedChr, - wantedType => $wantedType, - wantedName => $wantedName, - overwrite => $overwrite || 0, - debug => $debug || 0, - logPath => $logPath, - delete => !!$delete, - build_region_track_only => !!$regionTrackOnly, - skipCompletionCheck => !!$skipCompletionCheck, - dryRun => !!$dryRunInsertions, - verbose => $verbose, -}; - -if ( defined $maxThreads ) { - $builder_options_href->{maxThreads} = $maxThreads; -} - -my $builder = Seq::Build->new_with_config($builder_options_href); - -__END__ - -=head1 NAME - -build_genome_assembly - Builds a binary genome assembly - -=head1 SYNOPSIS - -build_genome_assembly [options] - - Options: - -c, --config YAML configuration file - -t, --type Type of build (e.g., genome, conserv, transcript_db, gene_db, snp_db) - -n, --name Name of the build - -v, --verbose Verbosity level - -h, --help Display this help message - -d, --debug Debug level (default: 0) - -o, --overwrite For a given track, overwrite existing track values, rather than merging them - --chr Chromosome to build (if applicable) - --delete Delete the track instead of building it - --build_region_track_only Build region track only - --skip_completion_check Skip completion check - --dry_run_insertions, --dry Perform a dry run - --log_dir Directory for log files - --threads Number of threads to use - -=head1 DESCRIPTION - -C takes a YAML configuration file and reads raw genomic -data that has been previously downloaded into the 'raw' folder to create the binary -index of the genome and associated annotations in the MongoDB instance. - -=head1 OPTIONS - -=over 8 - -=item B<-c>, B<--config> - -config: A YAML genome assembly configuration file that specifies the various -tracks and data associated with the assembly. This is the same file that is -used by the Bystro Package to annotate VCF and SNP files. - -=item B<-t>, B<--type> - -type: Build all tracks in the configuration file with `type: `. - -=item B<-n>, B<--name> - -name: Build the track specified in the configuration file with `name: `. - -=item B<--chr> - -chr: Chromosome to build, if building gene or SNP; will build all if not specified. - -=item B<-v>, B<--verbose> - -verbose: Verbosity level (default: 0). - -=item B<-d>, B<--debug> - -debug: Debug level (default: 0). - -=item B<-o>, B<--overwrite> - -overwrite: For a given track, overwrite existing track values, rather than merging them - -=item B<--delete> - -delete: Delete the track instead of building it. - -=item B<--build_region_track_only> - -build_region_track_only: Build region track only. - -=item B<--skip_completion_check> - -skip_completion_check: Skip completion check. - -=item B<--dry_run_insertions>, B<--dry> - -dry_run_insertions: Perform a dry run - -=item B<--log_dir> - -log_dir: Directory for log files. - -=item B<--threads> - -threads: Number of threads to use. - -=back - -=head1 AUTHOR - -Bystro Team - -=head1 SEE ALSO - -Bystro Package - -=cut diff --git a/perl/bin/bystro-utils.pl b/perl/bin/bystro-utils.pl deleted file mode 100755 index 15cf97336..000000000 --- a/perl/bin/bystro-utils.pl +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/env perl - -use 5.10.0; -use strict; -use warnings; - -use Getopt::Long; -use Path::Tiny qw/path/; -use Pod::Usage; -use YAML::XS qw/LoadFile/; -use String::Strip qw/StripLTSpace/; - -use DDP; - -use Utils::CaddToBed; -use Utils::Fetch; -use Utils::LiftOverCadd; -use Utils::SortCadd; -use Utils::RenameTrack; -use Utils::FilterCadd; -use Utils::RefGeneXdbnsfp; -use Utils::DbSnp2FormatInfo; - -use Seq::Build; - -# TODO: refactor to automatically call util by string value -# i.e: --util filterCadd launches Utils::FilterCadd -my ( - $yaml_config, $names, $sortCadd, $filterCadd, - $renameTrack, $utilName, $help, $liftOverCadd, - $liftOverPath, $liftOverChainPath, $debug, $overwrite, - $fetch, $caddToBed, $compress, $toBed, - $renameTrackTo, $verbose, $dryRunInsertions, $maxThreads, -); - -# usage -GetOptions( - 'c|config=s' => \$yaml_config, - 'n|name=s' => \$names, - 'h|help' => \$help, - 'u|util=s' => \$utilName, - 'd|debug=i' => \$debug, - 'o|overwrite' => \$overwrite, - 'v|verbose=i' => \$verbose, - 'r|dryRun' => \$dryRunInsertions, - 'm|maxThreads=i' => \$maxThreads, -); - -if ( $help || !$yaml_config ) { - Pod::Usage::pod2usage(); -} - -if ( !$names ) { - my $config = LoadFile($yaml_config); - my @tracks; - for my $track ( @{ $config->{tracks}{tracks} } ) { - my $hasUtils = !!$track->{utils}; - - if ($hasUtils) { - push @tracks, $track->{name}; - } - } - - $names = join( ",", @tracks ); -} - -if ( !$names ) { - say STDERR "No tracks found with 'utils' property"; -} - -say "Running utils for : " . $names; - -for my $wantedName ( split ',', $names ) { - # modifies in place - StripLTSpace($wantedName); - - my $config = LoadFile($yaml_config); - my $utilConfigs; - my $trackIdx = 0; - - my %options = ( - config => $yaml_config, - name => $wantedName, - debug => $debug, - overwrite => $overwrite || 0, - verbose => $verbose, - dryRun => $dryRunInsertions, - ); - - if ($maxThreads) { - $options{maxThreads} = $maxThreads; - } - - for my $track ( @{ $config->{tracks}{tracks} } ) { - if ( $track->{name} eq $wantedName ) { - $utilConfigs = $track->{utils}; - last; - } - - $trackIdx++; - } - - if ( !$utilConfigs ) { - die "The $wantedName track must have 'utils' property"; - } - - for ( my $utilIdx = 0; $utilIdx < @$utilConfigs; $utilIdx++ ) { - if ( $utilName && $utilConfigs->[$utilIdx]{name} ne $utilName ) { - next; - } - - # config may be mutated, by the last utility - $config = LoadFile($yaml_config); - my $utilConfig = $config->{tracks}{tracks}[$trackIdx]{utils}[$utilIdx]; - - my $utilName = $utilConfig->{name}; - say $utilName; - - # Uppercase the first letter of the utility class name - # aka user may specify "fetch" and we grab Utils::Fetch - my $className = - 'Utils::' - . uc( substr( $utilName, 0, 1 ) ) - . substr( $utilName, 1, length($utilName) - 1 ); - my $args = $utilConfig->{args} || {}; - - my %finalOpts = ( %options, %$args, ( utilIdx => $utilIdx, utilName => $utilName ) ); - - my $instance = $className->new( \%finalOpts ); - $instance->go(); - } -} - -__END__ - -=head1 NAME - -run_utils - Runs items in lib/Utils - -=head1 SYNOPSIS - -run_utils - --config - --name - [--debug] - [--verbose] - [--maxThreads] - [--dryRun] - [--overwrite] - [--help] - -=head1 DESCRIPTION - -C Lets you run utility functions in lib/Utils - -=head1 OPTIONS - -=over 8 - -=item B<-t>, B<--compress> - -Flag to compress output files - -=item B<-c>, B<--config> - -Config: A YAML genome assembly configuration file that specifies the various -tracks and data associated with the assembly. This is the same file that is -used by the Seq Package to annotate snpfiles. - -=item B<-w>, B<--name> - -name: The name of the track in the YAML config file - -=back - -=head1 AUTHOR - -Alex Kotlar - -=head1 SEE ALSO - -Seq Package - -=cut diff --git a/perl/bin/read_db_util.pl b/perl/bin/read_db_util.pl deleted file mode 100644 index 0bfecfcf9..000000000 --- a/perl/bin/read_db_util.pl +++ /dev/null @@ -1,82 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; -use Mouse; -extends 'Seq::Base'; - -1; - -#Supply config, track, chromosome, start, stop -use Scalar::Util qw/looks_like_number/; -use DDP; - -my $config = $ARGV[0]; -my $track = $ARGV[1]; -my $chrom = $ARGV[2]; -my $start = $ARGV[3]; -my $stop = $ARGV[4]; - -if ( !looks_like_number($start) || !looks_like_number($stop) ) { - die 'config trackName chrom start stop'; -} - -my $seq = MockBuilder->new_with_config( { config => $config, readOnly => 1 } ); - -my $tracks = $seq->tracksObj; - -my $refTrackGetter = $tracks->getRefTrackGetter(); -my $trackGetter = $tracks->getTrackGetterByName($track); - -my $isRef; -if ( $refTrackGetter eq $trackGetter ) { - $isRef = 1; -} - -if ( !$trackGetter ) { - die "$track not found in config $config"; -} - -my $db = Seq::DBManager->new(); - -my @positions = ( $start .. $stop ); -my @results = ( $start .. $stop ); - -my $data = $db->dbRead( $chrom, \@results ); - -say STDERR "Read data for $chrom $start"; -p $data; -say STDERR "Showing results for $track"; -my %alt = ( 'A' => 'T', 'C' => 'G', 'G' => 'C', 'T' => 'A' ); - -my $idx = 0; -for my $d (@$data) { - my $pos = $positions[$idx]; - $idx++; - - if ($isRef) { - say $refTrackGetter->get($d); - - next; - } - - my $ref = $refTrackGetter->get($d); - - for my $alt (qw/A C G T/) { - if ( $alt eq $ref ) { - next; - } - - my $out = []; - say STDERR "$chrom:$pos, ref: $ref, alt: $alt:"; - - $trackGetter->get( $d, $chrom, $ref, $alt, 0, $out, $start - 1 ); - - p $out; - } -} - -# if($trackGetter->type eq 'gene') { -# p $trackGetter->{_db}->dbReadAll( $trackGetter->regionTrackPath($chrom) ); -# } diff --git a/perl/cpanfile b/perl/cpanfile deleted file mode 100644 index b27922470..000000000 --- a/perl/cpanfile +++ /dev/null @@ -1,83 +0,0 @@ -# Do not edit this file directly. To change prereqs, edit the `dist.ini` file. - -requires "Archive::Extract" => "0"; -requires "Beanstalk::Client" => "0"; -requires "Carp" => "0"; -requires "Clone" => "0"; -requires "Cpanel::JSON::XS" => "0"; -requires "Cwd" => "0"; -requires "DBD::MariaDB" => "1.23"; -requires "DBI" => "0"; -requires "DDP" => "0"; -requires "Data::MessagePack" => "0"; -requires "Digest::MD5" => "0"; -requires "Fcntl" => "0"; -requires "File::Basename" => "0"; -requires "File::Glob" => "0"; -requires "File::Which" => "0"; -requires "Getopt::Long" => "0"; -requires "Getopt::Long::Descriptive" => "0"; -requires "Hash::Merge::Simple" => "0"; -requires "JSON::XS" => "0"; -requires "LMDB_File" => "0"; -requires "List::MoreUtils" => "0"; -requires "List::Util" => "0"; -requires "Log::Any::Adapter" => "0"; -requires "Log::Fast" => "0"; -requires "MCE::Loop" => "0"; -requires "Math::SigFigs" => "0"; -requires "Mouse" => "2"; -requires "Mouse::Role" => "2"; -requires "Mouse::Util::TypeConstraints" => "0"; -requires "MouseX::Getopt" => "0"; -requires "MouseX::NativeTraits" => "0"; -requires "POSIX" => "0"; -requires "Parallel::ForkManager" => "0"; -requires "Path::Tiny" => "0"; -requires "PerlIO::gzip" => "0"; -requires "PerlIO::utf8_strict" => "0"; -requires "Pod::Usage" => "0"; -requires "Scalar::Util" => "0"; -requires "String::Strip" => "0"; -requires "Sys::CpuAffinity" => "0"; -requires "Time::HiRes" => "0"; -requires "Time::localtime" => "0"; -requires "Try::Tiny" => "0"; -requires "Type::Params" => "0"; -requires "Types::Path::Tiny" => "0"; -requires "Types::Standard" => "0"; -requires "YAML::XS" => "0"; -requires "namespace::autoclean" => "0"; -requires "perl" => "v5.16.0"; -requires "strict" => "0"; -requires "warnings" => "0"; - -on 'test' => sub { - requires "Exporter" => "0"; - requires "ExtUtils::MakeMaker" => "0"; - requires "File::Spec" => "0"; - requires "File::Temp" => "0"; - requires "IO::Compress::Gzip" => "0"; - requires "IO::Uncompress::Gunzip" => "0"; - requires "Test::Exception" => "0"; - requires "Test::More" => "0"; - requires "Types::Common::String" => "0"; - requires "YAML::Tiny" => "0"; -}; - -on 'test' => sub { - recommends "CPAN::Meta" => "2.120900"; -}; - -on 'configure' => sub { - requires "ExtUtils::MakeMaker" => "0"; -}; - -on 'develop' => sub { - requires "Test::CPAN::Meta" => "0"; - requires "Test::EOF" => "0"; - requires "Test::EOL" => "0"; - requires "Test::MinimumVersion" => "0"; - requires "Test::More" => "0.88"; - requires "Test::NoTabs" => "0"; -}; diff --git a/perl/dist.ini b/perl/dist.ini deleted file mode 100644 index 73f5e0b14..000000000 --- a/perl/dist.ini +++ /dev/null @@ -1,76 +0,0 @@ -name = Bystro -author = The Bystro Authors -license = Apache_2_0 -copyright_holder = The Bystro Authors -copyright_year = 2023 - -version = 2.0.0 - -; install author dependencies with `dzil authordeps --missing | cpanm` -; install package dependencies with `dzil listdeps --missing | cpanm` - -[GatherDir] -exclude_match = Dockerfile -exclude_match = cpanfile -exclude_match = dist.ini -exclude_match = entrypoint.sh - -[Encoding] -encoding = bytes -match = .*\.gz$ -match = .*\.mdb$ - -[ExecDir] -dir = bin - -[ShareDir] - -[PruneCruft] - -[MakeMaker] - -[Manifest] - -[ManifestSkip] - -[License] - -[Readme] - -[MetaYAML] - -[CPANFile] - -[ConfirmRelease] - -[AutoPrereqs] - -[RemovePrereqs] -remove = lib - -; Tests -[MetaTests] - -[Test::ReportPrereqs] -verify_prereqs = 1 - -[Test::NoTabs] -finder = :InstallModules -finder = :ExecFiles - -[Test::EOL] -[Test::EOF] -strict = 0 - -[Test::MinimumVersion] - -[TestRelease] - -[RunExtraTests] - -[CopyFilesFromBuild::Filtered] -copy = cpanfile - -; Releaser Plugin -[UploadToCPAN] - diff --git a/perl/example_bystro_vcf_preprocessor_output.tsv b/perl/example_bystro_vcf_preprocessor_output.tsv deleted file mode 100644 index 36342be7d..000000000 --- a/perl/example_bystro_vcf_preprocessor_output.tsv +++ /dev/null @@ -1,11 +0,0 @@ -chrom pos type ref alt trTv heterozygotes heterozygosity homozygotes homozygosity missingGenos missingness ac an sampleMaf vcfPos id -chr20 1 SNP A T 2 NA00001 0.333 NA 0 NA 0 1 6 0.167 1 SIMPLE_SNP -chr20 1110696 MULTIALLELIC A G 0 NA00001;NA00002 0.667 NA 0 NA 0 2 6 0.333 1110696 MULTIALLELIC_SNP -chr20 1110696 MULTIALLELIC A T 0 NA00001;NA00002 0.667 NA00003 0.333 NA 0 4 6 0.667 1110696 MULTIALLELIC_SNP -chr20 1 INS A +C 0 NA00003 0.333 NA 0 NA 0 1 6 0.167 1 SIMPLE_INSERTION -chr20 1 INS A +CC 0 NA00001;NA00002 0.667 NA00003 0.333 NA 0 4 6 0.667 1 INSERTION_BETWEEN_TWO_BASES -chr20 1234568 MULTIALLELIC T -3 0 NA00001 0.333 NA00003 0.333 NA 0 3 6 0.5 1234567 microsat1 -chr20 1234568 MULTIALLELIC T +A 0 NA00002 0.333 NA 0 NA 0 1 6 0.167 1234567 microsat1 -chr20 3 MNP C A 2 NA 0 NA00003 0.5 NA00001 0.333 2 4 0.5 3 EXAMPLE_MISSING_MNP -chr20 4 MNP C A 2 NA 0 NA00003 0.5 NA00001 0.333 2 4 0.5 3 EXAMPLE_MISSING_MNP -chr20 5 MNP C A 2 NA 0 NA00003 0.5 NA00001 0.333 2 4 0.5 3 EXAMPLE_MISSING_MNP diff --git a/perl/example_vcf.tsv b/perl/example_vcf.tsv deleted file mode 100644 index 8bf8cb192..000000000 --- a/perl/example_vcf.tsv +++ /dev/null @@ -1,9 +0,0 @@ -##fileformat=VCFv4.2 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 -20 1 SIMPLE_SNP A T 50 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|1:54:7:56,60 0|0:48:4:51,51 0/0:61:2:51,51 -20 1110696 MULTIALLELIC_SNP A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP 1|2:21:6 2|1:2:0 2/2:35:4 -20 1 SIMPLE_INSERTION A AC 47 PASS NS=3;DP=13;AA=T GT:GQ:DP 0|0:54:7 0|0:48:4 1/0:61:2 -20 1 INSERTION_BETWEEN_TWO_BASES AT ACCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 -20 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 -20 3 EXAMPLE_MISSING_MNP CCC AAA 50 PASS NS=3;DP=9;AA=G GT ./1 0/0 1/1 - diff --git a/perl/example_vcf_dosage_matrix.feather b/perl/example_vcf_dosage_matrix.feather deleted file mode 100644 index 29dc004fd36c25bab7d18bf1193b9078b6e5a408..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1218 zcmd^9ODKd<6h8m;4`xhZ8qC5gNfsl<$YVi95tE3G$I4^lF-2)gR+LgKWW~b5g0h-y zMA;~cD3pzbt+JqO6^-xQhsR)H?cBcq-g~|?=YIeB&z(?rcV|z4C=T;kL>a{9A~)b9 z8&B=CY)Zu-_|%hEMEgJlsFe)S2=Z_am4}2EYqkyeF5SbW`PU-wOHr|N(nD=p$ zkFqHXIc=OwlnTz6j7egDK6uupVhRyZ3~eb#Kv1Tm8VRQ=g8&R*|MJ2>z1zXW`;)h{Cm@}aDC0)OSq;XY2<8W^I-CX&L3_c ziyQ&lNdsxC?T54DMGI~7IkD!r*8q*fLqE?yFWkf9^95`>%JB?_x;) GcfA47C5R#b diff --git a/perl/issues/subtxn_bug.pl b/perl/issues/subtxn_bug.pl deleted file mode 100644 index a92d84931..000000000 --- a/perl/issues/subtxn_bug.pl +++ /dev/null @@ -1,176 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package DBManager; -use LMDB_File qw/:all/; -my %envs; - -sub new { - my $class = shift; - my $self = {}; - bless $self, $class; -} - -sub dbPut { - my ( $self, $dbName, $key, $data, $skipCommit ) = @_; - - # 0 to create database if not found - my $db = $self->_getDbi($dbName); - - if ( !$db->{db}->Alive ) { - $db->{db}->Txn = $db->{env}->BeginTxn(); - # not strictly necessary, but I am concerned about hard to trace abort bugs related to scope - $db->{db}->Txn->AutoCommit(1); - } - - $db->{db}->Txn->put( $db->{dbi}, $key, $data ); - - $db->{db}->Txn->commit() unless $skipCommit; - - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_KEYEXIST ) { - die $LMDB_File::last_err; - } - - #reset the class error variable, to avoid crazy error reporting later - $LMDB_File::last_err = 0; - } - - return 0; -} - -sub dbReadOne { - my ( $self, $dbName, $key, $skipCommit ) = @_; - - my $db = $self->_getDbi($dbName) or return undef; - - if ( !$db->{db}->Alive ) { - $db->{db}->Txn = $db->{env}->BeginTxn(); - # not strictly necessary, but I am concerned about hard to trace abort bugs related to scope - $db->{db}->Txn->AutoCommit(1); - } - - $db->{db}->Txn->get( $db->{dbi}, $key, my $data ); - - # Commit unless the user specifically asks not to - #if(!$skipCommit) { - $db->{db}->Txn->commit() unless $skipCommit; - - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_NOTFOUND ) { - die $LMDB_File::last_err; - } - - $LMDB_File::last_err = 0; - } - - return $data; -} - -sub dbStartCursorTxn { - my ( $self, $dbName ) = @_; - - my $db = $self->_getDbi($dbName) or return; - - my $txn = $db->{env}->BeginTxn(); - - # Help LMDB_File track our cursor - LMDB::Cursor::open( $txn, $db->{dbi}, my $cursor ); - - # Unsafe, private LMDB_File method access but Cursor::open does not track cursors - $LMDB::Txn::Txns{$$txn}{Cursors}{$$cursor} = 1; - - return [ $txn, $cursor ]; -} - -sub _getDbi { - # Exists and not defined, because in read only database we may discover - # that some chromosomes don't have any data (example: hg38 refSeq chrM) - - # $_[0] $_[1], $_[2] - # Don't create used by dbGetNumberOfEntries - my ( $self, $dbPath ) = @_; - - if ( $envs{$dbPath} ) { - return $envs{$dbPath}; - } - - my $env = LMDB::Env->new( - $dbPath, - { - mapsize => 128 * 1024 * 1024 * 1024, # Plenty space, don't worry - #maxdbs => 20, # Some databases - mode => 0600, - maxdbs => - 0, # Some databases; else we get a MDB_DBS_FULL error (max db limit reached) - } - ); - - if ( !$env ) { - die 'No env'; - } - - my $txn = $env->BeginTxn(); - - my $dbFlags; - - my $DB = $txn->OpenDB( undef, MDB_INTEGERKEY ); - - # ReadMode 1 gives memory pointer for perf reasons, not safe - $DB->ReadMode(1); - - if ($LMDB_File::last_err) { - die $LMDB_File::last_err; - } - - # Now db is open - my $err = $txn->commit(); - - if ($err) { - die $err; - } - - $envs{$dbPath} = { env => $env, dbi => $DB->dbi, db => $DB }; - - return $envs{$dbPath}; -} - -1; - -use Test::More; -use DDP; -my $db = DBManager->new(); - -my $dbIdx = 1; -my $pos = 99; -my $val = "HELLO WORLD"; - -system('rm -rf ./test && mkdir ./test'); - -#### WORKS GREAT #### -my $cursor; -$cursor = $db->dbStartCursorTxn('test'); - -### Test Unsafe Transactions (Manually Managed) ########## -$db->dbPut( 'test', $pos, [], 1 ); - -$db->dbReadOne( 'test', $pos ); - -p %LMDB::Env::Envs; - -$db->dbReadOne( 'test', $pos ); -undef $db; -undef $cursor; - -system('rm -rf ./test && mkdir ./test'); - -$db = DBManager->new(); -#### DIES MISERABLE DEATH #### -say "The reverse order doesn't work"; - -$db->dbPut( 'test', $pos, [], 1 ); -$cursor = $db->dbStartCursorTxn('test'); -$db->dbReadOne( 'test', $pos ); - -say "We will never see this"; diff --git a/perl/lib/Interface.pm b/perl/lib/Interface.pm deleted file mode 100644 index 085ca4ef6..000000000 --- a/perl/lib/Interface.pm +++ /dev/null @@ -1,278 +0,0 @@ -#!/usr/bin/env perl -use 5.10.0; - -package Interface; - -use File::Basename; - -use Mouse; - -use Path::Tiny; -use Mouse::Util::TypeConstraints; - -use namespace::autoclean; - -use YAML::XS qw/LoadFile/; -use JSON::XS; - -use Getopt::Long::Descriptive; - -use Try::Tiny; - -use Seq; -with 'MouseX::Getopt'; - -##########Parameters accepted from command line################# -has input => ( - is => 'ro', - isa => 'ArrayRef[Str]', - metaclass => 'Getopt', - cmd_aliases => [qw/i in/], - documentation => - 'Input files. Supports mulitiple files: --in file1 --in file2 --in file3', -); - -has output => ( - is => 'ro', - isa => 'Str', - cmd_aliases => [qw/o out/], - metaclass => 'Getopt', - documentation => 'Base path for output files: /path/to/output', -); - -has json => ( - is => 'ro', - isa => 'Bool', - metaclass => 'Getopt', - documentation => - 'Do you want to output JSON instead? Incompatible with run_statistics', -); - -has config => ( - is => 'ro', - isa => 'Str', - coerce => 1, - required => 0, - metaclass => 'Getopt', - cmd_aliases => [qw/c configuration/], - documentation => 'Yaml config file path.', -); - -has overwrite => ( - is => 'ro', - isa => 'Int', - default => 0, - required => 0, - metaclass => 'Getopt', - documentation => 'Overwrite existing output file.', -); - -has read_ahead => ( - is => 'ro', - isa => 'Bool', - default => 0, - coerce => 1, - required => 0, - metaclass => 'Getopt', - documentation => 'For dense datasets, use system read-ahead', -); - -has debug => ( - is => 'ro', - isa => 'Num', - default => 0, - required => 0, - metaclass => 'Getopt', -); - -has verbose => ( - is => 'ro', - isa => 'Int', - required => 0, - metaclass => 'Getopt', -); - -has compress => ( - is => 'ro', - isa => 'Str', - metaclass => 'Getopt', - documentation => - 'Enable compression. Specify the type of compression: lz4 gz bgz. `bgz` is an alias for gz (gzip); when bgzip is available, it will be used and will generate a block gzipped file with index', - default => 0, -); - -has archive => ( - is => 'ro', - isa => 'Bool', - metaclass => 'Getopt', - documentation => 'Place all outputs into a tarball?', - default => 0, -); - -has run_statistics => ( - is => 'ro', - isa => 'Int', - metaclass => 'Getopt', - documentation => - 'Create per-sample feature statistics (like transition:transversions)?', - default => 1, -); - -has delete_temp => ( - is => 'ro', - isa => 'Int', - documentation => 'Delete the temporary directory made during annotation', - default => 1, -); - -has wantedChr => ( - is => 'ro', - isa => 'Str', - metaclass => 'Getopt', - cmd_aliases => [qw/chr wanted_chr/], - documentation => 'Annotate a single chromosome', -); - -has maxThreads => ( - is => 'ro', - isa => 'Int', - metaclass => 'Getopt', - cmd_aliases => [qw/threads/], - documentation => 'Number of CPU threads to use (optional)', -); - -has publisher => ( - is => 'ro', - isa => 'Str', - required => 0, - metaclass => 'Getopt', - documentation => - 'Tell Bystro how to send messages to a plugged-in interface (such as a web interface)' -); - -has ignore_unknown_chr => ( - is => 'ro', - isa => 'Bool', - default => 1, - required => 0, - metaclass => 'Getopt', - documentation => 'Don\'t quit if we find a non-reference chromosome (like ChrUn)' -); - -has json_config => ( - is => 'ro', - isa => 'Str', - required => 0, - metaclass => 'Getopt', - documentation => - 'JSON config file path. Use this if you wish to invoke the annotator by file passing.', -); - -has result_summary_path => ( - is => 'ro', - isa => 'Str', - required => 0, - metaclass => 'Getopt', - documentation => 'Where to output the result summary. Defaults to STDOUT', -); - -sub annotate { - my $self = shift; - - my $args; - - if ( $self->json_config ) { - my $json_config_data = path( $self->json_config )->slurp; - # p $json_config_data; - $args = decode_json($json_config_data); - } - else { - my $publisher; - - if ( $self->publisher ) { - if ( type $self->publisher eq 'Str' ) { - $publisher = decode_json( $self->publisher ); - } - else { - $publisher = $self->publisher; - } - } - - if ( defined $self->verbose ) { - $args->{verbose} = $self->verbose; - } - - if ( defined $self->maxThreads ) { - $args->{maxThreads} = $self->maxThreads; - } - - if ( defined $self->json ) { - $args->{outputJson} = $self->json; - - if ( $self->run_statistics ) { - say STDERR "--json incompatible with --run_statistics 1"; - exit(1); - } - } - - $args = { - config => $self->config, - input_files => $self->input, - output_file_base => $self->output, - debug => $self->debug, - wantedChr => $self->wantedChr, - ignore_unknown_chr => $self->ignore_unknown_chr, - overwrite => $self->overwrite, - publisher => $publisher, - compress => $self->compress, - archive => $self->archive, - run_statistics => !!$self->run_statistics, - delete_temp => !!$self->delete_temp, - readAhead => $self->read_ahead - }; - } - - my ( $err, $results, $totalProgress, $totalSkipped ); - - try { - my $annotator = Seq->new_with_config($args); - ( $err, $results, $totalProgress, $totalSkipped ) = $annotator->annotate(); - } - catch { - $err = $_; - }; - - my $formattedResults = JSON::XS->new->pretty(1)->encode( - { - error => $err, - results => $results, - totalProgress => $totalProgress, - totalSkipped => $totalSkipped, - } - ); - - if ( $self->result_summary_path ) { - path( $self->result_summary_path )->spew($formattedResults); - } - else { - say $formattedResults; - } -} - -__PACKAGE__->meta->make_immutable; - -1; - -=item messanger - -Contains a hash reference (also accept json representation of hash) that -tells Bystro how to send data to a plugged interface. - -Example: { - room: jobObj.userID, - message: { - publicID: jobObj.publicID, - data: tData, - }, - }; -=cut diff --git a/perl/lib/Seq.pm b/perl/lib/Seq.pm deleted file mode 100644 index 697a6495b..000000000 --- a/perl/lib/Seq.pm +++ /dev/null @@ -1,862 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq; - -our $VERSION = '0.001'; -# ABSTRACT: Annotate a snp file - -use Fcntl qw(:flock); -use Mouse 2; -use Types::Path::Tiny qw/AbsFile/; - -use namespace::autoclean; - -use MCE::Loop; - -use Seq::InputFile; -use Seq::Output; -use Seq::Output::Delimiters; -use Seq::Headers; -use JSON::XS; -use Seq::DBManager; -use Path::Tiny; -use Scalar::Util qw/looks_like_number/; -use Try::Tiny; - -extends 'Seq::Base'; - -our $ANNOTATION_COMPLTE_FILE_NAME = 'bystro_annotation.complete'; -our $ANNOTATION_LOCK_FILE_NAME = 'bystro_annotation.lock'; - -# We add a few of our own annotation attributes -# These will be re-used in the body of the annotation processor below -# Users may configure these -has input_files => ( is => 'rw', isa => 'ArrayRef', required => 1 ); - -# Maximum (signed) size of del allele -has maxDel => ( is => 'ro', isa => 'Int', default => -32, writer => 'setMaxDel' ); - -# TODO: formalize: check that they have name and args properties -has fileProcessors => ( is => 'ro', isa => 'HashRef', required => 1 ); - -# Defines most of the properties that can be configured at run time -# Requires logPath to be provided (currently found in Seq::Base) -with 'Seq::Definition', 'Seq::Role::Validator'; - -# To initialize Seq::Base with only getters -has '+readOnly' => ( init_arg => undef, default => 1 ); - -# https://stackoverflow.com/questions/1609467/in-perl-is-there-a-built-in-way-to-compare-two-arrays-for-equality -sub _arraysEqual { - my ( $xref, $yref ) = @_; - return unless @$xref == @$yref; - - my $i; - for my $e (@$xref) { - return unless $e eq $yref->[ $i++ ]; - } - return 1; -} - -# TODO: further reduce complexity -sub BUILD { - my $self = shift; - - if ( $self->maxDel > 0 ) { - $self->setMaxDel( -$self->maxDel ); - } - - ################## Make the full output path ###################### - # The output path always respects the $self->output_file_base attribute path; - $self->{_outPath} = - $self->_workingDir->child( $self->outputFilesInfo->{annotation} ); - - $self->{_headerPath} = $self->_workingDir->child( $self->outputFilesInfo->{header} ); - - # Must come before statistics, which relies on a configured Seq::Tracks - #Expects DBManager to have been given a database_dir - $self->{_db} = Seq::DBManager->new(); - - # When tracksObj accessor is called, a function is executed that results in the database being set to read only - # when read only mode is called for (Seq::Base instantiated with readOnly => 1) - # Therefore it is important to call tracksObj before any other database calls - # to make sure that read-only semantics are enforced throughout the annotation process - # To ensure that we don't accidentally defer setting database mode until child proceses begin to annotate (in which case they will race to set the database mode) - # from hereon we will only use $self->{_tracks} to refer to tracks objects. - $self->{_tracks} = $self->tracksObj; -} - -sub annotate { - my $self = shift; - - $self->log( 'info', 'Checking input file format' ); - - my $firstFileType; - for my $file ( @{ $self->input_files } ) { - my ( $err, $fileType ) = $self->validateInputFile($file); - - if ($err) { - $self->_errorWithCleanup($err); - return ( $err, undef ); - } - - if ( !$firstFileType ) { - $firstFileType = $fileType; - } - elsif ( $fileType ne $firstFileType ) { - $self->_errorWithCleanup("All input files must be of the same type"); - return ( "All input files must be of the same type", undef ); - } - } - - $self->log( 'info', 'Beginning annotation' ); - return $self->annotateFile($firstFileType); -} - -sub annotateFile { - #Inspired by T.S Wingo: https://github.com/wingolab-org/GenPro/blob/master/bin/vcfToSnp - my $self = shift; - my $type = shift; - - my $lockFh; - my $lockPath = $self->outDir->child($ANNOTATION_LOCK_FILE_NAME)->stringify; - my $outDir = $self->outDir->stringify; - open $lockFh, ">", $lockPath or die $!; - flock $lockFh, LOCK_EX | LOCK_NB - or die - "Multiple Bystro annotator instances are competing to process $outDir. If you are running the Bystro Annotator from the Bystro UI, this is likely because of a network interruption that resulted in double submission. Please retry this job. Error: $!"; - - # Check if $ANNOTATION_COMPLTE_FILE_NAME exists in the outDir, and if it does, exit - my $annotationCompletePath = - $self->outDir->child($ANNOTATION_COMPLTE_FILE_NAME)->stringify; - if ( -e $annotationCompletePath ) { - my $annotationOutputDir = $self->outDir->stringify; - $self->_errorWithCleanup('Annotation already completed'); - return ( - "Skipping annotation. We found the annotation status file `$ANNOTATION_COMPLTE_FILE_NAME` in the target ouput directory `$annotationOutputDir`, which suggests that this directory already contains a completed annotation. This is most likely caused by one of two reasons: 1) You are trying to re-run an annotation and are outputting to an existing output directory, or 2) There are network connectivity issues, such as with a load balancing server, and this annotation was accidentally double-submitted. If you think this is an error: 1) if running Bystro Annotator from the command line: delete `$annotationCompletePath` (or all files in the directory) and try annotating again. 2) If running from Bystro UI, re-submit the annotation.", - undef - ); - } - - my ( $err, $inFhs, $outFh, $statsFh, $headerFh, $preOutArgs ) = - $self->_getFileHandles($type); - - if ($err) { - $self->_errorWithCleanup($err); - return ( $err, undef ); - } - - # Create a file "bystro_annotation_lock" in the destination directory - # and get an exclusive lock - # If that fails, we know that another process is running, and we should exit - - ########################## Write the header ################################## - my $header; - for my $inFh (@$inFhs) { - $header = <$inFh>; - - if ( !$header ) { - $self->_errorWithCleanup("Empty input file"); - return ( "Empty input file", undef ); - } - } - - $self->setLineEndings($header); - - my ( $finalHeader, $numberSplitFields ) = $self->_getFinalHeader($header); - - ## A programmatically useful representation of the header - say $headerFh encode_json( $finalHeader->getOrderedHeader() ); - my $outputHeader = $finalHeader->getString(); - - if ( !$self->outputJson ) { - say $outFh $outputHeader; - } - - if ($statsFh) { - say $statsFh $outputHeader; - } - - ######################## Build the fork pool ################################# - my $abortErr; - - my $messageFreq = ( 2e4 / 4 ) * $self->maxThreads; - - # Report every 1e4 lines, to avoid thrashing receiver - my $progressFunc = - $self->makeLogProgressAndPrint( \$abortErr, $outFh, $statsFh, $messageFreq ); - MCE::Loop::init { - max_workers => $self->maxThreads || 8, - use_slurpio => 1, - chunk_size => 'auto', - gather => $progressFunc, - }; - - # We separate out the reference track getter so that we can check for discordant - # bases, and pass the true reference base to other getters that may want it (like CADD) - - # To avoid the Moose/Mouse accessor penalty, store reference to underlying data - my $db = $self->{_db}; - my $refTrackGetter = $self->{_tracks}->getRefTrackGetter(); - my @trackGettersExceptReference = - @{ $self->{_tracks}->getTrackGettersExceptReference() }; - my @trackIndicesExceptReference = 0 .. $#trackGettersExceptReference; - - my $outIndicesMap = $finalHeader->getParentIndices(); - - my @outIndicesExceptReference = - map { $outIndicesMap->{ $_->name } } @trackGettersExceptReference; - - ######### Set Outputter ######### - my @allOutIndices = - map { $outIndicesMap->{ $_->name } } @{ $self->{_tracks}->trackGetters }; - - # Now that header is prepared, make the outputter - # Note, that the only features that we need to iterate over - # Are the features that come from our database - # Meaning, we can skip anything forwarded from the pre-processor - - my $outputter = Seq::Output->new( - { - header => $finalHeader, - trackOutIndices => \@allOutIndices, - refTrackName => $refTrackGetter->name - } - ); - - ###### Processes pre-processor output passed from file reader/producer ####### - my $discordantIdx = $outIndicesMap->{ $self->discordantField }; - my $refTrackOutIdx = $outIndicesMap->{ $refTrackGetter->name }; - - #Accessors are amazingly slow; it takes as long to call ->name as track->get - #after accounting for the nubmer of calls to ->name - my %wantedChromosomes = %{ $refTrackGetter->chromosomes }; - my $maxDel = $self->maxDel; - - my $outJson = $self->outputJson; - - for my $inFh (@$inFhs) { - mce_loop_f { - #my ($mce, $slurp_ref, $chunk_id) = @_; - # $_[0], $_[1], $_[2] - open my $MEM_FH, '<', $_[1]; - binmode $MEM_FH, ':raw'; - - my $total = 0; - - my @indelDbData; - my @indelRef; - my @lines; - my $dataFromDbAref; - my $zeroPos; - - # This is going to be copied on write... avoid a bunch of function calls - # Each thread will get its own %cursors object - # But start in child because relying on COW seems like it could lead to - # future bugs (in, say Rust if sharing between user threads) - my %cursors = (); - - # Each line is expected to be - # chrom \t pos \t type \t inputRef \t alt \t hets \t homozygotes \n - # the chrom is always in ucsc form, chr (the golang program guarantees it) - my $outputJson = $self->outputJson; - while ( my $line = $MEM_FH->getline() ) { - chomp $line; - - my @fields = split( '\t', $line, $numberSplitFields ); - - $total++; - - if ( !$wantedChromosomes{ $fields[0] } ) { - next; - } - - $zeroPos = $fields[1] - 1; - - # Caveat: It seems that, per database ($chr), we can have only one - # read-only transaction; so ... yeah can't combine with dbRead, dbReadOne - if ( !$cursors{ $fields[0] } ) { - $cursors{ $fields[0] } = $db->dbStartCursorTxn( $fields[0] ); - } - - $dataFromDbAref = $db->dbReadOneCursorUnsafe( $cursors{ $fields[0] }, $zeroPos ); - - if ( !defined $dataFromDbAref ) { - $self->_errorWithCleanup("Wrong assembly? $fields[0]\: $fields[1] not found."); - # Store a reference to the error, allowing us to exit with a useful fail message - MCE->gather( 0, 0, "Wrong assembly? $fields[0]\: $fields[1] not found." ); - $_[0]->abort(); - return; - } - - if ( length( $fields[4] ) > 1 ) { - # INS or DEL - if ( looks_like_number( $fields[4] ) ) { - # We ignore -1 alleles, treat them just like SNPs - if ( $fields[4] < -1 ) { - # Grab everything from + 1 the already fetched position to the $pos + number of deleted bases - 1 - # Note that position_1_based - (negativeDelLength + 2) == position_0_based + (delLength - 1) - if ( $fields[4] < $maxDel ) { - @indelDbData = ( $fields[1] .. $fields[1] - ( $maxDel + 2 ) ); - } - else { - @indelDbData = ( $fields[1] .. $fields[1] - ( $fields[4] + 2 ) ); - } - - #last argument: skip commit - $db->dbReadCursorUnsafe( $cursors{ $fields[0] }, \@indelDbData ); - - #Note that the first position keeps the same $inputRef - #This means in the (rare) discordant multiallelic situation, the reference - #Will be identical between the SNP and DEL alleles - #faster than perl-style loop (much faster than c-style) - @indelRef = ( $fields[3], map { $refTrackGetter->get($_) } @indelDbData ); - - #Add the db data that we already have for this position - unshift @indelDbData, $dataFromDbAref; - } - } - else { - #It's an insertion, we always read + 1 to the position being annotated - # which itself is + 1 from the db position, so we read $out[1][0][0] to get the + 1 base - # Read without committing by using 1 as last argument - @indelDbData = ( - $dataFromDbAref, $db->dbReadOneCursorUnsafe( $cursors{ $fields[0] }, $fields[1] ) - ); - - #Note that the first position keeps the same $inputRef - #This means in the (rare) discordant multiallelic situation, the reference - #Will be identical between the SNP and DEL alleles - @indelRef = ( $fields[3], $refTrackGetter->get( $indelDbData[1] ) ); - } - } - - if (@indelDbData) { - ############### Gather all track data (besides reference) ################# - for my $posIdx ( 0 .. $#indelDbData ) { - for my $trackIndex (@trackIndicesExceptReference) { - $fields[ $outIndicesExceptReference[$trackIndex] ] //= []; - - $trackGettersExceptReference[$trackIndex]->get( - $indelDbData[$posIdx], $fields[0], $indelRef[$posIdx], $fields[4], $posIdx, - $fields[ $outIndicesExceptReference[$trackIndex] ], - $zeroPos + $posIdx - ); - } - - $fields[$refTrackOutIdx][$posIdx] = $indelRef[$posIdx]; - } - - # If we have multiple indel alleles at one position, need to clear stored values - @indelDbData = (); - @indelRef = (); - } - else { - for my $trackIndex (@trackIndicesExceptReference) { - $fields[ $outIndicesExceptReference[$trackIndex] ] //= []; - - $trackGettersExceptReference[$trackIndex] - ->get( $dataFromDbAref, $fields[0], $fields[3], $fields[4], 0, - $fields[ $outIndicesExceptReference[$trackIndex] ], $zeroPos ); - } - - $fields[$refTrackOutIdx][0] = $refTrackGetter->get($dataFromDbAref); - } - - # 3 holds the input reference, we'll replace this with the discordant status - $fields[$discordantIdx] = - $refTrackGetter->get($dataFromDbAref) ne $fields[3] ? "true" : "false"; - - push @lines, \@fields; - } - - close $MEM_FH; - - if (@lines) { - if ($outJson) { - MCE->gather( scalar @lines, $total - @lines, undef, encode_json( \@lines ) ); - } - else { - MCE->gather( - scalar @lines, - $total - @lines, - undef, $outputter->makeOutputString( \@lines ) - ); - } - } - else { - MCE->gather( 0, $total ); - } - - } - $inFh; - } - - # Force flush - my ( $totalAnnotated, $totalSkipped ) = $progressFunc->( 0, 0, undef, undef, 1 ); - - MCE::Loop::finish(); - - # Unfortunately, MCE::Shared::stop() removes the value of $abortErr - # according to documentation, and I did not see mention of a way - # to copy the data from a scalar, and don't want to use a hash for this alone - # So, using a scalar ref to abortErr in the gather function. - if ($abortErr) { - # Database & tx need to be closed - $db->cleanUp(); - - return ( 'Job aborted due to error', undef ); - } - - ################ Finished writing file. If statistics, print those ########## - # Sync to ensure all files written - # This simply tries each close/sync/move operation in order - # And returns an error early, or proceeds to next operation - my $configOutPath = $self->_workingDir->child( $self->outputFilesInfo->{config} ); - - $err = - $self->safeClose($outFh) - || ( $statsFh && $self->safeClose($statsFh) ) - || $self->safeSystem( "cp " . $self->config . " $configOutPath" ) - || $self->safeSystem('sync'); - - if ($err) { - my $humanErr = "Failed to close files"; - $self->_errorWithCleanup($humanErr); - return ( $humanErr, undef ); - } - - $db->cleanUp(); - - # If there are multiple input files, we will have multiple pre-processor outputs - # should the pre-processor be configured to output sampleList or dosageMatrixOutPath - # We need to combine these into a single file each - - # 1) For the sampleList, we need to: - ## 1a) check that the sampleList files are identical - ## 1b) if they are, we can simply move one to the final outBaseName.sample_list destination - ## 1c) if they are not, we need to combine them and note that they are not identical - # 2) for the dosageMatrixOutPath, we need to call the dosage-combiner - ## 2a) the dosage-combiner will check that the dosageMatrixOutPath schemas are identical - ## 2b) if they are, it will combine them into a single file - ## 2c) if they are not, it will combine them into a single file, such that the schema is the union of all schemas - ## 2c) meaning that the number of samples is the total across all dosage files - ## 2c) and the number of variants is the total across all dosage files - ## 2c) with missing values filled in with 0 (reference allele) - - # Step 1: - if ( @$preOutArgs > 1 ) { - $self->log( "info", "Has multiple pre-processor outputs; combining them" ); - my @sampleLists; - my @dosageMatrixOutPaths; - for my $preOutArgHref (@$preOutArgs) { - if ( $preOutArgHref->{sampleList} ) { - push @sampleLists, $preOutArgHref->{sampleList}; - } - - if ( $preOutArgHref->{dosageMatrixOutPath} ) { - push @dosageMatrixOutPaths, $preOutArgHref->{dosageMatrixOutPath}; - } - } - - # Read the sample lists, and check that they are identical - my $allSampleListsIdentical = 1; - if (@sampleLists) { - $self->log( "info", "Combining sample lists" ); - - my $sampleList = $self->_workingDir->child( $self->outputFilesInfo->{sampleList} ); - - my $sampleListContents; - my $sampleListErr; - my %uniqueSamples; - my @uniqueSamples; - - my $idx = 0; - my $hasNonUniqueSamples = 0; - - my @canonicalSampleList; - for my $sampleListPath (@sampleLists) { - my $sampleListContentsNew = path($sampleListPath)->slurp; - - # We could have heterogenous files, some with samples and some without - if ( !$sampleListContentsNew ) { - next; - } - - my @samples = split( '\n', $sampleListContentsNew ); - - if ( $idx == 0 ) { - @canonicalSampleList = @samples; - } - elsif ( !_arraysEqual( \@canonicalSampleList, \@samples ) ) { - $allSampleListsIdentical = 0; - } - - for (@samples) { - if ( $uniqueSamples{$_} ) { - next; - } - - $uniqueSamples{$_} = 1; - push @uniqueSamples, $_; - - if ( $idx > 0 ) { - $hasNonUniqueSamples = 1; - } - } - - $idx += 1; - } - - $sampleListContents = join( "\n", @uniqueSamples ); - - my $finalSampleListDestination = - $self->_workingDir->child( $self->outputFilesInfo->{sampleList} ); - $err = - $self->safeSystem("echo \"$sampleListContents\" > $finalSampleListDestination"); - - if ($err) { - my $humanErr = "Failed to write combined sample list file"; - $self->_errorWithCleanup($humanErr); - return ( $humanErr, undef ); - } - - # Remove the intermediate sample lists - for my $sampleListPath (@sampleLists) { - $err = $self->safeSystem("rm $sampleListPath"); - - if ($err) { - my $humanErr = "Failed to remove intermediate sample list files"; - $self->_errorWithCleanup($humanErr); - return ( $humanErr, undef ); - } - } - } - - # This is technically a warning; the rest of the annotation will work - # However we have not threaded through Bystro optional dosage matrices - # Nor can we yet combine them if they have different sample lists - if ( !$allSampleListsIdentical ) { - my $humanErr = - "Bystro currently requires identical samples per input file. Different sample lists found"; - $self->_errorWithCleanup($humanErr); - return ( $humanErr, undef ); - } - - # Step 2: - if (@dosageMatrixOutPaths) { - $self->log( "info", "Combining dosage matrix outputs" ); - - # Find all non-empty dosageMatrixOutPaths, by stat-ing them - my @nonEmptyDosageMatrixOutPaths; - for my $dosageMatrixOutPath (@dosageMatrixOutPaths) { - if ( -s $dosageMatrixOutPath ) { - push @nonEmptyDosageMatrixOutPaths, $dosageMatrixOutPath; - } - } - - my $finalOutPath = - $self->_workingDir->child( $self->outputFilesInfo->{dosageMatrixOutPath} ); - - if ( @nonEmptyDosageMatrixOutPaths != @dosageMatrixOutPaths ) { - $self->log( "warn", - "Some empty dosage matrix outputs found. Combining non-empty files" ); - } - - if ( !@nonEmptyDosageMatrixOutPaths ) { - $self->log( "warn", "No non-empty dosage matrix outputs found" ); - - # Create an empty file in the final dosageMatrixOutPath destination - $err = $self->safeSystem("touch $finalOutPath"); - - if ($err) { - my $humanErr = "Failed to create empty dosage matrix output file"; - $self->_errorWithCleanup($humanErr); - return ( $humanErr, undef ); - } - } - else { - my $err = - $self->safeSystem( 'dosage --output ' - . $finalOutPath . " " - . join( " ", @nonEmptyDosageMatrixOutPaths ) ); - - if ($err) { - my $humanErr = "Failed to combine dosage matrix outputs"; - $self->_errorWithCleanup($humanErr); - return ( $humanErr, undef ); - } - - # Remove the intermediate dosageMatrixOutPaths - for my $dosageMatrixOutPath (@dosageMatrixOutPaths) { - $err = $self->safeSystem("rm $dosageMatrixOutPath"); - - if ($err) { - my $humanErr = "Failed to remove intermediate dosage matrix files"; - $self->_errorWithCleanup($humanErr); - return ( $humanErr, undef ); - } - } - - $self->log( "info", "Finished combining dosage matrix outputs" ); - } - } - } - - $err = $self->safeSystem('sync') || $self->_moveFilesToOutputDir(); - if ($err) { - my $humanErr = "Failed to move files to output directory"; - $self->_errorWithCleanup($humanErr); - return ( $humanErr, undef ); - } - - my $completionPath = $self->outDir->child($ANNOTATION_COMPLTE_FILE_NAME)->stringify; - - $err = $self->safeSystem("touch $completionPath"); - - if ($err) { - my $humanErr = "Failed to create completion file"; - $self->_errorWithCleanup($humanErr); - return ( $humanErr, undef ); - } - - # Post moving files to output directory, we don't want to regenerate log file in the tmp dir - # so just log to STDERR - say STDERR "Created completion file"; - - try { - close($lockFh); - unlink($lockPath); - } - catch { - say STDERR "Failed to close and delete lock file: $_"; - }; - - return ( $err, $self->outputFilesInfo, $totalAnnotated, $totalSkipped ); -} - -sub makeLogProgressAndPrint { - my ( $self, $abortErrRef, $outFh, $statsFh, $throttleThreshold ) = @_; - - my $totalAnnotated = 0; - my $totalSkipped = 0; - - my $publish = $self->hasPublisher; - - my $thresholdAnn = 0; - my $thresholdSkipped = 0; - - if ( !$throttleThreshold ) { - $throttleThreshold = 1e4; - } - - return sub { - #$annotatedCount, $skipCount, $err, $outputLines, $forcePublish = @_; - ## $_[0], $_[1] , $_[2], $_[3]. , $_[4] - if ( $_[2] ) { - $$abortErrRef = $_[2]; - return ( $totalAnnotated, $totalSkipped ); - } - - $totalAnnotated += $_[0]; - $totalSkipped += $_[1]; - - if ($publish) { - $thresholdAnn += $_[0]; - $thresholdSkipped += $_[1]; - - if ( $_[4] || $thresholdAnn + $thresholdSkipped >= $throttleThreshold ) { - $self->publishProgress( $totalAnnotated, $totalSkipped ); - - $thresholdAnn = 0; - $thresholdSkipped = 0; - } - } - - if ( $_[3] ) { - if ($statsFh) { - print $statsFh $_[3]; - } - - print $outFh $_[3]; - } - - return ( $totalAnnotated, $totalSkipped ); - } -} - -sub _getFileHandles { - my ( $self, $type ) = @_; - - my ( $outFh, $statsFh, @inFhs, @preOutArgs, $headerFh, $err ); - - my $index = 0; - my $total = @{ $self->input_files }; - for my $file ( @{ $self->input_files } ) { - my ( $err, $inFh, $preOutArgHref ) = - $self->_openAnnotationPipe( $type, $file, $index, $total ); - - if ($err) { - return ( $err, undef, undef, undef, undef ); - } - - push @inFhs, $inFh; - push @preOutArgs, $preOutArgHref; - - $index += 1; - } - - if ( $self->run_statistics ) { - ########################## Tell stats program about our annotation ############## - my $statArgs = $self->_statisticsRunner->getStatsArguments(); - - $err = $self->safeOpen( $statsFh, "|-", $statArgs ); - - if ($err) { - return ( $err, undef, undef, undef, undef, undef ); - } - } - - # $fhs{stats} = $$statsFh; - ( $err, $outFh ) = $self->getWriteFh( $self->{_outPath} ); - - if ($err) { - return ( $err, undef, undef, undef, undef, undef ); - } - - ( $err, $headerFh ) = $self->getWriteFh( $self->{_headerPath} ); - - if ($err) { - return ( $err, undef, undef, undef, undef, undef ); - } - - return ( undef, \@inFhs, $outFh, $statsFh, $headerFh, \@preOutArgs ); -} - -sub _preparePreprocessorProgram { - my ( $self, $type, $inPath, $index, $total ) = @_; - - if ( !$self->fileProcessors->{$type} ) { - $self->_errorWithCleanup("No fileProcessors defined for $type file type"); - } - - my $basename = path($inPath)->basename; - - # Remove all non-alphanumeric characters to make sure - # the shell/bash command doens't crash - $basename =~ s/[^a-zA-Z0-9-_]/_/g; - - $inPath = quotemeta($inPath); - - my $errPath = $self->_workingDir->child( $basename . '.file-log.log' ); - - #cat is wasteful, but we expect no one reads large uncompressed files - my $echoProg = $self->getReadArgs($inPath) || "cat $inPath"; - - my $fp = $self->fileProcessors->{$type}; - - my $finalProgram; - if ( $fp->{no_stdin} ) { - $finalProgram = $fp->{program} . " " . $inPath; - } - else { - $finalProgram = $echoProg . " | " . $fp->{program}; - } - - my %finalPreprocessArgs; - if ( $fp->{args} ) { - my $args = $fp->{args}; - - my $potentialPreArgs = - $self->prepareBystroPreprocessorOutputsForMultiFile( $index, $total ); - - for my $type ( keys %{$potentialPreArgs} ) { - if ( index( $args, "\%$type\%" ) > -1 ) { - my $arg = $self->_workingDir->child( $potentialPreArgs->{$type} ); - substr( $args, index( $args, "\%$type\%" ), length("\%$type\%") ) = $arg; - - $finalPreprocessArgs{$type} = $arg; - } - } - - $finalProgram .= " $args"; - } - - return ( $finalProgram, $errPath, \%finalPreprocessArgs ); -} - -sub _openAnnotationPipe { - my ( $self, $type, $inPath, $index, $total ) = @_; - - my ( $finalProgram, $errPath, $preOutArgs ) = - $self->_preparePreprocessorProgram( $type, $inPath, $index, $total ); - - my $fh; - my $err = $self->safeOpen( $fh, '-|', "$finalProgram 2> $errPath" ); - - return ( $err, $fh, $preOutArgs ); -} - -sub _getFinalHeader { - my ( $self, $header ) = @_; - chomp $header; - - ######### Build the header, and write it as the first line ############# - my $finalHeader = Seq::Headers->new(); - - # Bystro takes data from a file pre-processor, which spits out a common - # intermediate format - # This format is very flexible, in fact Bystro doesn't care about the output - # of the pre-processor, provided that the following is found in the corresponding - # indices: - # idx 0: chromosome, - # idx 1: position - # idx 3: the reference (we rename this to discordant) - # idx 4: the alternate allele - # idx 5 on: variable: anything the preprocessor provides - my $numberSplitFields; - my @headerFields; - if ( $self->outputJson ) { - @headerFields = split( '\t', $header ); - $numberSplitFields = @headerFields; - } - else { - # Avoid unnecessary work splitting parts of the file we will not be extracting individual fields from - $numberSplitFields = 5 + 1; - @headerFields = split( '\t', $header, $numberSplitFields ); - } - - # We need to ensure that the ref field of the pre-processor is renamed - # so to not conflict with the ref field of the reference track - # because we store field names in a hash - $headerFields[3] = $self->inputRefField; - - # Our header class checks the name of each feature - # It may be, more than likely, that the pre-processor names the 4th column 'ref' - # We replace this column with trTv - # This not only now reflects its actual function - # but prevents name collision issues resulting in the wrong header idx - # being generated for the ref track - push @headerFields, $self->discordantField; - - # Prepend all of the headers created by the pre-processor - $finalHeader->addFeaturesToHeader( \@headerFields, undef, 1 ); - - return ( $finalHeader, $numberSplitFields ); -} - -sub _errorWithCleanup { - my ( $self, $msg ) = @_; - - $self->log( 'error', $msg ); - - $self->{_db}->cleanUp(); - - return $msg; -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Base.pm b/perl/lib/Seq/Base.pm deleted file mode 100644 index e30c6466a..000000000 --- a/perl/lib/Seq/Base.pm +++ /dev/null @@ -1,121 +0,0 @@ -use 5.10.0; -use strict; -use warnings; -our $VERSION = '0.002'; - -package Seq::Base; - -# ABSTRACT: Configures singleton log and sets database directory - -# Also exports db object, since we configure the database class anyway - -# VERSION - -# TODO: -# Rename database_dir to databaseDir - -use Mouse 2; -use namespace::autoclean; -use Seq::DBManager; -use Seq::Tracks; - -#exports new_with_config -with 'Seq::Role::ConfigFromFile', - #setLogLevel, setLogPath, setPublisher - 'Seq::Role::Message', -############# Required Arguments ########### - - has database_dir => ( is => 'ro', required => 1 ); - -has tracks => ( is => 'ro', required => 1 ); - -############ Public Exports ################### -has readOnly => ( is => 'ro', default => 0 ); - -has tracksObj => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { - my $self = shift; - - my %config = ( %{ $self->tracks }, ( gettersOnly => $self->readOnly ) ); - return Seq::Tracks->new(%config); - } -); - -############# Optional Arguments ############# -has publisher => ( is => 'ro' ); - -has logPath => ( is => 'ro' ); - -has verbose => ( is => 'ro' ); - -has debug => ( is => 'ro', default => 0 ); - -has readAhead => ( is => 'ro', default => 0 ); - -sub BUILD { - my $self = shift; - - # DBManager acts as a singleton. It is configured once, and then consumed repeatedly - # However, in long running processes, this can lead to misconfiguration issues - # and worse, environments created in one process, then copied during forking, to others - # To combat this, every time Seq::Base is called, we re-set/initialzied the static - # properties that create this behavior - # Initialize it before BUILD, to make this class less dependent on inheritance order - # Spend no time in unconfigured state; readOnly needs to applied immediately - # because otherwise could corrupt database - # Inspiration: https://peter.bourgon.org/go-best-practices-2016/#repository-structure - Seq::DBManager::initialize( - { - databaseDir => $self->database_dir, - readOnly => $self->readOnly, - readAhead => $self->readAhead, - } - ); - - # Similarly Seq::Role::Message acts as a singleton - # Clear previous consumer's state, if in long-running process - Seq::Role::Message::initialize(); - - # Each track getter adds its own features to Seq::Headers, which is a singleton - # Since instantiating Seq::Tracks also instantiates getters at this point - # We must clear Seq::Headers here to ensure our tracks can properly do this - # TODO: Make Seq::Headers idempotent, such that one track cannot add its own - # headers multiple times - Seq::Headers::initialize(); - - # Not really needed for Seq::Tracks, but for clarity - Seq::Tracks::initialize(); - - # Seq::Role::Message settings - # We manually set the publisher, logPath, verbosity, and debug, because - # Seq::Role::Message is meant to be consumed globally, but configured once - # Treating publisher, logPath, verbose, debug as instance variables - # would result in having to configure this class in every consuming class - # TODO: move to static methods, to understand where the functions are defined - if ( defined $self->publisher ) { - $self->setPublisher( $self->publisher ); - } - - if ( defined $self->logPath ) { - $self->setLogPath( $self->logPath ); - } - - if ( defined $self->verbose ) { - $self->setVerbosity( $self->verbose ); - } - - #todo: finisih ;for now we have only one level - if ( $self->debug ) { - $self->setLogLevel('DEBUG'); - } - else { - $self->setLogLevel('INFO'); - } -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Build.pm b/perl/lib/Seq/Build.pm deleted file mode 100644 index 8b2b7d025..000000000 --- a/perl/lib/Seq/Build.pm +++ /dev/null @@ -1,207 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Build; -our $VERSION = '0.001'; - -# ABSTRACT: A class for building all files associated with a genome assembly -# VERSION - -=head1 DESCRIPTION - - @class Seq::Build - Iterates all of the builders present in the config file - And executes their buildTrack method - Also guarantees that the reference track will be built first - - @example - -=cut - -use Mouse 2; -use namespace::autoclean; -extends 'Seq::Base'; - -use Seq::Tracks; -use Seq::Tracks::Base::Types; -use Utils::Base; -use List::Util qw/first/; -use YAML::XS qw/LoadFile Dump/; -use Path::Tiny qw/path/; -use String::Strip qw/StripLTSpace/; - -use Time::localtime; - -has wantedType => ( is => 'ro', isa => 'Maybe[Str]', lazy => 1, default => undef ); - -#TODO: allow building just one track, identified by name -has wantedName => ( is => 'ro', isa => 'Maybe[Str]', lazy => 1, default => undef ); - -has meta_only => ( is => 'ro', default => 0 ); - -# The config file path, used to update the config file with build version date & author -has config => ( is => 'ro', required => 1 ); - -#Figures out what track type was asked for -#and then builds that track by calling the tracks -#"buildTrack" method -sub BUILD { - my $self = shift; - - # From Seq::Base; - my $tracks = $self->tracksObj; - - my $buildDate = Utils::Base::getDate(); - - #http://stackoverflow.com/questions/1378221/how-can-i-get-name-of-the-user-executing-my-perl-script - my $buildAuthor = $ENV{LOGNAME} || $ENV{USER} || getpwuid($<); - # Meta tracks are built during instantiation, so if we only want to build the - # meta data, we can return here safely. - if ( $self->meta_only ) { - return; - } - - my @builders; - my @allBuilders = $tracks->allTrackBuilders; - - if ( $self->wantedType ) { - my @types = split( /,/, $self->wantedType ); - - for my $type (@types) { - # modifies in place - StripLTSpace($type); - - my $buildersOfType = $tracks->getTrackBuildersByType($type); - - if ( !defined $buildersOfType ) { - $self->log( 'fatal', "Track type \"$type\" not recognized" ); - return; - } - - push @builders, @$buildersOfType; - } - } - elsif ( $self->wantedName ) { - my @names = split( /,/, $self->wantedName ); - - for my $name (@names) { - # modifies in place - StripLTSpace($name); - - my $builderOfName = $tracks->getTrackBuilderByName($name); - - if ( !defined $builderOfName ) { - $self->log( 'fatal', "Track name \"$name\" not recognized" ); - return; - } - - push @builders, $builderOfName; - } - } - else { - @builders = @allBuilders; - - #If we're building all tracks, reference should be first - if ( $builders[0]->name ne $tracks->getRefTrackBuilder()->name ) { - $self->log( 'fatal', "Reference track should be listed first" ); - } - } - - if ( $tracks->getRefTrackBuilder()->no_build ) { - $self->log( 'fatal', "Reference track is marked as no_build, but must be built" ); - } - - #TODO: return error codes from the rest of the buildTrack methods - my $decodedConfig = LoadFile( $self->config ); - - for my $builder (@builders) { - if ( $builder->no_build ) { - $self->log( 'info', "Marked as no_build, skipping: " . $builder->name . "\n" ); - next; - } - - $self->log( 'info', "Started building " . $builder->name . "\n" ); - - #TODO: implement errors for all tracks - #Currently we expect buildTrack to die if it didn't properly complete - $builder->buildTrack(); - - my $track = - first { $_->{name} eq $builder->name } @{ $decodedConfig->{tracks}{tracks} }; - - $track->{build_date} = $buildDate; - $track->{build_author} = $buildAuthor; - $track->{version} = $track->{version} ? ++$track->{version} : 1; - - $self->log( 'info', "Finished building " . $builder->name . "\n" ); - } - - $self->log( 'info', - "finished building all requested tracks: " - . join( ", ", map { $_->name } @builders ) - . "\n" ); - - $decodedConfig->{build_date} = $buildDate; - $decodedConfig->{build_author} = $buildAuthor; - $decodedConfig->{version} = - $decodedConfig->{version} ? ++$decodedConfig->{version} : 1; - - # If this is already a symlink, remove it - if ( -l $self->config ) { - unlink $self->config; - } - else { - my $backupPath = $self->config . ".build-bak.$buildDate"; - if ( - system( - "rm -f $backupPath; mv " - . $self->config . " " - . $self->config - . ".build-bak.$buildDate" - ) != 0 - ) - { - $self->log( 'fatal', "Failed to back up " . $self->config ); - } - } - - my $newConfigPath = $self->config . ".build.$buildDate"; - my $newConfigPathBase = path($newConfigPath)->basename; - - # Write a copy of the new config file to the database-containing folder - $newConfigPath = - path( $decodedConfig->{database_dir} )->child($newConfigPathBase)->stringify; - open( my $fh, '>', $newConfigPath ) - or $self->log( 'fatal', "Couldn't open $newConfigPath for writing" ); - - say $fh Dump($decodedConfig); - - close($fh); - - # Write a 2nd copy to the original path of the config file. - open( $fh, '>', $self->config ); - - say $fh Dump($decodedConfig); - - close($fh); - - # Create a clean copy, free of file paths, for github - $decodedConfig->{database_dir} = '~'; - $decodedConfig->{files_dir} = '~'; - $decodedConfig->{temp_dir} = '~'; - - $newConfigPathBase = path( $self->config )->basename; - $newConfigPathBase = - substr( $newConfigPathBase, 0, rindex( $newConfigPathBase, '.' ) ) . ".clean.yml"; - - $newConfigPath = path( $self->config )->parent->child($newConfigPathBase)->stringify; - - open( $fh, '>', $newConfigPath ); - - say $fh Dump($decodedConfig); -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/DBManager.pm b/perl/lib/Seq/DBManager.pm deleted file mode 100644 index 5242fb3c2..000000000 --- a/perl/lib/Seq/DBManager.pm +++ /dev/null @@ -1,1181 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::DBManager; - -our $VERSION = '0.001'; - -# ABSTRACT: Manages Database connection -# VERSION - -#TODO: Better errors; Seem to get bad perf if copy error after each db call -#TODO: Allow missing database only if in $dbReadOnly mode -#TODO: Better singleton handling - -use Mouse 2; -with 'Seq::Role::Message'; - -use Data::MessagePack; -use LMDB_File qw(:all); -use Types::Path::Tiny qw/AbsPath/; - -use Hash::Merge::Simple qw/ merge /; -use Path::Tiny; -use Scalar::Util qw/looks_like_number/; - -# We will maintain our own, internal database log for errors -use Cwd; -use Log::Fast; - -# Most common error is "MDB_NOTFOUND" which isn't nec. bad. -$LMDB_File::die_on_err = 0; - -######### Public Attributes -# Flag for deleting tracks instead of inserting during patch* methods -has delete => ( is => 'rw', isa => 'Bool', default => 0, lazy => 1 ); - -has dryRun => ( is => 'rw', isa => 'Bool', default => 0, lazy => 1 ); - -# DBManager maintains its own, internal log, so that in a multi-user environment -# DBA can keep track of key errors -# TODO: Warning: cwd may fill up if left unchecked -my $internalLog = Log::Fast->new( - { - path => path( getcwd() )->child('dbManager-error.log')->stringify, - pid => $$, - } -); - -# instanceConfig variable holding our databases; this way can be used -# in environment where calling process never dies -# { -# database_dir => { -# env => $somEnv, dbi => $someDbi -# } -# } -####################### Static Properties ############################ -my $instance; -# instanceConfig contains -# databaseDir => -# readOnly => -my %instanceConfig; -# Each process should open each environment only once. -# http://www.lmdb.tech/doc/starting.html -my %envs; -# We are enforcing a singel transaction per environment for the moment, especially -# in light of the apparent LMDB_File restriction to this effect -my %cursors; - -# Can call as class method (DBManager->setDefaultDatabaseDir), or as instanceConfig method -# Prepares the class for consumption; should be run before the program can fork -# To ensure that all old data is cleared, if executing from a long-running process -sub initialize { - my $data = @_ == 2 ? $_[1] : $_[0]; - - if (%instanceConfig) { - $internalLog->WARN("dbManager already initialized; clearing state"); - } - - cleanUp(); - - undef $instance; - undef %instanceConfig; - undef %envs; - undef %cursors; - - if ( !$data->{databaseDir} ) { - $internalLog->ERR("dbManager requires a databaseDir"); - die; - } - - $instanceConfig{databaseDir} = path( $data->{databaseDir} ); - if ( !$instanceConfig{databaseDir}->exists ) { - $instanceConfig{databaseDir}->mkpath; - } - - if ( $data->{readOnly} ) { - $instanceConfig{readOnly} = 1; - } - - if ( $data->{readAhead} ) { - $instanceConfig{readAhead} = 1; - } - - shift; - return __PACKAGE__->new(@_); -} - -around 'new' => sub { - my $orig = shift; - my $self = shift; - - return $instance //= $self->$orig(@_); -}; - -sub BUILD { - my $self = shift; - - # TODO: think about better way to initialize this class w.r.t databaseDir - if ( !$instanceConfig{databaseDir} ) { - $self->_errorWithCleanup("DBManager requires databaseDir"); - } - - if ( !$instanceConfig{databaseDir}->is_dir ) { - $self->_errorWithCleanup('databaseDir not a directory'); - } -} - -# Our packing function -#treat "1" as an integer, save more space -#treat .00012 as a single precision float, saving 4 bytes. -my $mp = Data::MessagePack->new()->prefer_integer()->prefer_float32(); - -################### DB Read, Write methods ############################ -# Unsafe for $_[2] ; will be modified if an array is passed -# Read transactions are committed by default -sub dbReadOne { - #my ($self, $chr, $posAref, $skipCommit, $stringKeys,) = @_; - #== $_[0], $_[1], $_[2], $_[3], $_[4] (don't assign to avoid copy) - - #It is possible not to find a database in $dbReadOnly mode (for ex: refSeq for a while didn't have chrM) - #http://ideone.com/uzpdZ8 - # #$name, $dontCreate, $stringKeys - my $db = $_[0]->_getDbi( $_[1], 1, $_[4] ) or return; - - if ( !$db->{db}->Alive ) { - $db->{db}->Txn = $db->{env}->BeginTxn(); - # not strictly necessary, but I am concerned about hard to trace abort bugs related to scope - $db->{db}->Txn->AutoCommit(1); - } - - $db->{db}->Txn->get( $db->{dbi}, $_[2], my $json ); - - # Commit unless the user specifically asks not to - #if(!$skipCommit) { - $db->{db}->Txn->commit() unless $_[3]; - - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_NOTFOUND ) { - $_[0]->_errorWithCleanup("dbRead LMDB error $LMDB_File::last_err"); - return 255; - } - - $LMDB_File::last_err = 0; - } - - return defined $json ? $mp->unpack($json) : undef; -} - -# Unsafe for $_[2] ; will be modified if an array is passed -# Read transactions are committed by default -sub dbRead { - #my ($self, $chr, $posAref, $skipCommit, $stringKeys) = @_; - #== $_[0], $_[1], $_[2], $_[3], $_[4] (don't assign to avoid copy) - if ( !ref $_[2] ) { - goto &dbReadOne; - } - - #It is possible not to find a database in dbReadOnly mode (for ex: refSeq for a while didn't have chrM) - #http://ideone.com/uzpdZ8 - # #$name, $dontCreate, $stringKeys - my $db = $_[0]->_getDbi( $_[1], 0, $_[4] ) or return []; - my $dbi = $db->{dbi}; - - if ( !$db->{db}->Alive ) { - $db->{db}->Txn = $db->{env}->BeginTxn(); - # not strictly necessary, but I am concerned about hard to trace abort bugs related to scope - $db->{db}->Txn->AutoCommit(1); - } - - my $txn = $db->{db}->Txn; - - my $json; - - # Modifies $posAref ($_[2]) to avoid extra allocation - for my $pos ( @{ $_[2] } ) { - $txn->get( $dbi, $pos, $json ); - - $pos = defined $json ? $mp->unpack($json) : undef; - } - - # Commit unless the user specifically asks not to - #if(!$skipCommit) { - $txn->commit() unless $_[3]; - - #substantial to catch any errors - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_NOTFOUND ) { - $_[0]->_errorWithCleanup("dbRead LMDB error after loop: $LMDB_File::last_err"); - return 255; - } - - #reset the class error variable, to avoid crazy error reporting later - $LMDB_File::last_err = 0; - } - - #will return a single value if we were passed one value - #return \@out; - return $_[2]; -} - -#Assumes that the posHref is -# { -# position => { -# feature_name => { -# ...everything that belongs to feature_name -# } -# } -# } - -# Method to write one key => value pair to the database, as a hash -# $pos can be any string, identifies a key within the kv database -# dataHref should be {someTrackName => someData} that belongs at $chr:$pos -# Currently only used for region tracks (currently only the Gene Track region track) -sub dbPatchHash { - my ( $self, $chr, $pos, $dataHref, $mergeFunc, $skipCommit, $overwrite, $stringKeys ) - = @_; - - if ( ref $dataHref ne 'HASH' ) { - $self->_errorWithCleanup("dbPatchHash requires a 1-element hash of a hash"); - return 255; - } - - # 0 argument means "create if not found" - # last argument means we want string keys rather than integer keys - my $db = $self->_getDbi( $chr, 0, $stringKeys ); - - if ( !$db ) { - $self->_errorWithCleanup( "Couldn't open $chr database. readOnly is " - . ( $instanceConfig{readOnly} ? "set" : "not set" ) ); - return 255; - } - - my $dbi = $db->{dbi}; - - if ( !$db->{db}->Alive ) { - $db->{db}->Txn = $db->{env}->BeginTxn(); - # not strictly necessary, but I am concerned about hard to trace abort bugs related to scope - $db->{db}->Txn->AutoCommit(1); - } - - #zero-copy read, don't modify $json - $db->{db}->Txn->get( $dbi, $pos, my $json ); - - if ( $LMDB_File::last_err && $LMDB_File::last_err != MDB_NOTFOUND ) { - $self->_errorWithCleanup("dbPatchHash LMDB error during get: $LMDB_File::last_err"); - return 255; - } - - $LMDB_File::last_err = 0; - - my $href; - my $skip; - if ($json) { - $href = $mp->unpack($json); - - my ( $trackKey, $trackValue ) = %{$dataHref}; - - if ( !defined $trackKey || ref $trackKey ) { - $self->_errorWithCleanup("dbPatchHash requires scalar trackKey"); - return 255; - } - - # Allows undefined trackValue - - if ( defined $href->{$trackKey} ) { - # Deletion and insertion are mutually exclusive - if ( $self->delete ) { - delete $href->{$trackKey}; - } - elsif ($overwrite) { - # Merge with righthand hash taking precedence, https://ideone.com/SBbfYV - # Will overwrite any keys of the same name (why it's called $overwrite) - $href = merge $href, $dataHref; - } - elsif ( defined $mergeFunc ) { - ( my $err, $href->{$trackKey} ) = - &$mergeFunc( $chr, $pos, $href->{$trackKey}, $trackValue ); - - if ($err) { - $self->_errorWithCleanup("dbPatchHash mergeFunc error: $err"); - return 255; - } - } - else { - # Nothing to do, value exists, we're not deleting, overwrite, or merging - $skip = 1; - } - } - elsif ( $self->delete ) { - # We want to delete a non-existant key, skip - $skip = 1; - } - else { - $href->{$trackKey} = $trackValue; - } - } - elsif ( $self->delete ) { - # If we want to delete, and no data, there's nothing to do, skip - $skip = 1; - } - - #insert href if we have that (only truthy if defined), or the data provided as arg - if ( !$skip ) { - if ( $self->dryRun ) { - $self->log( 'info', "DBManager dry run: would have dbPatchHash $chr\:$pos" ); - } - else { - $db->{db}->Txn->put( $db->{dbi}, $pos, $mp->pack( $href || $dataHref ) ); - } - } - - $db->{db}->Txn->commit() unless $skipCommit; - - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_KEYEXIST ) { - $self->_errorWithCleanup("dbPut LMDB error: $LMDB_File::last_err"); - return 255; - } - - #reset the class error variable, to avoid crazy error reporting later - $LMDB_File::last_err = 0; - } - - return 0; -} - -#Method to write a single position into the main databse -# Write transactions are by default committed -# Removed delete, overwrite capacities -sub dbPatch { - #my ($self, $chr, $trackIndex, $pos, $trackValue, $mergeFunc, $skipCommit, $stringKeys) = @_; - #. $_[0], $_[1] $_[2] $_[3] $_[4] $_[5] $_[6] $_[7] - - # 0 argument means "create if not found" - #my $db = $self->_getDbi($chr, 0, $stringKeys); - my $db = $_[0]->_getDbi( $_[1], 0, $_[7] ) or return 255; - - if ( !$db ) { - $_[0]->_errorWithCleanup( "Couldn't open $_[1] database. readOnly is " - . ( $instanceConfig{readOnly} ? "set" : "not set" ) ); - return 255; - } - - if ( !$db->{db}->Alive ) { - $db->{db}->Txn = $db->{env}->BeginTxn(); - # not strictly necessary, but I am concerned about hard to trace abort bugs related to scope - $db->{db}->Txn->AutoCommit(1); - } - - my $txn = $db->{db}->Txn; - - #zero-copy - #$db->{db}->Txn->get($db->{dbi}, $pos, my $json); - $txn->get( $db->{dbi}, $_[3], my $json ); - - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_NOTFOUND ) { - #$self-> - $_[0]->_errorWithCleanup("dbPatch LMDB error $LMDB_File::last_err"); - return 255; - } - - #reset the class error variable, to avoid crazy error reporting later - $LMDB_File::last_err = 0; - } - - my $aref = defined $json ? $mp->unpack($json) : []; - - #Undefined track values are allowed as universal-type "missing data" signal - #$aref->[$trackIndex] - if ( defined $aref->[ $_[2] ] ) { - #if($mergeFunc) { - if ( $_[5] ) { - #$aref->[$trackIndex]) = $mergeFunc->($chr, $pos, $aref->[$trackIndex], $trackValue); - ( my $err, $aref->[ $_[2] ] ) = $_[5]->( $_[1], $_[3], $aref->[ $_[2] ], $_[4] ); - - if ($err) { - #$self - $_[0]->_errorWithCleanup("mergeFunc error: $err"); - return 255; - } - - # Nothing to update - if ( !defined $aref->[ $_[2] ] ) { - return 0; - } - } - else { - # No overriding - return 0; - } - } - else { - #[$trackIndex] = $trackValue - $aref->[ $_[2] ] = $_[4]; - } - - #if($self->dryRun) { - if ( $_[0]->dryRun ) { - #$self-> - $_[0]->log( 'info', "DBManager dry run: would have dbPatch $_[1]\:$_[3]" ); - } - else { - #$txn->put($db->{dbi}, $pos, $mp->pack($aref)); - $txn->put( $db->{dbi}, $_[3], $mp->pack($aref) ); - } - - #if(!$skipCommit) { - $txn->commit() unless $_[6]; - - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_KEYEXIST ) { - #$self-> - $_[0]->_errorWithCleanup("dbPatch put or commit LMDB error $LMDB_File::last_err"); - return 255; - } - - #reset the class error variable, to avoid crazy error reporting later - $LMDB_File::last_err = 0; - } - - return 0; -} - -# Write transactions are by default committed -sub dbPut { - my ( $self, $chr, $pos, $data, $skipCommit, $stringKeys ) = @_; - - if ( $self->dryRun ) { - $self->log( 'info', "DBManager dry run: would have dbPut $chr:$pos" ); - return 0; - } - - if ( !( defined $chr && defined $pos ) ) { - $self->_errorWithCleanup("dbPut requires position"); - return 255; - } - - # 0 to create database if not found - my $db = $self->_getDbi( $chr, 0, $stringKeys ); - - if ( !$db ) { - $self->_errorWithCleanup( "Couldn't open $chr database. readOnly is " - . ( $instanceConfig{readOnly} ? "set" : "not set" ) ); - return 255; - } - - if ( !$db->{db}->Alive ) { - $db->{db}->Txn = $db->{env}->BeginTxn(); - # not strictly necessary, but I am concerned about hard to trace abort bugs related to scope - $db->{db}->Txn->AutoCommit(1); - } - - $db->{db}->Txn->put( $db->{dbi}, $pos, $mp->pack($data) ); - - $db->{db}->Txn->commit() unless $skipCommit; - - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_KEYEXIST ) { - $self->_errorWithCleanup("dbPut LMDB error: $LMDB_File::last_err"); - return 255; - } - - #reset the class error variable, to avoid crazy error reporting later - $LMDB_File::last_err = 0; - } - - return 0; -} - -sub dbDelete { - my ( $self, $chr, $pos, $stringKeys ) = @_; - - if ( $self->dryRun ) { - $self->log( 'info', "DBManager dry run: Would have dbDelete $chr\:$pos" ); - return 0; - } - - if ( !( defined $chr && defined $pos ) ) { - $self->_errorWithCleanup("dbDelete requires chr and position"); - return 255; - } - - my $db = $self->_getDbi( $chr, $stringKeys ); - - if ( !$db->{db}->Alive ) { - $db->{db}->Txn = $db->{env}->BeginTxn(); - # not strictly necessary, but I am concerned about hard to trace abort bugs related to scope - $db->{db}->Txn->AutoCommit(1); - } - - # Error with LMDB_File api, means $data is required as 3rd argument, - # even if it is undef - $db->{db}->Txn->del( $db->{dbi}, $pos, undef ); - - if ( $LMDB_File::last_err && $LMDB_File::last_err != MDB_NOTFOUND ) { - $self->_errorWithCleanup("dbDelete LMDB error: $LMDB_File::last_err"); - return 255; - } - - $LMDB_File::last_err = 0; - - $db->{db}->Txn->commit(); - - if ($LMDB_File::last_err) { - $self->_errorWithCleanup("dbDelete commit LMDB error: $LMDB_File::last_err"); - return 255; - } - - #reset the class error variable, to avoid crazy error reporting later - $LMDB_File::last_err = 0; - return 0; -} - -#cursor version -# Read transactions are by default not committed -sub dbReadAll { - my ( $self, $chr, $skipCommit, $stringKeys ) = @_; - #== $_[0], $_[1], $_[2] - - #It is possible not to find a database in dbReadOnly mode (for ex: refSeq for a while didn't have chrM) - #http://ideone.com/uzpdZ8 - my $db = $self->_getDbi( $chr, 0, $stringKeys ) or return; - - if ( !$db->{db}->Alive ) { - $db->{db}->Txn = $db->{env}->BeginTxn(); - # not strictly necessary, but I am concerned about hard to trace abort bugs related to scope - $db->{db}->Txn->AutoCommit(1); - } - - # We store data in sequential, integer order - # in all but the meta tables, which don't use this function - # LMDB::Cursor::open($txn, $db->{dbi}, my $cursor); - my $cursor = $db->{db}->Cursor; - - my ( $key, $value, @out ); - my $first = 1; - while (1) { - if ($first) { - $cursor->_get( $key, $value, MDB_FIRST ); - $first = 0; - } - else { - $cursor->_get( $key, $value, MDB_NEXT ); - } - - #because this error is generated right after the get - #we want to capture it before the next iteration - #hence this is not inside while( ) - if ( $LMDB_File::last_err == MDB_NOTFOUND ) { - $LMDB_FILE::last_err = 0; - last; - } - - if ($LMDB_FILE::last_err) { - $_[0]->_errorWithCleanup("dbReadAll LMDB error $LMDB_FILE::last_err"); - return 255; - } - - push @out, $mp->unpack($value); - } - - # !$skipCommit - $db->{db}->Txn->commit() unless $skipCommit; - - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_NOTFOUND ) { - $_[0]->_errorWithCleanup("dbReadAll LMDB error at end: $LMDB_File::last_err"); - return 255; - } - - #reset the class error variable, to avoid crazy error reporting later - $LMDB_File::last_err = 0; - } - - return \@out; -} - -# Delete all values within a database; a necessity if we want to update a single track -# TODO: this may inflate database size, because very long-lived transaction -# maybe should allow to commit -sub dbDeleteAll { - my ( $self, $chr, $dbName, $stringKeys ) = @_; - - #It is possible not to find a database in dbReadOnly mode (for ex: refSeq for a while didn't have chrM) - #http://ideone.com/uzpdZ8 - my $db = $self->_getDbi( $chr, 0, $stringKeys ) or return; - - if ( !$db->{db}->Alive ) { - $db->{db}->Txn = $db->{env}->BeginTxn(); - # not strictly necessary, but I am concerned about hard to trace abort bugs related to scope - $db->{db}->Txn->AutoCommit(1); - } - - # We store data in sequential, integer order - # in all but the meta tables, which don't use this function - # LMDB::Cursor::open($txn, $db->{dbi}, my $cursor); - my $cursor = $db->{db}->Cursor; - - my ( $key, $value, @out ); - my $first = 1; - while (1) { - if ($first) { - $cursor->_get( $key, $value, MDB_FIRST ); - $first = 0; - } - else { - $cursor->_get( $key, $value, MDB_NEXT ); - } - - #because this error is generated right after the get - #we want to capture it before the next iteration - #hence this is not inside while( ) - if ( $LMDB_File::last_err == MDB_NOTFOUND ) { - $LMDB_FILE::last_err = 0; - last; - } - - if ($LMDB_FILE::last_err) { - $_[0]->_errorWithCleanup("dbReadAll LMDB error $LMDB_FILE::last_err"); - return 255; - } - - my $vals = $mp->unpack($value); - - if ( $vals->[$dbName] ) { - $vals->[$dbName] = undef; - - $cursor->_put( $key, $mp->pack($vals), MDB_CURRENT ); - } - } - - $db->{db}->Txn->commit(); - - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_NOTFOUND ) { - $_[0]->_errorWithCleanup("dbReadAll LMDB error at end: $LMDB_File::last_err"); - return 255; - } - - #reset the class error variable, to avoid crazy error reporting later - $LMDB_File::last_err = 0; - } - - return 0; -} - -################################################################################ -###### For performance reasons we may want to manage our own transactions ###### -######################## WARNING: *UNSAFE* ##################################### -sub dbStartCursorTxn { - my ( $self, $chr ) = @_; - - if ( $cursors{$chr} ) { - return $cursors{$chr}; - } - - #It is possible not to find a database in $dbReadOnly mode (for ex: refSeq for a while didn't have chrM) - #http://ideone.com/uzpdZ8 - - my $db = $self->_getDbi($chr); - - # TODO: Better error handling; since a cursor may be used to read or write - # in most cases a database not existing indicates we set readOnly or are need to return an error if the database doesn't exist - if ( !$db ) { - $self->_errorWithCleanup( - "Couldn't open $chr database because it doesn't exist. readOnly is " - . ( $instanceConfig{readOnly} ? "set" : "not set" ) ); - return 255; - } - - # TODO: Investigate why a subtransaction isn't successfully made - # when using BeginTxn() - # If we create a txn and assign it to DB->Txn, from $db->{env}, before creating a txn here - # upon trying to use the parent transaction, we will get a crash (-30782 / BAD_TXN) - # no such issue arises the other way around; i.e creating this transaction, then having - # a normal DB->Txn created as a nested transaction - if ( $db->{db}->Alive ) { - $self->_errorWithCleanup( - "DB alive when calling dbStartCursorTxn, LMDB_File allows only 1 txn per environment: commit DB->Txn before dbStartCursorTxn" - ); - return 255; - } - - # Will throw errors saying "should be nested transaction" unlike env->BeginTxn(); - # to protect against the above BAD_TXN issue - my $txn = LMDB::Txn->new( $db->{env}, $db->{tflags} ); - - $txn->AutoCommit(1); - - # This means LMDB_File will not track our cursor, must close/delete manually - LMDB::Cursor::open( $txn, $db->{dbi}, my $cursor ); - - # TODO: better error handling - if ( !$cursor ) { - $self->_errorWithCleanup("Couldn't open cursor for $_[1]"); - return 255; - } - - # Unsafe, private LMDB_File method access but Cursor::open does not track cursors - $LMDB::Txn::Txns{$$txn}{Cursors}{$$cursor} = 1; - - $cursors{$chr} = [ $txn, $cursor ]; - - # We store data in sequential, integer order - # in all but the meta tables, which don't use this function - # LMDB::Cursor::open($txn, $db->{dbi}, my $cursor); - return $cursors{$chr}; -} - -# Assumes user manages their own transactions -# Don't copy variables on the stack, since this may be called billions of times -sub dbReadOneCursorUnsafe { - #my ($self, $cursor, $pos) = @_; - #$_[0]. $_[1]. $_[2] - - #$cursor->[1]->_get($pos) - $_[1]->[1]->_get( $_[2], my $json, MDB_SET ); - - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_NOTFOUND ) { - #$self->_errorWithCleanup - $_[0]->_errorWithCleanup("dbEndCursorTxn LMDB error: $LMDB_File::last_err"); - return 255; - } - - #reset the class error variable, to avoid crazy error reporting later - $LMDB_File::last_err = 0; - } - - return defined $json ? $mp->unpack($json) : undef; -} - -# Don't copy variables on the stack, since this may be called billions of times -# Instead, modify the passed $posAref (arg 3) -sub dbReadCursorUnsafe { - #my ($self, $cursor, $posAref) = @_; - #$_[0]. $_[1]. $_[2] - - #foreach(@{$posAref}) - foreach ( @{ $_[2] } ) { - #$cursor->[1]->_get($_, my $json, MDB_SET); - $_[1]->[1]->_get( $_, my $json, MDB_SET ); - - $_ = defined $json ? $mp->unpack($json) : undef; - } - - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_NOTFOUND ) { - #$self - $_[0]->_errorWithCleanup("dbEndCursorTxn LMDB error: $LMDB_File::last_err"); - return 255; - } - - #reset the class error variable, to avoid crazy error reporting later - $LMDB_File::last_err = 0; - } - - #return $posAref; - return $_[2]; -} - -# When you need performance, especially for genome-wide insertions -# Be an adult, manage your own cursor -# LMDB tells you : if you commit the cursor is closed, needs to be renewed -# Don't copy variables on the stack, since this may be called billions of times -sub dbPatchCursorUnsafe { - #my ( $self, $cursor, $chr, $dbName, $pos, $newValue, $mergeFunc) = @_; - # $_[0]. $_[1]. $_[2]. $_[3]. $_[4] $_[5]. $_[6] - - #$cursor->[1]->_get($pos, my $json, MDB_SET); - $_[1]->[1]->_get( $_[4], my $json, MDB_SET ); - - my $existingValue = defined $json ? $mp->unpack($json) : []; - # [$dbName] - if ( defined $existingValue->[ $_[3] ] ) { - # ($mergeFunc) - if ( $_[6] ) - { #[$dbName]=$mergeFunc->($chr, $pos, $existingValue->[$dbName], $newValue); - ( my $err, $existingValue->[ $_[3] ] ) = - $_[6]->( $_[2], $_[4], $existingValue->[ $_[3] ], $_[5] ); - - if ($err) { - $_[0]->_errorWithCleanup("dbPatchCursor mergeFunc error: $err"); - return 255; - } - - # nothing to do; no value returned - if ( !defined $existingValue->[ $_[3] ] ) { - return 0; - } - } - else { - # No overwrite allowed by default - # just like dbPatch, but no overwrite option - # Overwrite is impossible when mergeFunc is defined - # TODO: remove overwrite from dbPatch - return 0; - } - } - else { - #$existingValue->[$dbName]= $newValue; - $existingValue->[ $_[3] ] = $_[5]; - } - - #_put as used here will not return errors if the cursor is inactive - # hence, "unsafe" - if ( defined $json ) { - #$cursor->[1]->_put($pos, $mp->pack($existingValue), MDB_CURRENT); - $_[1]->[1]->_put( $_[4], $mp->pack($existingValue), MDB_CURRENT ); - } - else { - #$cursor->[1]->_put($pos, $mp->pack($existingValue)); - $_[1]->[1]->_put( $_[4], $mp->pack($existingValue) ); - } - - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_NOTFOUND && $LMDB_File::last_err != MDB_KEYEXIST ) { - #$self->_errorWithCleanup... - $_[0]->_errorWithCleanup("dbEndCursorTxn LMDB error: $LMDB_File::last_err"); - return 255; - } - - #reset the class error variable, to avoid crazy error reporting later - $LMDB_File::last_err = 0; - } - - return 0; -} - -# commit and close a self-managed cursor object -# TODO: Don't close cursor if not needed -sub dbEndCursorTxn { - my ( $self, $chr ) = @_; - - if ( !defined $cursors{$chr} ) { - return 0; - } - - $cursors{$chr}->[1]->close(); - - # closes a write cursor as well; the above $cursor->close() is to be explicit - # will not close a MDB_RDONLY cursor - $cursors{$chr}->[0]->commit(); - - delete $cursors{$chr}; - - # Allow two relatively innocuous errors, kill for anything else - if ($LMDB_File::last_err) { - if ( $LMDB_File::last_err != MDB_NOTFOUND && $LMDB_File::last_err != MDB_KEYEXIST ) { - $self->_errorWithCleanup("dbEndCursorTxn LMDB error: $LMDB_File::last_err"); - return 255; - } - - #reset the class error variable, to avoid crazy error reporting later - $LMDB_File::last_err = 0; - } - - return 0; -} - -################################################################################ - -sub dbGetNumberOfEntries { - my ( $self, $chr ) = @_; - - #get database, but don't create it if it doesn't exist - my $db = $self->_getDbi( $chr, 1 ); - - return $db ? $db->{env}->stat->{entries} : 0; -} - -#to store any records -#For instanceConfig, here we can store our feature name mappings, our type mappings -#whether or not a particular track has completed writing, etc -state $metaDbNamePart = '_meta'; - -#We allow people to update special "Meta" databases -#The difference here is that for each $databaseName, there is always -#only one meta database. Makes storing multiple meta documents in a single -#meta collection easy -#For example, users may want to store field name mappings, how many rows inserted -#whether building the database was a success, and more -sub dbReadMeta { - my ( $self, $databaseName, $metaKey, $skipCommit ) = @_; - - # pass 1 to use string keys for meta properties - return $self->dbReadOne( $databaseName . $metaDbNamePart, $metaKey, $skipCommit, 1 ); -} - -#@param $databaseName : whatever the user wishes to prefix the meta name with -#@param $metaKey : this is our "position" in the meta database -# a.k.a the top-level key in that meta database, what type of meta data this is -#@param $data : {someField => someValue} or a scalar value -sub dbPatchMeta { - my ( $self, $databaseName, $metaKey, $data ) = @_; - - my $dbName = $databaseName . $metaDbNamePart; - # If the user treats this metaKey as a scalar value, overwrite whatever was there - if ( !ref $data ) { - # undef : commit every transcation - # 1 : use string keys - $self->dbPut( $dbName, $metaKey, $data, undef, 1 ); - } - else { - # Pass 1 to merge $data with whatever was kept at this metaKey - # Pass 1 to use string keys for meta databases - $self->dbPatchHash( $dbName, $metaKey, $data, undef, undef, 1, 1 ); - } - - # Make sure that we update/sync the meta data asap, since this is critical - # to db integrity - $self->dbForceCommit($dbName); - return; -} - -sub dbDeleteMeta { - my ( $self, $databaseName, $metaKey ) = @_; - - #dbDelete returns nothing - # last argument means non-integer keys - $self->dbDelete( $databaseName . $metaDbNamePart, $metaKey, 1 ); - return; -} - -sub dbDropDatabase { - my ( $self, $chr, $remove, $stringKeys ) = @_; - - #dbDelete returns nothing - # 0 means don't create - # last argument means non-integer keys - my $db = $self->_getDbi( $chr, 0, $stringKeys ); - if ( !$db->{db}->Alive ) { - $db->{db}->Txn = $db->{env}->BeginTxn(); - # not strictly necessary, but I am concerned about hard to trace abort bugs related to scope - $db->{db}->Txn->AutoCommit(1); - } - - # if $remove is not truthy, database is emptied rather than dropped - $db->{db}->drop($remove); - - $instanceConfig{databaseDir}->child($chr)->remove_tree(); -} - -sub _getDbi { - # Exists and not defined, because in read only database we may discover - # that some chromosomes don't have any data (example: hg38 refSeq chrM) - if ( $envs{ $_[1] } ) { - return $envs{ $_[1] }; - } - - # $_[0] $_[1], $_[2] - # Don't create used by dbGetNumberOfEntries - my ( $self, $name, $dontCreate, $stringKeys ) = @_; - - my $dbPath = $instanceConfig{databaseDir}->child($name); - - # Create the database, only if that is what is intended - if ( !$dbPath->is_dir ) { - # If dbReadOnly flag set, this database will NEVER be created during the - # current execution cycle - if ( $instanceConfig{readOnly} ) { - return; - } - elsif ($dontCreate) { - # dontCreate does not imply the database will never be created, - # so we don't want to update $self->_envs - return; - } - else { - $dbPath->mkpath; - } - } - - $dbPath = $dbPath->stringify; - - my $flags = 0; - if ( $instanceConfig{readOnly} ) { - $flags = MDB_NOLOCK | MDB_RDONLY; - } - - if ( !$instanceConfig{readAhead} ) { - $flags = $flags | MDB_NORDAHEAD; - } - - my $env = LMDB::Env->new( - $dbPath, - { - mapsize => 128 * 1024 * 1024 * 1024, # Plenty space, don't worry - #maxdbs => 20, # Some databases - mode => 0600, - #can't just use ternary that outputs 0 if not read only... - #MDB_RDONLY can also be set per-transcation; it's just not mentioned - #in the docs - flags => $flags, - maxdbs => - 0, # Some databases; else we get a MDB_DBS_FULL error (max db limit reached) - maxreaders => 128, - } - ); - - if ( !$env ) { - $self->_errorWithCleanup( - "Failed to create environment $name for $instanceConfig{databaseDir} beacuse of $LMDB_File::last_err" - ); - return; - } - - my $txn = $env->BeginTxn(); - - my $dbFlags; - - # Much faster random, somewhat faster sequential performance - # Much smaller database size (4 byte keys, vs 6-10 byte keys) - if ( !$stringKeys ) { - $dbFlags = MDB_INTEGERKEY; - } - - my $DB = $txn->OpenDB( undef, $dbFlags ); - - # ReadMode 1 gives memory pointer for perf reasons, not safe - $DB->ReadMode(1); - - if ($LMDB_File::last_err) { - $self->_errorWithCleanup( - "Failed to open database $name for $instanceConfig{databaseDir} beacuse of $LMDB_File::last_err" - ); - return; - } - - # Now db is open - my $err = $txn->commit(); - - if ($err) { - $self->_errorWithCleanup("Failed to commit open db tx because: $err"); - return; - } - - # Goal is to allow database open commit to happen with fsync on - # And afterwards disable fsync - $env->set_flags( MDB_NOSYNC, 1 ); - - $envs{$name} = { env => $env, dbi => $DB->dbi, db => $DB, tflags => $flags }; - - return $envs{$name}; -} - -sub dbForceCommit { - my ( $self, $envName, $noSync ) = @_; - - if ( defined $envs{$envName} ) { - if ( $envs{$envName}{db}->Alive ) { - $envs{$envName}{db}->Txn->commit(); - } - - # Sync in case MDB_NOSYNC, MDB_MAPASYNC, or MDB_NOMETASYNC were enabled - # I assume that if the user is forcing commit, they also want the state of the - # db updated - # sync(1) flag needed to ensure that disk buffer is flushed with MDB_NOSYNC, MAPASYNC - $envs{$envName}{env}->sync(1) unless $noSync; - } - else { - $self->_errorWithCleanup('dbManager expects existing environment in dbForceCommit'); - } -} - -# This can be called without instantiating Seq::DBManager, either as :: or -> class method -# @param $self (optional) -# @param $envName (optional) : the name of a specific environment -sub cleanUp { - if ( !%envs && !%cursors ) { - return 0; - } - - if ( !%envs && %cursors ) { - _fatalError('dbManager expects no cursors if no environments opened'); - - return 255; - } - - # We track the unsafe stuff, just as a precaution - foreach ( keys %cursors ) { - # Check defined because database may be empty (and will be stored as undef) - if ( defined $cursors{$_} ) { - - $cursors{$_}[1]->close(); - $cursors{$_}[0]->commit(); - - delete $cursors{$_}; - - if ( $LMDB_File::last_err - && $LMDB_File::last_err != MDB_NOTFOUND - && $LMDB_File::last_err != MDB_KEYEXIST ) - { - _fatalError("dbCleanUp LMDB error: $LMDB_File::last_err"); - - return 255; - } - } - } - - foreach ( keys %envs ) { - # Check defined because database may be empty (and will be stored as undef) - if ( defined $envs{$_} ) { - if ( defined $envs{$_}{db} && $envs{$_}{db}->Alive ) { - $envs{$_}{db}->Txn->commit(); - } - - if ( defined $envs{$_}{env} ) { - # Sync in case MDB_NOSYNC, MDB_MAPASYNC, or MDB_NOMETASYNC were enabled - # sync(1) flag needed to ensure that disk buffer is flushed with MDB_NOSYNC, MAPASYNC - $envs{$_}{env}->sync(1); - $envs{$_}{env}->Clean(); - } - - delete $envs{$_}; - - if ( $LMDB_File::last_err - && $LMDB_File::last_err != MDB_NOTFOUND - && $LMDB_File::last_err != MDB_KEYEXIST ) - { - _fatalError("dbCleanUp LMDB error: $LMDB_File::last_err"); - - return 255; - } - } - } - - return 0; -} - -# Like DESTROY, but Moosier -sub DEMOLISH { - my $self = shift; - $self->cleanUp(); -} - -# For now, we'll throw the error, until program is changed to expect error/success -# status from functions -sub _errorWithCleanup { - my $msg = @_ == 2 ? $_[1] : $_[0]; - - cleanUp(); - - _fatalError($msg); -} - -sub _fatalError { - my $msg = @_ == 2 ? $_[1] : $_[0]; - - $internalLog->ERR($msg); - - # Reset error message, not sure if this is the best way - $LMDB_File::last_err = 0; - - __PACKAGE__->log( 'fatal', $msg ); - die $msg; -} - -1; diff --git a/perl/lib/Seq/Definition.pm b/perl/lib/Seq/Definition.pm deleted file mode 100644 index a49708243..000000000 --- a/perl/lib/Seq/Definition.pm +++ /dev/null @@ -1,306 +0,0 @@ -# TODO //2024-03-24 @akotlar: Split out pre-processor outputs into separate attribute -use 5.10.0; -use strict; -use warnings; - -package Seq::Definition; -use Mouse::Role 2; -use Path::Tiny; - -use Types::Path::Tiny qw/AbsPath AbsFile AbsDir/; -use List::MoreUtils qw/first_index/; -use Mouse::Util::TypeConstraints; -use Sys::CpuAffinity; - -# TODO: Explore portability on windows-based systems w/File::Copy - -use Seq::Statistics; - -with 'Seq::Role::IO'; -with 'Seq::Output::Fields'; -# Note: All init_arg undef methods must be lazy if they rely on arguments that are -# not init_arg => undef, and do not have defaults (aka are required) -######################## Required ############################## - -# output_file_base contains the absolute path to a file base name -# Ex: /dir/child/BaseName ; BaseName is appended with .annotated.tsv , .annotated-log.txt, etc -# for the various outputs -######################## Required ############################## - -# output_file_base contains the absolute path to a file base name -# Ex: /dir/child/BaseName ; BaseName is appended with .annotated.tsv , .annotated-log.txt, etc -# for the various outputs -has output_file_base => ( - is => 'ro', - isa => AbsPath, - coerce => 1, - required => 1, - handles => { outDir => 'parent', outBaseName => 'basename' } -); - -############################### Optional ##################################### -# String, allowing us to ignore it if not truthy -# Acceptable values include ~ in YAML (undef/null) -has temp_dir => ( is => 'ro', isa => 'Maybe[Str]' ); - -# Do we want to compress? -has compress => ( is => 'ro', isa => 'Str', default => 1 ); - -has compressType => ( is => 'ro', isa => enum( [qw/lz4 gz bgz/] ), default => 'gz' ); - -# Do we want to tarball our results -has archive => ( is => 'ro', isa => 'Bool', default => 0 ); - -# The statistics configuration options, usually defined in a YAML config file -has statistics => ( is => 'ro', isa => 'HashRef' ); - -# Users may not need statistics -has run_statistics => - ( is => 'ro', isa => 'Bool', default => sub { !!$_[0]->statistics } ); - -has maxThreads => ( - is => 'ro', - isa => 'Int', - lazy => 1, - default => sub { - return Sys::CpuAffinity::getNumCpus(); - } -); - -has outputJson => ( is => 'ro', isa => 'Bool', default => 0 ); - -# The path to the annotation configuration file -has config => ( is => 'ro', isa => 'Str', required => 1 ); - -# has badSamplesField => (is => 'ro', default => 'badSamples', lazy => 1); - -################ Public Exports ################## -#@ params -# filePaths @params: -# compressed : the name of the compressed folder holding annotation, stats, etc (only if $self->compress) -# converted : the name of the converted folder -# annnotation : the name of the annotation file -# log : the name of the log file -# stats : the { statType => statFileName } object -# Allows us to use all to to extract just the file we're interested from the compressed tarball -has outputFilesInfo => ( - is => 'ro', - isa => 'HashRef', - init_arg => undef, - lazy => 1, - default => sub { - my $self = shift; - - my %out; - - $out{log} = path( $self->logPath )->basename; - - # Must be lazy in order to allow "revealing module pattern", with output_file_base below - my $outBaseName = $self->outBaseName; - - my $extension = $self->outputJson ? 'json' : 'tsv'; - - $out{annotation} = - $outBaseName - . ".annotation.$extension" - . ( $self->compress ? "." . $self->compressType : "" ); - $out{header} = $outBaseName . ".annotation.header.json"; - - # sampleList and dosageMatrixOutPath are available to be used by the preprocessor - $out{sampleList} = $outBaseName . '.sample_list'; - $out{dosageMatrixOutPath} = $outBaseName . '.dosage.feather'; - - # Must be lazy in order to allow "revealing module pattern", with __statisticsRunner below - if ( $self->run_statistics ) { - $out{statistics} = { - json => path( $self->_statisticsRunner->jsonFilePath )->basename, - tab => path( $self->_statisticsRunner->tabFilePath )->basename, - qc => path( $self->_statisticsRunner->qcFilePath )->basename, - }; - } - - if ( $self->archive ) { - # Seq::Role::IO method - # Only compress the tarball if we're not compressing the inner file - # because this wastes a lot of time, since the compressed inner annotation - # which dominates the archive 99%, cannot be compressed at all - $out{archived} = $self->makeTarballName( $outBaseName, !$self->compress ); - } - - $out{config} = path( $self->config )->basename; - - return \%out; - } -); - -############################ Private ################################### -# Must be lazy... Mouse doesn't seem to respect attribute definition order at all times -# Leading to situations where $self->outDir doesn't exist by the time _workingDir -# is created. This can lead to the contents of the current working directory being accidentally compressed -# into $self->outputFilesInfo->{archived} -has _workingDir => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { - my $self = shift; - - $self->outDir->mkpath; - - if ( $self->temp_dir ) { - my $dir = path( $self->temp_dir ); - $dir->mkpath; - - my $tmp = Path::Tiny->tempdir( DIR => $dir, CLEANUP => 1 ); - - return $tmp; - } - - return $self->outDir; - } -); - -### Override logPath to use the working directory / output_file_base basename ### -has logPath => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { - my $self = shift; - return $self->_workingDir->child( $self->outBaseName . '.annotation.log.txt' ) - ->stringify(); - } -); - -# Must be lazy because needs run_statistics and statistics -has _statisticsRunner => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { - my $self = shift; - - my $basePath = $self->_workingDir->child( $self->outBaseName )->stringify; - # Assumes that is run_statistics is specified, $self-statistics exists - if ( $self->run_statistics ) { - my %args = ( - altField => $self->altField, - homozygotesField => $self->homozygotesField, - heterozygotesField => $self->heterozygotesField, - outputBasePath => $basePath, - ); - - %args = ( %args, %{ $self->statistics } ); - - return Seq::Statistics->new( \%args ); - } - - return undef; - } -); - -sub prepareBystroPreprocessorOutputsForMultiFile { - # If we have multiple files, we need to update the outputs that the pre-processor - # generates, so that they are unique to each preprocessor instance - my $self = shift; - - # the index of the current input file - my $index = shift; - - # the total number of expected input files - my $totalCount = shift; - - # Update sampleList and dosageMatrixOutPath based on the index - # But if the index is 0 and totalCount is 1, we don't need to update the file names - my %preprocessorOutputs; - if ( !( $index == 0 && $totalCount == 1 ) ) { - $preprocessorOutputs{sampleList} = $self->outputFilesInfo->{sampleList} . ".$index"; - $preprocessorOutputs{dosageMatrixOutPath} = - $self->outputFilesInfo->{dosageMatrixOutPath} . ".$index"; - } - else { - $preprocessorOutputs{sampleList} = $self->outputFilesInfo->{sampleList}; - $preprocessorOutputs{dosageMatrixOutPath} = - $self->outputFilesInfo->{dosageMatrixOutPath}; - } - - return \%preprocessorOutputs; -} - -# TODO: This assumes that if workingDir != outDir, the working dir is only for annotation files -# TODO: Properly set write permissions -sub _moveFilesToOutputDir { - my $self = shift; - - my $err; - my $outDir = $self->outDir->stringify(); - - if ( $self->archive ) { - my $an = $self->outputFilesInfo->{annotation}; - my @files = grep { $_ !~ $an } glob( $self->_workingDir->child('*')->stringify ); - - for my $file (@files) { - $err = $self->safeSystem("cp $file $outDir"); - - if ($err) { - return $err; - } - } - - # Without the 2 sync operations, support files go missing - # This simply executes each operation in turn, or receivs an error early - $err = $self->safeSystem("sync") - || $self->compressDirIntoTarball( $self->_workingDir, - $self->outputFilesInfo->{archived} ) - || $self->safeSystem("sync"); - - if ($err) { - return $err; - } - } - - if ( $self->outDir eq $self->_workingDir ) { - $self->log( 'debug', "Nothing to move, workingDir equals outDir" ); - return; - } - - $self->log( 'info', "Moving output files to output directory" ); - - my $workDir = $self->_workingDir->stringify(); - - $err = $self->safeSystem("mv $workDir/* $outDir && chmod a+r $outDir/*; sync"); - - return $err; -} - -# Replaces periods with _ -# Database like Mongodb don't like periods -# Modifies the array -sub _normalizeSampleNames { - my ( $self, $inputHeader, $sampleIndicesAref ) = @_; - - for my $idx (@$sampleIndicesAref) { - $inputHeader->[$idx] =~ tr/./_/; - } - - return $inputHeader; -} - -# If we need another way of instantiating workingDir that is less error-prone -# (because of the extreme dependence on laziness) -around 'BUILD' => sub { - my $orig = shift; - my $self = shift; - - # Ensure that the output directory exists - $self->outDir->mkpath(); - - $self->$orig(@_); - - if ( $self->archive && !$self->temp_dir ) { - $self->log( 'fatal', "If you wish to 'archive', must specify 'temp_dir'" ); - } -}; - -no Mouse::Role; -1; diff --git a/perl/lib/Seq/Headers.pm b/perl/lib/Seq/Headers.pm deleted file mode 100644 index 617dd1a39..000000000 --- a/perl/lib/Seq/Headers.pm +++ /dev/null @@ -1,250 +0,0 @@ -package Seq::Headers; -use Mouse 2; - -# # Abstract: Responsible for building the header object and string -use 5.10.0; -use strict; -use warnings; -use namespace::autoclean; -use List::Util qw/first/; - -with 'Seq::Role::Message'; -#stored as array ref to preserve order -# [ { $parent => [ $child1, $child2 ] }, $feature2, $feature3, etc ] -state $orderedHeaderFeaturesAref = []; -# { $parent => [ $child1, $child2 ] } -state $parentChild = {}; - -# [ [ $child1, $child2 ], $feature2, $feature3, etc ] -state $orderedHeaderCache = []; -state $strHeaderCache = ''; -# { childFeature1 => idx, childFeature2 => idx; -state $orderMapCache = {}; -# { $parent => { $child1 => idxChild1, $child2 => idxChild2 }} -my %parentChildHash; - -# All singleton tracks have an initialize method, which clears -sub initialize { - _clearCache(); - - $orderedHeaderFeaturesAref = []; - $parentChild = {}; - - return; -} - -sub _clearCache { - # These get initialize/cleared every time feature added - # They simply track different views of - $orderedHeaderCache = []; - $orderMapCache = {}; - undef %parentChildHash; - $strHeaderCache = ''; - - return; -} - -sub get { - return $orderedHeaderFeaturesAref; -} - -sub getParentFeatures { - my ( $self, $parentName ) = @_; - return $parentChild->{$parentName}; -} - -sub getFeatureIdx { - my ( $self, $parentName, $childName ) = @_; - - if ( !%parentChildHash ) { - my $i = -1; - for my $entry ( values @$orderedHeaderFeaturesAref ) { - $i++; - - if ( ref $entry ) { - # One key only, the parent name (trackName) - my ($trackName) = keys %{$entry}; - - my %children; - my $y = -1; - for my $childName ( @{ $entry->{$trackName} } ) { - $y++; - $children{$childName} = $y; - } - - $parentChildHash{$trackName} = \%children; - next; - } - - $parentChildHash{'_masterBystro_'} //= {}; - $parentChildHash{'_masterBystro_'}{$entry} = $i; - } - } - - $parentName ||= '_masterBystro_'; - return $parentChildHash{$parentName}{$childName}; -} - -sub getOrderedHeader() { - if (@$orderedHeaderCache) { - return $orderedHeaderCache; - } - - for my $i ( 0 .. $#$orderedHeaderFeaturesAref ) { - if ( ref $orderedHeaderFeaturesAref->[$i] ) { - my $trackName = ( keys %{ $orderedHeaderFeaturesAref->[$i] } )[0]; - - $orderedHeaderCache->[$i] = $orderedHeaderFeaturesAref->[$i]{$trackName}; - } - else { - $orderedHeaderCache->[$i] = $orderedHeaderFeaturesAref->[$i]; - } - } - - return $orderedHeaderCache; -} - -# Retrieves child feature -sub getParentIndices() { - if (%$orderMapCache) { - return $orderMapCache; - } - - for my $i ( 0 .. $#$orderedHeaderFeaturesAref ) { - if ( ref $orderedHeaderFeaturesAref->[$i] ) { - $orderMapCache->{ ( keys %{ $orderedHeaderFeaturesAref->[$i] } )[0] } = $i; - } - else { - $orderMapCache->{ $orderedHeaderFeaturesAref->[$i] } = $i; - } - } - - return $orderMapCache; -} - -sub getString { - my $self = shift; - - if ($strHeaderCache) { - return $strHeaderCache; - } - - my @out; - for my $feature (@$orderedHeaderFeaturesAref) { - #this is a parentName => [$feature1, $feature2, $feature3] entry - if ( ref $feature ) { - my ($parentName) = %$feature; - foreach ( @{ $feature->{$parentName} } ) { - push @out, "$parentName.$_"; - } - next; - } - push @out, $feature; - } - - $strHeaderCache = join( "\t", @out ); - return $strHeaderCache; -} - -#######################addFeaturesToHeader####################### -# Description: Add a single feature to the header -# @param $child: A feature name (required) -# @param $parent: A parent name that the $child belongs to (optional) -# @param $prepend: Whether or not to add the $child to the beginning of -# the features array, or to the beginning of the $parent feature array if !!$parent -sub addFeaturesToHeader { - my ( $self, $child, $parent, $prepend ) = @_; - - _clearCache(); - - if ( ref $child eq 'ARRAY' ) { - goto &_addFeaturesToHeaderBulk; - } - - if ($parent) { - my $parentFound = 0; - - for my $headerEntry (@$orderedHeaderFeaturesAref) { - if ( !ref $headerEntry ) { - if ( $parent eq $headerEntry ) { - $self->log( - 'warning', "$parent equals $headerEntry, which has no - child features, which was not what we expected" - ); - } - next; - } - - my ( $key, $valuesAref ) = %$headerEntry; - - if ( $key eq $parent ) { - # If we have already added this feature, exit the function - if ( defined( first { $_ eq $child } @$valuesAref ) ) { - return; - } - - if ($prepend) { - unshift @$valuesAref, $child; - } - else { - push @$valuesAref, $child; - } - - $parentChild->{$parent} = $valuesAref; - - return; - } - } - - # No parent found, no need to check if feature has previously been added - my $val = { $parent => [$child] }; - - if ($prepend) { - unshift @$orderedHeaderFeaturesAref, $val; - } - else { - push @$orderedHeaderFeaturesAref, $val; - } - - $parentChild->{$parent} = [$child]; - - return; - } - - ######## No parent provided; we expect that the child is the only ########## - ####### value stored, rather than a parentName => [value1, value2] ########## - - # If the value was previously added, exit function; - if ( defined( first { $_ eq $child } @$orderedHeaderFeaturesAref ) ) { - return; - } - - if ($prepend) { - unshift @$orderedHeaderFeaturesAref, $child; - } - else { - push @$orderedHeaderFeaturesAref, $child; - } - - return; -} - -sub _addFeaturesToHeaderBulk { - my ( $self, $childrenAref, $parent, $prepend ) = @_; - - if ( !ref $childrenAref ) { - goto &addFeaturesToHeader; - } - - my @array = $prepend ? reverse @$childrenAref : @$childrenAref; - - for my $child (@array) { - $self->addFeaturesToHeader( $child, $parent, $prepend ); - } - - return; -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/InputFile.pm b/perl/lib/Seq/InputFile.pm deleted file mode 100644 index da79f608d..000000000 --- a/perl/lib/Seq/InputFile.pm +++ /dev/null @@ -1,179 +0,0 @@ -package Seq::InputFile; - -our $VERSION = '0.001'; - -# ABSTRACT: Checks validity of input file, and knows about input file header -# VERSION - -use 5.10.0; -use strict; -use warnings; - -use Mouse 2; - -use Types::Path::Tiny qw/AbsPath/; -use Mouse::Util::TypeConstraints; -use File::Which qw(which); -use File::Basename; -use List::MoreUtils qw(firstidx); -use List::Util qw( max ); - -use namespace::autoclean; - -with 'Seq::Role::Message'; - -# the minimum required snp headers that we actually have -# we use singleton pattern because we expect to annotate only one file -# per run -# order matters, we expect the first N fields to be what is defined here - -# TODO : Simplify this; just look for any order of headers in the first 5-6 columns -# state $requiredInputHeaderFields = { -# snp_1 => [qw/ Fragment Position Reference Minor_allele Type /], -# snp_2 => [qw/ Fragment Position Reference Alleles Allele_Counts Type/], -# snp_3 => [qw/ Fragment Position Reference Type Alleles Allele_Counts/] -# }; - -state $requiredInputHeaderFields = { - chrField => qr/Fragment$|Chromosome[s]{0,1}$|Chrom$|Chr$/i, - positionField => qr/Position$|Pos$/i, - referenceField => qr/Reference$|Ref$/i, - #will match Minor_alleles as well (just looks for prefix) - alleleField => qr/Alt$|Alternate$|Allele[s]{0,1}$|Minor_allele[s]{0,1}$/i, -}; - -state $optionalInputHeaderFields = { - alleleCountField => qr/Allele_Counts/i, - typeField => qr/Type/i, -}; - -# @ public only the common fields exposed -has chrFieldName => ( is => 'ro', init_arg => undef ); - -has positionFieldName => ( is => 'ro', init_arg => undef ); - -has referenceFieldName => ( is => 'ro', init_arg => undef ); - -has typeFieldName => ( is => 'ro', init_arg => undef ); - -has alleleFieldName => ( is => 'ro', init_arg => undef ); - -has alleleCountFieldName => ( is => 'ro', init_arg => undef ); - -has chrFieldIdx => ( is => 'ro', init_arg => undef ); - -has positionFieldIdx => ( is => 'ro', init_arg => undef ); - -has referenceFieldIdx => ( is => 'ro', init_arg => undef ); - -has alleleFieldIdx => ( is => 'ro', init_arg => undef ); - -has typeFieldIdx => ( is => 'ro', init_arg => undef ); - -has alleleCountFieldIdx => ( is => 'ro', init_arg => undef ); - -# The last field containing snp data; 5th or 6th -# Set in checkInputFileHeader -has lastSnpFileFieldIdx => - ( is => 'ro', init_arg => undef, writer => '_setLastSnpFileFieldIdx' ); - -# Returns genotype indices -# Assumes all other fields occur in a contiguous section before the start -# of the sample/genotype block section -# @ return -sub getSampleNamesGenos { - my ( $self, $fAref ) = @_; - my $strt = $self->lastSnpFileFieldIdx + 1; - - # every other field column name is blank, holds genotype probability - # for preceeding column's sample; - # don't just check for ne '', to avoid simple header issues - my @genosIdx; - - # We expect that if this is a .snp file containing genotypes it will have - # a genotype call and then a confidence - # If there isn't at least one field past the $strt, we don't have genotypes - if ( $#$fAref < $strt ) { - return undef; - } - - for ( my $i = $strt; $i <= $#$fAref; $i += 2 ) { - push @genosIdx, $i; - } - - return \@genosIdx; -} - -#uses the input file headers to figure out what the file type is -# @return $err : the error, if any -sub checkInputFileHeader { - my ( $self, $inputFieldsAref, $dontDieOnUnkown ) = @_; - - if ( !defined $inputFieldsAref ) { - return "No tab-separated header fields found"; - } - - my $totalHeaderKeys = scalar( keys %{$requiredInputHeaderFields} ) - + scalar( keys %{$optionalInputHeaderFields} ); - - my @firstFields = @$inputFieldsAref[ 0 .. $totalHeaderKeys - 1 ]; - - if ( $firstFields[0] !~ $requiredInputHeaderFields->{chrField} - || $firstFields[1] !~ $requiredInputHeaderFields->{positionField} - || $firstFields[2] !~ $requiredInputHeaderFields->{referenceField} ) - { - return "First three fields must be chrom, pos, ref"; - } - - my $notFound; - my $found; - my @indicesFound; - - REQ_LOOP: for my $fieldType ( keys %$requiredInputHeaderFields ) { - $found = 0; - for ( my $i = 0; $i < @firstFields; $i++ ) { - if ( defined $firstFields[$i] - && $firstFields[$i] =~ $requiredInputHeaderFields->{$fieldType} ) - { - $self->{ $fieldType . "Name" } = $firstFields[$i]; - $self->{ $fieldType . "Idx" } = $i; - - push @indicesFound, $i; - $found = 1; - } - } - - if ( !$found ) { - $notFound = $fieldType; - last; - } - } - - OPTIONAL: for my $fieldType ( keys %$optionalInputHeaderFields ) { - for ( my $i = 0; $i < @firstFields; $i++ ) { - if ( defined $firstFields[$i] - && $firstFields[$i] =~ $optionalInputHeaderFields->{$fieldType} ) - { - $self->{ $fieldType . "Name" } = $firstFields[$i]; - $self->{ $fieldType . "Idx" } = $i; - - push @indicesFound, $i; - } - } - } - - my $lastSnpFileFieldIdx = max(@indicesFound); - - $self->_setLastSnpFileFieldIdx($lastSnpFileFieldIdx); - - # $self->_setFirstSampleIdx($lastSnpFileFieldIdx + 1); - - if ($notFound) { - return "Couldn't find required field $notFound"; - } - - return; -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Output.pm b/perl/lib/Seq/Output.pm deleted file mode 100644 index 6b2b00a63..000000000 --- a/perl/lib/Seq/Output.pm +++ /dev/null @@ -1,255 +0,0 @@ -package Seq::Output; -use 5.10.0; -use strict; -use warnings; - -use Mouse 2; - -use List::Util qw/min max/; - -use Seq::Output::Delimiters; -use Seq::Headers; - -with 'Seq::Role::Message'; - -has header => ( is => 'ro', isa => 'Seq::Headers', required => 1 ); -has trackOutIndices => ( is => 'ro', isa => 'ArrayRef', required => 1 ); - -has delimiters => ( - is => 'ro', - isa => 'Seq::Output::Delimiters', - default => sub { - return Seq::Output::Delimiters->new(); - } -); - -has refTrackName => ( - is => 'ro', - isa => 'Str', -); - -sub BUILD { - my $self = shift; - - my @trackOutIndices = @{ $self->trackOutIndices }; - my $minIdx = min(@trackOutIndices); - my $maxIdx = max(@trackOutIndices); - - # Cache delimiters to avoid method calls in hot loop - $self->{_emptyFieldChar} = $self->delimiters->emptyFieldChar; - $self->{_overlapDelim} = $self->delimiters->overlapDelimiter; - $self->{_valDelim} = $self->delimiters->valueDelimiter; - - $self->{_trackOutIndices} = []; - $self->{_trackFeatCount} = []; - - my @header = @{ $self->header->getOrderedHeader() }; - - # Use array for faster lookup in hot loop - @{ $self->{_trackFeatCount} } = @header; - - my $outIdx = -1; - for my $trackName (@header) { - $outIdx++; - - if ( $outIdx < $minIdx || $outIdx > $maxIdx ) { - next; - } - - push @{ $self->{_trackOutIndices} }, $outIdx; - - if ( $trackName eq $self->refTrackName ) { - $self->{_refTrackIdx} = $outIdx; - next; - } - - if ( ref $trackName ) { - $self->{_trackFeatCounts}[$outIdx] = $#$trackName; - } - else { - $self->{_trackFeatCounts}[$outIdx] = 0; - } - } -} - -sub uniqueify { - my %count; - my $undefCount = 0; - - if ( !@{ $_[0] } ) { - return $_[0]; - } - - foreach my $value ( @{ $_[0] } ) { - if ( !defined $value ) { - $undefCount++; - } - else { - $count{$value} = 1; - } - } - - if ( $undefCount == @{ $_[0] } ) { - return [ $_[0]->[0] ]; - } - - if ( $undefCount == 0 && scalar keys %count == 1 ) { - return [ $_[0]->[0] ]; - } - - return $_[0]; -} - -sub mungeRow { - # $_[0] = $self - # $_[1] = $row - - for my $row ( @{ $_[1] } ) { - if ( !defined $row ) { - $row = $_[0]->{_emptyFieldChar}; - next; - } - - if ( ref $row ) { - $row = join( - $_[0]->{_overlapDelim}, - map { defined $_ ? $_ : $_[0]->{_emptyFieldChar} } @{ uniqueify($row) } - ); - } - } - - return join $_[0]->{_valDelim}, @{ uniqueify( $_[1] ) }; -} - -# ABSTRACT: Knows how to make an output string -# VERSION - -#takes an array of data that is what we grabbed from the database -#and whatever else we added to it -#and an array of input data, which contains our original input fields -#which we are going to re-use in our output (namely chr, position, type alleles) -sub makeOutputString { - my ( $self, $outputDataAref ) = @_; - - # Re-assigning these isn't a big deal beause makeOutputString - # Called very few times; expected to be called every few thousand rows - my $missChar = $self->delimiters->emptyFieldChar; - my $posDelim = $self->delimiters->positionDelimiter; - my $fieldSep = $self->delimiters->fieldSeparator; - my $featCounts = $self->{_trackFeatCounts}; - - for my $row (@$outputDataAref) { - next if !$row; - # info = [$outIdx, $numFeatures, $missingValue] - # if $numFeatures == 0, this track has no features - TRACK_LOOP: for my $oIdx ( @{ $self->{_trackOutIndices} } ) { - if ( $oIdx == $self->{_refTrackIdx} ) { - $row->[$oIdx] = join '', @{ $row->[$oIdx] }; - next; - } - - # If this track has no features - if ( $featCounts->[$oIdx] == 0 ) { - # We always expect output, for any track - # to be at least a 1 member array - # because we need to know where in an indel we are - # or whether we're in a snp - # So... reference for insance is [A] for a snp - # and maybe [A, T, C] for a 3 base deletion - - # Most common case, not an indel - # Currently we have no 0-feature tracks - if ( @{ $row->[$oIdx] } == 1 ) { - if ( !defined $row->[$oIdx][0] ) { - $row->[$oIdx] = $missChar; - next; - } - - if ( ref $row->[$oIdx][0] ) { - $row->[$oIdx] = $self->mungeRow( $row->[$oIdx][0] ); - next; - } - - $row->[$oIdx] = $row->[$oIdx][0]; - next; - } - - # For things without features, we currently support - # ref (scalar), phastCons, phyloP, cadd, which are all scalars - - # If its not a scalar, its because we have an indel - # Then, for each position, the thing may be defined, or not - # It's an array, for instance, CADD scores are - $row->[$oIdx] = join( - $posDelim, - @{ - uniqueify( - [ - map { !defined $_ ? $missChar : ref $_ ? $self->mungeRow($_) : $_ } - @{ $row->[$oIdx] } - ] - ) - } - ); - - next; - } - - # If this track is missing altogether it will be an empty array - # But it will be an array - - for my $featIdx ( 0 .. $featCounts->[$oIdx] ) { - if ( !defined $row->[$oIdx][$featIdx] ) { - $row->[$oIdx][$featIdx] = $missChar; - next; - } - - # Typically, we have no indel - # Which means the feature has only 1 value - if ( @{ $row->[$oIdx][$featIdx] } == 1 ) { - if ( !defined $row->[$oIdx][$featIdx][0] ) { - $row->[$oIdx][$featIdx] = $missChar; - next; - } - - # Typically we have a scalar - if ( !ref $row->[$oIdx][$featIdx][0] ) { - $row->[$oIdx][$featIdx] = $row->[$oIdx][$featIdx][0]; - next; - } - - $row->[$oIdx][$featIdx] = $self->mungeRow( $row->[$oIdx][$featIdx][0] ); - next; - } - - for my $posData ( @{ $row->[$oIdx][$featIdx] } ) { - if ( !defined $posData ) { - $posData = $missChar; - next; - } - - # At this position in the indel, value is scalar - if ( !ref $posData ) { - next; - } - - $posData = $self->mungeRow($posData); - } - - $row->[$oIdx][$featIdx] = - join( $posDelim, @{ uniqueify( $row->[$oIdx][$featIdx] ) } ); - } - - # Fields are separated by something like tab - $row->[$oIdx] = join( $fieldSep, @{ $row->[$oIdx] } ); - } - - # Tracks are also separated by something like tab - $row = join( $fieldSep, @$row ); - } - - return join( "\n", @$outputDataAref ) . "\n"; -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Output/Delimiters.pm b/perl/lib/Seq/Output/Delimiters.pm deleted file mode 100644 index 3bafbe2fb..000000000 --- a/perl/lib/Seq/Output/Delimiters.pm +++ /dev/null @@ -1,165 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Output::Delimiters; -use Mouse 2; - -with 'Seq::Role::Message'; - -# TODO: initialize as singleton - -has valueDelimiter => ( is => 'ro', isa => 'Str', default => ';' ); - -has positionDelimiter => ( is => 'ro', isa => 'Str', default => '|' ); - -# Allows 1:n (or n:m) relationships between features of a single track -# Typically occurs with dbSNP (1 rs# for at least 2 alleles; in cases of 2 rs# -# need rs1;rs2 allele1_a/allele1_b;allele2_a/allele2_b to keep the order -# of rs1 => [allele1_a, allele1_b] rs2 => [allele2_a,allele2_b] -# So in short is expected to be used for the 3rd dimension of a 3D array (3-tensor) -has overlapDelimiter => ( is => 'ro', isa => 'Str', default => '/' ); - -has fieldSeparator => ( is => 'ro', isa => 'Str', default => "\t" ); - -has emptyFieldChar => ( is => 'ro', isa => 'Str', default => "NA" ); - -# What to replace the flagged characters with if found in a string -has globalReplaceChar => ( is => 'ro', isa => 'Str', default => ',' ); - -# Memoized cleaning function -has cleanDelims => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { - my $self = shift; - - my $vD = $self->valueDelimiter; - my $pD = $self->positionDelimiter; - my $oD = $self->overlapDelimiter; - my $gr = $self->globalReplaceChar; - - my $re = qr/[$vD$pD$oD]+/; - my $reEnd = qr/[$vD$pD$oD$gr]+$/; - - return sub { - #my ($line) = @_; - # $_[0] - - # modified $line ($_[0]) directly - #/s modifier to include newline - $_[0] =~ s/$re/$gr/gs; - $_[0] =~ s/$reEnd//gs; - } - } -); - -has splitByField => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { - my $self = shift; - - my $d = $self->fieldSeparator; - - my $re = qr/[$d]/; - - # Returns unmodified value, or a list of split values - # Used in list context either 1 or many values emitted - return sub { - #my ($line) = @_; - # $_[0] - - # Since we always expect multiple fields, no need to check index - - # modified $line ($_[0]) directly - return split /$re/, $_[0]; - } - } -); - -has splitByPosition => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { - my $self = shift; - - my $d = $self->positionDelimiter; - - my $re = qr/[$d]/; - - # Returns unmodified value, or a list of split values - # Used in list context either 1 or many values emitted - return sub { - #my ($line) = @_; - # $_[0] - - if ( index( $_[0], $d ) == -1 ) { - return $_[0]; - } - - # modified $line ($_[0]) directly - return split /$re/, $_[0]; - } - } -); - -has splitByOverlap => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { - my $self = shift; - - my $d = $self->overlapDelimiter; - - my $re = qr/[$d]/; - - # Returns unmodified value, or a list of split values - # Used in list context either 1 or many values emitted - return sub { - #my ($line) = @_; - # $_[0] - - if ( index( $_[0], $d ) == -1 ) { - return $_[0]; - } - - # modified $line ($_[0]) directly - return split /$re/, $_[0]; - } - } -); - -has splitByValue => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { - my $self = shift; - - my $d = $self->valueDelimiter; - - my $re = qr/[$d]/; - - # Returns unmodified value, or a list of split values - # Used in list context either 1 or many values emitted - return sub { - #my ($line) = @_; - # $_[0] - - if ( index( $_[0], $d ) == -1 ) { - return $_[0]; - } - - # modified $line ($_[0]) directly - return split /$re/, $_[0]; - } - } -); - -__PACKAGE__->meta->make_immutable(); -1; diff --git a/perl/lib/Seq/Output/Fields.pm b/perl/lib/Seq/Output/Fields.pm deleted file mode 100644 index 9f4765921..000000000 --- a/perl/lib/Seq/Output/Fields.pm +++ /dev/null @@ -1,24 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Output::Fields; -use Mouse::Role 2; - -has chromField => ( is => 'ro', default => 'chrom', lazy => 1 ); -has posField => ( is => 'ro', default => 'pos', lazy => 1 ); -has typeField => ( is => 'ro', default => 'type', lazy => 1 ); -has inputRefField => ( is => 'ro', default => 'inputRef', lazy => 1 ); -has discordantField => ( is => 'ro', default => 'discordant', lazy => 1 ); -has altField => ( is => 'ro', default => 'alt', lazy => 1 ); -has trTvField => ( is => 'ro', default => 'trTv', lazy => 1 ); -has heterozygotesField => ( is => 'ro', default => 'heterozygotes', lazy => 1 ); -has heterozygosityField => ( is => 'ro', default => 'heterozygosity', lazy => 1 ); -has homozygotesField => ( is => 'ro', default => 'homozygotes', lazy => 1 ); -has homozygosityField => ( is => 'ro', default => 'homozygosity', lazy => 1 ); -has missingField => ( is => 'ro', default => 'missingGenos', lazy => 1 ); -has missingnessField => ( is => 'ro', default => 'missingness', lazy => 1 ); -has sampleMafField => ( is => 'ro', default => 'sampleMaf', lazy => 1 ); - -no Mouse::Role; -1; diff --git a/perl/lib/Seq/Role/ConfigFromFile.pm b/perl/lib/Seq/Role/ConfigFromFile.pm deleted file mode 100644 index 160ceef1f..000000000 --- a/perl/lib/Seq/Role/ConfigFromFile.pm +++ /dev/null @@ -1,94 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Role::ConfigFromFile; - -our $VERSION = '0.001'; - -# ABSTRACT: A moose role for configuring a class from a YAML file -# VERSION - -=head1 DESCRIPTION - - @role Seq::Role::ConfigFromFile - #TODO: Check description - - @example with 'Seq::Role::ConfigFromFile' - -Used in: -=for :list -* Seq::Annotate -* Seq::Assembly -* Seq::Fetch - -Extended by: None - -=cut - -use Mouse::Role 2; - -use Carp qw/ croak /; -use namespace::autoclean; -use Type::Params qw/ compile /; -use Types::Standard qw/ :types /; -use Scalar::Util qw/ reftype /; -use YAML::XS qw/ LoadFile /; - -with 'Seq::Role::IO', 'MouseX::Getopt'; - -state $tracksKey = 'tracks'; -#The only "Trick" added here is that we take everything that is outside the -#"tracks" key, and push that stuff in each tracks array item -#THe logic is that our YAML config file has 2 levels of content -#1) Global : key => value pairs that apply to every track -#2) Track-level : key => value pairs that only apply to that track -sub new_with_config { - state $check = compile( Str, HashRef ); - my ( $class, $opts ) = $check->(@_); - my %opts; - - my $config = $opts->{config}; - - if ( !defined $config ) { - croak "new_with_config() expects config"; - } - - my $hash = LoadFile($config); - %opts = ( %$hash, %$opts ); - - # If no "tracks object, nothing left to do" - if ( !$opts{$tracksKey} ) { - $class->new( \%opts ); - return; - } - - my $trackConfig; - - if ( ref $opts{$tracksKey} eq 'ARRAY' ) { - # Back compatibility with b10 - my $temp = $opts{$tracksKey}; - - $opts{$tracksKey} = { $tracksKey => $temp }; - } - - #Now push every single global option into each individual track - #Since they are meant to operate as independent units - my @nonTrackKeys = grep { $_ ne $tracksKey } keys %opts; - - if ( ref $opts{$tracksKey}{$tracksKey} ne 'ARRAY' ) { - croak "expect $tracksKey to contain an array of data"; - } - - for my $trackHref ( @{ $opts{$tracksKey}{$tracksKey} } ) { - for my $key (@nonTrackKeys) { - $trackHref->{$key} = $opts{$key}; - } - } - - $class->new( \%opts ); -} - -no Mouse::Role; - -1; diff --git a/perl/lib/Seq/Role/IO.pm b/perl/lib/Seq/Role/IO.pm deleted file mode 100644 index 5017d9646..000000000 --- a/perl/lib/Seq/Role/IO.pm +++ /dev/null @@ -1,480 +0,0 @@ -use 5.10.0; -use strict; -use warnings; -# TODO: Also support reading zipped files (right now only gzip files) -package Seq::Role::IO; - -our $VERSION = '0.001'; - -# ABSTRACT: A moose role for all of our file handle needs -# VERSION - -use Mouse::Role; - -use PerlIO::utf8_strict; -use PerlIO::gzip; -use File::Which qw/which/; -use Sys::CpuAffinity; - -use Path::Tiny; -use Try::Tiny; - -use Scalar::Util qw/looks_like_number/; -with 'Seq::Role::Message'; -# tried various ways of assigning this to an attrib, with the intention that -# one could change the taint checking characters allowed but this is the simpliest -# one that worked; wanted it precompiled to improve the speed of checking -my $taintCheckRegex = qr{\A([\+\,\.\-\=\:\/\t\|\s\w\d/]+)\z}; - -has taintCheckRegex => ( - is => 'ro', - lazy => 1, - init_arg => undef, - default => sub { $taintCheckRegex }, -); - -has delimiter => ( - is => 'ro', - lazy => 1, - default => '\t', - writer => '_setDelimiter', -); - -my $tar = which('tar'); -my $gzip = which('bgzip') || which('pigz') || which('gzip'); -my $lz4 = which('lz4'); - -# Without this, pigz -d -c issues many system calls (futex) -# Thanks to Meltdown, this slows decompression substantially -# Up to 1 core will be used solely for meltdown-related overhead -# So disable mutli-threading -# For compression, tradeoff still worth it -my $gzipDcmpArgs = '-d -c'; -if ( $gzip =~ /pigz/ ) { - $gzipDcmpArgs = "-p 1 $gzipDcmpArgs"; -} -elsif ( $gzip =~ /bgzip/ ) { - $gzipDcmpArgs = "--threads " . Sys::CpuAffinity::getNumCpus() . " $gzipDcmpArgs"; -} - -my $gzipCmpArgs = '-c'; -if ( $gzip =~ /bgzip/ ) { - $gzipCmpArgs = "--threads " . Sys::CpuAffinity::getNumCpus(); -} - -my $tarCompressedGzip = "$tar --use-compress-program=$gzip"; -my $tarCompressedLZ4 = "$tar --use-compress-program=$lz4"; - -has gzip => ( - is => 'ro', - isa => 'Str', - init_arg => undef, - lazy => 1, - default => sub { $gzip } -); -has decompressArgs => ( - is => 'ro', - isa => 'Str', - init_arg => undef, - lazy => 1, - default => sub { $gzipDcmpArgs } -); - -sub getReadArgs { - my ( $self, $filePath ) = @_; - - my ( $remoteCpCmd, $remoteFileSizeCmd ) = $self->getRemoteProg($filePath); - my $outerCommand = $self->getCompressProgWithArgs($filePath); - - if ($remoteCpCmd) { - if ($outerCommand) { - $outerCommand = "$remoteCpCmd | $outerCommand -"; - } - else { - $outerCommand = "$remoteCpCmd"; - } - } - elsif ($outerCommand) { - $outerCommand = "$outerCommand $filePath"; - } - - return $outerCommand; -} -#@param {Path::Tiny} $file : the Path::Tiny object representing a single input file -#@param {Str} $errCode : what log level to use if we can't open the file -#@return file handle -sub getReadFh { - my ( $self, $file, $errCode ) = @_; - - # By default, we'll return an error, rather than die-ing with log - # We wont be able to catch pipe errors however - if ( !$errCode ) { - $errCode = 'error'; - } - - my $filePath; - if ( ref $file eq 'Path::Tiny' ) { - $filePath = $file->stringify; - } - else { - $filePath = $file; - } - - my $outerCommand = $self->getReadArgs($filePath); - - my ( $err, $fh ); - - if ($outerCommand) { - $err = $self->safeOpen( $fh, '-|', "$outerCommand", $errCode ); - } - else { - $err = $self->safeOpen( $fh, '<', $filePath, $errCode ); - } - - my $compressed = !!$outerCommand; - - return ( $err, $compressed, $fh ); -} - -sub getRemoteProg { - my ( $self, $filePath ) = @_; - - if ( $filePath =~ /^s3:\/\// ) { - return ( "aws s3 cp $filePath -", "" ); - } - - if ( $filePath =~ /^gs:\/\// ) { - return ( "gsutil cp $filePath -", "" ); - } - - return ""; -} - -sub getInnerFileCommand { - my ( $self, $filePath, $innerFile, $errCode ) = @_; - - if ( !$errCode ) { - $errCode = 'error'; - } - - my $compressed = - $innerFile =~ /[.]gz$/ - || $innerFile =~ /[.]bgz$/ - || $filePath =~ /[.]lz4$/; - - my $innerCommand; - if ( $filePath =~ /[.]lz4$/ ) { - $innerCommand = $compressed ? "\"$innerFile\" | $lz4 -d -c -" : "\"$innerFile\""; - } - else { - $innerCommand = - $compressed ? "\"$innerFile\" | $gzip $gzipDcmpArgs -" : "\"$innerFile\""; - } - - # We do this because we have not built in error handling from opening streams - - my $err; - my $command; - my $outerCompressed; - - if ( $filePath =~ /[.]tar/ ) { - $command = "$tar -O -xf - $innerCommand"; - } - else { - $err = "When inner file provided, must provde a parent file.tar or file.tar.gz"; - - $self->log( $errCode, $err ); - - return ( $err, undef, undef ); - } - - # If an innerFile is passed, we assume that $file is a path to a tarball - - return ( $err, $compressed, $command ); -} - -# Is the file a single compressed file -sub isCompressedSingle { - my ( $self, $filePath ) = @_; - - my $basename = path($filePath)->basename(); - - if ( $basename =~ /tar[.]gz$/ ) { - return 0; - } - - if ( $basename =~ /[.]gz$/ || $basename =~ /[.]bgz$/ ) { - return "gzip"; - } - - if ( $basename =~ /[.]lz4$/ ) { - return "lz4"; - } - - return ""; -} - -sub getCompressProgWithArgs { - my ( $self, $filePath ) = @_; - - my $ext = $self->isCompressedSingle($filePath); - - if ( !$ext ) { - return ""; - } - if ( $ext eq 'gzip' ) { - return "$gzip $gzipDcmpArgs"; - } - if ( $ext eq 'lz4' ) { - return "$lz4 -d -c"; - } -} - -# TODO: return error if failed -sub getWriteFh { - my ( $self, $file, $compress, $errCode ) = @_; - - # By default, we'll return an error, rather than die-ing with log - # We wont be able to catch pipe errors however - if ( !$errCode ) { - $errCode = 'error'; - } - - my $err; - - if ( !$file ) { - $err = 'get_fh() expected a filename'; - $self->log( $errCode, $err ); - - return ( $err, undef ); - } - - my $fh; - my $hasGz = $file =~ /[.]gz$/ || $file =~ /[.]bgz$/; - my $hasLz4 = $file =~ /[.]lz4$/; - if ( $hasGz || $hasLz4 || $compress ) { - if ( $hasLz4 || ( $compress && $compress =~ /[.]lz4$/ ) ) { - $err = $self->safeOpen( $fh, "|-", "$lz4 -c > $file", $errCode ); - } - else { - $err = $self->safeOpen( $fh, "|-", "$gzip $gzipCmpArgs > $file", $errCode ); - } - - } - else { - $err = $self->safeOpen( $fh, ">", $file, $errCode ); - } - - return ( $err, $fh ); -} - -# Allows user to return an error; dies with logging by default -sub safeOpen { - #my ($self, $fh, $operator, $operand, $errCode) = @_; - # $_[0], $_[1], $_[2], $_[3], $_[4] - - # In some cases, file attempting to be read may not have been flushed - # Clearest case is Log::Fast - my $err = $_[0]->safeSystem('sync'); - - # Modifies $fh/$_[1] by reference - if ( $err || !open( $_[1], $_[2], $_[3] ) ) { - $err = $err || $!; - - #$self #$errCode #$operand - $_[0]->log( $_[4] || 'debug', "Couldn't open $_[3]: $err ($?)" ); - return $err; - } - - return; -} - -sub safeClose { - my ( $self, $fh, $errCode ) = @_; - - my $err = $self->safeSystem('sync'); - - if ($err) { - $self->log( $errCode || 'debug', "Couldn't sync before close due to: $err" ); - return $err; - } - - if ( !close($fh) ) { - $self->log( $errCode || 'debug', "Couldn't close due to: $! ($?)" ); - return $!; - } - - return; -} - -sub getCleanFields { - my ( $self, $line ) = @_; - - chomp $line; - if ( $line =~ m/$taintCheckRegex/xm ) { - my @out; - - push @out, split $self->delimiter, $1; - - return \@out; - } - - return undef; -} - -sub getLineEndings { - return $/; -} - -sub setLineEndings { - my ( $self, $firstLine ) = @_; - - if ( $firstLine =~ /\r\n$/ ) { - $/ = "\r\n"; - } - elsif ( $firstLine =~ /\n$/ ) { - $/ = "\n"; - } - elsif ( $firstLine =~ /\015/ ) { - # Match ^M (MacOS style line endings, which Excel outputs on Macs) - $/ = "\015"; - } - else { - return "Cannot discern line endings: Not Mac, Unix, or Windows style"; - } - - return ""; -} - -sub checkDelimiter { - my ( $self, $line ) = @_; - - if ( $line =~ /^\s*\S+\t\S+/ ) { - return 1; - } - - return 0; -} - -sub safeSystem { - my ( $self, $cmd, $errCode ) = @_; - - my $return = system($cmd); - - if ( $return > 0 ) { - $self->log( $errCode || 'error', "Failed to execute $cmd. Return code: $?" ); - return $return; - } - - return; -} - -sub setDelimiter { - my ( $self, $line ) = @_; - - if ( $line =~ /^\s*\S+\t\S+/ ) { - $self->_setDelimiter('\t'); - } - elsif ( $line =~ /^\s*\S+,\S+/ ) { - $self->_setDelimiter(','); - } - else { - return "Line is not tab or comma delimited"; - } - - return ""; -} - -sub makeTarballName { - my ( $self, $baseName, $compress ) = @_; - - return $baseName . ( $compress ? '.tar.gz' : '.tar' ); -} - -# Assumes if ref's are passed for dir, baseName, or compressedName, they are path tiny -sub compressDirIntoTarball { - my ( $self, $dir, $tarballName ) = @_; - - if ( !$tar ) { - $self->log( 'warn', 'No tar program found' ); - return 'No tar program found'; - } - - if ( ref $dir ) { - $dir = $dir->stringify; - } - - if ( !$tarballName ) { - $self->log( 'warn', 'must provide baseName or tarballName' ); - return 'Must provide baseName or tarballName'; - } - - if ( ref $tarballName ) { - $tarballName = $tarballName->stringify; - } - - $self->log( 'info', 'Compressing all output files' ); - - my @files = glob $dir; - - if ( !@files ) { - $self->log( 'warn', "Directory is empty" ); - return 'Directory is empty'; - } - - my $tarProg = - $tarballName =~ /tar.gz$/ - ? $tarCompressedGzip - : ( $tarballName =~ /tar.lz4$/ ? $tarCompressedLZ4 : "tar" ); - my $tarCommand = sprintf( - "cd %s; $tarProg --exclude '.*' --exclude %s -cf %s * --remove-files", - $dir, - $tarballName, #and don't include our new compressed file in our tarball - $tarballName, # the name of our tarball - ); - - $self->log( 'debug', "compress command: $tarCommand" ); - - my $err = $self->safeSystem($tarCommand); - - return $err; -} - -# returns chunk size in kbytes -sub getChunkSize { - my ( $self, $filePath, $parts, $min, $max ) = @_; - - # If given 0 - $parts ||= 1; - - if ( !$min ) { - $min = 512; - } - - if ( !$max ) { - $max = 32768; - } - - my $size = path($filePath)->stat()->size; - - # Use 15x the size of the file as a heuristic - # VCF files compress roughly this well - $size *= 15; - - my $chunkSize = CORE::int( $size / ( $parts * 4096 ) ); - - if ( $chunkSize < $min ) { - return ( undef, $min ); - } - - # Cap to make sure memory usage doesn't grow uncontrollably - if ( $chunkSize > $max ) { - return ( undef, $max ); - } - - return ( undef, $chunkSize ); -} - -no Mouse::Role; - -1; diff --git a/perl/lib/Seq/Role/Message.pm b/perl/lib/Seq/Role/Message.pm deleted file mode 100644 index a3c1fada4..000000000 --- a/perl/lib/Seq/Role/Message.pm +++ /dev/null @@ -1,341 +0,0 @@ -package Seq::Role::Message; -use 5.10.0; -use strict; -use warnings; - -our $VERSION = '0.001'; - -# ABSTRACT: A class for communicating to log and to some plugged in messaging service -# VERSION -use Mouse::Role 2; - -#doesn't work with Parallel::ForkManager; -#for more on AnyEvent::Log -#http://search.cpan.org/~mlehmann/AnyEvent-7.12/lib/AnyEvent/Log.pm -# use AnyEvent; -# use AnyEvent::Log; - -use Log::Fast; -use namespace::autoclean; -use Beanstalk::Client; -use Cpanel::JSON::XS; -use DDP return_value => 'dump'; -use Carp qw/croak/; -use Time::HiRes qw(time); -use Try::Tiny; - -my $PUBLISHER_ACTION_TIMEOUT = 20; -my $PUBLISHER_CONNECT_TIMEOUT = 30; -my $MAX_PUT_MESSAGE_TIMEOUT = 5; -# How many consecutive failures to connect to the publisher before we stop trying -my $MAX_PUBLISHER_FAILURES_IN_A_ROW = 5; - -$Seq::Role::Message::LOG = Log::Fast->new( - { - level => 'INFO', - prefix => '%D %T ', - type => 'fh', - fh => \*STDERR, - } -); - -$Seq::Role::Message::mapLevels = { - info => 'INFO', #\&{$LOG->INFO} - INFO => 'INFO', - ERR => 'ERR', - error => 'ERR', - fatal => 'ERR', - warn => 'WARN', - WARN => 'WARN', - debug => 'DEBUG', - DEBUG => 'DEBUG', - NOTICE => 'NOTICE', -}; - -my %mapSeverity = ( - debug => 0, - info => 1, - warn => 2, - error => 3, - fatal => 4, -); - -# Static variables; these need to be cleared by the consuming class -state $debug = 0; -state $verbose = 1000; -state $publisher; -state $messageBase; -state $lastPublisherInteractionTime; -state $publisherConsecutiveConnectionFailures = 0; - -# whether log level or verbosity is at the debug level -# shoud only be accessed after setLogLevel and/or setVerbosity executed if program doesn't want default -has hasDebugLevel => ( - is => 'ro', - isa => 'Bool', - init_arg => undef, - lazy => 1, - default => sub { - return $debug || $verbose == 0; - } -); -# should only be run after setPublisher is executed if program doesn't want default -has hasPublisher => ( - is => 'ro', - isa => 'Bool', - init_arg => undef, - lazy => 1, - default => sub { - return !!$publisher; - } -); - -sub initialize { - $debug = 0; - $verbose = 10000; - $publisher = undef; - $lastPublisherInteractionTime = 0; - $publisherConsecutiveConnectionFailures = 0; - $messageBase = undef; -} - -sub putMessageWithTimeout { - my ( $publisher, $timeout, @args ) = @_; - - eval { - local $SIG{ALRM} = sub { die "TIMED_OUT_CLIENT_SIDE" }; - alarm($timeout); - - $publisher->put(@args); # Execute the passed function with arguments - - alarm(0); # Disable the alarm - }; - - if ($@) { - return $@; - } - - return; -} - -sub setLogPath { - my ( $self, $path ) = @_; - #open($Seq::Role::Message::Fh, '<', $path); - - #Results in deep recursion issue if we include Seq::Role::IO (which requires Role::Message - open( my $fh, '>', $path ) or die "Couldn't open log path $path"; - - #$AnyEvent::Log::LOG->log_to_file ($path); - $Seq::Role::Message::LOG->config( { fh => $fh, } ); -} - -sub setLogLevel { - my ( $self, $level ) = @_; - - our $mapLevels; - - if ( $level =~ /debug/i ) { - $debug = 1; - } - - $Seq::Role::Message::LOG->level( $mapLevels->{$level} ); -} - -sub setVerbosity { - my ( $self, $verboseLevel ) = @_; - - if ( $verboseLevel != 0 && $verboseLevel != 1 && $verboseLevel != 2 ) { - say STDERR "Verbose level must be 0, 1, or 2, setting to 10000 (no verbose output)"; - $verbose = 10000; - return; - } - - $verbose = $verboseLevel; -} - -sub setPublisher { - my ( $self, $publisherConfig ) = @_; - - if ( !ref $publisherConfig eq 'Hash' ) { - return $self->log->( 'fatal', 'setPublisherAndAddress requires hash' ); - } - - if ( - !( - defined $publisherConfig->{server} - && defined $publisherConfig->{queue} - && defined $publisherConfig->{messageBase} - ) - ) - { - return $self->log( 'fatal', 'setPublisher server, queue, messageBase properties' ); - } - - $publisher = Beanstalk::Client->new( - { - server => $publisherConfig->{server}, - default_tube => $publisherConfig->{queue}, - connect_timeout => $PUBLISHER_CONNECT_TIMEOUT, - } - ); - - $lastPublisherInteractionTime = time(); - $publisherConsecutiveConnectionFailures = 0; - - $messageBase = $publisherConfig->{messageBase}; -} - -sub _incrementPublishFailuresAndWarn { - $publisherConsecutiveConnectionFailures++; - if ( $publisherConsecutiveConnectionFailures >= $MAX_PUBLISHER_FAILURES_IN_A_ROW ) { - say STDERR - "Exceeded maximum number of publisher reconnection attempts. Disabling publisher until job completion."; - } -} - -# note, accessing hash directly because traits don't work with Maybe types -sub publishMessage { - # my ( $self, $msg ) = @_; - # to save on perf, $_[0] == $self, $_[1] == $msg; - - return unless $publisher; - - if ( $publisherConsecutiveConnectionFailures >= $MAX_PUBLISHER_FAILURES_IN_A_ROW ) { - return; - } - - my $timeSinceLastInteraction = time() - $lastPublisherInteractionTime; - if ( $timeSinceLastInteraction >= $PUBLISHER_ACTION_TIMEOUT ) { - say STDERR - "Attempting to reconnect to publisher in publishMessage because time since last interaction is $timeSinceLastInteraction seconds."; - - $publisher->disconnect(); - $publisher->connect(); - - # Ensure that we space apart reconnection attempts - $lastPublisherInteractionTime = time(); - - if ( $publisher->error ) { - say STDERR "Failed to connect to publisher in publishMessage: " . $publisher->error; - - _incrementPublishFailuresAndWarn(); - return; - } - - say STDERR "Successfully reconnected to publisher in publishMessage"; - - $publisherConsecutiveConnectionFailures = 0; - } - - $messageBase->{data} = $_[1]; - - my $error = putMessageWithTimeout( $publisher, $MAX_PUT_MESSAGE_TIMEOUT, - { priority => 0, data => encode_json($messageBase) } ); - - if ( $error || $publisher->error ) { - my $err = $publisher->error ? $publisher->error : $error; - say STDERR "Failed to publish message: " . $err; - - return; - } - - $lastPublisherInteractionTime = time(); -} - -sub publishProgress { - # my ( $self, $annotatedCount, $skippedCount ) = @_; - # $_[0], $_[1], $_[2] - - return unless $publisher; - - if ( $publisherConsecutiveConnectionFailures >= $MAX_PUBLISHER_FAILURES_IN_A_ROW ) { - return; - } - - my $timeSinceLastInteraction = time() - $lastPublisherInteractionTime; - if ( $timeSinceLastInteraction >= $PUBLISHER_ACTION_TIMEOUT ) { - say STDERR - "Attempting to reconnect publisher in publishProgress because time since last interaction is $timeSinceLastInteraction seconds."; - - $publisher->disconnect(); - $publisher->connect(); - - # Ensure that we space apart reconnection attempts - $lastPublisherInteractionTime = time(); - - if ( $publisher->error ) { - say STDERR "Failed to connect to publisher in publishProgress: " . $publisher->error; - - _incrementPublishFailuresAndWarn(); - return; - } - - say STDERR "Successfully reconnected to publisher in publishProgress"; - - $publisherConsecutiveConnectionFailures = 0; - } - - $messageBase->{data} = { progress => $_[1], skipped => $_[2] }; - - my $error = putMessageWithTimeout( $publisher, $MAX_PUT_MESSAGE_TIMEOUT, - { priority => 0, data => encode_json($messageBase) } ); - if ( $error || $publisher->error ) { - my $err = $publisher->error ? $publisher->error : $error; - say STDERR "Failed to publish progress: " . $err; - - return; - } - - $lastPublisherInteractionTime = time(); -} - -sub log { - #my ( $self, $log_method, $msg ) = @_; - #$_[0] == $self, $_[1] == $log_method, $_[2] == $msg; - - if ( ref $_[2] ) { - $_[2] = p $_[2]; - } - - if ( $_[1] eq 'info' ) { - $Seq::Role::Message::LOG->INFO("[$_[1]] $_[2]"); - - $_[0]->publishMessage( $_[2] ); - } - elsif ( $_[1] eq 'debug' ) { - $Seq::Role::Message::LOG->DEBUG("[$_[1]] $_[2]"); - - # we may publish too many debug messages. to enable: - # $_[0]->publishMessage( "Debug: $_[2]" ); - } - elsif ( $_[1] eq 'warn' ) { - $Seq::Role::Message::LOG->WARN("[$_[1]] $_[2]"); - - # we may publish too many warnings. to enable: - # $_[0]->publishMessage( "Warning: $_[2]" ); - } - elsif ( $_[1] eq 'error' ) { - $Seq::Role::Message::LOG->ERR("[$_[1]] $_[2]"); - - $_[0]->publishMessage("Error: $_[2]"); - } - elsif ( $_[1] eq 'fatal' ) { - $Seq::Role::Message::LOG->ERR("[$_[1]] $_[2]"); - - $_[0]->publishMessage("Fatal: $_[2]"); - - croak("[$_[1]] $_[2]"); - } - else { - return; - } - - if ( $verbose <= $mapSeverity{ $_[1] } ) { - say STDERR "[$_[1]] $_[2]"; - } - - return; -} - -no Mouse::Role; -1; diff --git a/perl/lib/Seq/Role/Validator.pm b/perl/lib/Seq/Role/Validator.pm deleted file mode 100644 index 8c41aade7..000000000 --- a/perl/lib/Seq/Role/Validator.pm +++ /dev/null @@ -1,55 +0,0 @@ -## Interface Class -use 5.10.0; - -package Seq::Role::Validator; - -use Mouse::Role; -use namespace::autoclean; - -#also prrovides ->is_file function -use Types::Path::Tiny qw/File AbsFile AbsPath AbsDir/; - -use Path::Tiny; -use Cwd 'abs_path'; - -use YAML::XS; -use Archive::Extract; -use Try::Tiny; -use File::Which; -use Carp qw(cluck confess); - -use Seq::InputFile; - -with 'Seq::Role::IO', 'Seq::Role::Message'; - -has _inputFileBaseName => ( - isa => 'Str', - is => 'ro', - init_arg => undef, - required => 0, - lazy => 1, - default => sub { - my $self = shift; - return $self->snpfile->basename(qr/\..*/); - }, -); - -sub validateInputFile { - my ( $self, $inputFilePath ) = @_; - - my @parts = split( "/", $inputFilePath ); - - my $last = $parts[-1]; - - # TODO: support more types - for my $type ( ( "vcf", "snp" ) ) { - my ( $format, $gz ) = $last =~ /\.($type)(\.\w+)?/; - - if ($format) { - return ( 0, lc($format) ); - } - } - - return ( "Couldn't identify format of $inputFilePath", "" ); -} -1; diff --git a/perl/lib/Seq/Statistics.pm b/perl/lib/Seq/Statistics.pm deleted file mode 100644 index a02bb7a4b..000000000 --- a/perl/lib/Seq/Statistics.pm +++ /dev/null @@ -1,171 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Statistics; - -use Mouse 2; -use namespace::autoclean; - -use Seq::Output::Delimiters; -use Types::Path::Tiny qw/AbsPath AbsFile AbsDir/; -use File::Which qw/which/; - -with 'Seq::Role::Message'; - -############################ Exports ######################################### -has jsonFilePath => ( is => 'ro', init_arg => undef, writer => '_setJsonFilePath' ); - -has tabFilePath => ( is => 'ro', init_arg => undef, writer => '_setTabFilePath' ); - -has qcFilePath => ( is => 'ro', init_arg => undef, writer => '_setQcFilePath' ); - -################################# Required ################################ -# The path to the output file without extensions -has outputBasePath => ( is => 'ro', isa => 'Str', required => 1 ); - -# Comes from YAML or command line: what bystro features are called -has dbSNPnameField => ( is => 'ro', isa => 'Str', default => '' ); - -# Comes from YAML or command line: what bystro features are called -has siteTypeField => ( is => 'ro', isa => 'Str', required => 1 ); - -# Comes from YAML or command line: The tracks configuration -has exonicAlleleFunctionField => ( is => 'ro', isa => 'Str', required => 1 ); - -# Optional. Can be specified in YAML, command line, or passed. -# If not passed, -has refTrackField => ( is => 'ro', isa => 'Str', required => 1 ); - -has altField => ( is => 'ro', isa => 'Str', required => 1 ); - -has homozygotesField => ( is => 'ro', isa => 'Str', required => 1 ); - -has heterozygotesField => ( is => 'ro', isa => 'Str', required => 1 ); - -############################### Optional ################################## - -# The statistics package config options -# This is by default the go program we use to calculate statistics -has programPath => ( is => 'ro', isa => 'Str', default => 'bystro-stats' ); - -has outputExtensions => ( - is => 'ro', - isa => 'HashRef', - default => sub { - return { - json => '.statistics.json', - tab => '.statistics.tab', - qc => '.statistics.qc.tab', - }; - } -); - -# TODO: Store error, return that from BUILD, instead of err -sub BUILDARGS { - my ( $self, $data ) = @_; - - if ( defined $data->{outputExtensions} ) { - if ( - !( - $data->{outputExtensions}{json} - && $data->{outputExtensions}{qc} - && $data->{outputExtensions}{tab} - ) - ) - { - $self->log( 'fatal', "outputExtensions property requires json, qc, tab values" ); - return; - } - } - - return $data; -} - -sub BUILD { - my $self = shift; - - $self->{_delimiters} = Seq::Output::Delimiters->new(); - - #TODO: Should we store the which path at $self->statistics_program - if ( !which( $self->programPath ) ) { - $self->log( 'fatal', "Couldn't find statistics program at " . $self->programPath ); - return; - } - - $self->_setJsonFilePath( $self->outputBasePath . $self->outputExtensions->{json} ); - $self->_setTabFilePath( $self->outputBasePath . $self->outputExtensions->{tab} ); - $self->_setQcFilePath( $self->outputBasePath . $self->outputExtensions->{qc} ); -} - -sub getStatsArguments { - my $self = shift; - - # Accumulate the delimiters: Note that $alleleDelimiter isn't necessary - # because the bystro_statistics script never operates on multiallelic sites - my $valueDelimiter = $self->{_delimiters}->valueDelimiter; - - my $fieldSeparator = $self->{_delimiters}->fieldSeparator; - my $emptyFieldString = $self->{_delimiters}->emptyFieldChar; - - my $refColumnName = $self->refTrackField; - my $alleleColumnName = $self->altField; - - my $homozygotesColumnName = $self->homozygotesField; - my $heterozygotesColumnName = $self->heterozygotesField; - - my $jsonOutPath = $self->jsonFilePath; - my $tabOutPath = $self->tabFilePath; - my $qcOutPath = $self->qcFilePath; - - my $siteTypeColumnName = $self->siteTypeField; - my $snpNameColumnName = $self->dbSNPnameField; - my $exonicAlleleFuncColumnName = $self->exonicAlleleFunctionField; - - my $statsProg = which( $self->programPath ); - - if ( - !$statsProg - && ( $snpNameColumnName - && $exonicAlleleFuncColumnName - && $emptyFieldString - && $valueDelimiter - && $refColumnName - && $alleleColumnName - && $siteTypeColumnName - && $homozygotesColumnName - && $heterozygotesColumnName - && $jsonOutPath - && $tabOutPath - && $qcOutPath ) - ) - { - return ( - "Need, refColumnName, alleleColumnName, siteTypeColumnName, homozygotesColumnName," - . "heterozygotesColumnName, jsonOutPath, tabOutPath, qcOutPath, " - . "primaryDelimiter, fieldSeparator, and " - . "numberHeaderLines must equal 1 for statistics", - undef, undef - ); - } - - my $dbSNPpart = ""; - - if ($snpNameColumnName) { - $dbSNPpart = "-dbSnpNameColumn $snpNameColumnName "; - } - - return ( undef, - "$statsProg -outJsonPath $jsonOutPath -outTabPath $tabOutPath " - . "-outQcTabPath $qcOutPath -refColumn $refColumnName " - . "-altColumn $alleleColumnName -homozygotesColumn $homozygotesColumnName " - . "-heterozygotesColumn $heterozygotesColumnName -siteTypeColumn $siteTypeColumnName " - . $dbSNPpart - . "-emptyField '$emptyFieldString' " - . "-exonicAlleleFunctionColumn $exonicAlleleFuncColumnName " - . "-primaryDelimiter '$valueDelimiter' -fieldSeparator '$fieldSeparator' " ); -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks.pm b/perl/lib/Seq/Tracks.pm deleted file mode 100644 index 4afef62b7..000000000 --- a/perl/lib/Seq/Tracks.pm +++ /dev/null @@ -1,471 +0,0 @@ -# ABSTRACT: A base class for track classes - -# Used to simplify process of detecting tracks -# Tracks.pm knows very little about each track, just enough to instantiate them -# This is a singleton class; it will not instantiate multiple of each track -# Finally, it is worth noting that track order matters -# By default, the tracks are stord in the database in the order specified in the -# tracks object (from the YAML config) -# Since the database stores each track's data at each position in the genome -# in an array, and arrays can only be consequtively indexed, it is best -# to place sparse tracks at the end of the array of tracks -# such that undef values are not written for these sparse tracks for as many positions -# Ex: If we place refSeq track before cadd track, at every position where cadd exists -# and refSeq doesn't, an undef value will be written, such the cadd value can be written -# to the appropriate place; -# ie: [refSeq, cadd] yields [undef, caddData] at every position that cadd exists -# but refSeq doesn't -# However, placing refSeq after cadd maens that we can simply store [cadd] and notice -# that there is no refSeq index - -package Seq::Tracks; -use 5.10.0; -use strict; -use warnings; - -use Clone 'clone'; - -use Mouse 2; -with 'Seq::Role::Message'; - -use Seq::Tracks::Reference; -use Seq::Tracks::Score; -use Seq::Tracks::Sparse; -use Seq::Tracks::Region; -use Seq::Tracks::Gene; -use Seq::Tracks::Cadd; -use Seq::Tracks::Vcf; -use Seq::Tracks::Nearest; - -use Seq::Tracks::Reference::Build; -use Seq::Tracks::Score::Build; -use Seq::Tracks::Sparse::Build; -use Seq::Tracks::Region::Build; -use Seq::Tracks::Gene::Build; -use Seq::Tracks::Nearest::Build; -use Seq::Tracks::Cadd::Build; -use Seq::Tracks::Vcf::Build; - -use Seq::Tracks::Base::Types; -########################### Configuration ################################## -# This only matters the first time this class is called -# All other calls will ignore this property -has gettersOnly => ( is => 'ro', isa => 'Bool', default => 0 ); - -# @param tracks: track configuration -# expects: { -# typeName : { -# name: someName (optional), -# data: { -# feature1: -#} } } -# This is used to check whether this package has been initialized -has tracks => ( - is => 'ro', - isa => 'ArrayRef[HashRef]' -); - -has outputOrder => ( - is => 'ro', - isa => 'Maybe[ArrayRef[Str]]' -); -########################### Public Methods ################################# - -# @param trackBuilders : ordered track builders -state $orderedTrackBuildersAref = []; -has trackBuilders => ( - is => 'ro', - isa => 'ArrayRef', - init_arg => undef, - lazy => 1, - traits => ['Array'], - handles => { allTrackBuilders => 'elements' }, - default => sub { $orderedTrackBuildersAref } -); - -state $trackBuildersByName = {}; - -sub getTrackBuilderByName { - # my ($self, $name) = @_; #$_[1] == $name - return $trackBuildersByName->{ $_[1] }; -} - -state $trackBuildersByType = {}; - -sub getTrackBuildersByType { - #my ($self, $type) = @_; #$_[1] == $type - return $trackBuildersByType->{ $_[1] }; -} - -# @param trackGetters : ordered track getters -state $orderedTrackGettersAref = []; -has trackGetters => ( - is => 'ro', - isa => 'ArrayRef', - init_arg => undef, - lazy => 1, - traits => ['Array'], - handles => { allTrackGetters => 'elements' }, - default => sub { $orderedTrackGettersAref } -); - -state $trackGettersByName = {}; - -sub getTrackGetterByName { - #my ($self, $name) = @_; #$_[1] == $name - return $trackGettersByName->{ $_[1] }; -} - -state $trackGettersByType = {}; - -sub getTrackGettersByType { - # my ($self, $type) = @_; # $_[1] == $type - return $trackGettersByType->{ $_[1] }; -} - -################### Individual track getters ################## - -my $types = Seq::Tracks::Base::Types->new(); - -#Returns 1st reference track -sub getRefTrackGetter { - my $self = shift; - return $trackGettersByType->{ $types->refType }[0]; -} - -sub getTrackGettersExceptReference { - my $self = shift; - - my @trackGettersExceptReference; - for my $trackGetter ( @{ $self->trackGetters } ) { - if ( $trackGetter->type ne $types->refType ) { - push @trackGettersExceptReference, $trackGetter; - } - } - - return \@trackGettersExceptReference; -} - -sub allRegionTrackBuilders { - my $self = shift; - return $trackBuildersByType->{ $types->regionType }; -} - -sub allScoreTrackBuilders { - my $self = shift; - return $trackBuildersByType->{ $types->scoreType }; -} - -sub allSparseTrackBuilders { - my $self = shift; - return $trackBuildersByType->{ $types->sparseType }; -} - -sub allGeneTrackBuilders { - my $self = shift; - return $trackBuildersByType->{ $types->geneType }; -} - -#only one ref track allowed, so we return the first -sub getRefTrackBuilder { - my $self = shift; - return $trackBuildersByType->{ $types->refType }[0]; -} - -# Used solely for clarity, keep with the interface used in other singleton classes -sub initialize { - _clearStaticGetters(); - _clearStaticBuilders(); -} - -sub BUILD { - my $self = shift; - - # The goal of this class is to allow one consumer to configure the tracks - # for the rest of the program - # i.e Seq.pm passes { tracks => $someTrackConfiguration } and Seq::Tracks::Gene - # can call Seq::Tracks::getRefTrackGetter and receive a configured ref track getter - - # However it is important that in long-running parent processes, which may - # instantiate this program more than once, we do not re-use old configurations - # So every time the parent passes a tracks object, we re-configure this class - - if ( !$self->tracks ) { - if ( !_hasTrackGetters() ) { - $self->log( 'fatal', - 'First time Seq::Tracks is run tracks configuration must be passed' ); - return; - } - - #If we do have trackGetters, this necessarily means we've run this builder before - #So just return, since Seq::Tracks is properly configured - return; - } - - # If we're only requesting - if ( $self->gettersOnly ) { - $self->_buildTrackGetters( $self->tracks ); - return; - } - - # If both getters and builders requested, don't mutate the tracks object - # so that builders get their own distinct configuration - my $getTracks = clone( $self->tracks ); - - # TODO: Lazy, or side-effect free initialization? - # Builders may have side effects; they may configure - # the db features available, including any private features - # If so, create those first - - $self->_buildTrackBuilders( $self->tracks ); - - # This ordering necessarily means that Builders cannot have getters in their - # BUILD step / initialization - # Need to wait for this to run, after BUILD - $self->_buildTrackGetters($getTracks); -} - -################### Private builders ##################### -sub _clearStaticGetters { - $trackGettersByName = {}; - $orderedTrackGettersAref = []; - $trackGettersByType = {}; -} - -sub _clearStaticBuilders { - $trackBuildersByName = {}; - $orderedTrackBuildersAref = []; - $trackBuildersByType = {}; -} - -sub _hasTrackGetters { - return - %{$trackGettersByName} - && @{$orderedTrackGettersAref} - && %{$trackGettersByType}; -} - -sub _hasTrackBuilders { - return - %{$trackBuildersByName} - && @{$orderedTrackBuildersAref} - && %{$trackBuildersByType}; -} - -sub _buildTrackGetters { - my $self = shift; - my $trackConfigurationAref = shift; - - if ( !$trackConfigurationAref ) { - $self->log( 'fatal', '_buildTrackGetters requires trackConfiguration object' ); - } - - my %seenTrackNames; - my $seenRef = 0; - # We may have previously configured this class in a long running process - # If so, remove the tracks, free the memory - _clearStaticGetters(); - - my %trackOrder; - if ( defined $self->outputOrder ) { - my %tracks = map { $_->{name} => $_ } @$trackConfigurationAref; - my $i = 0; - for my $name ( @{ $self->outputOrder } ) { - if ( !defined $tracks{$name} ) { - $self->log( 'fatal', "Uknown track $name specified in `outputOrder`" ); - } - elsif ( $tracks{$name}{no_build} ) { - $self->log( 'fatal', - "Track $name specified in `outputOrder` has `no_build` set, which means this track cannot be built, and is likely used only as a 'join' track, joined onto another track." - ); - } - - $trackOrder{$name} = $i; - $i++; - } - - if ( $i < @$trackConfigurationAref ) { - my @notSeen = - map { exists $trackOrder{ $_->{name} } || $_->{no_build} ? () : $_->{name} } - @$trackConfigurationAref; - - if ( @notSeen > 0 ) { - $self->log( 'fatal', - "When using `outputOrder`, specify all tracks, unless they have `no_build: true`, missing: " - . join( ',', @notSeen ) ); - } - } - } - - # Iterate over the original order - # This is important, because otherwise we may accidentally set the - # tracks database order based on the output order - # if _buildTrackGetters is called before _buildTrackBuilders - my $i = 0; - for my $trackHref (@$trackConfigurationAref) { - if ( $trackHref->{ref} ) { - $trackHref->{ref} = $trackBuildersByName->{ $trackHref->{ref} }; - } - - if ( !$seenRef ) { - $seenRef = $trackHref->{type} eq $types->refType; - - if ( $seenRef && $trackHref->{no_build} ) { - $self->log( 'fatal', "Reference track cannot have `no_build` set" ); - } - } - elsif ( $trackHref->{type} eq $types->refType ) { - $self->log( 'fatal', "Only one reference track allowed, found at least 2" ); - } - - # If we don't build the track, we also can't fetch data from the track - # In the rest of the body of this loop we define the track getters - if ( $trackHref->{no_build} ) { - next; - } - - my $className = $self->_toTrackGetterClass( $trackHref->{type} ); - - my $track = $className->new($trackHref); - - if ( exists $seenTrackNames{ $track->{name} } ) { - $self->log( - 'fatal', "More than one track with the same name - exists: $trackHref->{name}. Each track name must be unique - . Overriding the last object for this name, with the new" - ); - } - - #we use the track name rather than the trackHref name - #because at the moment, users are allowed to rename their tracks - #by name : - # something : someOtherName - $trackGettersByName->{ $track->{name} } = $track; - - #allows us to preserve order when iterating over all track getters - if ( %trackOrder && $track->{name} ) { - $orderedTrackGettersAref->[ $trackOrder{ $track->{name} } ] = $track; - } - else { - $orderedTrackGettersAref->[$i] = $track; - } - - $i++; - } - - for my $track (@$orderedTrackGettersAref) { - $track->setHeaders(); - push @{ $trackGettersByType->{ $track->type } }, $track; - } - - if ( !$seenRef ) { - $self->log( 'fatal', "One reference track required, found none" ); - } -} - -#different from Seq::Tracks in that we store class instances hashed on track type -#this is to allow us to more easily build tracks of one type in a certain order -sub _buildTrackBuilders { - my $self = shift; - my $trackConfigurationAref = shift; - - if ( !$trackConfigurationAref ) { - $self->log( 'fatal', '_buildTrackBuilders requires trackConfiguration object' ); - } - - my %seenTrackNames; - my $seenRef; - # We may have previously configured this class in a long running process - # If so, remove the tracks, free the memory - _clearStaticBuilders(); - - for my $trackHref (@$trackConfigurationAref) { - if ( $trackHref->{ref} ) { - $trackHref->{ref} = $trackBuildersByName->{ $trackHref->{ref} }; - } - - my $className = $self->_toTrackBuilderClass( $trackHref->{type} ); - - my $track = $className->new($trackHref); - - if ( !$seenRef ) { - $seenRef = $track->{type} eq $types->refType; - - if ( $track->{no_build} ) { - $self->log( 'fatal', "Reference track cannot have `no_build` set" ); - } - } - elsif ( $track->{type} eq $types->refType ) { - $self->log( 'fatal', "Only one reference track allowed, found at least 2" ); - } - - #we use the track name rather than the trackHref name - #because at the moment, users are allowed to rename their tracks - #by name : - # something : someOtherName - if ( exists $seenTrackNames{ $track->{name} } ) { - $self->log( - 'fatal', "More than one track with the same name - exists: $trackHref->{name}. Each track name must be unique - . Overriding the last object for this name, with the new" - ); - } - - #we use the track name rather than the trackHref name - #because at the moment, users are allowed to rename their tracks - #by name : - # something : someOtherName - #TODO: make this go away by automating track name conversion/storing in db - $trackBuildersByName->{ $track->{name} } = $track; - - push @{$orderedTrackBuildersAref}, $track; - - push @{ $trackBuildersByType->{ $trackHref->{type} } }, $track; - } - - if ( !$seenRef ) { - $self->log( 'fatal', "One reference track required, found none" ); - } -} - -####### Helper methods for _buildTrackBulders & _buildTrackGetters methods ######## - -sub _toTitleCase { - my $self = shift; - my $name = shift; - - return uc( substr( $name, 0, 1 ) ) . substr( $name, 1, length($name) - 1 ); -} - -sub _toTrackGetterClass { - my $self = shift, my $type = shift; - - # TODO: this right now won't pass $self->type TrackType constraints - if ( $type =~ /\w+\:+\w+/ ) { - my @types = split /\:+/, $type; - my $part1 = $self->_toTitleCase( $types[0] ); - my $part2 = $self->_toTitleCase( $types[1] ); - - return "Seq::Tracks::" . $part1 . "::" . $part2; - } - - return "Seq::Tracks::" . $self->_toTitleCase($type); -} - -sub _toTrackBuilderClass { - my $self = shift, my $type = shift; - - # TODO: this right now won't pass $self->type TrackType constraints - if ( $type =~ /\w+\:+\w+/ ) { - my @types = split /\:+/, $type; - my $part1 = $self->_toTitleCase( $types[0] ); - my $part2 = $self->_toTitleCase( $types[1] ); - - return "Seq::Tracks::" . $part1 . "::" . $part2 . "::Build"; - } - - return "Seq::Tracks::" . $self->_toTitleCase($type) . "::Build"; -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Base.pm b/perl/lib/Seq/Tracks/Base.pm deleted file mode 100644 index 372f49fbc..000000000 --- a/perl/lib/Seq/Tracks/Base.pm +++ /dev/null @@ -1,310 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Base; -#Every track class extends this. The attributes detailed within are used -#regardless of whether we're building or annotating -#and nothing else ends up here (for instance, required_fields goes to Tracks::Build) - -our $VERSION = '0.001'; - -use Mouse 2; -use MouseX::NativeTraits; - -use List::Util qw/first/; - -use Seq::Tracks::Base::MapTrackNames; -use Seq::Tracks::Base::MapFieldNames; -use Seq::DBManager; -# use String::Util qw/trim/; - -# automatically imports TrackType -use Seq::Tracks::Base::Types; - -with 'Seq::Role::Message'; - -# state $indexOfThisTrack = 0; -################# Public Exports ########################## -# Not lazy because every track will use this 100%, and rest are trivial values -# Not worth complexity of Maybe[Type], default => undef, -has dbName => ( is => 'ro', init_arg => undef, writer => '_setDbName' ); - -# Don't actually build, used if we want a track to serve as a join track -# Where the track must exist in the tracks list, so that the "join" property of a different track can reference it -# but we don't want to build it -# An example of this is clinvar. We don't want to build the actual track, because it only allows for position-wise -# matching, rather than allele-wise, but it contains large structural variations which we want to capture -# where they overlap with genes. -# So we join this clinvar data on refSeq data, capturing larger variation, but don't build a clinvar track -# using this data; instead we build a clinvar track using the VCF dataset, and call that clinvarVcf -has no_build => ( is => 'ro', isa => 'Bool', default => 0 ); - -# TODO: Evaluate removing joinTracks in favor of utilities -# or otherwise make them more flexible (array of them) -has joinTrackFeatures => ( - is => 'ro', - isa => 'ArrayRef', - init_arg => undef, - writer => '_setJoinTrackFeatures' -); - -has joinTrackName => - ( is => 'ro', isa => 'Str', init_arg => undef, writer => '_setJoinTrackName' ); - -###################### Required Arguments ############################ -# the track name -has name => ( is => 'ro', isa => 'Str', required => 1 ); - -has type => ( is => 'ro', isa => 'TrackType', required => 1 ); - -has assembly => ( is => 'ro', isa => 'Str', required => 1 ); -#anything with an underscore comes from the config format -#anything config keys that can be set in YAML but that only need to be used -#during building should be defined here -has chromosomes => ( - is => 'ro', - isa => 'HashRef', - traits => ['Hash'], - handles => { - allWantedChrs => 'keys', - chrIsWanted => 'exists', - }, - required => 1, -); - -# Memoized normalizer of chromosome names -# Handles chromosomes with or without 'chr' prefix -# And interconverts between MT and M, such that the requested -# (MT or M) is returned for the other of the pair -# Ex: for "1" we'll accept chr1 or 1, and both will map to "1" -# Ex: for "chrM" we'll accept chrMT, MT, chrM, M, all mapping to "chrM" -has normalizedWantedChr => ( - is => 'ro', - isa => 'HashRef', - init_arg => undef, - lazy => 1, - default => sub { - my $self = shift; - - # Includes the track chromosomes, with and without "chr" prefixes - # and if MT or M is provided, the other of MT or M - my %chromosomes = map { $_ => $_ } keys %{ $self->chromosomes }; - - # Add if not already present - if ( $chromosomes{'MT'} ) { - $chromosomes{'chrM'} //= 'MT'; - $chromosomes{'chrMT'} //= 'MT'; - $chromosomes{'M'} //= 'MT'; - } - elsif ( $chromosomes{'chrMT'} ) { - $chromosomes{'chrM'} //= 'chrMT'; - $chromosomes{'MT'} //= 'chrMT'; - $chromosomes{'M'} //= 'chrMT'; - } - elsif ( $chromosomes{'chrM'} ) { - $chromosomes{'MT'} //= 'chrM'; - $chromosomes{'chrMT'} //= 'chrM'; - $chromosomes{'M'} //= 'chrM'; - } - elsif ( $chromosomes{'M'} ) { - $chromosomes{'MT'} //= 'M'; - $chromosomes{'chrMT'} //= 'M'; - $chromosomes{'chrM'} //= 'M'; - } - - # If provide 'chr' prefixes, map the same chromosomes without those prefixes - # to the 'chr'-prefix name - # And vice versa - for my $chr ( keys %chromosomes ) { - if ( substr( $chr, 0, 3 ) eq 'chr' ) { - # Add if not already present, in case user for some reason wants to - # have chr1 and 1 point to distinct databases. - my $part = substr( $chr, 3 ); - - $chromosomes{$part} //= $chr; - } - else { - # Modify only if not already present - $chromosomes{"chr$chr"} //= $chr; - } - } - - return \%chromosomes; - } -); - -has fieldNames => ( - is => 'ro', - init_arg => undef, - default => sub { - my $self = shift; - return Seq::Tracks::Base::MapFieldNames->new( - { - name => $self->name, - assembly => $self->assembly, - debug => $self->debug - } - ); - }, - handles => [ 'getFieldDbName', 'getFieldName' ] -); - -################# Optional arguments #################### -has wantedChr => ( is => 'ro', isa => 'Maybe[Str]', lazy => 1, default => undef ); - -# Using lazy here lets us avoid memory penalties of initializing -# The features defined in the config file, not all tracks need features -# We allow people to set a feature type for each feature #- feature : int -# We store feature types separately since those are optional as well -# Cannot use predicate with this, because it ALWAYS has a default non-empty value -# As required by the 'Array' trait -has features => ( - is => 'ro', - isa => 'ArrayRef', - lazy => 1, - traits => ['Array'], - default => sub { [] }, - handles => { noFeatures => 'is_empty', }, -); - -# Public, but not expected to be set by calling class, derived from features -# in BUILDARG -has featureDataTypes => ( - is => 'ro', - isa => 'HashRef[DataType]', - lazy => 1, - traits => ['Hash'], - default => sub { {} }, - handles => { getFeatureType => 'get', }, -); - -has join => ( - is => 'ro', - isa => 'Maybe[HashRef]', - predicate => 'hasJoin', - lazy => 1, - default => undef -); - -has debug => ( is => 'ro', isa => 'Bool', lazy => 1, default => 0 ); - -# has index => (is => 'ro', init_arg => undef, default => sub { ++indexOfThisTrack; }); -#### Initialize / make dbnames for features and tracks before forking occurs ### -sub BUILD { - my $self = shift; - # getFieldDbNames is not a pure function; sideEffect of setting auto-generated dbNames in the - # database the first time (ever) that it is run for a track - # We could change this effect; for now, initialize here so that each thread - # gets the same name - for my $featureName ( @{ $self->features } ) { - $self->getFieldDbName($featureName); - } - - my $trackNameMapper = Seq::Tracks::Base::MapTrackNames->new(); - - $self->_setDbName( $trackNameMapper->getOrMakeDbName( $self->name ) ); - - $self->log( 'debug', "Track " . $self->name . " dbName is " . $self->dbName ); - - if ( $self->hasJoin ) { - if ( !defined $self->join->{track} ) { - $self->log( 'fatal', "'join' requires track key" ); - } - - $self->_setJoinTrackName( $self->join->{track} ); - $self->_setJoinTrackFeatures( $self->join->{features} ); - - #Each track gets its own private naming of join features - #Since the track may choose to store these features as arrays - #Again, needs to happen outside of thread, first time it's ever called - if ( $self->joinTrackFeatures ) { - for my $feature ( @{ $self->joinTrackFeatures } ) { - $self->getFieldDbName($feature); - } - } - } - - # Commit, sync, and remove any databases opened - # This is useful because locking may occur if there is an open transaction - # before fork(), and to make sure that any database meta data is properly - # committed before tracks begin to use that data. - Seq::DBManager::cleanUp(); -} - -############ Argument configuration to meet YAML config spec ################### - -# Expects a hash, will crash and burn if it doesn't -around BUILDARGS => sub { - my ( $orig, $class, $data ) = @_; - - # #don't mutate the input data - # my %data = %$dataHref; - if ( defined $data->{chromosomes} && ref $data->{chromosomes} eq 'ARRAY' ) { - my %chromosomes = map { $_ => $_ } @{ $data->{chromosomes} }; - $data->{chromosomes} = \%chromosomes; - } - - if ( defined $data->{wantedChr} ) { - my @chrs = split( ',', $data->{wantedChr} ); - - my $wantedChrs = {}; - for my $chr (@chrs) { - if ( exists $data->{chromosomes}->{$chr} ) { - $wantedChrs->{$chr} = $chr; - } - else { - $class->log( 'fatal', - "Wanted chromosome $chr not listed in chromosomes in YAML config" ); - } - } - - $data->{chromosomes} = $wantedChrs; - } - - if ( !defined $data->{features} ) { - return $class->$orig($data); - } - - if ( defined $data->{features} && ref $data->{features} ne 'ARRAY' ) { - #This actually works :) - $class->log( 'fatal', 'features must be array' ); - } - - # If features are passed to as hashes (to accomodate their data type) get back to array - my @featureLabels; - my %seenFeatures; - for my $origFeature ( @{ $data->{features} } ) { - if ( ref $origFeature eq 'HASH' ) { - my ( $name, $type ) = %$origFeature; #Thomas Wingo method - - push @featureLabels, $name; - $data->{featureDataTypes}{$name} = $type; - - next; - } - - push @featureLabels, $origFeature; - } - - my $idx = 0; - for my $feat (@featureLabels) { - if ( $seenFeatures{$feat} ) { - $class->log( 'warn', - "$feat is listed twice under " . $data->{name} . " features, removing" ); - splice( @featureLabels, $idx, 1 ); - } - - $seenFeatures{$feat} = 1; - - $idx++; - } - - $data->{features} = \@featureLabels; - - return $class->$orig($data); -}; - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Base/MapFieldNames.pm b/perl/lib/Seq/Tracks/Base/MapFieldNames.pm deleted file mode 100644 index 5234b73b4..000000000 --- a/perl/lib/Seq/Tracks/Base/MapFieldNames.pm +++ /dev/null @@ -1,144 +0,0 @@ -# The database stores, for each position, an array of values, either scalars, arrays, -# or hashes -# 1e6 => { [ { 0 => someValue, 1 => otherValue}, {}, someOtheValue, [20,30,40] ] -# This class translates all 'features' names, into integers, which can be used -# to either store a value in the database keyed on that space efficient integer, -# store a value at that integer's index in some array, or translate any -# integer back to its 'feature' name - -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Base::MapFieldNames; - -use Mouse 2; -use List::Util qw/max/; -use Seq::DBManager; - -with 'Seq::Role::Message'; - -################### Required attributes at construction ####################### - -# The name is required to identify which track's fields we're making dbNames for -# Since to improve performance, we memoize, keyed on track name -# This makes it important, in long-running processes, where many jobs are executed, -# spanning multiple config files, across which track names may not be unique -# to call "initialize" and clear an old config/job's data -has name => ( is => 'ro', isa => 'Str', required => 1 ); - -has fieldNamesMap => - ( is => 'ro', init_arg => undef, lazy => 1, default => sub { {} } ); -has fieldDbNamesMap => - ( is => 'ro', init_arg => undef, lazy => 1, default => sub { {} } ); - -################### Private ########################## -#Under which key fields are mapped in the meta database belonging to the -#consuming class' $self->name -#in roles that extend this role, this key's default can be overloaded -state $metaKey = 'fields'; - -# The db cannot be held in a static variable, because in a long-running/daemon env. -# multiple databases may be called for; the Seq::DBManager package is configured -# before this class is instantiated, on each run. -has _db => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { - return Seq::DBManager->new(); - } -); - -sub getFieldDbName { - #my ($self, $fieldName) = @_; - - #$self = $_[0] - #$fieldName = $_[1] - if ( !exists $_[0]->fieldNamesMap->{ $_[0]->name } ) { - $_[0]->_fetchMetaFields(); - } - - if ( !exists $_[0]->fieldNamesMap->{ $_[0]->name }{ $_[1] } ) { - $_[0]->addMetaField( $_[1] ); - } - - if ( !defined $_[0]->fieldNamesMap->{ $_[0]->name }->{ $_[1] } ) { - $_[0]->log( 'warn', "getFieldDbName failed to find or make a dbName for $_[1]" ); - return; - } - - return $_[0]->fieldNamesMap->{ $_[0]->name }->{ $_[1] }; -} - -#this function returns the human readable name -#expected to be used during database reading operations -#like annotation -#@param $fieldNumber : the database name -sub getFieldName { - #my ($self, $fieldNumber) = @_; - - #$self = $_[0] - #$fieldNumber = $_[1] - if ( !exists $_[0]->fieldNamesMap->{ $_[0]->name } ) { - $_[0]->_fetchMetaFields(); - } - - if ( !defined $_[0]->fieldDbNamesMap->{ $_[0]->name }{ $_[1] } ) { - $_[0]->log( 'warn', "getFieldName failed to find a name for $_[1]" ); - return; - } - - return $_[0]->fieldDbNamesMap->{ $_[0]->name }{ $_[1] }; -} - -sub _fetchMetaFields { - my $self = shift; - - my $dataHref = $self->_db->dbReadMeta( $self->name, $metaKey ); - - #if we don't find anything, just store a new hash reference - #to keep a consistent data type - if ( !$dataHref ) { - $self->fieldNamesMap->{ $self->name } = {}; - $self->fieldDbNamesMap->{ $self->name } = {}; - return; - } - - $self->fieldNamesMap->{ $self->name } = $dataHref; - #fieldNames map is name => dbName; dbNamesMap is the inverse - for my $fieldName ( keys %$dataHref ) { - $self->fieldDbNamesMap->{ $self->name }{ $dataHref->{$fieldName} } = $fieldName; - } -} - -sub addMetaField { - my $self = shift; - my $fieldName = shift; - - my @fieldKeys = keys %{ $self->fieldDbNamesMap->{ $self->name } }; - - my $fieldNumber; - if ( !@fieldKeys ) { - $fieldNumber = 0; - } - else { - #https://ideone.com/eX3dOh - $fieldNumber = max(@fieldKeys) + 1; - } - - #need a way of checking if the insertion actually worked - #but that may be difficult with the currrent LMDB_File API - #I've had very bad performance returning errors from transactions - #which are exposed in the C api - #but I may have mistook one issue for another - #passing 1 to overwrite existing fields - #since the below mapping ends up relying on our new values - $self->_db->dbPatchMeta( $self->name, $metaKey, { $fieldName => $fieldNumber }, 1 ); - - $self->fieldNamesMap->{ $self->name }{$fieldName} = $fieldNumber; - $self->fieldDbNamesMap->{ $self->name }{$fieldNumber} = $fieldName; -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Base/MapTrackNames.pm b/perl/lib/Seq/Tracks/Base/MapTrackNames.pm deleted file mode 100644 index 0b4bd40f1..000000000 --- a/perl/lib/Seq/Tracks/Base/MapTrackNames.pm +++ /dev/null @@ -1,110 +0,0 @@ -#This package stores track names as some integer -#if the user gives us a database name, we can store that as well -#they would do that by : -# name: -# someName : someValue - -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Base::MapTrackNames; -use Mouse 2; -use List::Util qw/max/; - -use Seq::DBManager; - -with 'Seq::Role::Message'; - -has dryRun => ( is => 'ro', default => 0 ); - -############## Private variables ############## -#_db shouldn't be static, because in long running environment, can lead to -# the wrong db config being used in a run -has _db => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { - my $self = shift; - return Seq::DBManager->new( { dryRun => $self->dryRun } ); - } -); - -# Track names are stroed under a database ('table') called $self->name_$metaKey -my $metaDb = 'trackNames'; - -####################### Public methods ################ -# Look in the $trackName meta database (create if not exit), for a "name" => dbNameInt -# pair. If none found, create one (by iterating the max found) -# @param $trackName: Some name that we call a track name -sub getOrMakeDbName { - my $self = shift; - my $trackName = shift; - - my $trackNumber = $self->_db->dbReadMeta( $metaDb, $trackName ); - - #if we don't find anything, just store a new hash reference - #to keep a consistent data type - if ( !defined $trackNumber ) { - $self->log( 'debug', "Creating new trackNmber for $trackName" ); - - $trackNumber = $self->_addTrackNameMeta($trackName); - - $self->log( 'debug', "Created new max trackNumber $trackNumber" ); - } - - return $trackNumber; -} - -sub renameTrack { - my ( $self, $trackName, $newTrackName ) = @_; - - my $trackNumber = $self->_db->dbReadMeta( $metaDb, $trackName ); - - if ( !defined $trackNumber ) { - $self->log( 'warn', - "trackName not found in tracknames meta database, skipping rename" ); - return "trackName not found in tracknames meta database"; - } - - # TODO: handle errors from dbManager - # pass 1 as 4th argument to signify that we're deleting - $self->_db->dbDeleteMeta( $metaDb, $trackName ); - - $self->_db->dbPatchMeta( $metaDb, $newTrackName, $trackNumber ); - - # 0 indicates success - return 0; -} - -################### Private Methods ################### -sub _addTrackNameMeta { - my $self = shift; - my $trackName = shift; - - state $largetTrackNumberKey = '_largestTrackNumber'; - - my $maxNumber = $self->_db->dbReadMeta( $metaDb, $largetTrackNumberKey ); - - my $trackNumber; - if ( !defined $maxNumber ) { - $trackNumber = 0; - } - else { - $trackNumber = $maxNumber + 1; - } - - #need a way of checking if the insertion actually worked - #but that may be difficult with the currrent LMDB_File API - #I've had very bad performance returning errors from transactions - #which are exposed in the C api - #but I may have mistook one issue for another - $self->_db->dbPatchMeta( $metaDb, $trackName, $trackNumber ); - $self->_db->dbPatchMeta( $metaDb, $largetTrackNumberKey, $trackNumber ); - - return $trackNumber; -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Base/Types.pm b/perl/lib/Seq/Tracks/Base/Types.pm deleted file mode 100644 index 5488312ba..000000000 --- a/perl/lib/Seq/Tracks/Base/Types.pm +++ /dev/null @@ -1,155 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Base::Types; - -our $VERSION = '0.001'; - -# ABSTRACT: Defines general track information: valid track "types", -# track casting (data) types -# VERSION - -use Mouse 2; -use Mouse::Util::TypeConstraints; -use namespace::autoclean; -use Scalar::Util qw/looks_like_number/; -use Math::SigFigs qw(:all); - -#What the types must be called in the config file -# TODO: build these track maps automatically -# by title casing the "type" field -# And therefore maybe don't use these at all. -state $refType = 'reference'; -has refType => - ( is => 'ro', init_arg => undef, lazy => 1, default => sub { $refType } ); - -state $scoreType = 'score'; -has scoreType => - ( is => 'ro', init_arg => undef, lazy => 1, default => sub { $scoreType } ); - -state $sparseType = 'sparse'; -has sparseType => - ( is => 'ro', init_arg => undef, lazy => 1, default => sub { $sparseType } ); - -state $regionType = 'region'; -has regionType => - ( is => 'ro', init_arg => undef, lazy => 1, default => sub { $regionType } ); - -state $geneType = 'gene'; -has geneType => - ( is => 'ro', init_arg => undef, lazy => 1, default => sub { $geneType } ); - -state $caddType = 'cadd'; -has caddType => - ( is => 'ro', init_arg => undef, lazy => 1, default => sub { $caddType } ); - -state $vcfType = 'vcf'; -has vcfType => - ( is => 'ro', init_arg => undef, lazy => 1, default => sub { $vcfType } ); - -state $nearestType = 'nearest'; -has nearestType => - ( is => 'ro', init_arg => undef, lazy => 1, default => sub { $nearestType } ); - -has trackTypes => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { - return [ $refType, $scoreType, $sparseType, $regionType, $geneType, $caddType, - $vcfType ]; - } -); - -enum TrackType => [ - $refType, $scoreType, $sparseType, $regionType, - $geneType, $caddType, $vcfType, $nearestType -]; - -#Convert types; Could move the conversion code elsewehre, -#but I wanted types definition close to implementation - -subtype DataType => as 'Str' => - where { $_ =~ /number|number\(\d+\)/ }; #['float', 'int', 'number', 'number(2)']; - -# float / number / int can give a precision in the form number(2) -state $precision = {}; -state $typeFunc = {}; -#idiomatic way to re-use a stack, gain some efficiency -#expects ->convert('string or number', 'type') -sub convert { - #my ($self, $value, $type) - # $_[0], $_[1], $_[2] - if ( !$typeFunc->{ $_[2] } ) { - my $idx = index( $_[2], '(' ); - - # We're given number(N) where N is sig figs - if ( $idx > -1 ) { - my $type = substr( $_[2], 0, $idx ); - - $typeFunc->{ $_[2] } = \&{$type}; - # if number(2) take "2" + 0 == 2 - $precision->{ $_[2] } = substr( $_[2], $idx + 1, index( $_[2], ')' ) - $idx - 1 ) +0; - } - else { - # We're given just "number", no precision, so use the type itself ($_[2]) - $typeFunc->{ $_[2] } = \&{ $_[2] }; - $precision->{ $_[2] } = -1; - } - } - - return $typeFunc->{ $_[2] }->( $_[1], $precision->{ $_[2] } ) - ; #2nd argument, with $self == $_[0] -} - -# Truncate a number -sub int { - #my ($value, $precision) = @_; - # $_[0], $_[1], - if ( !looks_like_number( $_[0] ) ) { - return $_[0]; - } - - return CORE::int( $_[0] ); -} - -# This is useful, because will convert a string like "1.000000" to an int -# And this will be interpreted in msgpack as an int, rather than a long string -# Similarly, all numbers *should* be storable within 9 bytes (float64), -# whereas if we sprintf, they will be stored as strings -# Will always take the smallest possible value, so will only be stored as float -# if needed -sub number { - #my ($value, $precision) = @_; - # $_[0], $_[1], - if ( !looks_like_number( $_[0] ) ) { - return $_[0]; - } - - #Saves us up to 8 bytes, because otherwise msgpack will store everything - #as a 9 byte double - if ( CORE::int( $_[0] ) == $_[0] ) { - return CORE::int( $_[0] ); - } - - # Add 0 to prevent from being treated as string by serializers - if ( $_[1] > 0 ) { - return 0+ FormatSigFigs( $_[0], $_[1] ); - } - - # No precision given, just ducktype into a number - return 0+ $_[0]; -} - -#moved away from this; the base build class shouldn't need to know -#what types are allowed, that info is kep in the various track modules -#this is a simple-minded way to enforce a bed-only format -#this should not be used for things with single-field headers -#like wig or multi-fasta (or fasta) -# enum BedFieldType => ['chrom', 'chromStart', 'chromEnd']; - -no Mouse::Util::TypeConstraints; -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Build.pm b/perl/lib/Seq/Tracks/Build.pm deleted file mode 100644 index 15491e667..000000000 --- a/perl/lib/Seq/Tracks/Build.pm +++ /dev/null @@ -1,611 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Build; - -our $VERSION = '0.001'; - -# ABSTRACT: A base class for Tracks::*:BUILD classes -# VERSION - -use Mouse 2; -use MouseX::NativeTraits; -use namespace::autoclean; -use Scalar::Util qw/looks_like_number/; - -use Seq::DBManager; -use Seq::Tracks::Build::CompletionMeta; -use Seq::Tracks::Base::Types; -use Seq::Tracks::Build::LocalFilesPaths; -use Seq::Output::Delimiters; - -# Faster than regex trim -use String::Strip qw/StripLTSpace/; - -extends 'Seq::Tracks::Base'; -# All builders need getReadFh -with 'Seq::Role::IO'; - -#################### Instance Variables ####################################### -############################# Public Exports ################################## -has skipCompletionCheck => ( is => 'ro' ); - -# Every builder needs access to the database -# Don't specify types because we do not allow consumers to set this attribute -has db => ( - is => 'ro', - init_arg => undef, - default => sub { - return Seq::DBManager->new(); - } -); - -# Allows consumers to record track completion, skipping chromosomes that have -# already been built -has completionMeta => ( - is => 'ro', - init_arg => undef, - default => sub { - my $self = shift; - return Seq::Tracks::Build::CompletionMeta->new( - { - name => $self->name, - db => $self->db, - skipCompletionCheck => $self->skipCompletionCheck - } - ); - } -); - -# Transaction size. If large, re-use of pages may be inefficient -# https://github.com/LMDB/lmdb/blob/mdb.master/libraries/liblmdb/lmdb.h -has commitEvery => ( is => 'rw', isa => 'Int', lazy => 1, default => 2e4 ); - -# All tracks want to know whether we have 1 chromosome per file or not -# If this flag is set, then the consumer can choose to skip entire files -# if an unexpected chr is found, or if the expected chr is recorded completed -# Change from b9: this now needs to be manually set, opt-in -has chrPerFile => ( is => 'ro', isa => 'Bool', default => 0 ); - -has maxThreads => ( is => 'ro', isa => 'Int', lazy => 1, default => 8 ); -########## Arguments taken from YAML config file or passed some other way ############## - -#################################### Required ################################### -has local_files => ( - is => 'ro', - isa => 'ArrayRef', - traits => ['Array'], - handles => { - noLocalFiles => 'is_empty', - allLocalFiles => 'elements', - }, - required => 1, -); - -########################### Optional arguments ################################ -#called based because that's what UCSC calls it -#most things are 0 based, including anything in bed format from UCSC, fasta files -has based => ( is => 'ro', isa => 'Int', default => 0, lazy => 1 ); - -# If a row has a field that doesn't pass this filter, skip it -has build_row_filters => ( - is => 'ro', - isa => 'HashRef', - traits => ['Hash'], - handles => { - hasFilter => 'exists', - allFieldsToFilterOn => 'keys', - }, - lazy => 1, - default => sub { {} }, -); - -# Transform a field in some way -has build_field_transformations => ( - is => 'ro', - isa => 'HashRef', - traits => ['Hash'], - handles => { - hasTransform => 'exists', - allFieldsToTransform => 'keys', - }, - lazy => 1, - default => sub { {} }, -); - -# The user can rename any input field, this will be used for the feature name -# This makes it possible to store any name in the db, output file, in place -# of the field name in the source file used to make the db -# if fieldMap isn't specified, this property will be filled with featureName => featureName -has fieldMap => ( - is => 'ro', - isa => 'HashRef', - lazy => 1, - default => sub { - my $self = shift; - my %data = map { $_ => $_ } @{ $self->features }; - - return \%data; - } -); - -################################ Constructor ################################ -sub BUILD { - my $self = shift; - - my @allLocalFiles = $self->allLocalFiles; - - #exported by Seq::Tracks::Base - my @allWantedChrs = $self->allWantedChrs; - - if ( @allWantedChrs > @allLocalFiles && @allLocalFiles > 1 ) { - $self->log( "warn", - "You're specified " - . scalar @allLocalFiles - . " file for " - . $self->name - . ", but " - . scalar @allWantedChrs - . " chromosomes. We will " - . "assume there is only one chromosome per file, and that 1 chromosome isn't accounted for." - ); - } - - my $d = Seq::Output::Delimiters->new(); - $self->{_cleanDelims} = $d->cleanDelims; - $self->{_missChar} = $d->emptyFieldChar; - # Commit, sync, and remove any databases opened - # This is useful because locking may occur if there is an open transaction - # before fork(), and to make sure that any database meta data is properly - # committed before tracks begin to use that data. - Seq::DBManager::cleanUp(); -} - -# Configure local_files as abs path, and configure required field (*_field_name) -# *_field_name is a computed attribute that the consumer may choose to implement -# Example. In config: -# required_field_map: -## chrom : Chromosome -# We pass on to classes that extend this: -# chrom_field_name with value "Chromosome" -my $localFilesHandler = Seq::Tracks::Build::LocalFilesPaths->new(); -around BUILDARGS => sub { - my ( $orig, $class, $href ) = @_; - - my %data = %$href; - - if ( !$href->{files_dir} ) { - $class->log( 'fatal', "files_dir required for track builders" ); - } - - $data{local_files} = $localFilesHandler->makeAbsolutePaths( $href->{files_dir}, - $href->{name}, $href->{local_files} ); - - return $class->$orig( \%data ); -}; - -#########################Type Conversion, Input Field Filtering ######################### -#type conversion; try to limit performance impact by avoiding unnec assignments -#@params {String} $_[1] : feature the user wants to check -#@params {String} $_[2] : data for that feature -#@returns {String} : coerced type - -# This is stored in Build.pm because this only needs to happen during insertion into db -state $converter = Seq::Tracks::Base::Types->new(); - -sub coerceFeatureType { - #my ($self, $feature, $data) = @_; - # $self == $_[0] , $feature == $_[1], $data == $_[2] - - my $type = $_[0]->getFeatureType( $_[1] ); - - #### All values sent to coerceFeatureType at least get an undefined check #### - # modifying the value here actually modifies the value in the array - # http://stackoverflow.com/questions/2059817/why-is-perl-foreach-variable-assignment-modifying-the-values-in-the-array - # https://ideone.com/gjWQeS - for my $val ( ref $_[2] ? @{ $_[2] } : $_[2] ) { - if ( !defined $val ) { - next; - } - - if ( !looks_like_number($val) ) { - $_[0]->{_cleanDelims}->($val); - $_[0]->_stripAndCoerceUndef($val); - } - - if ( defined $type && defined $val ) { - $val = $converter->convert( $val, $type ); - } - } - - # In order to allow fields to be well-indexed by ElasticSearch or other engines - # and to normalize delimiters in the output, anything that has a comma - # (or whatever multi_delim set to), return as an array reference - return $_[2]; -} - -sub passesFilter { - state $cachedFilters; - - if ( $cachedFilters->{ $_[1] } ) { - return &{ $cachedFilters->{ $_[1] } }( $_[2] ); - } - - # $_[0], $_[1], $_[2] - my ( $self, $featureName, $featureValue ) = @_; - - my $command = $self->build_row_filters->{$featureName}; - - my ( $infix, $value ) = split( ' ', $command ); - - if ( $infix eq '==' ) { - if ( looks_like_number($value) ) { - $cachedFilters->{$featureName} = sub { - my $fieldValue = shift; - - return $fieldValue == $value; - } - } - else { - $cachedFilters->{$featureName} = sub { - my $fieldValue = shift; - - return $fieldValue eq $value; - } - } - } - elsif ( $infix eq '!=' ) { - if ( looks_like_number($value) ) { - $cachedFilters->{$featureName} = sub { - my $fieldValue = shift; - - return $fieldValue != $value; - } - } - else { - $cachedFilters->{$featureName} = sub { - my $fieldValue = shift; - - return $fieldValue ne $value; - } - } - } - elsif ( $infix eq '>' ) { - $cachedFilters->{$featureName} = sub { - my $fieldValue = shift; - return $fieldValue > $value; - } - } - elsif ( $infix eq '>=' ) { - $cachedFilters->{$featureName} = sub { - my $fieldValue = shift; - return $fieldValue >= $value; - } - } - elsif ( $infix eq '<' ) { - $cachedFilters->{$featureName} = sub { - my $fieldValue = shift; - return $fieldValue < $value; - } - } - elsif ( $infix eq '<=' ) { - $cachedFilters->{$featureName} = sub { - my $fieldValue = shift; - return $fieldValue <= $value; - } - } - else { - $self->log( - 'warn', - "This filter, " - . $self->build_row_filters->{$featureName} - . ", uses an operator $infix that isn\'t supported. - Therefore this filter won\'t be run, and all values for $featureName will be allowed" - ); - #allow all - $cachedFilters->{$featureName} = sub { return 1; }; - } - - return &{ $cachedFilters->{$featureName} }($featureValue); -} - -######################### Field Transformations ########################### -#TODO: taint check the modifying value -state $transformOperators = [ '.', 'split', '-', '+', 'replace' ]; - -sub transformField { - state $cachedTransform; - - # $_[0], $_[1], $_[2] - #my ($self, $featureName, $featureValue) = @_; - - if ( defined $cachedTransform->{ $_[0]->name }{ $_[1] } ) { - return &{ $cachedTransform->{ $_[0]->name }{ $_[1] } }( $_[2] ); - } - - # $_[0], $_[1], $_[2] - my ( $self, $featureName, $featureValue ) = @_; - - my $command = $self->build_field_transformations->{$featureName}; - - my $leftHand = substr( $command, 0, index( $command, ' ' ) ); - my $rightHand = substr( $command, index( $command, ' ' ) + 1 ); - - # modifies in place - StripLTSpace($leftHand); - StripLTSpace($rightHand); - - my $codeRef; - - if ( $self->_isTransformOperator($leftHand) ) { - if ( $leftHand eq '.' ) { - $codeRef = sub { - # my $fieldValue = shift; - # same as $_[0]; - - return $_[0] . $rightHand; - } - } - elsif ( $leftHand eq '-' ) { - $codeRef = sub { - # my $fieldValue = shift; - # same as $_[0]; - - return $_[0] - $rightHand; - } - } - elsif ( $leftHand eq '+' ) { - $codeRef = sub { - # my $fieldValue = shift; - # same as $_[0]; - - return $_[0] + $rightHand; - } - } - elsif ( $leftHand eq 'split' ) { - $codeRef = sub { - # my $fieldValue = shift; - # same as $_[0]; - my @out; - - # if trailing ,; or whichever specified delimiter - # remove so that no trailing undef value remains - $_[0] =~ s/\s*$rightHand\s*$//; - - # Some fields may contain no data after the delimiter, - # which will lead to blank data, don't keep that - # TODO: skipping empty fields is dangerous; may lead to data that is - # ordered to fall out of order - # evalute the choice on line 349 - foreach ( split( /$rightHand/, $_[0] ) ) { - # Remove trailing/leading whitespace - $_ =~ s/^\s+//; - $_ =~ s/\s+$//; - - if ( defined $_ && $_ ne '' ) { - push @out, $_; - } - } - - return @out == 1 ? $out[0] : \@out; - } - } - elsif ( $leftHand eq 'replace' ) { - if ( substr( $rightHand, 0, 1 ) ne '/' || substr( $rightHand, -1, 1 ) ne '/' ) { - $self->log( 'fatal', - $self->name - . ": build_field_transformation 'replace' expects /from/to/, found $rightHand" ); - } - - my @parts = split '/', $rightHand; - - my $from = $parts[1]; - my $to = $parts[2]; - - $codeRef = sub { - # my $fieldValue = shift; - # $_[0] - $_[0] =~ s/$from/$to/gs; - - return $_[0]; - } - } - } - elsif ( $self->_isTransformOperator($rightHand) ) { - # Append text in the other direction - if ( $rightHand eq '.' ) { - $codeRef = sub { - # my $fieldValue = shift; - # same as $_[0]; - return $leftHand . $_[0]; - } - } - - # Don't allow +/- as right hand operator, pointless and a little silly - } - - if ( !defined $codeRef ) { - $self->log( 'warn', - "Requested transformation, $command, for $featureName, not understood" ); - return $featureValue; - } - - $cachedTransform->{ $self->name }{$featureName} = $codeRef; - - return &{$codeRef}($featureValue); -} - -# Merge [featuresOld...] with [featuresNew...] -# Expects 2 arrays of equal length -# //Won't merge when [featuresNew...] previously merged (duplicate) -# Note: it is completely unsafe to dbReadOne and not commit here -# If the user relies on a non DB->Txn transactions, LMDB_File will complain -# that the transaction should be a sub-transaction -# may screw up the parent transaction, since we currently use a single -# transaction per database per thread. We should move away from this. -# TODO: allow 2 scalars, or growing one array to match the lenght of the other -# TODO: dupsort, dupfixed to optimize storage -# TODO: get reliable de-duping algorithm of deep structures -sub makeMergeFunc { - my $self = shift; - - my $name = $self->name; - my $madeIntoArray = {}; - - my $tempDbName = "$name\_merge_temp"; - return ( - sub { - my ( $chr, $pos, $oldTrackVal, $newTrackVal ) = @_; - - if ( !ref $newTrackVal || @$newTrackVal != @$oldTrackVal ) { - return ( "makeMergeFunc accepts only array values of equal length", undef ); - } - - # commits automatically, so that we are ensured that overlaps - # called from different threads succeed - my $seen = $self->db->dbReadOne( "$tempDbName/$chr", $pos ); - - my @updated; - $#updated = $#$oldTrackVal; - - # oldTrackVal and $newTrackVal should both be arrays, with at least one index - for ( my $i = 0; $i < @$newTrackVal; $i++ ) { - if ( !$seen ) { - $updated[$i] = [ $oldTrackVal->[$i], $newTrackVal->[$i] ]; - next; - } - - $updated[$i] = [ @{ $oldTrackVal->[$i] }, $newTrackVal->[$i] ]; - } - - if ( !$seen ) { - # commits automatically, so that we are ensured that overlaps - # called from different threads succeed - $self->db->dbPut( "$tempDbName/$chr", $pos, 1 ); - } - - return ( undef, \@updated ); - }, - - sub { - my $chr = shift; - $self->db->dbDropDatabase( "$tempDbName/$chr", 1 ); - - say STDERR "Cleaned up $tempDbName/$chr"; - } - ); -} - -# TODO: Allow to be configured on per-track basis -sub _stripAndCoerceUndef { - #my ($self, $dataStr) = @_; - - # TODO: This will be configurable, per-track - state $cl = { - 'no assertion provided' => 1, - 'no_assertion_provided' => 1, - 'no assertion criteria provided' => 1, - 'no_assertion_criteria_provided' => 1, - 'no interpretation for the single variant' => 1, - 'no assertion for the individual variant' => 1, - 'no_assertion_for_the_individual_variant' => 1, - 'not provided' => 1, - 'not_provided' => 1, - 'not specified' => 1, - 'not_specified' => 1, - 'see cases' => 1, - 'see_cases' => 1, - 'unknown' => 1 - }; - - # STripLTSpace modifies passed string by stripping space from it - # This modifies the caller's version - StripLTSpace( $_[1] ); - - if ( $_[1] eq '' ) { - $_[1] = undef; - return $_[1]; - } - - my $v = lc( $_[1] ); - - # These will always get coerced to undef - # TODO: we may want to force only the missChar comparison - if ( $v eq '.' || $v eq 'na' || $v eq $_[0]->{_missChar} ) { - $_[1] = undef; - return $_[1]; - } - - # This will be configurable - if ( exists $cl->{$v} ) { - $_[1] = undef; - return $_[1]; - } - - return $_[1]; -} - -sub chrWantedAndIncomplete { - my ( $self, $chr ) = @_; - - # Allow users to pass 0 as a valid chromosome, in case coding is odder than we expect - if ( !defined $chr || ( !$chr && "$chr" eq '' ) ) { - return undef; - } - - if ( $self->chrIsWanted($chr) && $self->completionMeta->okToBuild($chr) ) { - return $chr; - } - - return undef; -} - -sub safeCloseBuilderFh { - my ( $self, $fh, $fileName, $errCode, $strict ) = @_; - - if ( !$errCode ) { - $errCode = 'fatal'; - } - - #From Seq::Role::IO - my $err = $self->safeClose($fh); - - if ($err) { - #Can happen when closing immediately after opening - if ( $? != 13 ) { - $self->log( $errCode, $self->name . ": Failed to close $fileName: $err ($?)" ); - return $err; - } - - # We make a choice to ignored exit code 13... it happens a lot - # 13 is sigpipe, occurs if closing pipe before cat/pigz finishes - $self->log( 'warn', $self->name . ": Failed to close $fileName: $err ($?)" ); - - # Make it optional to return a sigpipe error, since controlling - # program likely wants to die on error, and sigpipe may not be worth it - if ($strict) { - return $err; - } - - return; - } - - $self->log( 'info', $self->name . ": closed $fileName with $?" ); - return; -} - -sub _isTransformOperator { - my ( $self, $value ) = @_; - - for my $operator (@$transformOperators) { - if ( index( $value, $operator ) > -1 ) { - return 1; - } - } - return 0; -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Build/CompletionMeta.pm b/perl/lib/Seq/Tracks/Build/CompletionMeta.pm deleted file mode 100644 index fb59efaeb..000000000 --- a/perl/lib/Seq/Tracks/Build/CompletionMeta.pm +++ /dev/null @@ -1,92 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Build::CompletionMeta; - -# Keeps track of track build completion -# TODO: better error handling, not sure how w/ present LMDB API without perf loss -use Mouse 2; -use namespace::autoclean; - -with 'Seq::Role::Message'; - -has name => ( is => 'ro', isa => 'Str', required => 1 ); -has db => ( is => 'ro', isa => 'Seq::DBManager', required => 1 ); -has skipCompletionCheck => ( is => 'ro', isa => 'Bool' ); - -############################ Private attributes ######################## -# Instance variable holding completion status for this $self->name db -has _completed => ( is => 'ro', init_arg => undef, default => sub { {} } ); - -state $metaKey = 'completed'; -###################### Public Methods ###################### -sub okToBuild { - my ( $self, $chr ) = @_; - - if ( $self->_isCompleted($chr) ) { - if ( !$self->skipCompletionCheck ) { - $self->log( 'debug', "$chr recorded completed for " . $self->name . ". Skipping" ); - return 0; - } - } - - return 1; -} - -sub recordCompletion { - my ( $self, $chr ) = @_; - - # overwrite any existing entry for $chr - # dbPatchMeta syncs the meta db automatically - my $err = $self->db->dbPatchMeta( $self->name, $metaKey, { $chr => 1 }, 1 ); - - if ($err) { - $self->log( 'fatal', $err ); - return; - } - - $self->_completed->{$chr} = 1; - - $self->log( 'debug', - "Recorded completion of $chr (set to 1) for " . $self->name . " db" ); -} - -########################### Private Methods ############################ -sub _eraseCompletionMeta { - my ( $self, $chr ) = @_; - - # Overwrite any existing entry for $chr - my $err = $self->db->dbPatchMeta( $self->name, $metaKey, { $chr => 0 }, 1 ); - - if ($err) { - return $self->log( 'fatal', $err ); - } - - $self->_completed->{$chr} = 0; - - $self->log( 'debug', - "Erased completion of $chr (set to 0) for " . $self->name . " db" ); -} - -sub _isCompleted { - my ( $self, $chr ) = @_; - - if ( defined $self->_completed->{$chr} ) { - return $self->_completed->{$chr}; - } - - my $allCompleted = $self->db->dbReadMeta( $self->name, $metaKey ); - - if ( $allCompleted && defined $allCompleted->{$chr} && $allCompleted->{$chr} == 1 ) { - $self->_completed->{$chr} = 1; - } - else { - $self->_completed->{$chr} = 0; - } - - return $self->_completed->{$chr}; -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Build/LocalFilesPaths.pm b/perl/lib/Seq/Tracks/Build/LocalFilesPaths.pm deleted file mode 100644 index 0d24d298b..000000000 --- a/perl/lib/Seq/Tracks/Build/LocalFilesPaths.pm +++ /dev/null @@ -1,31 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Build::LocalFilesPaths; - -use Mouse 2; - -use Path::Tiny qw/path/; -use File::Glob ':bsd_glob'; - -sub makeAbsolutePaths { - my ( $self, $filesDir, $trackName, $localFilesAref ) = @_; - - my @localFiles; - for my $localFile (@$localFilesAref) { - if ( path($localFile)->is_absolute ) { - push @localFiles, bsd_glob($localFile); - next; - } - - push @localFiles, - bsd_glob( - path($filesDir)->child($trackName)->child($localFile)->absolute->stringify ); - } - - return \@localFiles; -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Cadd.pm b/perl/lib/Seq/Tracks/Cadd.pm deleted file mode 100644 index 77d0b7d91..000000000 --- a/perl/lib/Seq/Tracks/Cadd.pm +++ /dev/null @@ -1,98 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -# TODO: refactor to allow mutliple alleles, and multiple posiitions -package Seq::Tracks::Cadd; - -#A track whose features are only reported if they match the minor allele -#present in the sample -#Called cadd because at the time of writing it's the -use Mouse 2; -use namespace::autoclean; -use Seq::Tracks::Cadd::Order; - -# This esesntially is a score track, just needs to lookup the value in the array -extends 'Seq::Tracks::Get'; - -has scalingFactor => ( is => 'ro', isa => 'Int', default => 10 ); - -sub BUILD { - my $self = shift; - - # purely to save accessor time - $self->{_s} = $self->scalingFactor; - - #Provided by Seq::Tracks::Get - #$self->{_dbName} = $self->dbName; -} - -state $order = Seq::Tracks::Cadd::Order->new(); -$order = $order->order; - -sub get { - #my ($self, $href, $chr, $refBase, $allele, $outAccum, $alleleNumber) = @_ - # $_[0] == $self - # $_[1] == $href : the database data, with each top-level index corresponding to a track - # $_[2] == $chr : the chromosome - # $_[3] == $refBase : ACTG - # $_[4] == $allele : the allele (ACTG or - / +ACTG) - # $_[5] == $positionIdx : the position in the indel, if any - # $_[6] == $outAccum : a reference to the output, which we mutate - - # We may have stored an empty array at this position, in case - # the CADD scores read were not guaranteed to be sorted - # Alternatively the CADD data for this position may be missing (not defined) - # It's slightly faster to check for truthiness, rather than definition - # Since we always either store undef or an array, truthiness is sufficient - - # Similar to VCF tracks, we only return exact matches - # So we do not tile across the entire deleted / inserted region - # We return only the first result - # In the case of a SNP there is only 1 result - # In the case of an indel, we return undef only once - if ( $_[5] > 0 ) { - return $_[6]; - } - - if ( !defined $_[1]->[ $_[0]->{_dbName} ] ) { - $_[6][ $_[5] ] = undef; - - return $_[6]; - } - - # For indels, we return undef - # Bystro always left-normalizes alleles, so the ref is always length 1 - # and alleles are always longer than 1 for indels - # In the case of deletions, it is - - # In the case of insertions, it is + - if ( length( $_[4] ) > 1 ) { - $_[6][ $_[5] ] = undef; - - return $_[6]; - } - - if ( !defined $order->{ $_[3] } ) { - $_[0]->log( 'warn', "reference base $_[3] doesn't look valid, in Cadd.pm" ); - - $_[6][ $_[5] ] = undef; - - return $_[6]; - } - - #if (defined $order->{ $refBase }{ $altAlleles } ) { - if ( defined $order->{ $_[3] }{ $_[4] } ) { - $_[6][ $_[5] ] = - $_[1]->[ $_[0]->{_dbName} ][ $order->{ $_[3] }{ $_[4] } ] / $_[0]->{_s}; - - return $_[6]; - } - - # We match on exact allele - $_[6][ $_[5] ] = undef; - - return $_[6]; -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Cadd/Build.pm b/perl/lib/Seq/Tracks/Cadd/Build.pm deleted file mode 100644 index aacd89db2..000000000 --- a/perl/lib/Seq/Tracks/Cadd/Build.pm +++ /dev/null @@ -1,625 +0,0 @@ -use 5.10.0; -use strict; -use warnings; -# Adds cadd data to our main database -# Reads CADD's bed-like format -package Seq::Tracks::Cadd::Build; - -use Mouse 2; -extends 'Seq::Tracks::Build'; - -use Seq::Tracks::Cadd::Order; -use Seq::Tracks::Score::Build::Round; -use Seq::Tracks; - -use Scalar::Util qw/looks_like_number/; - -# Cadd tracks seem to be 1 based (not well documented) -has '+based' => ( default => 1 ); - -# CADD files may not be sorted, -has sorted => ( is => 'ro', isa => 'Bool', lazy => 1, default => 0 ); - -has scalingFactor => ( is => 'ro', isa => 'Int', default => 10 ); - -my $order = Seq::Tracks::Cadd::Order->new(); -$order = $order->order; - -my $refTrack; - -sub BUILD { - my $self = shift; - - $self->{_rounder} = - Seq::Tracks::Score::Build::Round->new( { scalingFactor => $self->scalingFactor } ); -} -############## Version that does not assume positions in order ################ -############## Will optimize for cases when sorted_guranteed truthy ########### -# TODO: refactor so that one function handles both main build, and the tail end -sub buildTrack { - my $self = shift; - - # TODO: Remove side effects, or think about another initialization method - # Unfortunately, it is better to call track getters here - # Because builders may have side effects, like updating - # the meta database - # So we want to call builders BUILD methods first - my $tracks = Seq::Tracks->new(); - $refTrack = $tracks->getRefTrackGetter(); - - my $pm = Parallel::ForkManager->new( $self->maxThreads ); - - ######Record completion status only if the process completed unimpeded ######## - my %completedChrs; - $pm->run_on_finish( - sub { - my ( $pid, $exitCode, $fileName, $exitSignal, $coreDump, $errOrChrs ) = @_; - - if ( $exitCode != 0 ) { - my $err = $self->name - . ": got exitCode $exitCode for $fileName: $exitSignal . Dump: $coreDump"; - - $self->log( 'fatal', $err ); - } - - if ( $errOrChrs && ref $errOrChrs eq 'HASH' ) { - for my $chr ( keys %$errOrChrs ) { - if ( !$completedChrs{$chr} ) { - $completedChrs{$chr} = [$fileName]; - } - else { - push @{ $completedChrs{$chr} }, $fileName; - } - } - } - - #Only message that is different, in that we don't pass the $fileName - $self->log( 'info', $self->name . ": completed building from $fileName" ); - } - ); - - #Perl is dramatically faster when splitting on a constant, so we assume '\t' - if ( $self->delimiter ne '\t' && $self->delimiter ne "\t" ) { - $self->log( "fatal", $self->name . ": requires delimiter to be \\t" ); - } - - my $missingValue = undef; - - # If we cannot rely on the cadd sorting order, we must use a defined - # value for those bases that we skip, because we'll ned mergeFunc - # to know when data was found for a position, and when it is truly missing - # Because when CADD scores are not sorted, each chromosome-containing file - # can potentially have any other chromosome's scores, meaning we may get - # 6-mers or greater for a single position; when that happens the only - # sensible solution is to store a missing value; undef would be nice, - # but that will never get triggered, unless our database is configured to store - # hashes instead of arrays; since a sparse array will contain undef/nil - # for any track at that position that has not yet been inserted into the db - # For now we require sorting to be guaranteed to simplify this code - if ( !$self->sorted ) { - $self->log( "fatal", $self->name . ": requires sorted to be true" ); - } - - for my $file ( @{ $self->local_files } ) { - $self->log( 'info', $self->name . ": beginning building from $file" ); - - # Although this should be unnecessary, environments must be created - # within the process that uses them - # This provides a measure of safety - $self->db->cleanUp(); - - $pm->start($file) and next; - my ( $err, undef, $fh ) = $self->getReadFh($file); - - if ($err) { - $self->log( 'fatal', $self->name . ": $err" ); - } - - my $versionLine = <$fh>; - - if ( !$versionLine ) { - $self->log( 'fatal', $self->name . ": couldn't read version line of $file" ); - } - - chomp $versionLine; - - $self->log( 'debug', $self->name . ": read version line of $file: $versionLine" ); - - if ( index( $versionLine, '## CADD' ) == -1 ) { - $self->log( 'fatal', - $self->name . ": first line of $file is not CADD formatted: $_" ); - } - - # Cadd's columns descriptor is on the 2nd line - my $headerLine = <$fh>; - - if ( !$headerLine ) { - $self->log( 'fatal', $self->name . ": couldn't read header line of $file" ); - } - - chomp $headerLine; - - $self->log( 'debug', $self->name . ": read header of $file: $headerLine" ); - - # We may have converted the CADD file to a BED-like format, which has - # chrom chromStart chromEnd instead of #Chrom Pos - # and which is 0-based instead of 1 based - # Moving $phastIdx to the last column - my @headerFields = split '\t', $headerLine; - - # Get the last index, that's where the phast column lives https://ideone.com/zgtKuf - # Can be 5th or 6th column idx. 5th for CADD file, 6th for BED-like file - my $phastIdx = $#headerFields; - - my $altAlleleIdx = $#headerFields - 2; - my $refBaseIdx = $#headerFields - 3; - - my $based = $self->based; - my $isBed; - - if ( @headerFields == 7 ) { - # It's the bed-like format - $based = 0; - $isBed = 1; - } - - $self->log( 'debug', - $self->name - . ": input file is " - . ( $isBed ? "a bed-like file" : "a CADD (non-bed) file" ) ); - $self->log( 'debug', $self->name . ": input file is $based\-based" ); - - # We accumulate information about why a record is bad - my %skipSites; - - # Track which fields we recorded, to record in $self->completionMeta - my %visitedChrs = (); - - # We keep track of posiitonal changes, to know when we're done accumulating scores - # (provided that sorting is guaranteed) - my $lastPosition; - - # the reference may not match up, report how many of these positions exist - # this is very expected for lifted over files - my $changedRefPositions = 0; - my $multiRefPositions = 0; - my $multiScorePositions = 0; - my $nonACTGrefPositions = 0; - my $nonACTGaltPositions = 0; - my $missingScorePositions = 0; - - # File does not need to be sorted by chromosome, but each position - # must be in a block of 3 (one for each possible allele) - my @fields; - - my ( $chr, $wantedChr, $dbPosition ); - my ( @caddData, $caddRef, $dbData, $assemblyRefBase, $altAllele, $refBase, - $phredScoresAref ); - - # Manage our own cursors, to improve performance - my $cursor; - my $count = 0; - FH_LOOP: while ( my $line = $fh->getline() ) { - chomp $line; - - @fields = split '\t', $line; - - # If this is a regular CADD file, it will not have "chr" prepended - # Else it "should" be found in the beginning of the string - # If not, it will be caught in our if( $self->chrIsWanted($chr) ) check - # http://ideone.com/Y5PiUa - - # Normalizes the $chr representation to one we may want but did not specify - # Example: 1 becomes chr1, and is checked against our list of wanted chromosomes - # Avoids having to use a field transformation, since this may be very common - # and Bystro typical use is with UCSC-style chromosomes - # If the chromosome isn't wanted, $chr will be undefined - $chr = $self->normalizedWantedChr->{ $fields[0] }; - - #If the chromosome is new, write any data we have & see if we want new one - if ( !defined $wantedChr || ( !defined $chr || $wantedChr ne $chr ) ) { - # We switched chromosomes - if ( defined $wantedChr ) { - #Clean up the database, commit & close any cursors, free memory; - $self->db->cleanUp(); - undef $cursor; - - #Reset our transaction counter - $count = 0; - - if ( @caddData || defined $caddRef ) { - my $err = $self->name . ": changed chromosomes, but unwritten data remained"; - - $self->log( 'fatal', $err ); - } - } - - # Completion meta checks to see whether this track is already recorded - # as complete for the chromosome, for this track - $wantedChr = $self->chrWantedAndIncomplete($chr); - undef @caddData; - undef $caddRef; - } - - # We expect either one chr per file, or all in one file - # However, chr-split CADD files may have multiple chromosomes after liftover - # TODO: rethink chrPerFile handling - if ( !defined $wantedChr ) { - next FH_LOOP; - } - - ### Record that we visited the chr, to enable recordCompletion later ### - # //= is equivalent to checking for defined before assigning - $visitedChrs{$wantedChr} //= 1; - - # CADD uses a number of IUPAC codes for multi reference sites, skip these - if ( $fields[$refBaseIdx] ne 'A' - && $fields[$refBaseIdx] ne 'C' - && $fields[$refBaseIdx] ne 'G' - && $fields[$refBaseIdx] ne 'T' ) - { - $nonACTGrefPositions++; - next FH_LOOP; - } - - $dbPosition = $fields[1] - $based; - - ######## If we've changed position, we should have a 3 mer ######## - ####################### If so, write that ############################## - ####################### If not, wait until we do ####################### - # Note, each call to dbPatch has a boolean !!($count >= $self->commitEvery) - # because we delay commits to increase performance - if ( defined $lastPosition && $lastPosition != $dbPosition ) { - if ( defined $skipSites{"$wantedChr\_$lastPosition"} ) { - #use debug because this logs hundreds of MB of data for lifted over hg38, and never seen a mistake - #can have billions of messages, avoid string copy by checking if would log/print anything - #From Seq::Role::Message - if ( $self->hasDebugLevel ) { - $self->log( 'debug', - $self->name - . ": $wantedChr\:$lastPosition: " - . $skipSites{"$wantedChr\_$lastPosition"} - . ". Skipping" ); - } - - if ( @caddData || $caddRef ) { - my $err = - $self->name . ": skipSites and score accumulation should be mutually exclusive"; - $self->log( 'fatal', $err ); - } - - #Can delete because order guaranteed - delete $skipSites{"$wantedChr\_$lastPosition"}; - # There is nothing to write in this case since sorting is guaranteed - } - elsif ( !@caddData ) { - # Could occur if we skipped the lastPosition because refBase didn't match - # assemblyRefBase - $self->log( 'warn', - $self->name . ": $wantedChr\:$lastPosition: No scores or warnings accumulated." ); - undef $caddRef; - } - else { - $cursor //= $self->db->dbStartCursorTxn($wantedChr); - - ########### Check refBase against the assembly's reference ############# - # We read using our cursor; since in LMDB, cursors are isolated - # and therefore don't want to use our helper dbRead class, as inconsistencies may arise - $dbData = $self->db->dbReadOneCursorUnsafe( $cursor, $lastPosition ); - $assemblyRefBase = $refTrack->get($dbData); - - if ( !defined $assemblyRefBase ) { - my $err = $self->name . ": no assembly ref base found for $wantedChr:$lastPosition"; - $self->log( 'fatal', $err ); - } - - # When lifted over, reference base is not lifted, can cause mismatch - # In these cases it makes no sense to store this position's CADD data - if ( $assemblyRefBase ne $caddRef ) { - # Don't log, in hg38 case there will be much of the genome logged - $changedRefPositions++; - - #As long as sorting is guaranteed, there is no reason to write - #anything in these cases - undef @caddData; - undef $caddRef; - } - else { - $phredScoresAref = - $self->_accumulateScores( $wantedChr, \@caddData, $caddRef, $lastPosition ); - - if ( !defined $phredScoresAref ) { - # Sorted guaranteed, but no score found - # This can actually happen as a result of liftover - # chr22 20668231 has 6 scores, because liftOver mapped 2 different positions - # to 20668231, when lifting over from hg19 - if ( $self->hasDebugLevel ) { - $self->log( 'debug', - $self->name - . ": $wantedChr\:$lastPosition: Instead of 3 scores got: " - . ( @caddData || 0 ) - . ". Skipping" ); - } - - if ( @caddData > 3 ) { - $multiScorePositions++; - } - else { - my $err = $self->name - . ": $wantedChr\:$lastPosition: Didn't accumulate 3 phredScores, and not because > 3 scores, which should be impossible."; - $self->log( 'fatal', $err ); - die $err; - } - - #Since sorting is guaranteed, there is nothing to write here - } - else { - #Args: $cursor $chr, $trackIndex, $pos, $trackValue - $self->db->dbPatchCursorUnsafe( $cursor, $wantedChr, $self->dbName, $lastPosition, - $phredScoresAref ); - - if ( $count > $self->commitEvery ) { - $self->db->dbEndCursorTxn($wantedChr); - undef $cursor; - - $count = 0; - } - - $count++; - - undef $phredScoresAref; - } - - undef @caddData; - undef $caddRef; - } - } - } - - ##### Build up the scores into 3-mer (or 4-mer if ambiguous base) ##### - - # This site will be next in 1 iteration - $lastPosition = $dbPosition; - - if ( defined $skipSites{"$wantedChr\_$lastPosition"} ) { - next; - } - - $altAllele = $fields[$altAlleleIdx]; - $refBase = $fields[$refBaseIdx]; - - if ( $altAllele ne 'A' - && $altAllele ne 'C' - && $altAllele ne 'G' - && $altAllele ne 'T' ) - { - $skipSites{"$wantedChr\_$lastPosition"} = "non_actg_alt"; - $nonACTGaltPositions++; - - # No need to keep this in memory, since we never will use this value - undef @caddData; - undef $caddRef; - next; - } - - if ( $dbPosition < 0 ) { - my $err = $self->name . ": found dbPosition < 0: $line. This is impossible."; - $self->log( 'fatal', $err ); - die $err; - } - - #if !defined $caddRef assign caddRef - $caddRef //= $refBase; - - # If we find a position that has multiple bases, that is undefined behavior - # so we will store a nil (undef on perl side, nil in msgpack) for cadd at that position - # This can happen as result of liftover - # This is NOT the same thing as the multiple base call that CADD sometimes - # uses for the reference (e.g and "M", or "R") - if ( $caddRef ne $refBase ) { - # Mark for $missingValue insertion - $skipSites{"$wantedChr\_$lastPosition"} = "multi_ref"; - $multiRefPositions++; - - # No need to keep this in memory, since we never will use this value - undef @caddData; - undef $caddRef; - next; - } - - # If no phastIdx found for this site, there cannot be 3 scores accumulated - # so mark it as for skipping; important because when out of order - # we may have cryptic 3-mers, which we don't want to insert - if ( !defined $fields[$phastIdx] || !looks_like_number( $fields[$phastIdx] ) ) { - # Mark for undef insertion - $skipSites{"$wantedChr\_$lastPosition"} = "missing_score"; - $missingScorePositions++; - - # No need to keep this in memory, since we never will use this value - undef @caddData; - undef $caddRef; - next; - } - - push @caddData, [ $altAllele, $self->{_rounder}->round( $fields[$phastIdx] ) ]; - } - - ######################### Finished reading file ########################## - ######### Collect any scores that were accumulated out of order ########## - if (@caddData) { - if ( !( defined $wantedChr && defined $lastPosition && defined $caddRef ) ) { - my $err = $self->name - . ": at end of file, if have cadd data, expect lastPosition, wantedChr, and cadRef"; - $self->log( 'fatal', $err ); - die $err; - } - - if ( defined $cursor ) { - $self->db->dbEndCursorTxn($wantedChr); - undef $cursor; - } - - if ( defined $skipSites{"$wantedChr\_$lastPosition"} ) { - $self->log( 'debug', - $self->name - . ": $wantedChr\:$lastPosition: " - . $skipSites{"$chr\_$lastPosition"} - . ". Skipping." ); - - # always safe to delete here; last time we'll check it - delete $skipSites{"$wantedChr\_$lastPosition"}; - #Since sorting is guaranteed, no need to write anything - } - else { - $dbData = $self->db->dbReadOne( $wantedChr, $lastPosition ); - $assemblyRefBase = $refTrack->get($dbData); - - if ( $assemblyRefBase ne $caddRef ) { - $changedRefPositions++; - - $self->log( 'debug', - $self->name . ": $wantedChr\:$lastPosition: Ref doesn't match. Skipping." ); - #Since sorting is guaranteed, no need to write anything - } - else { - $phredScoresAref = - $self->_accumulateScores( $wantedChr, \@caddData, $caddRef, $lastPosition ); - - # We want to keep missing values consistent - # Because when sorting not guaranteed, we may want non-nil/undef - # values to prevent cryptic 3-mers - if ( !defined $phredScoresAref ) { - $self->log( 'debug', - $self->name - . ": $wantedChr\:$lastPosition: Instead of 3 scores got: " - . ( @caddData || 0 ) - . ". Skipping" ); - - if ( @caddData > 3 ) { - $multiScorePositions++; - } - else { - my $err = $self->name - . ": $wantedChr\:$lastPosition: Didn't accumulate 3 phredScores, and not because > 3 scores, which should be impossible."; - $self->log( 'fatal', $err ); - } - } - else { - #We commit here, because we don't expect any more scores - $self->db->dbPatch( $wantedChr, $self->dbName, $lastPosition, $phredScoresAref ); - } - } - } - } - - undef @caddData; - undef $lastPosition; - undef $wantedChr; - undef $phredScoresAref; - - #Commit any remaining transactions, commit & close cursors, sync all environments, free memory - $self->db->cleanUp(); - undef $cursor; - - $self->safeCloseBuilderFh( $fh, $file, 'fatal' ); - - if ( $changedRefPositions > 0 ) { - $self->log( 'warn', - $self->name - . ": skipped $changedRefPositions positions because CADD Ref didn't match ours: in $file." - ); - } - - if ( $multiRefPositions > 0 ) { - $self->log( 'warn', - $self->name - . ": skipped $multiRefPositions positions because CADD Ref had multiple Ref at that position: in $file." - ); - } - - if ( $multiScorePositions > 0 ) { - $self->log( 'warn', - $self->name - . ": skipped $multiScorePositions positions because found multiple scores: in $file" - ); - } - - if ( $nonACTGrefPositions > 0 ) { - $self->log( 'warn', - $self->name - . ": skipped $nonACTGrefPositions positions because found non-ACTG CADD Ref: in $file" - ); - } - - if ( $nonACTGaltPositions > 0 ) { - $self->log( 'warn', - $self->name - . ": skipped $nonACTGaltPositions positions because found non-ACTG CADD Alt: in $file" - ); - } - - if ( $missingScorePositions > 0 ) { - $self->log( 'warn', - $self->name - . ": skipped $missingScorePositions positions because has missing Phred scores: in $file" - ); - } - - $pm->finish( 0, \%visitedChrs ); - } - - $pm->wait_all_children(); - - # Now, regardless of whether chromosomes were sorted, or spread across many files - # we can record true completion state - for my $chr ( keys %completedChrs ) { - $self->completionMeta->recordCompletion($chr); - - $self->log( 'info', - $self->name - . ": recorded $chr completed, from " - . ( join( ",", @{ $completedChrs{$chr} } ) ) ); - } - - #TODO: figure out why this is necessary, even with DEMOLISH - $self->db->cleanUp(); - - #TODO: Implement actual error return codes instead of dying - return; -} - -sub _accumulateScores { - #my ($self, $chr, $dataAref, $caddRef, $lastPosition) = @_; - # $_[0] , $_[1], $_[2], $_[3], $_[4] - - # (@{$dataAref} != 3) - if ( @{ $_[2] } != 3 ) { - # May be called before 3 scores accumulated - return undef; - } - - # Found 3 scores - # Make sure we place them in the correct order - my @phredScores; - my $index; - # ( @{$dataAref} ) - for my $aref ( @{ $_[2] } ) { - #In the aref, first position is the altAllele, 2nd is the phred score - # {$caddRef} - $index = $order->{ $_[3] }{ $aref->[0] }; - - # checks whether ref and alt allele are ACTG - if ( !defined $index ) { - my $err = $_[0]->name . ": $_[1]\:$_[4]: no score possible for altAllele $aref->[0]"; - # $self->log - $_[0]->log( 'fatal', $err ); - } - - $phredScores[$index] = $aref->[1]; - } - - return \@phredScores; -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Cadd/Order.pm b/perl/lib/Seq/Tracks/Cadd/Order.pm deleted file mode 100644 index dbb4d75f5..000000000 --- a/perl/lib/Seq/Tracks/Cadd/Order.pm +++ /dev/null @@ -1,40 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Cadd::Order; -use Mouse 2; - -state $order = { - A => { - C => 0, - G => 1, - T => 2, - }, - C => { - A => 0, - G => 1, - T => 2, - }, - G => { - A => 0, - C => 1, - T => 2, - }, - T => { - A => 0, - C => 1, - G => 2, - }, - N => { - A => 0, - C => 1, - G => 2, - T => 3, - } -}; - -has order => ( is => 'ro', init_arg => undef, default => sub { $order } ); - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Gene.pm b/perl/lib/Seq/Tracks/Gene.pm deleted file mode 100644 index e877cb0cb..000000000 --- a/perl/lib/Seq/Tracks/Gene.pm +++ /dev/null @@ -1,485 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Gene; - -our $VERSION = '0.001'; - -=head1 DESCRIPTION - - @class B - - Note: unlike previous Bystro, there is no longer a genomic_type - Just a siteType, which is Intronic, Coding, 5UTR, etc - This is just like Annovar's - We add "Intergenic" if not covered by any gene - This class not longer handles intergenic sites - -=cut - -use Mouse 2; - -use namespace::autoclean; - -use Seq::Tracks::Gene::Site; -use Seq::Tracks::Gene::Site::SiteTypeMap; -use Seq::Tracks::Gene::Site::CodonMap; -use Seq::Tracks::Gene::Definition; -use Seq::DBManager; -use Seq::Headers; - -# Doesn't extend Seq::Tracks::Get to reduce inheritance depth, since most -# of that class is overriden anyhow (leaving only the headers property inheritance -# which isn't necessary since Seq::Headers is a singleton class) -extends 'Seq::Tracks::Base'; - -my $geneDef = Seq::Tracks::Gene::Definition->new(); - -### Set the features default that we get from the Gene track region database ### -has '+features' => ( - default => sub { - return [ @{ $geneDef->ucscGeneAref }, $geneDef->txErrorName ]; - }, -); - -with 'Seq::Tracks::Region::RegionTrackPath'; - -########### @Public attributes########## -########### Additional "features" that we will add to our output ############## -### Users may configure these; they are not stored, but computed #### - -# These are features defined by Gene::Site, but we name them in Seq::Tracks::Gene -# Because it gets really confusing to track down the features defined in Seq::Tracks::Gene::Site -# TODO: rename these siteTypeField to match the interface used by Seq.pm (TODO: and Seq::Tracks::Sparse::Build) -has siteTypeKey => ( is => 'ro', default => 'siteType' ); -has strandKey => ( is => 'ro', default => 'strand' ); -has codonNumberKey => ( is => 'ro', default => 'codonNumber' ); -has codonPositionKey => ( is => 'ro', default => 'codonPosition' ); - -has codonSequenceKey => ( is => 'ro', default => 'refCodon' ); -has refAminoAcidKey => ( is => 'ro', default => 'refAminoAcid' ); -has newCodonKey => ( is => 'ro', default => 'altCodon' ); -has newAminoAcidKey => ( is => 'ro', default => 'altAminoAcid' ); -has exonicAlleleFunctionKey => ( is => 'ro', default => 'exonicAlleleFunction' ); - -# This is actually a stored value, added automatically -# has txErrorKey => (is => 'ro', default => 'txError'); - -# This is just the index corresponding to the transcript in the region db -# Not a real feature; we need to ask for "txNumberKey" -# In a future API, it may be worthwhile just storing a 'txNumber' -# property in the region database, to allow a user to just write 'txNumber' -# as a feature -# This is needed, because, as insane as it is, the transcript 'name' -# is not a unique key, and therefore is not a primary key, which makes it -# useless for lookup -has txNumberKey => ( is => 'ro', lazy => 1, default => 'txNumber' ); -has reportTxNumber => ( is => 'ro', isa => 'Bool', default => 0 ); - -# has hgvsPkey => (is => 'ro', default => 'hgvsP'); -# has hgvsCkey => (is => 'ro', default => 'hgvsC'); - -########################## Private Attributes ################################## -########## The names of various features. These cannot be configured ########## -### Positions that aren't covered by a refSeq record are intergenic ### -### TODO: We don't output anything for these sites ### -### Because "intergenic" is just a label for missing , they are 100% equal ### - -### txEffect possible values ### -# TODO: export these, and make them configurable -state $silent = 'synonymous'; -state $replacement = 'nonSynonymous'; -state $frameshift = 'indel-frameshift'; -state $inFrame = 'indel-nonFrameshift'; -state $indelBoundary = 'indel-exonBoundary'; -state $startLoss = 'startLoss'; -state $stopLoss = 'stopLoss'; -state $stopGain = 'stopGain'; - -has siteTypes => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { - return { - replacement => $replacement, - silent => $silent, - frameshift => $frameshift, - inFrame => $inFrame, - indelBoundary => $indelBoundary, - startLoss => $startLoss, - stopLoss => $stopLoss, - stopGain => $stopGain, - }; - } -); - -# TODO: implement the truncated annotation -state $truncated = 'truncatedCodon'; - -### objects that get used by multiple subs, but shouldn't be public attributes ### -# All of these instantiated classes cannot be configured at instantiation time -# so safe to use in static context -state $siteUnpacker = Seq::Tracks::Gene::Site->new(); -state $siteTypeMap = Seq::Tracks::Gene::Site::SiteTypeMap->new(); -state $codonMap = Seq::Tracks::Gene::Site::CodonMap->new(); - -state $strandIdx = $siteUnpacker->strandIdx; -state $siteTypeIdx = $siteUnpacker->siteTypeIdx; -state $codonSequenceIdx = $siteUnpacker->codonSequenceIdx; -state $codonPositionIdx = $siteUnpacker->codonPositionIdx; -state $codonNumberIdx = $siteUnpacker->codonNumberIdx; - -state $negativeStrandTranslation = { A => 'T', C => 'G', G => 'C', T => 'A' }; - -#### Add our other "features", everything we find for this site #### -sub BUILD { - my $self = shift; - - # Private variables, meant to cache often used data - $self->{_allCachedDbNames} = {}; - $self->{_regionDb} = {}; - - $self->{_features} = $self->features; - $self->{_dbName} = $self->dbName; - $self->{_db} = Seq::DBManager->new(); - - # Not including the txNumberKey; this is separate from the annotations, which is - # what these keys represent - - if ( $self->hasJoin ) { - my $joinTrackName = $self->joinTrackName; - - $self->{_hasJoin} = 1; - - $self->{_flatJoinFeatures} = - [ map { "$joinTrackName.$_" } @{ $self->joinTrackFeatures } ]; - - my $i = 0; - for my $fName ( @{ $self->joinTrackFeatures } ) { - $self->{_allCachedDbNames}{ $self->{_flatJoinFeatures}[$i] } = - $self->getFieldDbName($fName); - $i++; - } - } - - for my $fName ( @{ $self->{_features} } ) { - $self->{_allCachedDbNames}{$fName} = $self->getFieldDbName($fName); - } -} - -sub setHeaders { - my $self = shift; - my @features = @{ $self->features }; - - my $headers = Seq::Headers->new(); - - ########################## Create the header ################################# - # don't mutate $self->features or flatJoinFeatures - # those features should be keys in the region db - # whereas siteType, exonicAlleleFunction, etc may be computed features - # TODO: have clearer separation between computed and stored features - ############################################################################## - unshift @features, $self->siteTypeKey, $self->exonicAlleleFunctionKey, - $self->codonSequenceKey, $self->newCodonKey, $self->refAminoAcidKey, - $self->newAminoAcidKey, $self->codonPositionKey, - $self->codonNumberKey, $self->strandKey; #$self->hgvsCkey, $self->hgvsPkey - - if ( $self->{_flatJoinFeatures} ) { - push @features, @{ $self->{_flatJoinFeatures} }; - } - - if ( $self->reportTxNumber ) { - push @features, $self->txNumberKey; - } - - # Prepend some custom features - # Providing 1 as the last argument means "prepend" instead of append - # So these features will come before any other refSeq.* features - $headers->addFeaturesToHeader( \@features, $self->name ); - - my @allGeneTrackFeatures = @{ $headers->getParentFeatures( $self->name ) }; - - # Get the output index of each feature we added to the header - # This is the index in this tracks output array - # and includes features added to header, using addFeatureToHeader - # such as the join track feature names - # and siteType, strand, codonNumber, etc. - for my $i ( 0 .. $#allGeneTrackFeatures ) { - $self->{_featureIdxMap}{ $allGeneTrackFeatures[$i] } = $i; - } - - ############### Store the output indices for computed features ############### - # so that we can manually include them in the output - my $lastFeatIdx = $#allGeneTrackFeatures; - - $self->{_featIdx} = [ 0 .. $lastFeatIdx ]; - - $self->{_strandFidx} = $self->{_featureIdxMap}{ $self->strandKey }; - $self->{_siteFidx} = $self->{_featureIdxMap}{ $self->siteTypeKey }; - # Avoid accessor penalties by aliasing to the $self hash - # These correspond to all of the sites held in Gene::Site - $self->{_codonSidx} = $self->{_featureIdxMap}{ $self->codonSequenceKey }; - $self->{_codonPosFidx} = $self->{_featureIdxMap}{ $self->codonPositionKey }; - $self->{_codonNumFidx} = $self->{_featureIdxMap}{ $self->codonNumberKey }; - - # The values for these keys we calculate at get() time. - $self->{_refAaFidx} = $self->{_featureIdxMap}{ $self->refAminoAcidKey }; - $self->{_altCodonSidx} = $self->{_featureIdxMap}{ $self->newCodonKey }; - $self->{_altAaFidx} = $self->{_featureIdxMap}{ $self->newAminoAcidKey }; - $self->{_alleleFuncFidx} = $self->{_featureIdxMap}{ $self->exonicAlleleFunctionKey }; - - $self->{_txNumberFidx} = $self->{_featureIdxMap}{ $self->txNumberKey }; - - # $self->{_hgvsCidx} = $self->{_featureIdxMap}{$self->hgvsCkey}; - # $self->{_hgvsPidx} = $self->{_featureIdxMap}{$self->hgvsPkey}; -} - -sub get { - #my ($self, $dbData, $chr, $refBase, $alt, $posIdx, $outAccum) = @_; - # $_[0], $_[1], $_[1], $_[3], $_[4], $_[5] $_[6] - # WARNING: If $_[1]->[$_[0]->{_dbName} isn't defined, will be treated as the 0 index!!! - # therefore return here if that is the case - # ~1/2 of sites will have no gene track entry (including all non-coding, 2% coding) - if ( !defined $_[1]->[ $_[0]->{_dbName} ] ) { - for my $i ( @{ $_[0]->{_featIdx} } ) { - $_[6]->[$i][ $_[5] ] = undef; - } - - return $_[6]; - } - - my ( $self, $dbData, $chr, $ref, $alt, $posIdx, $outAccum ) = @_; - - # my @out; - # # Set the out array to the size we need; undef for any indices we don't add here - - # Cached field names to make things easier to read - my $cachedDbNames = $self->{_allCachedDbNames}; - my $idxMap = $self->{_featureIdxMap}; - - ################# Cache track's region data ############## - # returns an array - $self->{_regionDb}{$chr} //= $self->{_db}->dbReadAll( $self->regionTrackPath($chr) ); - - my $geneDb = $self->{_regionDb}{$chr}; - - ####### Get all transcript numbers, and site data for this position ######### - - # $unpackedSites ; $txNumbers - my ( $siteData, $txNumbers, $multiple ); - - #Reads: - # ( $dbData->[$self->{_dbName}] ) { - ( $txNumbers, $siteData ) = $siteUnpacker->unpack( $dbData->[ $self->{_dbName} ] ); - $multiple = ref $txNumbers ? $#$txNumbers : 0; - - if ( $self->{_hasJoin} ) { - # For join tracks, use only the entry for the first of multiple transcripts - # Because the data stored is always identical at one position - my $num = $multiple ? $txNumbers->[0] : $txNumbers; - # http://ideone.com/jlImGA - for my $fName ( @{ $self->{_flatJoinFeatures} } ) { - $outAccum->[ $idxMap->{$fName} ][$posIdx] = - $geneDb->[$num]{ $cachedDbNames->{$fName} }; - } - } - - # Needs to be done early, in case !hasCodon - if ( defined $self->{_txNumberFidx} ) { - $outAccum->[ $self->{_txNumberFidx} ][$posIdx] = $txNumbers; - } - - ################## Populate site information ######################## - # save unpacked sites, for use in txEffectsKey population ##### - # moose attrs are very slow, cache - # Push, because we'll use the indexes in calculating alleles - # TODO: Better handling of truncated codons - # Avoid a bunch of \;\ for non-coding sites - # By not setting _codonNumberKey, _codonPositionKey, _codonSequenceKey if !hasCodon - my $hasCodon; - if ($multiple) { - for my $site (@$siteData) { - push @{ $outAccum->[ $self->{_strandFidx} ][$posIdx] }, $site->[$strandIdx]; - push @{ $outAccum->[ $self->{_siteFidx} ][$posIdx] }, $site->[$siteTypeIdx]; - - if ( !$hasCodon && defined $site->[$codonSequenceIdx] ) { - $hasCodon = 1; - } - } - } - else { - # This is a bit messy; we should just push anyway, or be consistent - # and never store an array unless multiple, through Gene.pm - # Currently we rely on the output package to do the right thing - $outAccum->[ $self->{_strandFidx} ][$posIdx] = $siteData->[$strandIdx]; - $outAccum->[ $self->{_siteFidx} ][$posIdx] = $siteData->[$siteTypeIdx]; - - if ( defined $siteData->[$codonSequenceIdx] ) { - $hasCodon = 1; - } - } - # ################# Populate geneTrack's user-defined features ##################### - #Reads: $self->{_features} - if ($multiple) { - for my $feature ( @{ $self->{_features} } ) { - $outAccum->[ $idxMap->{$feature} ][$posIdx] = - [ map { $geneDb->[$_]{ $cachedDbNames->{$feature} } } @$txNumbers ]; - } - } - else { - for my $feature ( @{ $self->{_features} } ) { - $outAccum->[ $idxMap->{$feature} ][$posIdx] = - $geneDb->[$txNumbers]{ $cachedDbNames->{$feature} }; - } - } - - if ( !$hasCodon ) { - $outAccum->[ $self->{_codonPosFidx} ][$posIdx] = undef; - $outAccum->[ $self->{_codonNumFidx} ][$posIdx] = undef; - $outAccum->[ $self->{_alleleFuncFidx} ][$posIdx] = undef; - $outAccum->[ $self->{_refAaFidx} ][$posIdx] = undef; - $outAccum->[ $self->{_altAaFidx} ][$posIdx] = undef; - $outAccum->[ $self->{_codonSidx} ][$posIdx] = undef; - $outAccum->[ $self->{_altCodonSidx} ][$posIdx] = undef; - - return; - } - - ######Populate _codon*Key, exonicAlleleFunction, amion acids keys ############ - - my ( $i, @funcAccum, @codonNum, @codonSeq, @codonPos, @refAA, @newAA, @newCodon ); - # Set undefs for every position, other than the ones we need - # So that we don't need to push undef's to keep transcript order - $#funcAccum = $#codonNum = $#codonSeq = $#codonPos = $#refAA = $#newAA = - $#newCodon = $multiple; - - $i = 0; - - if ( length($alt) > 1 ) { - # Indels get everything besides the _*AminoAcidKey and _newCodonKey - my $indelAllele = - substr( $alt, 0, 1 ) eq '+' - ? length( substr( $alt, 1 ) ) % 3 - ? $frameshift - : $inFrame - : int($alt) % 3 ? $frameshift - : $inFrame; - - for my $site ( $multiple ? @$siteData : $siteData ) { - $codonNum[$i] = $site->[$codonNumberIdx]; - $codonSeq[$i] = $site->[$codonSequenceIdx]; - - if ( defined $site->[$codonSequenceIdx] ) { - $funcAccum[$i] = $indelAllele; - - # Codon position only exists (and always does) when codonSequence does - # We store codonPosition as 0-based, users probably expect 1 based - $codonPos[$i] = $site->[$codonPositionIdx] + 1; - - if ( length( $site->[$codonSequenceIdx] ) == 3 ) { - $refAA[$i] = $codonMap->codon2aa( $site->[$codonSequenceIdx] ); - } - - # For indels we don't store newAA or newCodon - # or hgvs notation - } - - $i++; - } - } - else { - my $altCodonSequence; - - SNP_LOOP: for my $site ( $multiple ? @$siteData : $siteData ) { - $codonNum[$i] = $site->[$codonNumberIdx]; - $codonSeq[$i] = $site->[$codonSequenceIdx]; - - if ( !defined $site->[$codonSequenceIdx] ) { - $i++; - next SNP_LOOP; - } - - # We store as 0-based, users probably expect 1 based - $codonPos[$i] = $site->[$codonPositionIdx] + 1; - - if ( length( $site->[$codonSequenceIdx] ) != 3 ) { - $i++; - next SNP_LOOP; - } - - #make a codon where the reference base is swapped for the allele - $altCodonSequence = $site->[$codonSequenceIdx]; - - # If codon is on the opposite strand, invert the allele - # Note that $site->[$codonPositionIdx] MUST be 0-based for this to work - if ( $site->[$strandIdx] eq '-' ) { - substr( $altCodonSequence, $site->[$codonPositionIdx], 1 ) = - $negativeStrandTranslation->{$alt}; - } - else { - substr( $altCodonSequence, $site->[$codonPositionIdx], 1 ) = $alt; - } - - $newCodon[$i] = $altCodonSequence; - - $newAA[$i] = $codonMap->codon2aa($altCodonSequence); - $refAA[$i] = $codonMap->codon2aa( $site->[$codonSequenceIdx] ); - - if ( !defined $newAA[$i] ) { - $i++; - next SNP_LOOP; - } - - if ( $refAA[$i] eq $newAA[$i] ) { - $funcAccum[$i] = $silent; - } - elsif ( $newAA[$i] eq '*' ) { - $funcAccum[$i] = $stopGain; - } - elsif ( $refAA[$i] eq '*' ) { - $funcAccum[$i] = $stopLoss; - } - elsif ( $codonNum[$i] == 1 ) { - $funcAccum[$i] = $startLoss; - } - else { - $funcAccum[$i] = $replacement; - } - - # $hgvsC[$i] = 'c.' . $ref . ($codonNum[$i] * 3 - (3 - $codonPos[$i])) . $alt; - # $hgvsP[$i] = 'p.' . $refAA[$i] . $codonNum[$i] . $newAA[$i]; - - $i++; - } - } - - if ($multiple) { - $outAccum->[ $self->{_codonPosFidx} ][$posIdx] = \@codonPos; - $outAccum->[ $self->{_codonNumFidx} ][$posIdx] = \@codonNum; - $outAccum->[ $self->{_alleleFuncFidx} ][$posIdx] = \@funcAccum; - $outAccum->[ $self->{_refAaFidx} ][$posIdx] = \@refAA; - $outAccum->[ $self->{_altAaFidx} ][$posIdx] = \@newAA; - $outAccum->[ $self->{_codonSidx} ][$posIdx] = \@codonSeq; - $outAccum->[ $self->{_altCodonSidx} ][$posIdx] = \@newCodon; - - return $outAccum; - } - - $outAccum->[ $self->{_codonPosFidx} ][$posIdx] = $codonPos[0]; - $outAccum->[ $self->{_codonNumFidx} ][$posIdx] = $codonNum[0]; - $outAccum->[ $self->{_alleleFuncFidx} ][$posIdx] = $funcAccum[0]; - $outAccum->[ $self->{_refAaFidx} ][$posIdx] = $refAA[0]; - $outAccum->[ $self->{_altAaFidx} ][$posIdx] = $newAA[0]; - $outAccum->[ $self->{_codonSidx} ][$posIdx] = $codonSeq[0]; - $outAccum->[ $self->{_altCodonSidx} ][$posIdx] = $newCodon[0]; - - # $outAccum->[$self->{_hgvsCidx}][$posIdx] = \@hgvsC; - # $outAccum->[$self->{_hgvsPidx}][$posIdx] = \@hgvsP; - return $outAccum; -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Gene/Build.pm b/perl/lib/Seq/Tracks/Gene/Build.pm deleted file mode 100644 index 094bd4c36..000000000 --- a/perl/lib/Seq/Tracks/Gene/Build.pm +++ /dev/null @@ -1,647 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Gene::Build; - -our $VERSION = '0.001'; - -# ABSTRACT: Builds Gene Tracks -# Stores refSeq data, and generates in-silico transcribed transcripts (and associated fields) - -use Mouse 2; -use namespace::autoclean; - -use Parallel::ForkManager; - -use Seq::Tracks::Gene::Build::TX; -use Seq::Tracks::Gene::Definition; -use Seq::Tracks; - -extends 'Seq::Tracks::Build'; -#exports regionTrackPath -with 'Seq::Tracks::Region::RegionTrackPath'; - -use List::Util qw/first/; - -my $geneDef = Seq::Tracks::Gene::Definition->new(); - -# We don't remap field names -# It's easier to remember the real names than real names + our domain-specific names - -# Required, or must be mapped using 'fieldMap' in YAML -my @coordinateFields = ( - 'chrom', 'txStart', 'txEnd', 'cdsStart', - 'cdsEnd', 'exonStarts', 'exonEnds', 'strand' -); - -my %coordinateFields = map { $_ => 1 } @coordinateFields; - -my %coordinateTransforms = ( - exonStarts => sub { - return [ split ',', $_[0] ]; - }, - exonEnds => sub { - return [ split ',', $_[0] ]; - } -); - -# has chrom_field_name => (is => 'ro', lazy => 1, default => 'chrom' ); -# has txStart_field_name => (is => 'ro', lazy => 1, default => 'txStart' ); -# has txEnd_field_name => (is => 'ro', lazy => 1, default => 'txEnd' ); -# has cdsStart_field_name => (is => 'ro', lazy => 1, default => 'txStart' ); -# has cdsEnd_field_name => (is => 'ro', lazy => 1, default => 'txEnd' ); -# has exonStarts_field_name => (is => 'ro', lazy => 1, default => 'txStart' ); -# has txEnd_field_name => (is => 'ro', lazy => 1, default => 'txEnd' ); - -has build_region_track_only => - ( is => 'ro', isa => 'Bool', coerce => 1, lazy => 1, default => 0 ); -has join => ( is => 'ro', isa => 'HashRef' ); - -# These are the features stored in the Gene track's region database -# Does not include $geneDef->txErrorName here, because that is something -# that is not actually present in UCSC refSeq or knownGene records, we add ourselves -has '+features' => ( default => sub { $geneDef->ucscGeneAref; } ); - -my $txNumberKey = 'txNumber'; -my $joinTrack; - -sub BUILD { - my $self = shift; - - $self->getFieldDbName( $geneDef->txErrorName ); - # Do this before we build, to avoid threading-related issues - for my $f (@coordinateFields) { - $self->getFieldDbName($f); - } -} - -# 1) Store a reference to the corresponding entry in the gene database (region database) -# 2) Store this codon information at some key, which the Tracks::Region::Gene -# 3) Store transcript errors, if any -# 4) Write region data -# 5) Write gene track data in main db -# 6) Write nearest genes if user wants those -sub buildTrack { - my $self = shift; - # txErrorName isn't a default feature, initializing here to make sure - # we store this value (if calling for first time) before any threads get to it - my @allFiles = $self->allLocalFiles; - - # Only allow 1 thread because perl eats memory like candy - my $pm = Parallel::ForkManager->new( $self->maxThreads ); - - my %completedChrs; - $pm->run_on_finish( - sub { - my ( $pid, $exitCode, $fileName, $exitSignal, $coreDump, $errOrChrs ) = @_; - - if ( $exitCode != 0 ) { - my $err = $self->name - . ": got exitCode $exitCode for $fileName: $exitSignal . Dump: $coreDump"; - - $self->log( 'fatal', $err ); - } - - if ( $errOrChrs && ref $errOrChrs eq 'HASH' ) { - for my $chr ( keys %$errOrChrs ) { - if ( !$completedChrs{$chr} ) { - $completedChrs{$chr} = [$fileName]; - } - else { - push @{ $completedChrs{$chr} }, $fileName; - } - } - } - - #Only message that is different, in that we don't pass the $fileName - $self->log( 'info', $self->name . ": completed building from $fileName" ); - } - ); - - # Assume one file per loop, or all sites in one file. Tracks::Build warns if not - for my $file (@allFiles) { - # Although this should be unnecessary, environments must be created - # within the process that uses them - # This provides a measure of safety - $self->db->cleanUp(); - - $pm->start($file) and next; - my ( $seenChrs, $data ); - - my ( $err, undef, $fh ) = $self->getReadFh($file); - - if ($err) { - $self->log( 'fatal', $self->name . ": $err" ); - } - - ( $err, $seenChrs, $data ) = $self->_readTxData($fh); - - if ($err) { - $self->log( 'fatal', $self->name . ": $err" ); - } - - # If we fork a process in order to read (example zcat) prevent that process - # from becoming defunct - $self->safeCloseBuilderFh( $fh, $file, 'fatal' ); - - my %all = %{ $data->{all} }; - my %txStarts = %{ $data->{txStart} }; - my %regions = %{ $data->{region} }; - - # If we skipped the $file, will never get here, so this is an error - # Can happen if only 1 file, with only 1 chromosome (not building all chrs) - if ( !%all && $seenChrs == 1 ) { - # This returns to parent, in $pm->run_on_finish - $pm->finish(0); - } - - if ( !%all ) { - $self->log( 'fatal', $self->name . ": no transcript data accumulated" ); - } - - ############################### Make transcripts ######################### - my %visitedChrs; - - my @allChrs = keys %all; - - my $txErrorDbname = $self->getFieldDbName( $geneDef->txErrorName ); - - TX_LOOP: for my $chr (@allChrs) { - $visitedChrs{$chr} //= 1; - # We may want to just update the region track, - # TODO: Note that this won't store any txErrors - if ( $self->build_region_track_only ) { - $self->_writeRegionData( $chr, $regions{$chr} ); - - if ( $self->join ) { - $self->_joinTracksToGeneTrackRegionDb( $chr, $txStarts{$chr} ); - } - - next TX_LOOP; - } - - $self->log( 'info', $self->name . ": starting to build transcripts for $chr" ); - - my @allTxStartsAscending = sort { $a <=> $b } keys %{ $all{$chr} }; - - for my $txStart (@allTxStartsAscending) { - for my $txData ( @{ $all{$chr}->{$txStart} } ) { - my $txNumber = $txData->{$txNumberKey}; - - # Important that anything this class needs to commit to db happens - # before we open our cursor - my $txInfo = Seq::Tracks::Gene::Build::TX->new($txData); - - if ( @{ $txInfo->transcriptSites } % 2 != 0 ) { - $err = $self->name - . ": expected txSiteDataAndPos to contain (position1, value1, position2, value2) data"; - $self->log( 'fatal', $err ); - } - - # Note that is Build::TX doesn't close it txn, this will be - # impossible; may be an LMDB_File bug - # If that is problematic, placing this before Build::TX will work - # for some reason, order of SubTxn's matters - # https://github.com/salortiz/LMDB_File/issues/30 - my $cursor = $self->db->dbStartCursorTxn($chr); - - INNER: for ( my $i = 0; $i < @{ $txInfo->transcriptSites }; $i += 2 ) { - # $txInfo->transcriptSites->[$i] corresponds to $pos, $i + 1 to value - # Commit for every position - # This also ensures that $mainMergeFunc will always be called with fresh data - $self->db->dbPatchCursorUnsafe( - $cursor, - $chr, - $self->dbName, - #pos - $txInfo->transcriptSites->[$i], - #value - $txInfo->transcriptSites->[ $i + 1 ], - #how we handle cases where multiple overlap - \&_dbMergeFunc - ); - } - - # Commits, closes cursor every transcript - $self->db->dbEndCursorTxn($chr); - - if ( @{ $txInfo->transcriptErrors } ) { - my @errors = @{ $txInfo->transcriptErrors }; - $regions{$chr}->{$txNumber}{$txErrorDbname} = \@errors; - } - } - - delete $all{$chr}->{$txStart}; - } - - delete $all{$chr}; - - $self->_writeRegionData( $chr, $regions{$chr} ); - - delete $regions{$chr}; - - if ( $self->join ) { - $self->_joinTracksToGeneTrackRegionDb( $chr, $txStarts{$chr} ); - } - - delete $txStarts{$chr}; - - $self->log( 'info', - $self->name . ": finished building transcripts for $chr from $file" ); - - $self->db->cleanUp(); - } - - #Commit, sync everything, including completion status, and release mmap - $self->db->cleanUp(); - - $pm->finish( 0, \%visitedChrs ); - } - - $pm->wait_all_children(); - - for my $chr ( keys %completedChrs ) { - $self->completionMeta->recordCompletion($chr); - - $self->log( 'info', - $self->name - . ": recorded $chr completed, from " - . ( join( ",", @{ $completedChrs{$chr} } ) ) ); - } - - #TODO: figure out why this is necessary, even with DEMOLISH - $self->db->cleanUp(); - - return; -} - -sub _dbMergeFunc { - # Only when called when there is a defined $oldVal - my ( $chr, $pos, $oldVal, $newVal ) = @_; - # make it an array of arrays (array of geneTrack site records) - if ( !ref $oldVal->[0] ) { - return ( undef, [ $oldVal, $newVal ] ); - } - - #oldVal is an array of arrays, push on to it - my @updatedVal = @$oldVal; - - push @updatedVal, $newVal; - - #TODO: Should we throw any errors? - return ( undef, \@updatedVal ); -} - -sub _getIdx { - my ( $self, $firstLine ) = @_; - - chomp $firstLine; - - my ( %allIdx, %regionIdx ); - # If the user wanted to transform the input field names, do, so source field names match - # those expected by the track - my @fields = map { $self->fieldMap->{$_} || $_ } split( '\t', $firstLine ); - - # Store all features we can find, for Seq::Build::Gene::TX. Avoid autocracy, - # don't need to know what Gene::TX requires. - my $fieldIdx = 0; - for my $field (@fields) { - $allIdx{$field} = $fieldIdx; - $fieldIdx++; - } - - my $err; - - # Some featuers are core to transcript building - for my $reqField (@coordinateFields) { - if ( !defined $allIdx{$reqField} ) { - $err = 'Must provide, or map via "fieldMap" in the YAML config - the following fields: ' . join( ',', @coordinateFields ); - - return ( $err, undef ); - } - } - - # Region database features; as defined by user in the YAML config, or our default - REGION_FEATS: for my $field ( @{ $self->features } ) { - if ( exists $allIdx{$field} ) { - $regionIdx{$field} = $allIdx{$field}; - next REGION_FEATS; - } - - $err = $self->name . ": required $field missing in header: $firstLine"; - last; - } - - return ( - $err, - { - all => \%allIdx, - region => \%regionIdx, - } - ); -} - -sub _readTxData { - my ( $self, $fh ) = @_; - - my $firstLine = <$fh>; - - # Fatal/exit will only affect that process, won't affect others - if ( !defined $firstLine ) { - return "Couldn't read header"; - } - - # support non-Unix line endings - my $err = $self->setLineEndings($firstLine); - - if ($err) { - return $err; - } - - ( $err, my $idx ) = $self->_getIdx($firstLine); - - my %allIdx = %{ $idx->{all} }; - my %regionIdx = %{ $idx->{region} }; - - if ($err) { - return $err; - } - - my ( $wantedChr, $txNumber, $txStart, $txEnd, $chr, @fields ); - - my ( %seenChrsInFile, %txNumbers, %regionData, %txStartData, %allData ); - - FH_LOOP: while (<$fh>) { - chomp; - @fields = split( '\t', $_ ); - - $chr = $fields[ $allIdx{'chrom'} ]; - $txStart = $fields[ $allIdx{'chrom'} ]; - - $seenChrsInFile{$chr} //= 1; - - # Normalize the representation, such that having or missing 'chr' - # or using MT instead of M won't matter - $chr = $self->normalizedWantedChr->{$chr}; - - # chr may be undefined - if ( !defined $wantedChr || ( !defined $chr || $wantedChr ne $chr ) ) { - $wantedChr = $self->chrWantedAndIncomplete($chr); - } - - if ( !defined $wantedChr ) { - next FH_LOOP; - } - - # Keep track of our 0-indexed transcript reference numbers - if ( !$txNumbers{$wantedChr} ) { - $txNumbers{$wantedChr} = 0; - } - - $txNumber = $txNumbers{$wantedChr}; - - my %rowData; - - my $fieldDbName; - ACCUM_VALUES: for my $fieldName ( keys %allIdx ) { - if ( $self->hasTransform($fieldName) ) { - $fields[ $allIdx{$fieldName} ] = - $self->transformField( $fieldName, $fields[ $allIdx{$fieldName} ] ); - } - - # TODO: This could lead to surprising behavior - # User can modify this behavior by passing a transform above - # But the expectation of $coordinateTransforms{$fieldName} should be - # documented, since not all transforms will work - if ( $coordinateTransforms{$fieldName} ) { - $fields[ $allIdx{$fieldName} ] = - $coordinateTransforms{$fieldName}->( $fields[ $allIdx{$fieldName} ] ); - } - - my $data = $self->coerceFeatureType( $fieldName, $fields[ $allIdx{$fieldName} ] ); - - if ( !defined $data ) { - next ACCUM_VALUES; - } - - # if this is a field that we need to store in the region db - # create a shortened field name - $fieldDbName = $self->getFieldDbName($fieldName); - - $rowData{$fieldName} = $data; - - if ( !( defined $regionIdx{$fieldName} || defined $coordinateFields{$fieldName} ) ) { - next ACCUM_VALUES; - } - - #store under a shortened fieldName to save space in the db - $regionData{$wantedChr}->{$txNumber}{$fieldDbName} = $data; - } - - $txStart = $rowData{'txStart'}; - - if ( !defined $txStart ) { - return ': missing transcript start ( we expected a value @ ' . 'txStart' . ')'; - } - - $txEnd = $rowData{'txEnd'}; - - if ( !defined $txEnd ) { - return 'missing transcript start ( we expected a value @ ' . 'txEnd' . ')'; - } - - #a field added by Bystro - # $regionData{$wantedChr}->{$txNumber}{$self->getFieldDbName($geneDef->txSizeName)} = $txEnd + 1 - $txStart; - - if ( defined $txStartData{$wantedChr}{$txStart} ) { - push @{ $txStartData{$wantedChr}{$txStart} }, [ $txNumber, $txEnd ]; - } - else { - $txStartData{$wantedChr}{$txStart} = [ [ $txNumber, $txEnd ] ]; - } - - $rowData{$txNumberKey} = $txNumber; - - push @{ $allData{$wantedChr}{$txStart} }, \%rowData; - - $txNumbers{$wantedChr} += 1; - } - - my $seenChrs = keys %seenChrsInFile; - - my %out; - - $out{txStart} = \%txStartData; - $out{all} = \%allData; - $out{region} = \%regionData; - - return ( $err, $seenChrs, \%out ); -} - -sub _writeRegionData { - my ( $self, $chr, $regionsHref ) = @_; - - $self->log( 'info', $self->name . ": starting _writeRegionData for $chr" ); - - my $dbName = $self->regionTrackPath($chr); - - my @txNums = sort { $a <=> $b } keys %$regionsHref; - - for my $txNumber (@txNums) { - # Patch one at a time, because we assume performance isn't an issue - # And neither is size, so hash keys are fine - # TODO: move away from this; don't store any hashes, use arrays - $self->db->dbPatchHash( $dbName, $txNumber, $regionsHref->{$txNumber} ); - } - - $self->log( 'info', $self->name . ": finished _writeRegionData for $chr" ); - return; -} - -############ Joining some other track to Gene track's region db ################ - -# TODO: Add check to see if values have already been entered -sub _joinTrackMergeFunc { - my ( $chr, $pos, $oldVal, $newVal ) = @_; - - my @updated; - - #If the old value is an array, push the new values on to the old values - if ( ref $oldVal ) { - @updated = @$oldVal; - - for my $val ( ref $newVal ? @$newVal : $newVal ) { - if ( !defined $val ) { - next; - } - - push @updated, $val; - } - } - else { - if ( defined $oldVal ) { - @updated = ($oldVal); - } - - for my $val ( ref $newVal ? @$newVal : $newVal ) { - if ( !defined $val ) { - next; - } - - # If not array I want to see an error - push @updated, $val; - } - } - - # Try to add as little junk as possible - if ( @updated == 0 ) { - return ( undef, $oldVal ); - } - - if ( @updated == 1 ) { - return ( undef, $updated[0] ); - } - - return ( undef, \@updated ); -} - -sub _joinTracksToGeneTrackRegionDb { - my ( $self, $chr, $txStarts ) = @_; - - if ( !$self->join ) { - return $self->name . ": join not set in _joinTracksToGeneTrackRegionDb"; - } - - my $tracks = Seq::Tracks->new(); - $joinTrack = $tracks->getTrackBuilderByName( $self->joinTrackName ); - - if ( !$joinTrack ) { - return - $self->name - . ': join track ' - . $self->joinTrackName - . ' has no "tracks" .yml entry'; - } - - $self->log( 'info', - $self->name - . ": starting _joinTracksToGeneTrackRegionDb " - . "for $chr using: " - . $self->joinTrackName ); - - # Gene tracks cover certain positions, record the start and stop - my @positionRanges; - my @txNums; - - for my $txStart ( keys %$txStarts ) { - foreach ( @{ $txStarts->{$txStart} } ) { - my $txNumber = $_->[0]; - my $txEnd = $_->[1]; - push @positionRanges, [ $txStart, $txEnd ]; - push @txNums, $txNumber; - } - } - - my $dbName = $self->regionTrackPath($chr); - - # For each txNumber, run dbPatchHash on any joining data - $joinTrack->joinTrack( - $chr, - \@positionRanges, - $self->joinTrackFeatures, - sub { - # Called every time a match is found - # Index is the index of @ranges that this update belongs to - my ( $hrefToAdd, $index ) = @_; - - my %out; - foreach ( keys %$hrefToAdd ) { - if ( !defined $hrefToAdd->{$_} ) { - next; - } - - if ( ref $hrefToAdd->{$_} eq 'ARRAY' ) { - my @arr; - my %uniq; - for my $entry ( @{ $hrefToAdd->{$_} } ) { - if ( !defined $entry ) { - next; - } - - if ( $uniq{$entry} ) { - next; - } - - push @arr, $entry; - $uniq{$entry} = 1; - } - - # Don't add empty arrays to the database - $hrefToAdd->{$_} = @arr ? \@arr : undef; - } - - if ( defined $hrefToAdd->{$_} ) { - # Our LMDB writer requires a value, so only add to our list of db entries - # to update if we have a value - #$self->getFieldDbName generates a name for the field we're joining, named $_ - $self->db->dbPatchHash( $dbName, $txNums[$index], - { $self->getFieldDbName($_) => $hrefToAdd->{$_} }, - \&_joinTrackMergeFunc ); - } - } - - # Free memory as soon as possible - undef $hrefToAdd; - undef %out; - } - ); - - $self->log( 'info', - $self->name . ": finished _joinTracksToGeneTrackRegionDb for $chr" ); - return; -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Gene/Build/TX.pm b/perl/lib/Seq/Tracks/Gene/Build/TX.pm deleted file mode 100644 index 78ab6f0b7..000000000 --- a/perl/lib/Seq/Tracks/Gene/Build/TX.pm +++ /dev/null @@ -1,663 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -# ABSTRACT: Creates a hash reference of sites with information in them - -=head1 DESCRIPTION - # A consuming build class stores this info at some key in the main database -=cut - -# Inspired by Thomas Wingo's similar in-silico transcrpion Seqant 3 package - -#TODO: Thomas. I may have misunderstood your intentions here. I spoke with -#Dave, and we decided, I believe, not to write peptides, because those -#were already reconstructed at annotation time (codon2AA) -#If I made significant mistakes here, 1000 apologies. I did my best -#to follow your code base, and the changes made were done for a few reasons: -#1) Make it easier for me to understand -#2) Make it clearer to future developers exactly what this class is responsible for -#3) Reduce the amount of code to the minimum needed to output a transcriptSite -#4) Related to #3, don't use moose attributes unless need $self or exposed as public API - -package Seq::Tracks::Gene::Build::TX; - -our $VERSION = '0.001'; - -use Mouse 2; -use MouseX::NativeTraits; -# We need pre-initialized tracks -use Seq::Tracks; -use Seq::Tracks::Gene::Site; -use Seq::DBManager; - -with 'Seq::Role::Message'; - -use namespace::autoclean; -use DDP; - -#how many bases away from exon bound we will call spliceAc or spliceDon site -my $spliceSiteLength = 2; -#placeholder for annotation in string -my $annBase = '0'; - -# stores all of our individual sites -# these can be used by the consumer to write per-reference-position -# codon information -# The only public variable other than transcriptErrors, which we may discard -has transcriptSites => ( - is => 'ro', - isa => 'ArrayRef', - lazy => 1, - default => sub { [] }, - init_arg => undef, -); - -# Also public (for now) -# uses transcriptAnnotations to figure out if anything went wrong -has transcriptErrors => ( - is => 'rw', - isa => 'ArrayRef', - writer => '_writeTranscriptErrors', - traits => ['Array'], - handles => { - noTranscriptErrors => 'is_empty', - allTranscriptErrors => 'elements', - }, - lazy => 1, - default => sub { [] }, - init_arg => undef, -); - -###All required arguments -has chrom => ( - is => 'ro', - isa => 'Str', - required => 1, -); - -has exonStarts => ( - is => 'ro', - isa => 'ArrayRef', - traits => ['Array'], - handles => { allExonStarts => 'elements', }, - required => 1, -); - -has exonEnds => ( - is => 'ro', - isa => 'ArrayRef', - traits => ['Array'], - handles => { allExonEnds => 'elements', }, - required => 1, -); - -has cdsStart => ( - is => 'ro', - isa => 'Int', - required => 1, -); - -has cdsEnd => ( - is => 'ro', - isa => 'Int', - required => 1, -); - -has strand => ( - is => 'ro', - isa => 'Str', - required => 1, -); - -has txNumber => ( - is => 'ro', - isa => 'Int', - required => 1, -); - -##End required arguments -#purely for debug -#not the same as the Track name -has name => ( - is => 'ro', - isa => 'Str', - required => 1, -); - -##End requ -###private -has debug => ( - is => 'ro', - isa => 'Int', - default => 0, - lazy => 1, -); - -#@private -state $codonPacker = Seq::Tracks::Gene::Site->new(); - -#coerce our exon starts and ends into an array -sub BUILDARGS { - my ( $orig, $href ) = @_; - - # The original data is a comma-delimited string - # But since Bystro often coerces delimited things into arrays, - # We may be given an array instead; If not, coerce into an array - if ( !ref $href->{exonStarts} ) { - $href->{exonStarts} = [ split( ',', $href->{exonStarts} ) ]; - } - - if ( !ref $href->{exonEnds} ) { - $href->{exonEnds} = [ split( ',', $href->{exonEnds} ) ]; - } - - return $href; -} - -sub BUILD { - my $self = shift; - - $self->log( 'debug', "Beginning to build tx: " . $self->name ); - - #seeds transcriptSequence and transcriptPositions - my ( $seq, $seqPosMapAref ) = $self->_buildTranscript(); - - my $txAnnotationHref = $self->_buildTranscriptAnnotation(); - - my $errorsAref = - $self->_buildTranscriptErrors( $seq, $seqPosMapAref, $txAnnotationHref ); - #if errors warn; some transcripts will be malformed - #we could pass an array reference to log, but let's give some additional - #context - if (@$errorsAref) { - my $error = 'For tx ' . $self->name . ' : ' . join( '; ', @$errorsAref ); - $self->log( 'warn', $error ); - } - - #We no longer strictly need to set 'abs_pos' for each exon_end - #We *could* subtract $based from each position, but skipping for now because - #it seems unlikely that we'll start accepting 1-based gene tracks - #maybe if UCSC loses it's dominance - - # transcriptSites holds all of our site annotations - $self->_buildTranscriptSites( $seq, $seqPosMapAref, $txAnnotationHref ); - - #this is now handled as part of the buildTranscript process - #$self->_build_flanking_sites; -} - -# give the sequence with respect to the direction of transcription / coding -# at the same time, creates an array of the positions to which that transcript -# belongs, and assigns what we call that thing - -#this combines _build_transcript_db, _build_flanking_sites, _build_transcript_abs_position -#since those mostly do the exact same thing, except in what they store -#and flanking sites is really adding data to the transcript_annotation - -#note that we no longer have absolute positions, so we don't call them absolute -#(because we're storing position by chr in the kv database) -sub _buildTranscript { - my $self = shift; - my @exonStarts = $self->allExonStarts; - my @exonEnds = $self->allExonEnds; - my $codingStart = $self->cdsStart; - my $codingEnd = $self->cdsEnd; - - my ( @sequencePositions, $txSequence ); - - my $tracks = Seq::Tracks->new(); - - my $refTrack = $tracks->getRefTrackGetter(); - - # Expects DBManager to have been previously configured - my $db = Seq::DBManager->new(); - - #in scalar, as in less than, @array gives length - for ( my $i = 0; $i < @exonStarts; $i++ ) { - if ( $exonStarts[$i] >= $exonEnds[$i] ) { - $self->log( 'fatal', "exon start $exonStarts[$i] >= end $exonEnds[$i]" ); - } - - #exonEnds is closed, so the actual exonEnd is - 1 - #exonStarts are open - #http://genomewiki.ucsc.edu/index.php/Coordinate_Transforms - - #transcript starts are 0-based and ends are 1-based - #http://genome.ucsc.edu/FAQ/FAQtracks#tracks1 - #perl range is always closed - #https://ideone.com/AKKpfC - # my @fragmentPositions; - - my $exonPosHref = [ $exonStarts[$i] .. $exonEnds[$i] - 1 ]; - - #limitation of the below API; we need to copy $posHref - #thankfully, we needed to do this anyway. - #we push them in order, so the first position in the - #https://ideone.com/wq0YJO (dereferencing not strictly necessary, but I think clearer) - push @sequencePositions, @$exonPosHref; - - # As a result of modifying the reference, each position in exonPosHref - # now has database data, or undefined - # last argument means don't commit this, saves io overhead, - # committing here to avoid interfering with transactions of the consuming class - # TODO: allow dbRead to use read-only transactions - $db->dbRead( $self->chrom, $exonPosHref ); - - #Now get the base for each item found in $dAref ($exonPosHref); - #This is handled by the refTrack of course - #Each track has its own "get" method, which fetches its data - #That can be a scalar or a hashRef - #Ref tracks always return a scalar, a single base, since that's the only - #thing that they could return - - for ( my $y = 0; $y < scalar @$exonPosHref; $y++ ) { - my $refBase = $refTrack->get( $exonPosHref->[$y] ); - - if ( !$refBase ) { - my @positionsWeHad = ( $exonStarts[$i] .. $exonEnds[$i] - 1 ); - $self->log( 'fatal', - "Position $positionsWeHad[$y] doesn't exist in our " - . $self->chrom - . " database." - . "\nWe've either selected the wrong assembly," - . "\nor haven't built the reference database for this chromosome" ); - } - - $txSequence .= $refBase; - } - } - - if ( $self->strand eq "-" ) { - #reverse the sequence, just as in _build_transcript_db - $txSequence = reverse $txSequence; - # get the complement, just as in _build_transcript_db - $txSequence =~ tr/ACGT/TGCA/; - #reverse the positions, just as done in _build_transcript_abs_position - @sequencePositions = reverse @sequencePositions; - } - - return ( $txSequence, \@sequencePositions ); - - #now in buildTranscriptAnnotation - #my $errorsAref = $self->_buildTranscriptErrors($txSequence, \@sequencePositions); -} - -sub _buildTranscriptAnnotation { - my $self = shift; - - my @exonStarts = $self->allExonStarts; - my @exonEnds = $self->allExonEnds; - my $codingStart = $self->cdsStart; - my $codingEnd = $self->cdsEnd; - - my $posStrand = $self->strand eq '+'; - #https://ideone.com/B3ygW6 - #is this a bug? isn't cdsEnd open, so shouldn't it be cdsStart == cdsEnd - 1 - #nope: http://genome.soe.ucsc.narkive.com/NHHMnfwF/cdsstart-cdsend-definition - my $nonCoding = $self->cdsStart == $self->cdsEnd; - - my $txAnnotationHref; - - # Store intron site type if that's what this is - # Note that the splice loop below this will ovewrite any positions that are - # Called Splice Sites - INTRON_LOOP: for ( my $i = 0; $i < @exonEnds; $i++ ) { - my $thisExonEnd = $exonEnds[$i]; - my $nextExonStart = $exonStarts[ $i + 1 ]; - - if ( !$nextExonStart ) { - last INTRON_LOOP; - } - #exon Ends are open, so the exon actually ends $exonEnds - 1 - for ( my $intronPos = $thisExonEnd; $intronPos < $nextExonStart; $intronPos++ ) { - $txAnnotationHref->{$intronPos} = $codonPacker->siteTypeMap->intronicSiteType; - } - } - - #Then store non-coding, 5'UTR, 3'UTR annotations - for ( my $i = 0; $i < @exonStarts; $i++ ) { - # Annotate splice donor/acceptor bp - # - i.e., bp within $spliceSiteLength bp of exon start / stop - # - what we want to capture is the bp that are within $spliceSiteLength bp of the start or end of - # an exon start/stop; whether this is only within the bounds of coding exons does - # not particularly matter to me - # - # From the gDNA: - # - # EStart CStart EEnd EStart EEnd EStart CEnd EEnd - # +-----------+---------------+-----------+--------+---------+--------+---------+ - # Exons 111111111111111111111111111 22222222 333333333333333333 - # Code ******************************************************* - # APR ### ### - # DNR %%% %%% - # - - #Compared to the "Seq" codebase written completely by Dr. Wingo: - #The only change to the logic that I have made, is to add a bit of logic - #to check for cases when our splice donor / acceptor sites - #as calculated by a the use of $spliceSiteLength - #actually overlap either the previous exonEnd, or the next exonStart - #and are therefore inside of a coding sequence. - #Dr. Wingo's original solution was completely correct, because it also - #assumed that downstream someone was smart enough to intersect - #coding sequences and splice site annotations, and keep the coding sequence - #in any overlap - - # CDS End is also half-open (end-open): - # https://genome.ucsc.edu/cgi-bin/hgTracks?position=chr1:909955-909955&hgsid=697700417_1ZakBvh4gGAEbl8WSM0gnBFTpnKa&ncbiRefSeqView=full - # NM_001160184 is + strand, cdsEnd: 909955 (chr1) in the mysql db - # And 909955 corresponds to exon 16 (last base) in ucsc genome browser - # And since the browser is 1-based fully-closed interval - # cdsEnd must be half-open, end-excluded - - #TODO: should we check if start + n is past end? or >= end - $n - - #If $i == 0, make sure we don't accidentally call the last exonEnd the first one - my $previousExonEnd = $i > 0 ? $exonEnds[ $i - 1 ] : undef; - my $nextExonStart = $exonStarts[ $i + 1 ]; - - # We cannot have a spliceAcceptor site unless there was an upstream exon - if ( defined $previousExonEnd ) { - # And if the length of the intron is smaller than 2*spliceSiteLength - # we would have overlap between our spliceAcceptor and donor, which doesn't make sense - #exonEnd is open, so previousExonEnd is the first base of the intron - #and #exonStart is closed, so it is +1 the end of the intron - #Also, exonStarts, ends always relative to sense strand - #So ends always larger than starts - if ( $exonStarts[$i] - $previousExonEnd >= 2 * $spliceSiteLength ) { - for ( my $n = 1; $n <= $spliceSiteLength; $n++ ) { - - # The first base of the spliceAcceptor is up to $spliceSiteLength bases - # away from the downstream exonStart - # exonStart is closed, meaning the exonStart is the first base of the exon - my $exonPos = $exonStarts[$i] - $n; - - #This last condition to prevent splice acceptors for being called in - #coding sites for weirdly tight transcripts - # >= because EEnd (exonEnds) are open range, aka their actual number is not - #to be included, it's 1 past the last base of that exon - #(and exonEnds are open, so exonEnd is the first base of the downstream intron) - $txAnnotationHref->{$exonPos} = - $posStrand - ? $codonPacker->siteTypeMap->spliceAcSiteType - : $codonPacker->siteTypeMap->spliceDonSiteType; - } - } - else { - $self->log( 'warn', - "No spliceAcceptor (neg donor) possible: intron between exons $i and " - . ( $i - 1 ) . " of " - . $self->name - . " is only " - . ( $exonStarts[$i] - $previousExonEnd ) - . " bp long" ); - } - } - - if ( defined $nextExonStart ) { - #exonEnd is the first intron, exonStart is +1 of the intron - if ( $nextExonStart - $exonEnds[$i] >= 2 * $spliceSiteLength ) { - for ( my $n = 1; $n <= $spliceSiteLength; $n++ ) { - # The exonEnd is already the first intron, which means the first spliceDonor site - # so we subtract 1 and add however many bases of spliceSiteLength wanted - my $exonPos = $exonEnds[$i] - 1 + $n; - - $txAnnotationHref->{$exonPos} = - $posStrand - ? $codonPacker->siteTypeMap->spliceDonSiteType - : $codonPacker->siteTypeMap->spliceAcSiteType; - } - } - else { - $self->log( 'warn', - "No spliceDonor (neg acceptor) possible: intron between exons $i and " - . ( $i + 1 ) . " of " - . $self->name - . " is only " - . ( $nextExonStart - $exonEnds[$i] ) - . " bp long" ); - } - } - - #I give UTR regions the lowest precedence at the moment; - #Meaning if the site is already marked spliceDonor/Acceptor or intronic that won't - #be overriden - #In the future, we could define UTR5-intronic, UTR5-spliceDonor, etc - #To do this we would just iterate pos between cdsStart and cdsEnd - UTR_NCRNA_LOOP: - for ( my $exonPos = $exonStarts[$i]; $exonPos < $exonEnds[$i]; $exonPos++ ) { - if ($nonCoding) { - $txAnnotationHref->{$exonPos} = $codonPacker->siteTypeMap->ncRNAsiteType; - - next UTR_NCRNA_LOOP; - } - - # Don't overwrite intronic or spliceDonor/Acceptor - if ( $txAnnotationHref->{$exonPos} ) { - next UTR_NCRNA_LOOP; - } - - #Next calculate UTR regions. We call utr regions from cdsStart to cdsEnd (cdsEnd is oddly closed) - # $exonPos must be < $codingEnd, because $exonPos is always < $exonEnds[$i] - # meaning it is in effect part of a fully-closed niterval, - # while $codingEnd is 0-based, half-open, end-excluded - if ( $exonPos < $codingEnd ) { - if ( $exonPos >= $codingStart ) { - #It's in the body of the translated region - next UTR_NCRNA_LOOP; - } - - $txAnnotationHref->{$exonPos} = - $posStrand - ? $codonPacker->siteTypeMap->fivePrimeSiteType - : $codonPacker->siteTypeMap->threePrimeSiteType; - } - else { - #if we're after cds end, but in an exon we must be in the 3' UTR - $txAnnotationHref->{$exonPos} = - $posStrand - ? $codonPacker->siteTypeMap->threePrimeSiteType - : $codonPacker->siteTypeMap->fivePrimeSiteType; - } - } - } - - #my $errorsAref = $self->_buildTranscriptErrors($txSequence, $txAnnotationHref); - - return $txAnnotationHref; - #return ($txAnnotationHref, $errorsAref); -} - -#TODO: double check that this works properly -#TODO: think about whether we want to pack anything if no info -#the problem with not packing something is we won't know how to unpack it apriori -sub _buildTranscriptSites { - my ( $self, $txSequence, $seqPosMapAref, $txAnnotationHref ) = @_; - my @exonStarts = $self->allExonStarts; - my @exonEnds = $self->allExonEnds; - - # we build up our site annotations in 2 steps - # 1st record everything as if it were a coding sequence - # then we overwrite those entries if there were other annotations associated - # advantage is we can keep 3-mer info for those annotated sites - # and don't need to modify the $txSequence string in _buildTranscrtipAnnotation - # which may be (for me) a bit easier to reason about - my %tempTXsites; - - #First add the annotations; note that if for some reason a codon overlaps - for my $pos ( keys %$txAnnotationHref ) { - my $siteType = $txAnnotationHref->{$pos}; - - #storing strand for now, could remove it later if we decided to - #just get it from the region database entry for the transcript - $tempTXsites{$pos} = - [ $self->txNumber, $siteType, $self->strand, undef, undef, undef ]; - } - - my $codingBaseCount = 0; - #Then, make all of the codons in locations that aren't in the $tempTXsites - - #Example (informal test): #https://ideone.com/a9NYhb - CODING_LOOP: for ( my $i = 0; $i < length($txSequence); $i++ ) { - #get the genomic position - my $pos = $seqPosMapAref->[$i]; - - if ( defined $tempTXsites{$pos} ) { - next CODING_LOOP; - } - - my ( $siteType, $codonNumber, $codonPosition, $codonSeq ); - - # Next check any remaining sites. These should be coding sites. - # We check the base composition (ATCG) to make sure no oddiities slipped by - # At this point, any remaining sites should be in the coding region - # Since we've accounted for non-coding, UTR, and ~ splice sites - - if ( substr( $txSequence, $i, 1 ) =~ m/[ACGT]/ ) { - #the codon number ; POSIX::floor safer than casting int for rounding - #but we just want to truncate; http://perldoc.perl.org/functions/int.html - $codonNumber = 1 + int( $codingBaseCount / 3 ); - - $codonPosition = $codingBaseCount % 3; - - my $codonStart = $i - $codonPosition; - - # Replaces: #for ( my $j = $codonStart; $j <= $codonEnd; $j++ ) { #$referenceCodonSeq .= $self->getTranscriptBases( $j, 1 ); #} - $codonSeq = substr( $txSequence, $codonStart, 3 ); - - $siteType = $codonPacker->siteTypeMap->codingSiteType; - - $tempTXsites{$pos} = [ - $self->txNumber, $siteType, $self->strand, - $codonNumber, $codonPosition, $codonSeq - ]; - - $codingBaseCount++; - next CODING_LOOP; - } - - $self->log( 'warn', - substr( $txSequence, $i, 1 ) - . " at $pos in transcript " - . $self->name - . " not A,T,C, or G" ); - } - - #At this point, we have all of the codon information stored. - #However, some sites won't be coding, and those are in our annotation href - - #Now compact the site details - for my $pos ( sort { $a <=> $b } keys %tempTXsites ) { - #stores the codon information as binary - #this was "$self->add_transcript_site($site)" - # passing args in list context - # https://ideone.com/By1GDW - my $site = $codonPacker->pack( @{ $tempTXsites{$pos} } ); - - #this transcript sites are keyed on reference position - #this is similar to what was done with Seq::Site::Gene before - push @{ $self->transcriptSites }, $pos, $site; - } -} - -# check coding sequence is -# 1. divisible by 3 -# 2. starts with ATG -# 3. Ends with stop codon - -# Note that is it appropriate, if surprising, to check that the full coding -# sequence is divisible by 3, rather than just looking for $exonEnd - $exonStart % 3 == 0 -# because splice sites may not be in frame, meaning base 1 of codon N may be on one -# side of an intron, in one exon, and bases 2 & 3 on the other side, in another exon -# with the sequence being in-frame only after splicing -# https://www.biostars.org/p/7019/ -sub _buildTranscriptErrors { - my $self = shift; - my $seq = shift; - my $seqPosAref = shift; - my $transcriptAnnotationHref = shift; - - state $atgRe = qr/\AATG/; #starts with ATG - state $stopCodonRe = qr/(TAA|TAG|TGA)\Z/; #ends with a stop codon - - my @errors = (); - - if ( $self->cdsStart == $self->cdsEnd ) { - #it's a non-coding site, so it has no sequence information stored at all - return \@errors; - } - - #I now see why Thomas replaced bases in the exon seq with 5 and 3 - my $codingSeq; - for ( my $i = 0; $i < length($seq); $i++ ) { - if ( defined $transcriptAnnotationHref->{ $seqPosAref->[$i] } ) { - next; - } - $codingSeq .= substr( $seq, $i, 1 ); - } - - my $codingSeq2; - for ( my $i = 0; $i < length($seq); $i++ ) { - if ( $seqPosAref->[$i] >= $self->cdsStart && $seqPosAref->[$i] < $self->cdsEnd ) { - $codingSeq2 .= substr( $seq, $i, 1 ); - } - } - - if ( $codingSeq ne $codingSeq2 ) { - if ( $self->debug ) { - say STDERR "condingSeq ne codingSeq2"; - say STDERR "coding seq is: "; - p $codingSeq; - say STDERR "coding seq length: "; - my $length = length($codingSeq); - p $length; - say STDERR "coding seq 2 is: "; - p $codingSeq2; - say STDERR "coding seq 2 length: "; - $length = length($codingSeq2); - p $length; - say STDERR "name of transcript:"; - p $self->name; - say STDERR "strand is"; - p $self->strand; - - my $numSpliceStuff = 0; - for my $pos ( keys %$transcriptAnnotationHref ) { - if ( $transcriptAnnotationHref->{$pos} eq $codonPacker->siteTypeMap->spliceAcSiteType - || $transcriptAnnotationHref->{$pos} eq - $codonPacker->siteTypeMap->spliceDonSiteType ) - { - $numSpliceStuff++; - } - } - - say STDERR "difference in length: " . ( length($codingSeq) - length($codingSeq2) ); - say STDERR "number of splice things: $numSpliceStuff"; - say STDERR "can be explained by splice things?: " - . ( length($codingSeq) - length($codingSeq2) ) eq $numSpliceStuff ? "Yes" : "No"; - } - - push @errors, - 'coding sequence calcualted by exclusion of annotated sites not equal to the one built from exon position intersection with coding sequence'; - } - - if ( length($codingSeq) % 3 ) { - push @errors, 'coding sequence not divisible by 3'; - } - - if ( $codingSeq !~ m/$atgRe/ ) { - push @errors, 'coding sequence doesn\'t start with ATG'; - } - - if ( $codingSeq !~ m/$stopCodonRe/ ) { - push @errors, 'coding sequnce doesn\'t end with TAA, TAG, or TGA'; - } - - $self->_writeTranscriptErrors( \@errors ); - return \@errors; -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Gene/Definition.pm b/perl/lib/Seq/Tracks/Gene/Definition.pm deleted file mode 100644 index 269b9a471..000000000 --- a/perl/lib/Seq/Tracks/Gene/Definition.pm +++ /dev/null @@ -1,39 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Gene::Definition; -use Mouse 2; -#Defines a few keys common to the build and get functions of Tracks::Gene - -#these is features used in the region database -has txErrorName => - ( is => 'ro', init_arg => undef, lazy => 1, default => 'txError' ); - -#these is features used in the region database -has txSizeName => ( is => 'ro', init_arg => undef, lazy => 1, default => 'txSize' ); - -#some default fields, some of which are required -#TODO: allow people to remap the names of required fields if their source -#file doesn't match (a bigger issue for sparse track than gene track) -state $ucscGeneAref = [ - 'chrom', 'strand', 'txStart', 'txEnd', - 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', - 'exonEnds', 'name', 'kgID', 'mRNA', - 'spID', 'spDisplayID', 'geneSymbol', 'refseq', - 'protAcc', 'description', 'rfamAcc', -]; - -has ucscGeneAref => ( - is => 'ro', - init_arg => undef, - lazy => 1, - isa => 'ArrayRef[Str]', - default => sub { - return [ grep { $_ ne 'chrom' && $_ ne 'exonStarts' && $_ ne 'exonEnds' } - @$ucscGeneAref ]; - } -); - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Gene/Site.pm b/perl/lib/Seq/Tracks/Gene/Site.pm deleted file mode 100644 index 461bb3b8e..000000000 --- a/perl/lib/Seq/Tracks/Gene/Site.pm +++ /dev/null @@ -1,249 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -#The goal of this is to both set and retrieve the information for a single -#position requested by the user -#So what is stored here is for the main database, and not -#Breaking this thing down to fit in the new contxt -#based on Seq::Gene in (kyoto-based) seq branch -#except _get_gene_data moved to Seq::Tracks::GeneTrack::Build - -# This package is meant for use in a static variable; it has no set - able -# instance attributes - -# TODO: Is Seq::Role::Message use safe in a threaded environment during building? -package Seq::Tracks::Gene::Site; - -use Mouse 2; -use Scalar::Util qw/looks_like_number/; - -use Seq::Tracks::Gene::Site::SiteTypeMap; -use Seq::Tracks::Gene::Site::CodonMap; - -#exports log method to $self -with 'Seq::Role::Message'; - -#since the two Site:: packages are tightly coupled to packedCodon and -#unpackCodon, I am making them public -#internally using the variables directly, because when called tens of millions -#of times, $self->codonMap may cost noticeable performance -#TODO: test that theory -#TODO: Remove these, consumers should just call Site::Class directly? -#Maybe not, because used to make combinedMap below -state $siteTypeMap = Seq::Tracks::Gene::Site::SiteTypeMap->new(); -has siteTypeMap => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { return $siteTypeMap }, -); - -state $codonMap = Seq::Tracks::Gene::Site::CodonMap->new(); -has codonMap => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { return $codonMap }, -); - -#These describe the site. -#Note that unpack will return both the transcript number and the site information -#So these indices refer only to the second value on, which represent -#The site information other than the transcript number (which is a reference to the region db) -#These must remain 0 - 4, used as constants in unpack -#They are simply exported for the use in consumers here -#Ex: 1-txSite : [txNubmer1, combinedStrandSitType, codonNumber, codonPosition, codonSequence] -#Expanded to: [txNumber1, strand, siteType, codonNumber, codonPosition, codonSequence] (in unpack) -has strandIdx => ( is => 'ro', init_arg => undef, lazy => 1, default => 0 ); -has siteTypeIdx => ( is => 'ro', init_arg => undef, lazy => 1, default => 1 ); -has codonNumberIdx => ( is => 'ro', init_arg => undef, lazy => 1, default => 2 ); -has codonPositionIdx => ( is => 'ro', init_arg => undef, lazy => 1, default => 3 ); -has codonSequenceIdx => ( is => 'ro', init_arg => undef, lazy => 1, default => 4 ); - -#pack strands as small integers, save a byte in messagepack -state $strandMap = { '-' => 0, '+' => 1, }; - -#combine strands with siteTypes #'-' will be a 0, '+' will be a 1; -state $combinedMap; -if ( !$combinedMap ) { - foreach ( keys %{ $siteTypeMap->siteTypeMap } ) { - for my $num ( 0, 1 ) { - $combinedMap->{ $_ - $num } = [ $num ? '+' : '-', $siteTypeMap->siteTypeMap->{$_} ]; - } - } -} - -# Cost to pack an array using messagePack (which happens by default) -# Should be the same as the overhead for messagePack storing a string -# Unless the Perl messagePack implementation isn't good -# So store as array to save pack / unpack overhead -sub pack { - my ( $self, $txNumber, $siteType, $strand, $codonNumber, $codonPosition, $codonSeq ) - = @_; - - my @outArray; - - if ( !defined $txNumber || !looks_like_number($txNumber) ) { - $self->log( 'fatal', 'packCodon requires txNumber' ); - } - - push @outArray, $txNumber; - - my $siteTypeNum = $siteTypeMap->getSiteTypeNum($siteType); - - if ( !defined $siteTypeNum ) { - $self->log( 'fatal', "site type $siteType not recognized" ); - } - - if ( !defined $strandMap->{$strand} ) { - $self->log( 'fatal', "Strand strand should be a + or -, got $strand" ); - } - - #combines the strand and site type - push @outArray, $siteTypeNum - $strandMap->{$strand}; - - if ( defined $codonNumber || defined $codonPosition || defined $codonSeq ) { - if ( !defined $codonNumber && !defined $codonPosition && !defined $codonSeq ) { - $self->log( 'fatal', - "Codons must be given codonNumber, codonPosition, and codonSeq" ); - } - - if ( !( looks_like_number($codonPosition) && looks_like_number($codonNumber) ) ) { - $self->log( 'fatal', - "codonPosition && codonNumber must be numeric, got $codonPosition && $codonNumber" ); - } - - push @outArray, $codonNumber; - push @outArray, $codonPosition; - - my $codonSeqNumber = $codonMap->codon2Num($codonSeq); - - if ( length($codonSeq) != 3 ) { - $self->log( 'debug', "codonSeqNumber for truncated is $codonSeqNumber" ); - } - - #warning for now, this mimics the original codebase - #TODO: do we want to store this as an error in the TX? - if ( !$codonSeqNumber ) { - $self->log( 'warn', "couldn\'t convert codon sequence $codonSeq to a number" ); - } - else { - push @outArray, $codonSeqNumber; - } - } - - return \@outArray; -} -#@param $self -#@param $codon -# This function assumes that the first value in any site array is the txNumber -# And the rest of values contain strandSiteTypeCombined, codonNumber, codonPosition, codonSequence -# The first value of that array is a combined siteType and strand -# Note also that we store codonPosition as 0 index (to try to store as 1/2 byte) -sub unpack { - # my $self, $codon - # $_[0], $_[1] - - # Sites are stored in the form [ [ $txNumber1, codon1Val1, codon1Val2, ... codon1ValY ], [ txNumber2, ...], ...] - # and for sites with only 1 transcript: # In the form [ $txNumber1, codon1Val1, codon1Val2,... ] - # So if the first value isn't an array, we have a single transcript - #! ref $codon->[0] - if ( !ref $_[1]->[0] ) { - # If the length of our only codon is 2, which happens in intergenic cases - # Then we return just the transcript number, and [strand, siteType] - # #@{$codon} == 2 - if ( @{ $_[1] } == 2 ) { - #returns: transcriptNum, [$strand, $siteType ] - # ( $codon->[0]),[( @{ $combinedMap->{ $codon->[1]} }) ] - return ( $_[1]->[0], [ ( @{ $combinedMap->{ $_[1]->[1] } } ) ] ); - } - - # The first value in the return list is the transcript number - #returns: transcriptNum, [$strand, $siteType ] - #return ( $codon->[0]),[( @{ $combinedMap->{ $codon->[1]} }) , $codon->[$codonNumberIdx], $codon->[$codonPositionIdx] + 1, - # $codonMap->num2Codon( $codon->[$codonSequenceIdx] ) ] ); - return ( - $_[1]->[0], - [ - ( @{ $combinedMap->{ $_[1]->[1] } } ), $_[1]->[2], - $_[1]->[3], $codonMap->num2Codon( $_[1]->[4] ) - ] - ); - } - - my ( @site, @txNumbers ); - foreach ( @{ $_[1] } ) { - # The first value is txNumber, and is always present - push @txNumbers, $_->[0]; - - if ( @{$_} == 2 ) { - # [ ( @{ $combinedMap->{ $_->[1] } } ) ] - push @site, [ ( @{ $combinedMap->{ $_->[1] } } ) ]; - next; - } - #push @site,[ ( @{ $combinedMap->{ $_->[1] } } ), $_->[$codonNumberIdx], $_->[$codonPositionIdx] + 1, - # $codonMap->num2Codon( $_->[$codonSequenceIdx] ) ]; - push @site, - [ - ( @{ $combinedMap->{ $_->[1] } } ), $_->[2], - $_->[3], $codonMap->num2Codon( $_->[4] ) - ]; - } - - return ( \@txNumbers, \@site ); -} - -#Future API - -# sub _unpackCodonBulk { -# #my ($self, $codoAref) = @_; -# #$codonStr == $_[1] -# #may be called a lot, so not using arg assignment -# #Old version relied on pack/unpack, here are some informal tests: -# #https://ideone.com/TFGjte -# #https://ideone.com/dVy6WL -# #my @unpackedCodon = $_[1] ? unpack('cAlcAAA', $_[1]) : (); -# #etc - -# for(my $i) - -# return { -# $siteTypeKey => defined $_[1]->[0] ? $_[0]->getSiteTypeFromNum($_[1]->[0]) : undef, -# $strandKey => $_[1]->[1], -# $codonNumberKey => $_[1]->[2], -# $codonPositionKey => $_[1]->[3], -# $peptideKey => defined $_[1]->[4] ? $_[0]->codon2aa( $_[0]->num2Codon($_[1]->[4]) ) : undef -# } -# } - -# sub getCodonStrand { -# return $unpackedCodonHref->{$_[0]->strandKey}; -# } - -# sub getCodonNumber { -# return $unpackedCodonHref->{$_[0]->codonNumberKey}; -# } - -# sub getCodonPosition { -# return $unpackedCodonHref->{$_[0]->codonPositionKey}; -# } - -# #https://ideone.com/cNQfwv -# sub getCodonSequence { -# return $unpackedCodonHref->{$_[0]->codonSequenceKey}; -# } - -# sub getCodonAAresidue { -# return $unpackedCodonHref->{$_[0]->peptideKey}; -# } - -# not in use yet -# sub hasCodon { -# my ($self, $href) = @_; - -# return !!$href->{ $invFeatureMap->{refCodonSequence} }; -# } - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Gene/Site/CodonMap.pm b/perl/lib/Seq/Tracks/Gene/Site/CodonMap.pm deleted file mode 100644 index 93bd96159..000000000 --- a/perl/lib/Seq/Tracks/Gene/Site/CodonMap.pm +++ /dev/null @@ -1,194 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -#Stores all 64 possible codons in a numerical map -#Stores all 64 possible codons mapped to single-leter amino-acids - -# Safe for use when instantiated to static variable; no set - able properties -package Seq::Tracks::Gene::Site::CodonMap; - -use Mouse 2; -use namespace::autoclean; - -# Store both normal and truncated AA, to remove ambiguity for consumers -# 0 to 64 and 0 to 32 take 1 byte in msgpack -state $codonMap = { - AAA => 1, - AAC => 2, - AAG => 3, - AAT => 4, - ACA => 5, - ACC => 6, - ACG => 7, - ACT => 8, - AGA => 9, - AGC => 10, - AGG => 11, - AGT => 12, - ATA => 13, - ATC => 14, - ATG => 15, - ATT => 16, - CAA => 17, - CAC => 18, - CAG => 19, - CAT => 20, - CCA => 21, - CCC => 22, - CCG => 23, - CCT => 24, - CGA => 25, - CGC => 26, - CGG => 27, - CGT => 28, - CTA => 29, - CTC => 30, - CTG => 31, - CTT => 32, - GAA => 33, - GAC => 34, - GAG => 35, - GAT => 36, - GCA => 37, - GCC => 38, - GCG => 39, - GCT => 40, - GGA => 41, - GGC => 42, - GGG => 43, - GGT => 44, - GTA => 45, - GTC => 46, - GTG => 47, - GTT => 48, - TAA => 49, - TAC => 50, - TAG => 51, - TAT => 52, - TCA => 53, - TCC => 54, - TCG => 55, - TCT => 56, - TGA => 57, - TGC => 58, - TGG => 59, - TGT => 60, - TTA => 61, - TTC => 62, - TTG => 63, - TTT => 64, - - #truncated - AA => -1, - AC => -2, - AG => -3, - AT => -4, - CA => -5, - CC => -6, - CG => -7, - CT => -8, - GA => -9, - GC => -10, - GG => -11, - GT => -12, - TA => -13, - TC => -14, - TG => -15, - TT => -16, - A => -17, - C => -18, - G => -19, - T => -20, -}; - -sub codon2Num { - #my ( $self, $codon ) = @_; - #will return undefined if not found - return $codonMap->{ $_[1] }; -} - -state $codonInverseMap = { map { $codonMap->{$_} => $_ } keys %$codonMap }; - -sub num2Codon { - #my ( $self, $codon ) = @_; - #will return undefined if not found - - return $codonInverseMap->{ $_[1] }; -} - -state $codonAAmap = { - "AAA" => "K", - "AAC" => "N", - "AAG" => "K", - "AAT" => "N", - "ACA" => "T", - "ACC" => "T", - "ACG" => "T", - "ACT" => "T", - "AGA" => "R", - "AGC" => "S", - "AGG" => "R", - "AGT" => "S", - "ATA" => "I", - "ATC" => "I", - "ATG" => "M", - "ATT" => "I", - "CAA" => "Q", - "CAC" => "H", - "CAG" => "Q", - "CAT" => "H", - "CCA" => "P", - "CCC" => "P", - "CCG" => "P", - "CCT" => "P", - "CGA" => "R", - "CGC" => "R", - "CGG" => "R", - "CGT" => "R", - "CTA" => "L", - "CTC" => "L", - "CTG" => "L", - "CTT" => "L", - "GAA" => "E", - "GAC" => "D", - "GAG" => "E", - "GAT" => "D", - "GCA" => "A", - "GCC" => "A", - "GCG" => "A", - "GCT" => "A", - "GGA" => "G", - "GGC" => "G", - "GGG" => "G", - "GGT" => "G", - "GTA" => "V", - "GTC" => "V", - "GTG" => "V", - "GTT" => "V", - "TAA" => "*", - "TAC" => "Y", - "TAG" => "*", - "TAT" => "Y", - "TCA" => "S", - "TCC" => "S", - "TCG" => "S", - "TCT" => "S", - "TGA" => "*", - "TGC" => "C", - "TGG" => "W", - "TGT" => "C", - "TTA" => "L", - "TTC" => "F", - "TTG" => "L", - "TTT" => "F" -}; - -sub codon2aa { - #my ( $self, $codon ) = @_; - #will return undefined if not found - return $codonAAmap->{ $_[1] }; -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Gene/Site/SiteTypeMap.pm b/perl/lib/Seq/Tracks/Gene/Site/SiteTypeMap.pm deleted file mode 100644 index d644e1134..000000000 --- a/perl/lib/Seq/Tracks/Gene/Site/SiteTypeMap.pm +++ /dev/null @@ -1,175 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Gene::Site::SiteTypeMap; - -use Mouse 2; -use Mouse::Util::TypeConstraints; -# Define allowable types - -# Safe for use when instantiated to static variable; no set - able properties -state $codingSite = 'exonic'; -has codingSiteType => - ( is => 'ro', lazy => 1, init_arg => undef, default => sub { $codingSite } ); -state $fivePrimeSite = 'UTR5'; -has fivePrimeSiteType => - ( is => 'ro', lazy => 1, init_arg => undef, default => sub { $fivePrimeSite } ); -state $threePrimeSite = 'UTR3'; -has threePrimeSiteType => ( - is => 'ro', - lazy => 1, - init_arg => undef, - default => sub { $threePrimeSite } -); -state $spliceAcSite = 'spliceAcceptor'; -has spliceAcSiteType => - ( is => 'ro', lazy => 1, init_arg => undef, default => sub { $spliceAcSite } ); -state $spliceDonSite = 'spliceDonor'; -has spliceDonSiteType => - ( is => 'ro', lazy => 1, init_arg => undef, default => sub { $spliceDonSite } ); -state $ncRNAsite = 'ncRNA'; -has ncRNAsiteType => - ( is => 'ro', lazy => 1, init_arg => undef, default => sub { $ncRNAsite } ); -state $intronicSite = 'intronic'; -has intronicSiteType => - ( is => 'ro', lazy => 1, init_arg => undef, default => sub { $intronicSite } ); - -# #Coding type always first; order of interest -state $siteTypes = [ - $codingSite, $fivePrimeSite, $threePrimeSite, $spliceAcSite, - $spliceDonSite, $ncRNAsite, $intronicSite -]; - -# #public -has siteTypes => ( - is => 'ro', - isa => 'ArrayRef', - traits => ['Array'], - handles => { - allSiteTypes => 'elements', - getSiteType => 'get', - }, - lazy => 1, - init_arg => undef, - default => sub { $siteTypes }, -); - -has nonCodingBase => ( - is => 'ro', - isa => 'Int', - init_arg => undef, - lazy => 1, - default => 1, -); - -has codingBase => ( - is => 'ro', - isa => 'Int', - init_arg => undef, - lazy => 1, - default => 3, -); - -has fivePrimeBase => ( - is => 'ro', - isa => 'Int', - init_arg => undef, - lazy => 1, - default => 5, -); - -has threePrimeBase => ( - is => 'ro', - isa => 'Int', - init_arg => undef, - lazy => 1, - default => 7, -); - -has spliceAcBase => ( - is => 'ro', - isa => 'Int', - init_arg => undef, - lazy => 1, - default => 9, -); - -has spliceDonBase => ( - is => 'ro', - isa => 'Int', - init_arg => undef, - lazy => 1, - default => 11, -); - -has intronicBase => ( - is => 'ro', - isa => 'Int', - init_arg => undef, - lazy => 1, - default => 13, -); - -#TODO: should constrain values to GeneSiteType -has siteTypeMap => ( - is => 'ro', - isa => 'HashRef', - traits => ['Hash'], - handles => { getSiteTypeFromNum => 'get', }, - lazy => 1, - init_arg => undef, - builder => '_buildSiteTypeMap', -); - -sub _buildSiteTypeMap { - my $self = shift; - - state $mapHref = { - $self->nonCodingBase => $ncRNAsite, - $self->codingBase => $codingSite, - $self->fivePrimeBase => $fivePrimeSite, - $self->threePrimeBase => $threePrimeSite, - $self->spliceAcBase => $spliceAcSite, - $self->spliceDonBase => $spliceDonSite, - $self->intronicBase => $intronicSite, - }; - - return $mapHref; -} - -#takes a GeneSite value and returns a number, matching the _siteTypeMap key -has siteTypeMapInverse => ( - is => 'ro', - isa => 'HashRef', - traits => ['Hash'], - handles => { getSiteTypeNum => 'get', }, - lazy => 1, - init_arg => undef, - builder => '_buildSiteTypeMapInverse', -); - -sub _buildSiteTypeMapInverse { - my $self = shift; - - state $inverse = - { map { $self->siteTypeMap->{$_} => $_ } keys %{ $self->siteTypeMap } }; - - return $inverse; -} - -has exonicSites => ( - is => 'ro', - init_arg => undef, - lazy => 1, - isa => 'HashRef', - traits => ['Hash'], - handles => { isExonicSite => 'exists', }, - default => sub { - return { map { $_ => 1 } - ( $codingSite, $ncRNAsite, $fivePrimeSite, $threePrimeSite ) }; - }, -); - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Get.pm b/perl/lib/Seq/Tracks/Get.pm deleted file mode 100644 index d4b685189..000000000 --- a/perl/lib/Seq/Tracks/Get.pm +++ /dev/null @@ -1,114 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Get; -# Synopsis: For fetching data -# TODO: make this a role? -our $VERSION = '0.001'; - -use Mouse 2; - -extends 'Seq::Tracks::Base'; - -use Seq::Headers; - -# Note that Seq::Headeres is a singleton class -# The headers property is exported only to allow easier overriding of setHeaders -has headers => ( - is => 'ro', - init_arg => undef, - lazy => 1, - default => sub { Seq::Headers->new() }, -); - -sub BUILD { - my $self = shift; - - # Skip accesor penalty, the get function in this package may be called - # hundreds of millions of times - $self->{_dbName} = $self->dbName; - - #register all features for this track - #@params $parent, $child - #if this class has no features, then the track's name is also its only feature - if ( $self->noFeatures ) { - return; - } - - $self->{_fDb} = [ map { $self->getFieldDbName($_) } @{ $self->features } ]; - $self->{_fIdx} = [ 0 .. $#{ $self->features } ]; -} - -# Decouple from build to allow decoupling from dbName / build order -sub setHeaders { - my $self = shift; - - if ( $self->noFeatures ) { - $self->headers->addFeaturesToHeader( $self->name ); - return; - } - - $self->headers->addFeaturesToHeader( $self->features, $self->name ); -} - -# Take an array reference containing (that is passed to this function), and get back all features -# that belong to thie Track -# @param $self -# @param $href : The raw data (presumably from the database); -# @return : A hash ref of featureName => featureValue pairs for -# all features the user specified for this Track in their config file -sub get { - #my ($self, $href, $chr, $refBase, $allele, $outAccum, $alleleNumber) = @_ - # $_[0] == $self - # $_[1] == $href : the database data, with each top-level index corresponding to a track - # $_[2] == $chr : the chromosome - # $_[3] == $refBase : a base, one of ACTG - # $_[4] == $allele : a base, the allele (ACTG or a deletion in the form of "-{number}", or insertion in the form of "+" followed by a sequence of nucleotides) - # $_[5] == $posIdx : the position in the indel, if any - # $_[6] == $outAccum : a reference to the output, which we mutate - - #internally the data is store keyed on the dbName not name, to save space - # 'some dbName' => someData - #dbName is simply the track name as stored in the database - - #some features simply don't have any features, and for those just return - #the value they stored - if ( !$_[0]->{_fIdx} ) { - #$outAccum->[$posIdx] = $href->[ $self->{_dbName} ] - # $_[6]->[$_[5]] = $_[1]->[ $_[0]->{_dbName} ]; - - #return #$outAccum; - return $_[6]; - } - - # TODO: decide whether we want to revert to old system of returning a bunch of ! - # one for each feature - # This is clunky, to have _i and fieldDbNames - if ( !defined $_[1]->[ $_[0]->{_dbName} ] ) { - for my $i ( @{ $_[0]->{_fIdx} } ) { - $_[6]->[$i][ $_[5] ] = undef; - } - - return $_[6]; - } - - # We have features, so let's find those and return them - # Since all features are stored in some shortened form in the db, we also - # will first need to get their dbNames ($self->getFieldDbName) - # and these dbNames will be found as a value of $href->{$self->dbName} - # #http://ideone.com/WD3Ele - # return [ map { $_[1]->[$_[0]->{_dbName}][$_] } @{$_[0]->{_fieldDbNames}} ]; - my $idx = 0; - for my $fieldDbName ( @{ $_[0]->{_fDb} } ) { - #$outAccum->[$idx][$posIdx] = $href->[$self->{_dbName}][$fieldDbName] } - $_[6]->[$idx][ $_[5] ] = $_[1]->[ $_[0]->{_dbName} ][$fieldDbName]; - $idx++; - } - - #return #$outAccum; - return $_[6]; -} -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Nearest.pm b/perl/lib/Seq/Tracks/Nearest.pm deleted file mode 100644 index 389c925be..000000000 --- a/perl/lib/Seq/Tracks/Nearest.pm +++ /dev/null @@ -1,161 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Nearest; - -our $VERSION = '0.001'; - -=head1 DESCRIPTION - - @class B - - A Region track that also calculates distance if the user wishes - IE it pulls any requested features by a region reference (an in 0 to N-1) - And then if "dist" is specified, calculates that based on the "from" and "to" features - -=cut - -## TODO: remove the if(nearestGeneNumber) check. Right now needed because -## we have no chrM refSeq stuff -use Mouse 2; - -use namespace::autoclean; - -extends 'Seq::Tracks::Base'; -with 'Seq::Tracks::Region::RegionTrackPath'; - -use Seq::Headers; -use Seq::DBManager; - -# Coordinate we start looking from -# Typically used for gene tracks, would be 'txStart' -has from => ( is => 'ro', isa => 'Str', required => 1 ); - -# Coordinate we look to -# Typically used for gene tracks, would 'txEnd' or nothing -has to => ( is => 'ro', isa => 'Maybe[Str]' ); - -has dist => ( is => 'ro', isa => 'Bool', default => 1 ); - -#### Add our other "features", everything we find for this site #### -sub BUILD { - my $self = shift; - - # Avoid accessor penalties in Mouse/Moose; - - # We only have 1 end - $self->{_eq} = !$self->to || $self->from eq $self->to; - # We expect to ALWAYS have a from field - $self->{_fromD} = $self->getFieldDbName( $self->from ); - # But "to" is optional - $self->{_toD} = !$self->{_eq} ? $self->getFieldDbName( $self->to ) : undef; - - $self->{_db} = Seq::DBManager->new(); - $self->{_dbName} = $self->dbName; - - # We may or may not want to calculate distance - $self->{_dist} = !!$self->dist; - $self->{_fieldDbNames} = [ map { $self->getFieldDbName($_) } @{ $self->features } ]; -} - -sub setHeaders { - my $self = shift; - - my $headers = Seq::Headers->new(); - my @features = @{ $self->features }; - - if ( $self->dist ) { - push @features, 'dist'; - } - - $headers->addFeaturesToHeader( \@features, $self->name ); - - # If we have dist, it comes as our last feature - # We don't have a field db name for dist...this is a calculated feature - # so we just want to get into the header, and into the output - $self->{_fIdx} = [ 0 .. $#{ $self->features } ]; -} - -sub get { - #my ($self, $href, $chr, $refBase, $allele, $positionIdx, $outAccum, $position) = @_ - # $_[0] == $self - # $_[1] == $href : the database data, with each top-level index corresponding to a track - # $_[2] == $chr : the chromosome - # $_[3] == $refBase : ACTG - # $_[4] == $allele : the allele (ACTG or -N / +ACTG) - # $_[5] == $positionIdx : the position in the indel, if any - # $_[6] == $outAccum : a reference to the output, which we mutate - # $_[7] == $zeroPos : the 0-based genomic position - ################# Cache track's region data ############## - #$self->{_regionData}{$chr} //= $self->{_db}->dbReadAll( $self->regionTrackPath($_[2]) ); - - # If the position idx isn't 0 we're in an indel - # We should make a decision whether to tile across the genom - # My opinion atm is its substantially easier to just consider the indel - # from the starting position w.r.t nearest data - # However, this also removes useful information when an indel spans - # multiple regions (in our use case mostly genes) - # if($_[5] != 0) { - # return $_[6]; - # } - - # WARNING: If $_[1]->[$_[0]->{_dbName} isn't defined, will be treated as the 0 index!!! - # therefore return here if that is the case - if ( !defined $_[1]->[ $_[0]->{_dbName} ] ) { - for my $i ( @{ $_[0]->{_fIdx} } ) { - $_[6]->[$i][ $_[5] ] = undef; - } - - return $_[6]; - } - - $_[0]->{_regionData}{ $_[2] } //= - $_[0]->{_db}->dbReadAll( $_[0]->regionTrackPath( $_[2] ) ); - - my $geneDb = $_[0]->{_regionData}{ $_[2] }[ $_[1]->[ $_[0]->{_dbName} ] ]; - - # exit; - # We have features, so let's find those and return them - # Since all features are stored in some shortened form in the db, we also - # will first need to get their dbNames ($self->getFieldDbName) - # and these dbNames will be found as a value of $href->{$self->dbName} - # #http://ideone.com/WD3Ele - - # All features from overlapping are already combined into arrays, unlike - # what gene tracks used to do - # Here we accumulate all features, except for the dist (not included in _fieldDbNames) - my $i = 0; - for my $fieldDbName ( @{ $_[0]->{_fieldDbNames} } ) { - #$outAccum->[$i][$positionIdx] = $href->[$self->{_dbName}][$self->{_fieldDbNames}[$i]] } - $_[6]->[$i][ $_[5] ] = $geneDb->[$fieldDbName]; - $i++; - } - - # Calculate distance if requested - # We always expect from and to fields to be scalars - # Notice that dist is our last feature, because $i incremented +1 here - if ( $_[0]->{_dist} ) { - if ( $_[0]->{_eq} || $_[7] < $geneDb->[ $_[0]->{_fromD} ] ) { - # We're before the starting position of the nearest region - # Or we're only checking one boundary (the from boundary) - $_[6]->[$i][ $_[5] ] = $geneDb->[ $_[0]->{_fromD} ] - $_[7]; - } - elsif ( $_[7] <= $geneDb->[ $_[0]->{_toD} ] ) { - # We already know $zeroPos >= $geneDb->[$_[0]->{_fromD}] - # so if we're here, we are within the range of the requested region at this position - # ie == 0 distance to the region - $_[6]->[$i][ $_[5] ] = 0; - } - else { - # occurs after the 'to' position - $_[6]->[$i][ $_[5] ] = $geneDb->[ $_[0]->{_toD} ] - $_[7]; - } - } - - return $_[6]; -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Nearest/Build.pm b/perl/lib/Seq/Tracks/Nearest/Build.pm deleted file mode 100644 index 0e75ad240..000000000 --- a/perl/lib/Seq/Tracks/Nearest/Build.pm +++ /dev/null @@ -1,864 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Nearest::Build; - -our $VERSION = '0.001'; - -# ABSTRACT: Builds region-based tracks , from some coordinate to some other coordinate -# It then writes a reference to some index in a region database -# And then creates that region database -# The reference # to the region database actually is a unique combination -# of overlapping data based on those coordinates -# So if 3 transcripts overlap completely, there will be a single reference -# that references the combined information of those 3 transcripts (whatever features requested by user) -# Notably, this data is also unique; namely, if the transcript features are all -# redundant, only 1 set of data will be written to the region database at that reference -# If the combinations are not totally redundant, all N unique combinatins will be written -# Finally, within any feature, if all values are completely redundant, only 1 such value -# will be written -# This is distinct from the gene track, which does no de-duplication -# Therefore it makes more sense to store information that is say only available at the gene -# rather than the transcript level, as this kind of track, rather than a type:gene -# It will make that data far more human readable. - -use Mouse 2; -use namespace::autoclean; - -use Parallel::ForkManager; -use Scalar::Util qw/looks_like_number/; - -use List::Util qw/max min/; -# http://fastcompression.blogspot.com/2014/07/xxhash-wider-64-bits.html -# much faster than md5, no cryptographic guarantee should suffice for our use -# Unfortunately, that turned out not to be true, failed tests -# use Digest::xxHash qw/xxhash64/; -use Digest::MD5 qw/md5/; -use Data::MessagePack; - -use Seq::Tracks; - -extends 'Seq::Tracks::Build'; -#exports regionTrackPath -with 'Seq::Tracks::Region::RegionTrackPath'; - -# Use to evaluate whether we have duplicates, together with md5 -# canonical needed to prevent pseudo-random ordering of hash keys -# although in our instance, shouldn't have any impact, as we use only arrays -my $mp = Data::MessagePack->new()->canonical(); - -# TODO: Currently we mutate the 'from' and 'to' properties -# such that these may not keep 1:1 correspondance in case of overlapping transcripts -# To get around this we could store shadow properties of these -# for distance calculation purposes - -# Coordinate we start looking from -# Typically used for gene tracks, would be 'txStart' -# This, at the moment, must be a column in which each value is a number -# So for instance, exonEnds, which have multiple comma-separated values -# Wouldn't work, since they would either be treated as a string, -# or through build_field_transformations: exonEnds: split(,) -# would appear to this program as an array of numbers, which this program -# doesn't currently know what to do with -has from => ( is => 'ro', isa => 'Str', required => 1 ); - -# Coordinate we look to -# Typically used for gene tracks, would 'txEnd' or nothing -# Similarly, should be a column of numbers -# Not required, we may opt to check against a single point, like txStart, for -# a "nearestTss" track -# To simplify the funcitons, we default to the "from" attribute -# In our BUILDARGS -has to => ( - is => 'ro', - isa => 'Str', - lazy => 1, - default => sub { - my $self = shift; - return $self->from; - } -); - -# If we're not given local_files, get them from a track reference -has ref => ( is => 'ro', isa => 'Seq::Tracks::Build' ); - -# So that we know which db to store within (db is segregated on chromosome) -has chromField => ( is => 'ro', isa => 'Str', default => 'chrom' ); - -# Should we store not just the stuff that is intergenic (or between defined regions) -# but also that within the regions themselves -# This is Cutler/Wingo's preferred solution, so that we have data -# for every position in genome -# This I think is reasonable, and from my perspective provides a nice search advantage -# We can search for nearest.dist < 5000 for instance, and include things that are 0 distance away -has storeOverlap => ( is => 'ro', isa => 'Bool', default => 1 ); -has storeNearest => ( is => 'ro', isa => 'Bool', default => 1 ); -my $txNumberKey = 'txNumber'; - -around BUILDARGS => sub { - my ( $orig, $self, $data ) = @_; - - if ( !defined $data->{local_files} ) { - # Careful with the reference - $data->{local_files} = $data->{ref}->local_files; - } - - # We require non-empty 'from' - if ( !$data->{from} ) { - # A nicer exit - $self->log( 'fatal', "'from' property must be specified for 'nearest' tracks" ); - } - - ############# Add "from" and "to" to "features" if not present ############### - # To allow distance calculations in the getter (Nearest.pm) - $data->{features} //= []; - - my $hasFrom; - my $hasTo; - for my $feat ( @{ $data->{features} } ) { - if ( $feat eq $data->{from} ) { - $hasFrom = 1; - } - - if ( $data->{to} && $feat eq $data->{to} ) { - $hasTo = 1; - } - } - - if ( $data->{to} && !$hasTo ) { - unshift @{ $data->{features} }, $data->{to}; - } - - if ( !$hasFrom ) { - unshift @{ $data->{features} }, $data->{from}; - } - - ##### Ensure that we get the exact 0-based, half-open coordinates correct ###### - # allows us to consider from .. to rather than from .. to - 1, from - 1 .. to, etc - # TODO: don't assume UCSC-style genes, and require a build_field_transformation - # For anything other than 'from' as 0-based closed and 'to' as 0-based open (+1 of true) - if ( $data->{build_field_transformations}{ $data->{from} } - && ( !$data->{to} || $data->{build_field_transformations}{ $data->{to} } ) ) - { - return $self->$orig($data); - } - - # We softly enforce build_field_transformations for the comm - if ( $data->{from} eq 'txEnd' || $data->{from} eq 'cdsEnd' ) { - $data->{build_field_transformations}{ $data->{from} } = '- 1'; - } - - if ( $data->{to} && ( $data->{to} eq 'txEnd' || $data->{to} eq 'cdsEnd' ) ) { - $data->{build_field_transformations}{ $data->{to} } = '- 1'; - } - - return $self->$orig($data); -}; - -sub BUILD { - my $self = shift; - - # We require these two fields, so make sure we make db names for them - # These are stored in the region database, allowing us to calculate distnace, - # Should that be needed - # We'll store these regardless of the 'dist' property - # Won't add many bytes (<18 per transcript), and add flexibility for the user - $self->getFieldDbName( $self->from ); - $self->getFieldDbName( $self->to ); - - #Check that -} - -# Find all of the nearest genes, for any intergenic regions -# Genic regions by our definition are nearest to themselves -# All UCSC refGene data is 0-based -# http://www.noncode.org/cgi-bin/hgTables?db=hg19&hgta_group=genes&hgta_track=refGene&hgta_table=refGene&hgta_doSchema=describe+table+schema -sub buildTrack { - my $self = shift; - - my @allFiles = $self->allLocalFiles; - - # Only allow 1 thread because perl eats memory like candy - my $pm = Parallel::ForkManager->new( $self->maxThreads ); - - my %allIdx; # a map { featureName => columnIndexInFile} - my %regionIdx; #like allIdx, but only for features going into the region databae - # Every row (besides header) describes a transcript - my %regionData; - - my $wantedChr; - - my $txNumber; - - my @fieldDbNames = - sort { $a <=> $b } map { $self->getFieldDbName($_) } @{ $self->features }; - - my %completedChrs; - $pm->run_on_finish( - sub { - my ( $pid, $exitCode, $fileName, $exitSignal, $coreDump, $errOrChrs ) = @_; - - if ( $exitCode != 0 ) { - my $err = $self->name - . ": got exitCode $exitCode for $fileName: $exitSignal . Dump: $coreDump"; - - $self->log( 'fatal', $err ); - } - - if ( $errOrChrs && ref $errOrChrs eq 'HASH' ) { - for my $chr ( keys %$errOrChrs ) { - if ( !$completedChrs{$chr} ) { - $completedChrs{$chr} = [$fileName]; - } - else { - push @{ $completedChrs{$chr} }, $fileName; - } - } - } - - #Only message that is different, in that we don't pass the $fileName - $self->log( 'info', $self->name . ": completed building from $fileName" ); - } - ); - - # Assume one file per loop, or all sites in one file. Tracks::Build warns if not - for my $file (@allFiles) { - # Although this should be unnecessary, environments must be created - # within the process that uses them - # This provides a measure of safety - $self->db->cleanUp(); - - $pm->start($file) and next; - my ( $err, undef, $fh ) = $self->getReadFh($file); - - if ($err) { - $self->log( 'fatal', $self->name . ": $err" ); - } - - my $firstLine = <$fh>; - - # Fatal/exit will only affect that process, won't affect others - if ( !defined $firstLine ) { - $self->log( 'fatal', $self->name . ": Failed to read header of $file" ); - } - - chomp $firstLine; - - # If the user wanted to transform the input field names, do, so source field names match - # those expected by the track - my @fields = map { $self->fieldMap->{$_} || $_ } split( '\t', $firstLine ); - - # Store all features we can find, for Seq::Build::Gene::TX. Avoid autocracy, - # don't need to know what Gene::TX requires. - my $fieldIdx = 0; - for my $field (@fields) { - $allIdx{$field} = $fieldIdx; - $fieldIdx++; - } - - my $fromIdx = $allIdx{ $self->from }; - my $toIdx = $allIdx{ $self->to }; - - # Except w.r.t the chromosome field, txStart, txEnd, txNumber definitely need these - if ( - !( defined $allIdx{ $self->chromField } && defined $fromIdx && defined $toIdx ) ) - { - $self->log( 'fatal', $self->name . ': must provide chrom, from, to fields' ); - } - - # Region database features; as defined by user in the YAML config, or our default - REGION_FEATS: for my $field ( @{ $self->features } ) { - if ( exists $allIdx{$field} ) { - $regionIdx{$field} = $allIdx{$field}; - next REGION_FEATS; - } - - $self->log( 'fatal', - $self->name . ": required $field missing in $file header: $firstLine" ); - } - - # We add the "from" and "to" fields to allow distance calculations - - # Read the file. - # We store everything on the basis of chr, so that we can accept - # Either a file that contains multiple chromosomes - # Or multiples file that contains a single chromosome each - my $fromDbName = $self->getFieldDbName( $self->from ); - my $toDbName = $self->getFieldDbName( $self->to ); - my $rowIdx = 0; - - my %visitedChrs; - - #TODO: ADD check if we have any wanted chrs - FH_LOOP: while (<$fh>) { - chomp; - my @fields = split( '\t', $_ ); - - # Normalize the representation, such that having or missing 'chr' - # or using MT instead of M won't matter - my $chr = $self->normalizedWantedChr->{ $fields[ $allIdx{ $self->chromField } ] }; - - # falsy value is '' - if ( !defined $wantedChr || ( !defined $chr || $wantedChr ne $chr ) ) { - $wantedChr = $self->chrWantedAndIncomplete($chr); - } - - # We no longer care if we have multiple chromosomes in a single file - # because memory usage is well controlled despite our use of pre-calculated - # overlaps - # We also no longer exit if finding an unwanted chromosome, for safety (ensure all sites found) - # at expense of runtime - # TODO: think about handling chrPerFile - if ( !defined $wantedChr ) { - next FH_LOOP; - } - - $visitedChrs{$wantedChr} //= 1; - - my @rowData; - - # Field db names are numerical, from 0 to N - 1 - # Assign the last one as the last index, rather than by $#fieldDbNames - # so that if we have sparse feature names, rowData still can accomodate them - $#rowData = $fieldDbNames[-1]; - ACCUM_VALUES: for my $fieldName ( keys %regionIdx ) { - my $data = $fields[ $regionIdx{$fieldName} ]; - - # say split, etc; comes first so that each individual value - # in an array (if split) can be coerced - if ( $self->hasTransform($fieldName) ) { - $data = $self->transformField( $fieldName, $data ); - } - - # convert the value into some type, typically number(N) - $data = $self->coerceFeatureType( $fieldName, $data ); - - # if this is a field that we need to store in the region db - # create a shortened field name - my $fieldDbName = $self->getFieldDbName($fieldName); - - #store under a shortened fieldName to save space in the db - $rowData[$fieldDbName] = $data; - } - - my $from = $rowData[$fromDbName]; - my $to = $rowData[$toDbName]; - - if ( - !( - defined $from && defined $to && looks_like_number($from) && looks_like_number($to) - ) - ) - { - $self->log( 'fatal', - "Expected numeric 'from' and 'to' fields, found: $from and $to" ); - } - - $regionData{$wantedChr}{$rowIdx} = [ $rowIdx, \@rowData ]; - - $rowIdx++; - } - - #Commit, sync everything, including completion status, and release mmap - $self->db->cleanUp(); - - # If we fork a process in order to read (example zcat) prevent that process - # from becoming defunct - $self->safeCloseBuilderFh( $fh, $file, 'fatal' ); - - # We've now accumulated everything from this file - # So write it. LMDB will serialize writes, so this is fine, even - # if the file is not properly organized by chromosome - # Requires that each file contains only one kind of chromosome - for my $chr ( keys %regionData ) { - $self->_writeNearestData( $chr, $regionData{$chr}, \@fieldDbNames ); - - # We've finished with 1 chromosome, so write that to meta to disk - $self->completionMeta->recordCompletion($chr); - } - - #Commit, sync everything, including completion status, and release mmap - $self->db->cleanUp(); - $pm->finish( 0, \%visitedChrs ); - } - - $pm->wait_all_children(); - - for my $chr ( keys %completedChrs ) { - $self->completionMeta->recordCompletion($chr); - - $self->log( 'info', - $self->name - . ": recorded $chr completed, from " - . ( join( ",", @{ $completedChrs{$chr} } ) ) ); - } - - #TODO: figure out why this is necessary, even with DEMOLISH - $self->db->cleanUp(); - - return; -} - -# We tile in the following way -#---previousLongestEnd##########midpoint#########currentStart-----currentLongestEnd -#everything before midpoint is assigned to the previous region -#everything midpoint on is assigned to the transcripts/regions overlapping currentStart -#everything between currentStart and currentEnd (closed interval) -#is assigned on a base by base basis - -# For all transcript/region records that overlap we take the smallest start -# and the largest end, to come up with the largest contiguous region -# And when (in getter) calculating distance, consider anything within -# such an interval as having distance 0 - -# Similarly, for any transcripts/regions sharing a start with multiple ends -# take the largest end -# And for any ends sharing a start, take the smallest start -sub _writeNearestData { - my ( $self, $chr, $regionDataHref, $fieldDbNames ) = @_; - - my $fromDbName = $self->getFieldDbName( $self->from ); - my $toDbName = $self->getFieldDbName( $self->to ); - - my $regionDbName = $self->regionTrackPath($chr); - - my $uniqNumMaker = _getTxNumber(); - my $uniqRegionEntryMaker = - _makeUniqueRegionData( $fromDbName, $toDbName, $fieldDbNames ); - - # First sort by to position, ascending (smallest to largest) - # then by from position (smallest to largest) - my @sorted = sort { $a->[1][$fromDbName] <=> $b->[1][$fromDbName] } - sort { $a->[1][$toDbName] <=> $b->[1][$toDbName] } values %{$regionDataHref}; - - # the tx starts ($self->from key) - my %startData; - - for my $data (@sorted) { - my $start = $data->[1][$fromDbName]; - push @{ $startData{$start} }, $data; - } - - # the tx ends ($self->to key) - my %endData; - - for my $data (@sorted) { - my $end = $data->[1][$toDbName]; - push @{ $endData{$end} }, $data; - } - - $self->log( 'info', $self->name . ": starting for $chr" ); - - # Get database length : assumes reference track already in the db - my $genomeNumberOfEntries = $self->db->dbGetNumberOfEntries($chr); - - if ( !$genomeNumberOfEntries ) { - $self->log( 'fatal', - $self->name - . " requires at least the reference track, to know how many bases in $chr" ); - } - - # Track the longest (further in db toward end of genome) txEnd, because - # in case of overlapping transcripts, want the points that ARENT - # covered by a gene (since those have apriori nearest records: themselves) - # This also acts as our starting position - # my $longestPreviousTxEnd = 0; - # my $longestPreviousTxNumber; - - my $midPoint; - - # We will combine overlapping transcripts here, and generate a unique txNumber - # for each combination - # This saves us having to walk transcript arrays to gather features at run time - # And also saves us having to write arrays to the main database, per genome position - # A big net win - my @globalTxData; - - my $previousLongestEnd; - my $previousTxNumber; - - my $cursor; - - my $count = 0; - - TXSTART_LOOP: for ( my $n = 0; $n < @sorted; $n++ ) { - my $start = $sorted[$n][1][$fromDbName]; - - # We are no longer tracking; see line ~ 506 - # my %completed; - - # if(ref $startData{$start}) { - # $longestEnd = max ( map { $_->[1][$toDbName]} ) @{$startData{$start}}; - # } - - # If > 1 transcript shares a start, txNumber will be an array of numbers - # of length 1 or more - # else will be a scalar, save some space in db, and reduce Perl memory growth - - # Assign a unique txNumber based on the overlap of transcripts - # Idempotent - my $txNumber = $uniqNumMaker->( $startData{$start} ); - - # If we're 1 short of the new txNumber (index), we have some unique data - # add the new item - if ( @globalTxData == $txNumber ) { - my $combinedValues = $uniqRegionEntryMaker->( $startData{$start} ); - - # write the region database, storing the region data at our sequential txNumber, allow us to release the data - $self->db->dbPut( $regionDbName, $txNumber, $combinedValues ); - - # we only need to store the longest end; only value that is needed below from combinedValues - push @globalTxData, $combinedValues->[$toDbName]; - } - - if ( defined $previousLongestEnd ) { - # Here we can assume that both $start and $longestPreviousEnd are both 0-based, closed - # and so the first intergenic base is + 1 of the longestPreviousTxEnd and - 1 of the current start - $midPoint = $previousLongestEnd + ( ( $start - $previousLongestEnd ) / 2 ); - } - - #### Accumulate txNumber or longestPreviousTxNumber for positions between transcripts #### - - # If we have no previous end or midpoint, we're starting from 0 index in db - # and moving until the $start - $previousLongestEnd //= -1; - $midPoint //= -1; - - # Consider/store intergenic things (note: if previousLongestEnd > $start, last tx overlapped this one) - if ( $self->storeNearest && $previousLongestEnd < $start ) { - # we force both the end and start to be 0-based closed, so start from +1 of previous end - # and - 1 of the start - POS_LOOP: for my $pos ( $previousLongestEnd + 1 .. $start - 1 ) { - $cursor //= $self->db->dbStartCursorTxn($chr); - - if ( $pos >= $midPoint ) { - #Args: $cursor, $chr, $dbName, $pos, $newValue - $self->db->dbPatchCursorUnsafe( $cursor, $chr, $self->dbName, $pos, $txNumber ); - } - else { - #Args: $cursor, $chr, $dbName, $pos, $newValue - $self->db->dbPatchCursorUnsafe( $cursor, $chr, $self->dbName, $pos, - $previousTxNumber ); - } - - if ( $count > $self->commitEvery ) { - $self->db->dbEndCursorTxn($chr); - undef $cursor; - - $count = 0; - } - - $count++; - } - } - - my $longestEnd = $globalTxData[$txNumber]; - - # If we want to store the stuff in the regions themselves, do that - if ( $self->storeOverlap ) { - # Remember, here longest end is 0-based, closed (last pos is the last 0-based - # position in the transcript) - for my $pos ( $start .. $longestEnd ) { - # We may clear this at any point, to prpevent db from over-growing - $cursor //= $self->db->dbStartCursorTxn($chr); - - # There may be overlaps between adjacent groups of transcripts - # Since we search for all overlapping transcripts for every position - # once we've visited one position, we need never visit it again - # However, let's say we got this wrong... it wouldn't actually matter - # except it would cost us a bit of performance - # Because the txNumber we get for the overlap simply wouldn't be unique. - # And we would overwrite it in the database, or more likely skip it - # if the overwrite flag isn't set. - # So don't check for $completed{$pos}, which requires setting global state - # and leaks/uses a tremendous amount of memory - # if($completed{$pos}) { - # next; - # } - - my @overlap; - - # We investigate everything from the present tx down; - I_LOOP: for ( my $i = $n; $i < @sorted; $i++ ) { - my $iFrom = $sorted[$i]->[1][$fromDbName]; - my $iTo = $sorted[$i]->[1][$toDbName]; - - if ( $pos >= $iFrom && $pos <= $iTo ) { - push @overlap, $sorted[$i]; - } - elsif ( $iFrom > $pos ) { - last I_LOOP; - } - } - - if ( !@overlap ) { - say STDERR "no overlap for the $chr\:$pos"; - next; - } - - # Make a unique overlap combination - my $txNumber = $uniqNumMaker->( \@overlap ); - - # If we're 1 short of the new txNumber (index), we have some unique data - # add the new item - if ( @globalTxData == $txNumber ) { - my $combinedValues = $uniqRegionEntryMaker->( \@overlap ); - - # write the region database, storing the region data at our sequential txNumber, allow us to release the data - $self->db->dbPut( $regionDbName, $txNumber, $combinedValues ); - - # we only need to store the longest end; only value that is needed from combinedValues - push @globalTxData, $combinedValues->[$toDbName]; - } - - # Assign the transcript number; args: $cursor, $chr, $dbName, $pos, $newValue) - $self->db->dbPatchCursorUnsafe( $cursor, $chr, $self->dbName, $pos, $txNumber ); - - if ( $count > $self->commitEvery ) { - $self->db->dbEndCursorTxn($chr); - undef $cursor; - - $count = 0; - } - - $count++; - } - } - - ###### Store the previous values for the next loop's midpoint calc ###### - my $longestEndTxNumber = $uniqNumMaker->( $endData{$longestEnd} ); - - if ( @globalTxData == $longestEndTxNumber ) { - my $combinedValues = $uniqRegionEntryMaker->( $endData{$longestEnd} ); - - # write the region database, storing the region data at our sequential txNumber, allow us to release the data - $self->db->dbPut( $regionDbName, $longestEndTxNumber, $combinedValues ); - - # we only need to store the longest end; only value that is needed from combinedValues - # TODO: Should this be $longestEnd? - push @globalTxData, $combinedValues->[$toDbName]; - } - - $previousTxNumber = $longestEndTxNumber; - $previousLongestEnd = $longestEnd; - } - - if ( $self->storeNearest ) { - # Once we've reached the last transcript, we still likely have some data remaining - END_LOOP: for my $pos ( $previousLongestEnd + 1 .. $genomeNumberOfEntries - 1 ) { - # We may clear this at any point, to prpevent db from over-growing - $cursor //= $self->db->dbStartCursorTxn($chr); - - #Args: $cursor, $chr, $dbName, $pos, $newValue) - $self->db->dbPatchCursorUnsafe( $cursor, $chr, $self->dbName, $pos, - $previousTxNumber ); - - if ( $count > $self->commitEvery ) { - $self->db->dbEndCursorTxn($chr); - undef $cursor; - - $count = 0; - } - - $count++; - } - } - - $self->db->dbEndCursorTxn($chr); - undef $cursor; - - $self->db->cleanUp(); - - $self->log( 'info', $self->name . ": finished for $chr" ); -} - -sub _getTxNumber { - my $txNumber = 0; - my %uniqCombos; - - return sub { - my $numAref = shift; - - # not as clean as passing the data we actually want, but maybe uses less mem - # Use pack to try to make smaller key - # sort isn't necessary unless we don't trust the caller to give us - # pre-sorted data - # so...to reduce bug burden from future changes, sort here. - my $hash = join( '_', sort { $a <=> $b } map { $_->[0] } @$numAref ); - - if ( defined $uniqCombos{$hash} ) { - return $uniqCombos{$hash}; - } - - $uniqCombos{$hash} = $txNumber; - - $txNumber++; - - return $uniqCombos{$hash}; - } -} - -# Takes region data from multiple regions, -# sets it so that each feature contains all regions' data for that feature -# than compresses each feature to a single value if all regions' values are duplicates -# then removes all duplicate regions -# then, if only one region remains -# represents that region in a single depth array, rather depth N -# for N regions -# Ex: -# if we have features name and naem2 -# where name = [val1, val2], and name2 = [someName2, someName2] -# name2 = [someName2], and not someName (scalar), and -# name2 = [someName2, someOtherName2] when > 1 unique value -# This is done because features aren't guaranteed to be scalar -# Let's say we have a tissue expression feature -# ie [[kidney, spleen, pons, medula], [kidney, spleen, pons, medula]] -# we can, without -# loss of information, compress to -# expression = [[kidney, spleen, pons, medulla]] -# but not expression = [kidney, spleen, pons, medulla], -# as this would lose the order with respect to transcript -# However, we can flatten the arrays in some cases -# ex: name = [someName] , name2 = [someName2], expression = [[kidney, pons]] -# then we can say: name = someName , name2 = someName2, expression = [kidney, pons] -# again without loss of information -# TODO: This does make parsing more difficult -# So we may want to re-evaluate the flattening to scalars -# Since our primary use of annotation data is to serialize it -# and we currently do that using a simple tab-delimited schema -# and ALWAYS expect no more than array depth 3, I don't think it presents much of a -# parsing challenge -sub _makeUniqueRegionData { - my ( $fromDbName, $toDbName, $featureKeysAref ) = @_; - - my @featureKeys = @$featureKeysAref; - my @nonFromToFeatures; - - for my $feat (@featureKeys) { - if ( $feat != $fromDbName && $feat != $toDbName ) { - push @nonFromToFeatures, $feat; - } - } - - return sub { - my $aRef = shift; - - my %dup; - - my @out; - - my $minFrom; - my $maxTo; - # Assumes arrays of equal length - #Expects val to have: - for my $val (@$aRef) { - # Because we calculate uniqueness based all keys but from and to - # it is important to calculate the min and max bounds here - # as the consumer expects the maximum overlapping regions - if ( !defined $minFrom || $minFrom > $val->[1][$fromDbName] ) { - $minFrom = $val->[1][$fromDbName]; - } - - if ( !defined $maxTo || $maxTo < $val->[1][$toDbName] ) { - $maxTo = $val->[1][$toDbName]; - } - - # Figure out what is unique by building an array that does not include - # the to and from positoins - # Since the md5 functon will complain about sparse arrays - # fill missing values with "" during the md5 check - # However, in the final, unique output, undefined values will remain undefined - my @nonFromTo; - - for my $i (@nonFromToFeatures) { - #not as clean as having $aRef contain only the [1] values, but maybe less meme - push @nonFromTo, defined $val->[1][$i] ? $val->[1][$i] : ""; - } - - # not sure if using md5 would be of any benefit, except to shorten key length potentially - # however, this requires that the binary string, if stringified to its utf8 representation - # if done, doesn't lose information (for instance, printing such strings isn't guaranteed) - # to be handled well - # in tests, looks ok - my $hash = $mp->pack( \@nonFromTo ); - - if ( $dup{$hash} ) { - next; - } - - $dup{$hash} = 1; - - for my $intKey (@featureKeys) { - push @{ $out[$intKey] }, $val->[1][$intKey]; - } - } - - my @uniqInner; - my %seen; - for my $intKey (@featureKeys) { - if ( !ref $out[$intKey] ) { - next; - } - - my %seen; - my @uniqInner; - - # If the first element is a reference, then we have an array of arrays - # Lets generate hashes of everything and see if they're unique - if ( @{ $out[$intKey] } > 1 ) { - for my $val ( @{ $out[$intKey] } ) { - my $hash = ref $val ? $mp->pack($val) : ( $val || '' ); - - if ( $seen{$hash} ) { - next; - } - - push @uniqInner, $val; - $seen{$hash} = 1; - } - - # Only if all vals are duplicate can we compress and retain all information - # ie order with respect to element with the greatest amount of entropy - # ex: if we have 2 genes [gene1, gene2, gene3], we needed to make sure that - # we either have [val1, val2, vale] for the Nth feature, or [val1] in case - # val1, val2, and val3 are duplicates - # If only val1 and val2 are duplicates, then we must keep [val1, val2, val3] - if ( @uniqInner == 1 ) { - # Cannot use $uniqInnr[0] because we may not always be able to distinguish - # multiple scalar values from N overlapping regions - # or a single region with N values - $out[$intKey] = \@uniqInner; - } - } - } - - # We keep everything all features in [[val1 ,.. valN]] form - # If all features have only 1 val, no need to store arrays, scalars will suffice - my $maxKeys = 1; - for my $i (@nonFromToFeatures) { - if ( @{ $out[$i] } != 1 ) { - $maxKeys = 100; - last; - } - } - - if ( $maxKeys == 1 ) { - for my $i (@nonFromToFeatures) { - $out[$i] = $out[$i][0]; - } - } - - # The from and to fields are reduce to min/max; i.e the widest interval they occupy - # Since these don't nec. share a start, choose the min start for the overlap - # Because we hash/check uniqueness of all values but these values, important to - # calculate the minFrom and maxTo based on this input data, not on the - # non-redundant data - $out[$fromDbName] = $minFrom; - $out[$toDbName] = $maxTo; - - return \@out; - } -} -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/README.md b/perl/lib/Seq/Tracks/README.md deleted file mode 100644 index 4da3fdc12..000000000 --- a/perl/lib/Seq/Tracks/README.md +++ /dev/null @@ -1,394 +0,0 @@ -Tracks ---- ---- -## Track Types - -We expose 3 kinds of general purpose tracks -### General Tracks - -1. Sparse (sparse) - - These are any tracks that have unique info per base - - These must be .bed - like format, containing the following fields (no other fields are required) - + chrom - + chromStart - + chromEnd - - Ex: snp146 - -2. Score (score) - - Any fixed wiggle format track - + ex: PhyloP, PhastCons - -We also have several "private" track types. These are still defined in the config file, but are just our special implementations of the above 3. - -### Special Tracks -These are special cases of the above tracks - -1. Reference: the genome assembly (*Only 1 per configuration file*) - - Accepts: multi-fasta file - -2. Gene: - - Accepts: UCSC gene track, such as refGene - - Stores: any features defined in the track configuration, that are present in the source file\ - -3. CADD - - Accepts: - - CADD format file (1-based) (http://krishna.gs.washington.edu/download/CADD/v1.3/whole_genome_SNVs.tsv.gz) - - Bed-like format, where first 3 header columns (after version line) are chrom, chromStart, chromEnd (0-based, half-open format) - -# Building tracks -#### Tracks are stored in a YAML configuration file, such as the file below -```yaml ---- -assembly: hg19 -build_author: ec2-user -build_date: 2017-02-08T03:01:00 -chromosomes: -- chr1 -- chr2 -- chr3 -- chr4 -- chr5 -- chr6 -- chr7 -- chr8 -- chr9 -- chr10 -- chr11 -- chr12 -- chr13 -- chr14 -- chr15 -- chr16 -- chr17 -- chr18 -- chr19 -- chr20 -- chr21 -- chr22 -- chrM -- chrX -- chrY -database_dir: /path/to/somewhere/ -files_dir: /path/to/somewhere/ -statistics: - dbSNPnameField: dbSNP.name - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tab - tab: .statistics.tab - refTrackField: ref - siteTypeField: refSeq.siteType -temp_dir: ~ #Optional -tracks: -- build_author: ec2-user - build_date: 2017-02-08T03:01:00 - fetch_date: 2017-02-04T22:36:00 - local_files: - - chr*.fa.gz - name: ref - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/hg19/chromosomes/ - remote_files: - - chr1.fa.gz - - chr2.fa.gz - - chr3.fa.gz - - chr4.fa.gz - - chr5.fa.gz - - chr6.fa.gz - - chr7.fa.gz - - chr8.fa.gz - - chr9.fa.gz - - chr10.fa.gz - - chr11.fa.gz - - chr12.fa.gz - - chr13.fa.gz - - chr14.fa.gz - - chr15.fa.gz - - chr16.fa.gz - - chr17.fa.gz - - chr18.fa.gz - - chr19.fa.gz - - chr20.fa.gz - - chr21.fa.gz - - chr22.fa.gz - - chrM.fa.gz - - chrX.fa.gz - - chrY.fa.gz - type: reference - version: 2 -- build_author: ec2-user - build_date: 2017-02-08T03:01:00 - features: - - kgID - - mRNA - - spID - - spDisplayID - - geneSymbol - - refseq - - protAcc - - description - - rfamAcc - - name - fetch_date: 2017-02-04T17:06:00 - join: - features: - - PhenotypeIDS - - OtherIDs - track: clinvar - local_files: - - hg19.refGene.chr*.gz - name: refSeq - nearest: - - name - - geneSymbol - sql_statement: SELECT * FROM hg19.refGene LEFT JOIN hg19.kgXref ON hg19.kgXref.refseq - = hg19.refGene.name - type: gene - version: 2 -- build_author: ec2-user - build_date: 2017-02-08T03:01:00 - fetch_date: 2017-02-04T16:52:00 - local_files: - - chr*.phastCons100way.wigFix.gz - name: phastCons - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/hg19/phastCons100way/hg19.100way.phastCons/ - remote_files: - - chr1.phastCons100way.wigFix.gz - - chr2.phastCons100way.wigFix.gz - - chr3.phastCons100way.wigFix.gz - - chr4.phastCons100way.wigFix.gz - - chr5.phastCons100way.wigFix.gz - - chr6.phastCons100way.wigFix.gz - - chr7.phastCons100way.wigFix.gz - - chr8.phastCons100way.wigFix.gz - - chr9.phastCons100way.wigFix.gz - - chr10.phastCons100way.wigFix.gz - - chr11.phastCons100way.wigFix.gz - - chr12.phastCons100way.wigFix.gz - - chr13.phastCons100way.wigFix.gz - - chr14.phastCons100way.wigFix.gz - - chr15.phastCons100way.wigFix.gz - - chr16.phastCons100way.wigFix.gz - - chr17.phastCons100way.wigFix.gz - - chr18.phastCons100way.wigFix.gz - - chr19.phastCons100way.wigFix.gz - - chr20.phastCons100way.wigFix.gz - - chr21.phastCons100way.wigFix.gz - - chr22.phastCons100way.wigFix.gz - - chrX.phastCons100way.wigFix.gz - - chrY.phastCons100way.wigFix.gz - - chrM.phastCons100way.wigFix.gz - type: score - version: 2 -- build_author: ec2-user - build_date: 2017-02-08T03:01:00 - fetch_date: 2017-02-03T20:57:00 - local_files: - - chr*.phyloP100way.wigFix.gz - name: phyloP - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way/ - remote_files: - - chr1.phyloP100way.wigFix.gz - - chr2.phyloP100way.wigFix.gz - - chr3.phyloP100way.wigFix.gz - - chr4.phyloP100way.wigFix.gz - - chr5.phyloP100way.wigFix.gz - - chr6.phyloP100way.wigFix.gz - - chr7.phyloP100way.wigFix.gz - - chr8.phyloP100way.wigFix.gz - - chr9.phyloP100way.wigFix.gz - - chr10.phyloP100way.wigFix.gz - - chr11.phyloP100way.wigFix.gz - - chr12.phyloP100way.wigFix.gz - - chr13.phyloP100way.wigFix.gz - - chr14.phyloP100way.wigFix.gz - - chr15.phyloP100way.wigFix.gz - - chr16.phyloP100way.wigFix.gz - - chr17.phyloP100way.wigFix.gz - - chr18.phyloP100way.wigFix.gz - - chr19.phyloP100way.wigFix.gz - - chr20.phyloP100way.wigFix.gz - - chr21.phyloP100way.wigFix.gz - - chr22.phyloP100way.wigFix.gz - - chrX.phyloP100way.wigFix.gz - - chrY.phyloP100way.wigFix.gz - - chrM.phyloP100way.wigFix.gz - type: score - version: 2 -- build_author: ec2-user - build_date: 2017-02-08T03:01:00 - local_files: - - whole_genome_SNVs.tsv.bed.chr*.organized-by-chr.txt.sorted.txt.gz - name: cadd - sort_date: 2017-01-20T16:06:00 - sorted: 1 - type: cadd - version: 2 -- build_author: ec2-user - build_date: 2017-02-08T03:01:00 - build_field_transformations: - alleleFreqs: split [,] - alleleNs: split [,] - alleles: split [,] - func: split [,] - observed: split [\/] - features: - - name - - strand - - observed - - class - - func - - alleles - - alleleNs: number - - alleleFreqs: number - fetch_date: 2017-02-04T21:35:00 - local_files: - - hg19.snp147.chr*.gz - name: dbSNP - sql_statement: SELECT * FROM hg19.snp147 - type: sparse - version: 2 -- based: 1 - build_author: ec2-user - build_date: 2017-02-08T03:01:00 - build_field_transformations: - Chromosome: chr . - OtherIDs: split [;,] - PhenotypeIDS: split [;,] - build_row_filters: - Assembly: == GRCh37 - Chromosome: != MT - build_field_filters: - Assembly: == GRCh37 - Chromosome: != MT - fieldMap: - '#AlleleID': alleleID - AlternateAllele: alternateAllele - Chromosome: chrom - ClinicalSignificance: clinicalSignificance - Origin: origin - OtherIDs: otherIDs - PhenotypeIDS: phenotypeIDs - PhenotypeList: phenotypeList - ReferenceAllele: referenceAllele - ReviewStatus: reviewStatus - Start: chromStart - Stop: chromEnd - Type: type - features: - - alleleID: number - - phenotypeList - - clinicalSignificance - - type - - origin - - reviewStatus - - otherIDs - - phenotypeIDs - - referenceAllele - - alternateAllele - fetch_date: 2017-02-04T16:51:00 - local_files: - - variant_summary.txt.gz - name: clinvar - remote_files: - - ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz - required_fields_map: - chrom: Chromosome - chromEnd: Stop - chromStart: Start - type: sparse - version: 2 -version: 2 -``` - -A genome build can be run by executing the ./bin/build_genome_assembly.pl program using the configuration file - -# YAML properties -## features -#### A list of field nams, either the original found in the source file, or the renamed value given in fieldMap - -##### Ex1: original field name -```yaml - features: - - ClinicalSignificance #Must exist in the source file -``` - -##### Ex2: renamed field -```yaml -fieldMap: - '#AlleleID': alleleID - AlternateAllele: alternateAllele - Chromosome: chrom - ClinicalSignificance: clinicalSignificance - Origin: origin - OtherIDs: otherIDs - PhenotypeIDS: phenotypeIDs - PhenotypeList: phenotypeList - ReferenceAllele: referenceAllele - ReviewStatus: reviewStatus - Start: chromStart - Stop: chromEnd - Type: type - features: - # If fieldMap is defined, the features specified here should be the renamed values - # Since 'AlleleID' was renamed alleledID, use that name - - alleleID: number #Note that a field data type can be specified after the colon -``` -# General YAML build configuration properties -## build_row_filters -#### Simple boolean opeartions that determine whether or not a row will be included in the database -##### Warning: Does not accept renamed fields, unlike features - -##### Ex: -```yaml -# Don't include any rows in the source file, whose Assembly equals GRCh37, or Chromosome equals MT -build_row_filters: - Assembly: == GRCh37 - Chromosome: != MT -``` -## build_field_transformations -#### Modify the value of any field -##### Current operations: - * split - * split the field, on any regular expression, in the form of "split rePattern" - * "." - * concatenate the value of any field to some string, in the form of ". somestring" or "somestring ." for prepend/append resp. - -#### Ex: -```yaml -build_field_transformations: - # If fieldMap is used for this track, these field names should be the renamed field names - chrom: chr . - clinicalSignificance: split [;] - otherIDs: split [;,] - phenotypeIDs: split [;,] - phenotypeList: split [;] -``` - -# Gene track-specific YAML configuration properties -## join -#### Allows you to add any track to the gene track -##### (currently joined track must define features, so sparse or gene) - -##### Ex: -```yaml -join: - # These should match features defined in the clinvar track "features" property - features: - - phenotypeIDs - - otherIDs - - alleleID - track: clinvar -``` - -## nearest -#### If no transcript exists at a given position store the nearest transcript -##### When transcripts are equidistant, the downstream one is chosen -##### If the nearest transcripts are overlapping (multiple transcripts at one location), all of them will be stored - -##### Ex: -```yaml -nearest: - - name - - geneSymbol -``` diff --git a/perl/lib/Seq/Tracks/Reference.pm b/perl/lib/Seq/Tracks/Reference.pm deleted file mode 100644 index dcd822361..000000000 --- a/perl/lib/Seq/Tracks/Reference.pm +++ /dev/null @@ -1,33 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Reference; - -our $VERSION = '0.001'; - -# ABSTRACT: The getter for the reference track -# VERSION - -use Mouse 2; - -use namespace::autoclean; - -use Seq::Tracks::Reference::MapBases; - -state $baseMapper = Seq::Tracks::Reference::MapBases->new(); -state $baseMapInverse = $baseMapper->baseMapInverse; - -extends 'Seq::Tracks::Get'; - -sub get { - # $_[0] == $self; $_[1] = dbDataAref - # $self->{_dbName} inherited from Seq::Tracks::Get - # not declared here because putting in a builder here results in - # "Oops Destroying Active Enviroment in LMDB_File - return $baseMapInverse->[ $_[1]->[ $_[0]->{_dbName} ] ]; -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Reference/Build.pm b/perl/lib/Seq/Tracks/Reference/Build.pm deleted file mode 100644 index 00e4ad664..000000000 --- a/perl/lib/Seq/Tracks/Reference/Build.pm +++ /dev/null @@ -1,192 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Reference::Build; - -our $VERSION = '0.001'; - -# ABSTRACT: Builds a plain text genome used for binary genome creation -# VERSION - -use Mouse 2; -use namespace::autoclean; -extends 'Seq::Tracks::Build'; - -use Seq::Tracks::Reference::MapBases; - -use Parallel::ForkManager; - -my $baseMapper = Seq::Tracks::Reference::MapBases->new(); - -sub buildTrack { - my $self = shift; - - my $headerRegex = qr/\A>([\w\d]+)/; - my $dataRegex = qr/(\A[ATCGNatcgn]+)\z/xms; - - my $pm = Parallel::ForkManager->new( $self->maxThreads ); - - my %completedChrs; - $pm->run_on_finish( - sub { - my ( $pid, $exitCode, $fileName, $exitSignal, $coreDump, $errOrChrs ) = @_; - - if ( $exitCode != 0 ) { - my $err = $self->name - . ": got exitCode $exitCode for $fileName: $exitSignal . Dump: $coreDump"; - - $self->log( 'fatal', $err ); - } - - if ( $errOrChrs && ref $errOrChrs eq 'HASH' ) { - for my $chr ( keys %$errOrChrs ) { - if ( !$completedChrs{$chr} ) { - $completedChrs{$chr} = [$fileName]; - } - else { - push @{ $completedChrs{$chr} }, $fileName; - } - } - } - - #Only message that is different, in that we don't pass the $fileName - $self->log( 'info', $self->name . ": completed building from $fileName" ); - } - ); - - for my $file ( $self->allLocalFiles ) { - # Expects 1 chr per file for n+1 files, or all chr in 1 file - # Single writer to reduce copy-on-write db inflation - $self->log( 'info', $self->name . ": Beginning building from $file" ); - - # Although this should be unnecessary, environments must be created - # within the process that uses them - # This provides a measure of safety - $self->db->cleanUp(); - - $pm->start($file) and next; - my ( $err, undef, $fh ) = $self->getReadFh($file); - - if ($err) { - $self->log( 'fatal', $self->name . ": $err" ); - } - - my $wantedChr; - - my $chrPosition = $self->based; - - my $count = 0; - # Record which chromosomes we've worked on - my %visitedChrs; - - my $cursor; - - FH_LOOP: while ( my $line = $fh->getline() ) { - #super chomp; also helps us avoid weird characters in the fasta data string - $line =~ s/^\s+|\s+$//g; #trim both ends, but not what's in between - - #could do check here for cadd default format - #for now, let's assume that we put the CADD file into a wigfix format - if ( $line =~ m/$headerRegex/ ) { #we found a wig header - my $chr = $1; - - if ( !$chr ) { - $self->log( 'fatal', $self->name . ": Require chr in fasta file headers" ); - die $self->name . ": Require chr in fasta file headers"; - } - - # Transforms $chr if it's not prepended with a 'chr' or is 'chrMT' or 'MT' - # and checks against our list of wanted chromosomes - $chr = $self->normalizedWantedChr->{$chr}; - - # falsy value is '' - if ( !defined $wantedChr || ( !defined $chr || $wantedChr ne $chr ) ) { - # We switched chromosomes - if ( defined $wantedChr ) { - # cleans up entire environment, commits/closes all cursors, syncs - $self->db->cleanUp(); - undef $cursor; - - $count = 0; - } - - $wantedChr = $self->chrWantedAndIncomplete($chr); - } - - # We expect either one chr per file, or a multi-fasta file that is sorted and contiguous - # TODO: Handle chrPerFile - if ( !defined $wantedChr ) { - next FH_LOOP; - } - - $visitedChrs{$wantedChr} //= 1; - - # Restart chrPosition count at 0, since assemblies are zero-based ($self->based defaults to 0) - # (or something else if the user based: allows non-reference fasta-formatted sources) - $chrPosition = $self->based; - - #don't store the header line - next; - } - - # If !$wantedChr we're likely in a mult-fasta file; could warn, but that spoils multi-threaded reads - if ( !defined $wantedChr ) { - next; - } - - if ( $line =~ $dataRegex ) { - # Store the uppercase bases; how UCSC does it, how people likely expect it - for my $char ( split '', uc($1) ) { - $cursor //= $self->db->dbStartCursorTxn($wantedChr); - - #Args: $cursor, $chr, $trackIndex, $pos, $newValue - $self->db->dbPatchCursorUnsafe( $cursor, $wantedChr, $self->dbName, $chrPosition, - $baseMapper->baseMap->{$char} ); - - if ( $count > $self->commitEvery ) { - $self->db->dbEndCursorTxn($wantedChr); - undef $cursor; - - $count = 0; - } - - $count++; - - #must come after, to not be 1 off; assumes fasta file is sorted ascending contiguous - $chrPosition++; - } - } - } - - #Commit, sync everything, including completion status, commit cursors, and release mmap - $self->db->cleanUp(); - undef $cursor; - - #13 is sigpipe, occurs if closing pipe before cat/pigz finishes - $self->safeCloseBuilderFh( $fh, $file, 'fatal' ); - - #exit with exit code 0; this only happens if successfully completed - $pm->finish( 0, \%visitedChrs ); - } - - $pm->wait_all_children(); - - for my $chr ( keys %completedChrs ) { - $self->completionMeta->recordCompletion($chr); - - $self->log( 'info', - $self->name - . ": recorded $chr completed, from " - . ( join( ",", @{ $completedChrs{$chr} } ) ) ); - } - - #TODO: figure out why this is necessary, even with DEMOLISH - $self->db->cleanUp(); - - return; -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Reference/MapBases.pm b/perl/lib/Seq/Tracks/Reference/MapBases.pm deleted file mode 100644 index fb3e63c5e..000000000 --- a/perl/lib/Seq/Tracks/Reference/MapBases.pm +++ /dev/null @@ -1,32 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Reference::MapBases; -use Mouse 2; -use namespace::autoclean; - -# Abstract: Maps bases to integers, which saves space in the db -# Encodes lowercase or uppercase letts into 0 - 4 byte, returns only uppercase letters -state $baseMap = - { N => 0, A => 1, C => 2, G => 3, T => 4, n => 0, a => 1, c => 2, g => 3, t => 4 }; -has baseMap => ( - is => 'ro', - isa => 'HashRef', - init_arg => undef, - lazy => 1, - default => sub { $baseMap } -); - -has baseMapInverse => ( - is => 'ro', - isa => 'ArrayRef', - init_arg => undef, - lazy => 1, - default => sub { - return [ 'N', 'A', 'C', 'G', 'T' ]; - } -); - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Region.pm b/perl/lib/Seq/Tracks/Region.pm deleted file mode 100644 index ae1981ae8..000000000 --- a/perl/lib/Seq/Tracks/Region.pm +++ /dev/null @@ -1,13 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Region; -use Mouse 2; - -our $VERSION = '0.001'; - -#Finish if needed -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Region/Build.pm b/perl/lib/Seq/Tracks/Region/Build.pm deleted file mode 100644 index a4c396191..000000000 --- a/perl/lib/Seq/Tracks/Region/Build.pm +++ /dev/null @@ -1,15 +0,0 @@ -#TODO: Write the general version, using the work done on the GeneTrack build method -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Region::Build; -use Mouse 2; - -our $VERSION = '0.001'; - -#Finish if needed - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Region/RegionTrackPath.pm b/perl/lib/Seq/Tracks/Region/RegionTrackPath.pm deleted file mode 100644 index 44fee4f9a..000000000 --- a/perl/lib/Seq/Tracks/Region/RegionTrackPath.pm +++ /dev/null @@ -1,18 +0,0 @@ -# Handles items common to Region tracks -package Seq::Tracks::Region::RegionTrackPath; -use 5.16.0; -use strict; -use warnings; - -use Mouse::Role; - -requires 'name'; - -sub regionTrackPath { - my ( $self, $chr ) = @_; - - return $self->name . "/$chr"; -} - -no Mouse::Role; -1; diff --git a/perl/lib/Seq/Tracks/Score.pm b/perl/lib/Seq/Tracks/Score.pm deleted file mode 100644 index 8696e55cc..000000000 --- a/perl/lib/Seq/Tracks/Score.pm +++ /dev/null @@ -1,48 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Score; - -our $VERSION = '0.001'; - -# ABSTRACT: The getter for any score track -# VERSION - -use Mouse 2; -use namespace::autoclean; - -extends 'Seq::Tracks::Get'; - -has scalingFactor => ( is => 'ro', isa => 'Int', default => 100 ); - -sub BUILD { - my $self = shift; - - # purely to save accessor time - $self->{_s} = $self->scalingFactor; - - #Provided by Seq::Tracks::Get - #$self->{_dbName} = $self->dbName; -} - -sub get { - #my ($self, $href, $chr, $refBase, $allele, $outAccum, $alleleNumber) = @_ - # $_[0] == $self - # $_[1] == $href : the database data, with each top-level index corresponding to a track - # $_[2] == $chr : the chromosome - # $_[3] == $refBase : ACTG - # $_[4] == $allele : the allele (ACTG or -N / +ACTG) - # $_[5] == $positionIdx : the position in the indel, if any - # $_[6] == $outAccum : a reference to the output, which we mutate - - $_[6][ $_[5] ] = - defined $_[1]->[ $_[0]->{_dbName} ] - ? $_[1]->[ $_[0]->{_dbName} ] / $_[0]->{_s} - : undef; - - return $_[6]; -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Score/Build.pm b/perl/lib/Seq/Tracks/Score/Build.pm deleted file mode 100644 index a7c361204..000000000 --- a/perl/lib/Seq/Tracks/Score/Build.pm +++ /dev/null @@ -1,214 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Score::Build; - -our $VERSION = '0.001'; - -# ABSTRACT: Build a sparse track file -# VERSION - -use Mouse 2; - -use namespace::autoclean; -use Parallel::ForkManager; - -extends 'Seq::Tracks::Build'; - -use Seq::Tracks::Score::Build::Round; - -# score track could potentially be 0 based -# http://www1.bioinf.uni-leipzig.de/UCSC/goldenPath/help/wiggle.html -# if it is the BED format version of the WIG format. -has '+based' => ( default => 1, ); - -has scalingFactor => ( is => 'ro', isa => 'Int', default => 100 ); - -sub BUILD { - my $self = shift; - - $self->{_rounder} = - Seq::Tracks::Score::Build::Round->new( { scalingFactor => $self->scalingFactor } ); -} - -sub buildTrack { - my $self = shift; - - my $fStep = 'fixedStep'; - my $vStep = 'variableStep'; - my $headerRegex = qr/^($fStep|$vStep)\s+chrom=(\S+)\s+start=(\d+)\s+step=(\d+)/; - - my @allChrs = $self->allLocalFiles; - - #Can't just set to 0, because then the completion code in run_on_finish won't run - my $pm = Parallel::ForkManager->new( $self->maxThreads ); - - my %completedChrs; - $pm->run_on_finish( - sub { - my ( $pid, $exitCode, $fileName, $exitSignal, $coreDump, $errOrChrs ) = @_; - - if ( $exitCode != 0 ) { - my $err = $self->name - . ": got exitCode $exitCode for $fileName: $exitSignal . Dump: $coreDump"; - - $self->log( 'fatal', $err ); - } - - if ( $errOrChrs && ref $errOrChrs eq 'HASH' ) { - for my $chr ( keys %$errOrChrs ) { - if ( !$completedChrs{$chr} ) { - $completedChrs{$chr} = [$fileName]; - } - else { - push @{ $completedChrs{$chr} }, $fileName; - } - } - } - - #Only message that is different, in that we don't pass the $fileName - $self->log( 'info', $self->name . ": completed building from $fileName" ); - } - ); - - for my $file ( $self->allLocalFiles ) { - $self->log( 'info', $self->name . ": beginning to build from $file" ); - - # Although this should be unnecessary, environments must be created - # within the process that uses them - # This provides a measure of safety - $self->db->cleanUp(); - - $pm->start($file) and next; - my ( $err, undef, $fh ) = $self->getReadFh($file); - - if ($err) { - $self->log( 'fatal', $self->name . ": $err" ); - } - - my $wantedChr; - my $chrPosition; # absolute by default, 0 index - - my $step; - my $stepType; - - my $based = $self->based; - - # Which chromosomes we've seen, for recording completionMeta - my %visitedChrs; - - # We use "unsafe" writers, whose active cursors we need to track - my $cursor; - my $count = 0; - - FH_LOOP: while (<$fh>) { - #super chomp; #trim both ends, but not what's in between - $_ =~ s/^\s+|\s+$//g; - - if ( $_ =~ m/$headerRegex/ ) { - my $chr = $2; - - $step = $4; - $stepType = $1; - - my $start = $3; - - if ( !$chr && $step && $start && $stepType ) { - $self->log( 'fatal', - $self->name . ": require chr, step, start, and step type fields in wig header" ); - die $self->name . ": require chr, step, start, and step type fields in wig header"; - } - - if ( $stepType eq $vStep ) { - $self->log( 'fatal', $self->name . ": variable step not currently supported" ); - die $self->name . ": variable step not currently supported"; - } - - # Transforms $chr if it's not prepended with a 'chr' or is 'chrMT' or 'MT' - # and checks against our list of wanted chromosomes - $chr = $self->normalizedWantedChr->{$chr}; - - # falsy value is '' - if ( !defined $wantedChr || ( !defined $chr || $wantedChr ne $chr ) ) { - if ( defined $wantedChr ) { - #Commit any remaining transactions, remove the db map from memory - #this also has the effect of closing all cursors - $self->db->cleanUp(); - undef $cursor; - - $count = 0; - } - - $wantedChr = $self->chrWantedAndIncomplete($chr); - } - - # TODO: handle chrPerFile - if ( !defined $wantedChr ) { - next FH_LOOP; - } - - # take the offset into account - $chrPosition = $start - $based; - - # Record what we've seen - $visitedChrs{$wantedChr} //= 1; - - #don't store the header in the database - next; - } - - # there could be more than one chr defined per file, just skip - # until we get to what we want - if ( !defined $wantedChr ) { - next; - } - - $cursor //= $self->db->dbStartCursorTxn($wantedChr); - - #Args: $cursor, $chr, $trackIndex, $pos, $trackValue - $self->db->dbPatchCursorUnsafe( $cursor, $wantedChr, $self->dbName, $chrPosition, - $self->{_rounder}->round($_) ); - - if ( $count > $self->commitEvery ) { - $self->db->dbEndCursorTxn($wantedChr); - undef $cursor; - - $count = 0; - } - - $count++; - - #this must come AFTER we store the position's data in db, since we have a starting pos - $chrPosition += $step; - } - - #Commit, sync everything, including completion status, and release mmap - $self->db->cleanUp(); - undef $cursor; - - $self->safeCloseBuilderFh( $fh, $file, 'fatal' ); - - $pm->finish( 0, \%visitedChrs ); - } - - $pm->wait_all_children(); - - for my $chr ( keys %completedChrs ) { - $self->completionMeta->recordCompletion($chr); - - $self->log( 'info', - $self->name - . ": recorded $chr completed, from " - . ( join( ",", @{ $completedChrs{$chr} } ) ) ); - } - - #TODO: figure out why this is necessary, even with DEMOLISH - $self->db->cleanUp(); - - return; -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Score/Build/Round.pm b/perl/lib/Seq/Tracks/Score/Build/Round.pm deleted file mode 100644 index 8db8ed8a2..000000000 --- a/perl/lib/Seq/Tracks/Score/Build/Round.pm +++ /dev/null @@ -1,23 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Score::Build::Round; -use Mouse 2; -use POSIX qw/lround/; - -#TODO: Allow configuratino through YAML - -has scalingFactor => ( is => 'ro', isa => 'Int', required => 1 ); - -sub round { - #my ($self, $value) = @_; - # ($_[0], $_[1] ) = @_; - - #We have updated Data::MessagePack to support, enforce single-precision floats - #So 5 bytes at most when prefer_float32() enabled - return lround( $_[1] * $_[0]->scalingFactor ); -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Seq/Tracks/Sparse.pm b/perl/lib/Seq/Tracks/Sparse.pm deleted file mode 100644 index 1b8affdf6..000000000 --- a/perl/lib/Seq/Tracks/Sparse.pm +++ /dev/null @@ -1,19 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Sparse; - -our $VERSION = '0.001'; - -# ABSTRACT: The getter for any sparse track -# VERSION - -use Mouse 2; -use namespace::autoclean; - -extends 'Seq::Tracks::Get'; - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Sparse/Build.pm b/perl/lib/Seq/Tracks/Sparse/Build.pm deleted file mode 100644 index cc3dba137..000000000 --- a/perl/lib/Seq/Tracks/Sparse/Build.pm +++ /dev/null @@ -1,560 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Sparse::Build; - -our $VERSION = '0.001'; - -=head1 DESCRIPTION - - @class Seq::Tracks::SparseTrack::Build - Builds any sparse track - -=cut - -# TODO: better error handling. Check for # completed, rather than # failed -# in some cases, errors thrown don't actually get caught with exitCode other than 0 -use Mouse 2; - -use namespace::autoclean; -use List::MoreUtils qw/firstidx/; -use Parallel::ForkManager; -use Scalar::Util qw/looks_like_number/; - -extends 'Seq::Tracks::Build'; - -# We assume sparse tracks have at least one feature; can remove this requirement -# But will need to update makeMergeFunc to not assume an array of values (at least one key => value) -has '+features' => ( required => 1 ); - -# Sparse tracks are typically quite small, with potentiall quite large values -# so to make optimal use of pages -# lets set a smaller default commitEvery -has '+commitEvery' => ( default => 1e3 ); - -#These cannot be overriden (change in api), but users can use fieldMap to rename any field -#in the input file -has chromField => - ( is => 'ro', isa => 'Str', init_arg => undef, lazy => 1, default => 'chrom' ); -has chromStartField => ( - is => 'ro', - isa => 'Str', - init_arg => undef, - lazy => 1, - default => 'chromStart' -); -has chromEndField => ( - is => 'ro', - isa => 'Str', - init_arg => undef, - lazy => 1, - default => 'chromEnd' -); - -# We skip entries that span more than this number of bases -has maxVariantSize => ( is => 'ro', isa => 'Int', lazy => 1, default => 32 ); - -################# Private ################ -# Only 0 based files should be half closed -has _halfClosedOffset => - ( is => 'ro', init_arg => undef, writer => '_setHalfClosedOffset' ); - -sub BUILD { - my $self = shift; - - $self->_setHalfClosedOffset( $self->based == 0 ? 1 : 0 ); - - if ( $self->based != 1 && $self->based != 0 ) { - $self->log( 'fatal', $self->name . ": SparseTracks expect based to be 0 or 1" ); - die $self->name . ": SparseTracks expect based to be 0 or 1"; - } -} - -sub buildTrack { - my $self = shift; - - my $pm = Parallel::ForkManager->new( $self->maxThreads ); - - # Get an instance of the merge function that closes over $self - # Note that tracking which positinos have been over-written will only work - # if there is one chromosome per file, or if all chromosomes are in one file - # At least until we share $madeIntoArray (in makeMergeFunc) between threads - # Won't be an issue in Go - my ( $mergeFunc, $cleanUpMerge ) = $self->makeMergeFunc(); - - my %completedDetails; - $pm->run_on_finish( - sub { - my ( $pid, $exitCode, $fileName, undef, undef, $errOrChrs ) = @_; - - if ( $exitCode != 0 ) { - my $err = $errOrChrs ? "due to: $$errOrChrs" : "due to an untimely demise"; - - $self->log( 'fatal', $self->name . ": Failed to build $fileName $err" ); - die $self->name . ": Failed to build $fileName $err"; - } - - for my $chr ( keys %$errOrChrs ) { - if ( !$completedDetails{$chr} ) { - $completedDetails{$chr} = [$fileName]; - } - else { - push @{ $completedDetails{$chr} }, $fileName; - } - } - - $self->log( 'info', $self->name . ": completed building from $fileName" ); - } - ); - - for my $file ( @{ $self->local_files } ) { - $self->log( 'info', $self->name . ": beginning building from $file" ); - - # Although this should be unnecessary, environments must be created - # within the process that uses them - # This provides a measure of safety - $self->db->cleanUp(); - - $pm->start($file) and next; - my ( $err, undef, $fh ) = $self->getReadFh($file); - - if ($err) { - $self->log( 'fatal', $self->name . ": $err" ); - } - - ############# Get Headers ############## - my $firstLine = <$fh>; - - # Support non-unix line endings - $err = $self->setLineEndings($firstLine); - - if ($err) { - $self->log( 'fatal', $self->name . ": $err" ); - } - - if ( !$firstLine ) { - $self->log( 'fatal', $self->name . ': failed to read header line' ); - } - - my ( $featureIdxHref, $reqIdxHref, $fieldsToTransformIdx, $fieldsToFilterOnIdx, - $numColumns ) - = $self->_getHeaderFields( $file, $firstLine, $self->features ); - ############## Read file and insert data into main database ############# - my $wantedChr; - - # Record which chromosomes were recorded for completionMeta - my %visitedChrs; - - my %fieldDbNames; - - my ( $invalid, $failedFilters, $tooLong ) = ( 0, 0, 0 ); - - # Faster insert: track cursors - my $cursor; - my $count = 0; - - my ( $chr, @fields, @sparseData, $start, $end ); - FH_LOOP: while ( my $line = $fh->getline() ) { - chomp $line; - - @fields = split( '\t', $line ); - - if ( !$self->_validLine( \@fields, $., $reqIdxHref, $numColumns ) ) { - $invalid++; - next FH_LOOP; - } - - if ( !$self->_passesFilter( $fieldsToFilterOnIdx, \@fields, $. ) ) { - $failedFilters++; - next FH_LOOP; - } - - $self->_transform( $fieldsToTransformIdx, \@fields ); - - # Normalizes the $chr representation to one we may want but did not specify - # Example: 1 becomes chr1, and is checked against our list of wanted chromosomes - # Avoids having to use a field transformation, since this may be very common - # and Bystro typical use is with UCSC-style chromosomes - # If the chromosome isn't wanted, $chr will be undefined - $chr = $self->normalizedWantedChr->{ $fields[ $reqIdxHref->{ $self->chromField } ] }; - - #If the chromosome is new, write any data we have & see if we want new one - if ( !defined $wantedChr || ( !defined $chr || $wantedChr ne $chr ) ) { - if ( defined $wantedChr ) { - #Commit, commit & close cursors, flush anything remaining to disk, release mapped memory - $self->db->cleanUp(); - undef $cursor; - - $count = 0; - } - - $wantedChr = $self->chrWantedAndIncomplete($chr); - } - - if ( !defined $wantedChr ) { - next FH_LOOP; - } - - ( $start, $end ) = $self->_getPositions( \@fields, $reqIdxHref ); - - if ( $end + 1 - $start > $self->maxVariantSize ) { - # TODO: think about adding this back in; results in far too many log messages - # $self->log('debug', "Line spans > " . $self->maxVariantSize . " skipping: $line"); - $tooLong++; - next FH_LOOP; - } - - # Collect all of the feature data as an array - # Coerce the field into the type specified for $name, if coercion exists - # Perl arrays auto-grow https://www.safaribooksonline.com/library/view/perl-cookbook/1565922433/ch04s04.html - @sparseData = (); - - # Get the field values after transforming them to desired types - FNAMES_LOOP: for my $name ( keys %$featureIdxHref ) { - my $value = $self->coerceFeatureType( $name, $fields[ $featureIdxHref->{$name} ] ); - - $fieldDbNames{$name} //= $self->getFieldDbName($name); - - # getFieldDbName will croak if it can't make or find a dbName - $sparseData[ $fieldDbNames{$name} ] = $value; - } - - for my $pos ( ( $start .. $end ) ) { - $cursor //= $self->db->dbStartCursorTxn($wantedChr); - - #Args: $cursor, $chr, $trackIndex, $pos, $trackValue, $mergeFunction - $self->db->dbPatchCursorUnsafe( $cursor, $wantedChr, $self->dbName, $pos, - \@sparseData, $mergeFunc ); - - if ( $count > $self->commitEvery ) { - $self->db->dbEndCursorTxn($wantedChr); - undef $cursor; - - $count = 0; - } - - $count++; - } - - undef @sparseData; - # Track affected chromosomes for completion recording - $visitedChrs{$wantedChr} //= 1; - } - - #Commit, sync everything, including completion status, commit/close cursors, and release mmap - $self->db->cleanUp(); - undef $cursor; - - $self->safeCloseBuilderFh( $fh, $file, 'fatal' ); - - $self->log( 'info', $self->name . ": $file closed with $?" ); - $self->log( 'info', $self->name . ": invalid lines found in $file: $invalid" ); - $self->log( 'info', - $self->name . ": lines that didn't pass filters in $file: $failedFilters" ); - $self->log( 'info', - $self->name - . ": lines that were longer than " - . $self->maxVariantSize - . " found in $file: $tooLong" ); - - $pm->finish( 0, \%visitedChrs ); - } - - $pm->wait_all_children(); - - # Defer recording completion state until all requested files visited, to ensure - # that if chromosomes are mis-sorted, we still build all that is needed - for my $chr ( keys %completedDetails ) { - $self->completionMeta->recordCompletion($chr); - - # cleanUpMerge placed here so that only after all files are processed do we - # drop the temporary merge databases - # so that if we have out-of-order chromosomes, we do not mishandle - # overlapping sites - $cleanUpMerge->($chr); - - $self->log( 'info', - $self->name - . ": recorded $chr completed, from " - . ( join( ",", @{ $completedDetails{$chr} } ) ) ); - } - - #TODO: figure out why this is necessary, even with DEMOLISH - $self->db->cleanUp(); - - return; -} - -# Unlike buildTrack, joinTrack does not use a length filter; huge CNVs will -# be stored -# @param $wantedPositionsAref : expects all wanted positions -sub joinTrack { - my ( $self, $wantedChr, $wantedPositionsAref, $wantedFeaturesAref, $callback ) = @_; - - if ( !$self->chrIsWanted($wantedChr) ) { - $self->log( 'fatal', - $self->name - . " join track: called with $wantedChr which is not in our config list of chromosomes" - ); - die $self->name - . " join track: called with $wantedChr which is not in our config list of chromosomes"; - } - - $self->log( 'info', $self->name . " join track: called for $wantedChr" ); - - for my $file ( $self->allLocalFiles ) { - my ( $err, undef, $fh ) = $self->getReadFh($file); - - if ($err) { - $self->log( 'fatal', $err ); - } - - ############# Get Headers ############## - my $firstLine = <$fh>; - - if ( !$firstLine ) { - $self->log( 'fatal', $self->name . ": couldn't read first line of $file" ); - } - - my ( $featureIdxHref, $reqIdxHref, $fieldsToTransformIdx, $fieldsToFilterOnIdx, - $numColumns ) - = $self->_getHeaderFields( $file, $firstLine, $wantedFeaturesAref ); - - my @allWantedFeatureIdx = keys %$featureIdxHref; - - my ( $invalid, $failedFilters ) = ( 0, 0 ); - - my ( $chr, @fields, %wantedData, $start, $end, $wantedStart, $wantedEnd ); - FH_LOOP: while ( my $line = $fh->getline() ) { - @fields = split( '\t', $line ); - - if ( !$self->_validLine( \@fields, $., $reqIdxHref, $numColumns ) ) { - $invalid++; - next FH_LOOP; - } - - if ( !$self->_passesFilter( $fieldsToFilterOnIdx, \@fields, $. ) ) { - $failedFilters++; - next FH_LOOP; - } - - $self->_transform( $fieldsToTransformIdx, \@fields ); - - # Transforms $chr if it's not prepended with a 'chr' or is 'chrMT' or 'MT' - # and checks against our list of wanted chromosomes - $chr = $self->normalizedWantedChr->{ $fields[ $reqIdxHref->{ $self->chromField } ] }; - - if ( !defined $chr || $chr ne $wantedChr ) { - # TODO: Rethink chrPerFile handling - # This should be safe, provided that chrPerFile is a manually-set flag - # in the YAML track config - if ( $self->chrPerFile ) { - $self->log( 'warn', - $self->name . "join track: chrs in file $file not wanted . Skipping" ); - - last FH_LOOP; - } - - next FH_LOOP; - } - - ( $start, $end ) = $self->_getPositions( \@fields, $reqIdxHref ); - - %wantedData = (); - FNAMES_LOOP: for my $name ( keys %$featureIdxHref ) { - my $value = $self->coerceFeatureType( $name, $fields[ $featureIdxHref->{$name} ] ); - - $wantedData{$name} = $value; - } - - for ( my $i = 0; $i < @$wantedPositionsAref; $i++ ) { - $wantedStart = $wantedPositionsAref->[$i][0]; - $wantedEnd = $wantedPositionsAref->[$i][1]; - - # Report anything larger than maxVariantSize, which at least partially overlaps the wanted interval - # The join tracks accumulate a large amount of useless (for my current use case) information - # namely, all of the single nucleotide variants that are already reported for a given position - # The real use of the join track currently is to report all of the really large variants when they - # overlap a gene, so let's do just that, by check against our maxVariantSize - if ( ( $start > $wantedEnd && $end > $wantedEnd ) - || ( $start < $wantedStart && $end < $wantedStart ) - || ( $end + 1 - $start <= $self->maxVariantSize ) ) - { - next; - } - - &$callback( \%wantedData, $i ); - undef %wantedData; - } - } - - $self->safeCloseBuilderFh( $fh, $file, 'fatal' ); - - $self->log( 'info', - $self->name . " join track: invalid lines found while joining on $file: $invalid" ); - $self->log( 'info', - $self->name - . " join track: lines that didn't pass filters while joining on $file: $failedFilters" - ); - } - - $self->log( 'info', $self->name . " join track: finished for $wantedChr" ); -} - -sub _getHeaderFields { - my ( $self, $file, $firstLine, $wantedFeaturesAref ) = @_; - - my @requiredFields = - ( $self->chromField, $self->chromStartField, $self->chromEndField ); - - chomp $firstLine; - - # If the user wanted to transform the input field names, do, so source field names match - # those expected by the track - my @fields = map { $self->fieldMap->{$_} || $_ } split( '\t', $firstLine ); - - my $numColumns = @fields; - - my %featureIdx; - my %reqIdx; - my %fieldsToTransformIdx; - my %fieldsToFilterOnIdx; - - # Which fields are required (chrom, chromStart, chromEnd) - REQ_LOOP: for my $field (@requiredFields) { - my $idx = firstidx { $_ eq $field } @fields; #returns -1 if not found - - if ( $idx > -1 ) { #bitwise complement, makes -1 0 - $reqIdx{$field} = $idx; - next REQ_LOOP; #label for clarity - } - - $self->log( 'fatal', - $self->name . ": required field $field missing in $file header" ); - die $self->name . ": required field $field missing in $file header"; - } - - # Which fields the user specified under "features" key in config file - FEATURE_LOOP: for my $fname (@$wantedFeaturesAref) { - my $idx = firstidx { $_ eq $fname } @fields; - - if ( $idx > -1 ) { #only non-0 when non-negative, ~0 > 0 - $featureIdx{$fname} = $idx; - next FEATURE_LOOP; - } - - $self->log( 'fatal', $self->name . ": feature $fname missing in $file header" ); - die $self->name . ": feature $fname missing in $file header"; - } - - # Which fields user wants to filter the value of against some config-defined value - FILTER_LOOP: for my $fname ( $self->allFieldsToFilterOn ) { - my $idx = firstidx { $_ eq $fname } @fields; - - if ( $idx > -1 ) { #only non-0 when non-negative, ~0 > 0 - $fieldsToFilterOnIdx{$fname} = $idx; - next FILTER_LOOP; - } - - $self->log( 'fatal', $self->name . ": feature $fname missing in $file header" ); - die $self->name . ": feature $fname missing in $file header"; - } - - # Which fields user wants to modify the values of in a config-defined way - TRANSFORM_LOOP: for my $fname ( $self->allFieldsToTransform ) { - my $idx = firstidx { $_ eq $fname } @fields; - - if ( $idx > -1 ) { #only non-0 when non-negative, ~0 > 0 - $fieldsToTransformIdx{$fname} = $idx; - next TRANSFORM_LOOP; - } - - $self->log( 'fatal', $self->name . ": feature $fname missing in $file header" ); - die $self->name . ": feature $fname missing in $file header"; - } - - return ( \%featureIdx, \%reqIdx, \%fieldsToTransformIdx, \%fieldsToFilterOnIdx, - $numColumns ); -} - -# TODO: think about adding back dubg logging for _validLine, _passesFilter -sub _validLine { - my ( $self, $fieldAref, $lineNumber, $reqIdxHref, $numColumns ) = @_; - - if ( @$fieldAref != $numColumns ) { - # $self->log('debug', "Line $lineNumber has fewer columns than expected, skipping"); - return; - } - - # Some files are misformatted, ex: clinvar's tab delimited - if ( !looks_like_number( $fieldAref->[ $reqIdxHref->{ $self->chromStartField } ] ) - || !looks_like_number( $fieldAref->[ $reqIdxHref->{ $self->chromEndField } ] ) ) - { - # $self->log('debug', "Line $lineNumber Start or stop doesn't look like a number, skipping"); - return; - } - - return 1; -} - -sub _transform { - my ( $self, $fieldsToTransformIdx, $fieldsAref ) = @_; - #If the user wants to modify the values of any fields, do that first - for my $fieldName ( $self->allFieldsToTransform ) { - $fieldsAref->[ $fieldsToTransformIdx->{$fieldName} ] = - $self->transformField( $fieldName, - $fieldsAref->[ $fieldsToTransformIdx->{$fieldName} ] ); - } -} - -sub _passesFilter { - my ( $self, $fieldsToFilterOnIdx, $fieldsAref, $lineNumber ) = @_; - # Then, if the user wants to exclude rows that don't pass some criteria - # that they defined in the YAML file, allow that. - for my $fieldName ( $self->allFieldsToFilterOn ) { - if ( - !$self->passesFilter( - $fieldName, $fieldsAref->[ $fieldsToFilterOnIdx->{$fieldName} ] - ) - ) - { - # $self->log('debug', "Line $lineNumber $fieldName doesn't pass filter: $fieldsAref->[ $fieldsToFilterOnIdx->{$fieldName} ]"); - return; - } - } - return 1; -} - -sub _getPositions { - my ( $self, $fieldsAref, $reqIdxHref ) = @_; - - my $start = $fieldsAref->[ $reqIdxHref->{ $self->chromStartField } ]; - my $end = $fieldsAref->[ $reqIdxHref->{ $self->chromEndField } ]; - - #From UCSC clinvar to bed http://genecats.cse.ucsc.edu/git-reports-history/v311/review/user/max/full/src/hg/utils/otto/clinvar/clinVarToBed.32cc9617debc808eb02eeaba28a8be3b705cd0dc.html - # https://ideone.com/IOYReQ - if ( $start > $end ) { - my $warn = $self->name - . ": $reqIdxHref->{$self->chromField} ] $start > $end. Flipping chromStart and chromEnd."; - $self->log( 'warn', $warn ); - - ( $start, $end ) = ( $end, $start ); - } - - # This is an insertion; the only case when start should == stop (for 0-based coordinates) - if ( $start == $end ) { - $start = $end = $start - $self->based; - } - else { - #it's a normal change, or a deletion - #0-based files are expected to be half-closed format, so subtract 1 from end - $start = $start - $self->based; - $end = $end - $self->based - $self->_halfClosedOffset; - } - - return ( $start, $end ); -} -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Vcf.pm b/perl/lib/Seq/Tracks/Vcf.pm deleted file mode 100644 index 776b3df6a..000000000 --- a/perl/lib/Seq/Tracks/Vcf.pm +++ /dev/null @@ -1,107 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Vcf; - -our $VERSION = '0.001'; - -# ABSTRACT: The getter for any type == vcf track -# VERSION - -use Mouse 2; -use namespace::autoclean; - -extends 'Seq::Tracks::Get'; - -sub BUILD { - my $self = shift; - $self->{_altIdx} = $self->getFieldDbName('alt'); - - if ( !defined $self->{_altIdx} ) { - $self->log( 'fatal', - $self->name . ": couldn't find 'alt' feature, required for vcf tracks" ); - } - - # Skip accesor penalty, the get function in this package may be called - # hundreds of millions of times - # Provided by Seq::Tracks::Get - # $self->{_dbName} = $self->dbName; - # $self->{_fDb} = [map { $self->getFieldDbName($_) } @{$self->features}]; - # $self->{_fIdx} = [0 .. $#{$self->features}]; -} - -sub get { - # Avoid assignments, save overhead - #my ($self, $href, $chr, $refBase, $allele, $outAccum, $alleleNumber) = @_ - # $_[0] == $self - # $_[1] == $href : the database data, with each top-level index corresponding to a track - # $_[2] == $chr : the chromosome - # $_[3] == $refBase : ACTG - # $_[4] == $allele : the allele (ACTG or -N / +ACTG) - # $_[5] == $posIdx : the position in the indel, if any - # $_[6] == $outAccum : a reference to the output, which we mutate - - # Unlike other tracks, for Vcf, we only return exact matches - # So tiling across the entire deleted region isn't appropriate - # Could result in false positives during search - if ( $_[5] > 0 ) { - return $_[6]; - } - - my $data = $_[1]->[ $_[0]->{_dbName} ]; - - if ( !$data ) { - return $_[6]; - } - - my $alt = $data->[ $_[0]->{_altIdx} ]; - - # To save CPU time, only enter for loop when necessary - # Almost all VCF sites (~99%) will not be multiallelic, and so the alt stored - # in db will be a scalar - # Handle this as a special, fast path - if ( !ref $alt ) { - if ( $alt eq $_[4] ) { - # Alt is a scalar, which means there were no overlapping database values - # at this pposiiton, and all fields represent a single value - for my $i ( @{ $_[0]->{_fIdx} } ) { - #$outAccum->[$idx][$alleleIdx][$posIdx] = $data->[$self->{_fDb}[$i]] } - $_[6]->[$i][ $_[5] ] = $data->[ $_[0]->{_fDb}[$i] ]; - } - } - - return $_[6]; - } - - # If $alt is a reference (expect array: if not, this is a programmatic error - # which we allow to crash the program) - # then find the matching alt if any, record its index in the database array, - # and look up all YAML-defined fields at this index in the same db data arrayref - # All fields are required to have the same depth, during building - my $dataIdx = 0; - - # Linear search; slow if many alleles, but we expect that at every site has <= 10 alleles - for my $alt (@$alt) { - if ( $alt eq $_[4] ) { - for my $i ( @{ $_[0]->{_fIdx} } ) { - #$outAccum->[$i][$posIdx] = $data->[$self->{_fDb}[$dataIdx]] } - $_[6]->[$i][ $_[5] ] = $data->[ $_[0]->{_fDb}[$i] ][$dataIdx]; - } - - #return $outAccum; - return $_[6]; - } - - $dataIdx++; - } - - # If we got to this point, we found nothing. - # Note that unlike other tracks that tile across indels, we return a single - # undef, rather than per-alt or per-position in indel - return $_[6]; -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Seq/Tracks/Vcf/Build.pm b/perl/lib/Seq/Tracks/Vcf/Build.pm deleted file mode 100644 index 675fba759..000000000 --- a/perl/lib/Seq/Tracks/Vcf/Build.pm +++ /dev/null @@ -1,670 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Seq::Tracks::Vcf::Build; - -our $VERSION = '0.001'; - -=head1 DESCRIPTION - - @class Seq::Tracks::Vcf::Build - Takes a VCF file, runs it through a vcf pre-processor to get it into - our internal annotation format, and then uses the info field to build a database - Will skip discordant sites, logging them -=cut - -# TODO: allow users to specify info.field to avoid nameclash - -# TODO: better error handling in the vcf pre-processor - -# TODO: Support fields delimited by something other than just , for Number=A - -# Note alt field is required, if not found will be appended -use Mouse 2; - -use namespace::autoclean; -use List::MoreUtils qw/firstidx/; -use Parallel::ForkManager; -use Scalar::Util qw/looks_like_number/; -use Seq::Output::Delimiters; -use Seq::Tracks::Base::Types; -use Scalar::Util qw/looks_like_number/; - -use Seq::Tracks; -extends 'Seq::Tracks::Build'; - -with 'Seq::Output::Fields'; - -# We assume sparse tracks have at least one feature; can remove this requirement -# But will need to update makeMergeFunc to not assume an array of values (at least one key => value) -has '+features' => ( required => 1 ); - -# Like Sparse tracks are typically quite small, with potentiall quite large values -# so to make optimal use of pages -# lets set a smaller default commitEvery -has '+commitEvery' => ( default => 1e3 ); - -has vcfProcessor => - ( is => 'ro', isa => 'Str', required => 1, default => 'bystro-vcf' ); - -state $converter = Seq::Tracks::Base::Types->new(); - -# Defines the indices expected in the intermediate vcf output of -# $self->vcfProcessor -# These fields are required, because we -# 1) allow any of them to be used under "features" -# 2) if their names clash with fields in INFO, will ignore the INFO fields -# TODO: allow users to specify info.field to avoid nameclash -state $requiredHeader = - [ 'chrom', 'pos', 'ref', 'alt', 'trTv', 'id', 'alleleIdx', 'info' ]; - -sub BUILD { - my $self = shift; - - if ( !@{ $self->features } ) { - $self->log( 'fatal', 'VCF tracks require features to be specified' ); - } - - my ( $err, $feats ) = $self->_findExpectedFeatures($requiredHeader); - - if ($err) { - $self->log( 'fatal', $self->name . ": $err" ); - } - - $self->{_vcfFeatures} = $feats; - - $self->{_headerFeatures} = $self->_getHeaderFeatures(); - - # TODO: prevent header features from overriding info fields by using info.field - # notation as a feature - $self->{_infoFeatureNames} = $self->_getInfoFeatureNames(); - $self->{_numFilters} = scalar keys %{ $self->build_row_filters } || 0; - - # Precalculate the field db names, for faster accesss - # TODO: think about moving away from storing the "db name" in the database - # We may just want to enforce no changs to the order of fields once - # The db is created - # It fails in too many ways; for instance if you remove a feature, - # Then try to build again, it will crash, because expected array length - # shorter than some of the remaining field indices stored in db, potentially - my %fieldDbNames; - - for my $feature ( @{ $self->features } ) { - $fieldDbNames{$feature} = $self->getFieldDbName($feature); - } - - $self->{_fieldDbNames} = \%fieldDbNames; - - # TODO: Read bystro-vcf header, and configure $vcfFeatures based on that - # will require either reading the first file in the list, or giving - # bystro-vcf a "output only the header" feature (but scope creep) -} - -# has altName => (is => '') -sub buildTrack { - my $self = shift; - - # TODO: Remove side effects, or think about another initialization method - # Unfortunately, it is better to call track getters here - # Because builders may have side effects, like updating - # the meta database - # So we want to call builders BUILD methods first - my $tracks = Seq::Tracks->new(); - $self->{_refTrack} = $tracks->getRefTrackGetter(); - - my $pm = Parallel::ForkManager->new( $self->maxThreads ); - - # my $altIdx = $self->headerFeatures->{ALT}; - # my $idIdx = $self->headerFeatures->{ID}; - - my $lastIdx = $#{ $self->features }; - - # location of these features in input file (intermediate annotation) - my $refIdx = $self->{_vcfFeatures}->{ref}; - my $posIdx = $self->{_vcfFeatures}->{pos}; - my $chrIdx = $self->{_vcfFeatures}->{chrom}; - my $altIdx = $self->{_vcfFeatures}->{alt}; - my $alleleIdx = $self->{_vcfFeatures}->{alleleIdx}; - my $infoIdx = $self->{_vcfFeatures}->{info}; - - # Track over-written positions - # Hashes all values passed in, to make sure that duplicate values aren't written - my ( $mergeFunc, $cleanUpMerge ) = $self->makeMergeFunc(); - - my %completedDetails; - $pm->run_on_finish( - sub { - my ( $pid, $exitCode, $fileName, undef, undef, $errOrChrs ) = @_; - - if ( $exitCode != 0 ) { - my $err = $errOrChrs ? "due to: $$errOrChrs" : "due to an untimely demise"; - - $self->log( 'fatal', $self->name . ": Failed to build $fileName $err" ); - } - - # TODO: check for hash ref - for my $chr ( keys %$errOrChrs ) { - if ( !$completedDetails{$chr} ) { - $completedDetails{$chr} = [$fileName]; - } - else { - push @{ $completedDetails{$chr} }, $fileName; - } - } - - $self->log( 'info', $self->name . ": completed building from $fileName" ); - } - ); - - for my $file ( @{ $self->local_files } ) { - $self->log( 'info', $self->name . ": beginning building from $file" ); - - # Although this should be unnecessary, environments must be created - # within the process that uses them - # This provides a measure of safety - $self->db->cleanUp(); - - $pm->start($file) and next; - my ( $err, $vcfNameMap, $vcfFilterMap ) = $self->_extractHeader($file); - - if ($err) { - # DB not open yet, no need to commit - $pm->finish( 255, \$err ); - } - - # Record which chromosomes were recorded for completionMeta - my %visitedChrs; - my $chr; - my @fields; - my $dbData; - my $wantedChr; - my $refExpected; - my $dbPos; - - # We use "unsafe" writers, whose active count we need to track - my $cursor; - my $count = 0; - - my $fh; - ( $err, $fh ) = $self->_openVcfPipe($file); - - if ($err) { - $self->log( 'fatal', $self->name . ": $err" ); - } - - # TODO: Read header, and configure vcf header feature indices based on that - my $header = <$fh>; - - FH_LOOP: while ( my $line = $fh->getline() ) { - chomp $line; - # This is the annotation input first 7 lines, plus id, info - @fields = split '\t', $line; - - # Transforms $chr if it's not prepended with a 'chr' or is 'chrMT' or 'MT' - # and checks against our list of wanted chromosomes - $chr = $self->normalizedWantedChr->{ $fields[$chrIdx] }; - - # falsy value is '' - if ( !defined $wantedChr || ( !defined $chr || $wantedChr ne $chr ) ) { - # We have a new chromosome - if ( defined $wantedChr ) { - #Commit any remaining transactions, remove the db map from memory - #this also has the effect of closing all cursors - $self->db->cleanUp(); - undef $cursor; - - $count = 0; - } - - $wantedChr = $self->chrWantedAndIncomplete($chr); - } - - # TODO: rethink chPerFile handling - if ( !defined $wantedChr ) { - next FH_LOOP; - } - - $visitedChrs{$wantedChr} //= 1; - - # 0-based position: VCF is 1-based - $dbPos = $fields[$posIdx] - 1; - - if ( !looks_like_number($dbPos) ) { - $self->db->cleanUp(); - - $pm->finish( 255, \"Invalid position @ $chr\: $dbPos" ); - } - - $cursor //= $self->db->dbStartCursorTxn($wantedChr); - - # We want to keep a consistent view of our universe, so use one transaction - # during read/modify/write - $dbData = $self->db->dbReadOneCursorUnsafe( $cursor, $dbPos ); - - $refExpected = $self->{_refTrack}->get($dbData); - if ( $fields[$refIdx] ne $refExpected ) { - $self->log( 'warn', - $self->name - . " $chr\:$fields[$posIdx]: " - . " Discordant. Expected ref: $refExpected, found: ref: $fields[$refIdx], alt:$fields[$altIdx]. Skipping" - ); - next; - } - - ( $err, my $data ) = - $self->_extractFeatures( \@fields, $infoIdx, $alleleIdx, $vcfNameMap, - $vcfFilterMap ); - - if ($err) { - #Commit, sync everything, including completion status, and release mmap - $self->db->cleanUp(); - - $pm->finish( 255, \$err ); - } - - # If the row didn't pass filters, $data will be undefined - # In all other cases it will be an array - if ( !defined $data ) { - next; - } - - #Args: $cursor, $chr, $trackIndex, $pos, $trackValue, $mergeFunction - $self->db->dbPatchCursorUnsafe( $cursor, $wantedChr, $self->dbName, $dbPos, $data, - $mergeFunc ); - - if ( $count > $self->commitEvery ) { - $self->db->dbEndCursorTxn($wantedChr); - undef $cursor; - - $count = 0; - } - - $count++; - } - - #Commit, sync everything, including completion status, and release mmap - $self->db->cleanUp(); - - $self->safeCloseBuilderFh( $fh, $file, 'fatal' ); - - $pm->finish( 0, \%visitedChrs ); - } - - $pm->wait_all_children(); - - for my $chr ( keys %completedDetails ) { - $self->completionMeta->recordCompletion($chr); - - # cleanUpMerge placed here so that only after all files are processed do we - # drop the temporary merge databases - # so that if we have out-of-order chromosomes, we do not mishandle - # overlapping sites - $cleanUpMerge->($chr); - - $self->log( 'info', - $self->name - . ": recorded $chr completed, from " - . ( join( ",", @{ $completedDetails{$chr} } ) ) ); - } - - #TODO: figure out why this is necessary, even with DEMOLISH - $self->db->cleanUp(); - return; -} - -sub _findExpectedFeatures { - my ( $self, $expFeaturesAref ) = @_; - - my $file = $self->local_files->[0]; - - if ( !$file ) { - return ( "Require at least one file in local_file", undef ); - } - - my ( $err, $fh ) = $self->_openVcfPipe( $self->local_files->[0] ); - - if ($err) { - return ( $err, undef ); - } - - my $header = <$fh>; - chomp $header; - - if ( !$header ) { - return ( "Couldn't read intermediate header of $file", undef ); - } - - my @head = split '\t', $header; - - # TODO: expose all features found in the header of the intermediate output - ####### we currently don't because results in name clashes with INFO values - ####### i.e bystro-vcf now outputs and "ac" field but we may want "info.ac" - my %found = map { $_ => -1 } @$expFeaturesAref; #@head; - - my $idx = -1; - for my $f (@head) { - $idx++; - - if ( defined $found{$f} && $found{$f} != -1 ) { - return ( "Found $f at least twice in header of $file", undef ); - } - - $found{$f} = $idx; - } - - my @missing = grep { $found{$_} == -1 } @$expFeaturesAref; - - if (@missing) { - return ( - "Couldn't find all expected header fields, missing: " . join( ',', @missing ), - undef ); - } - - return ( undef, \%found ); -} - -sub _getInfoFeatureNames { - my $self = shift; - - my %reverseFieldMap = map { $self->fieldMap->{$_} => $_ } keys %{ $self->fieldMap }; - - my %infoFeatureNames; - for my $feature ( @{ $self->features } ) { - my $originalName = $reverseFieldMap{$feature} || $feature; - - # If we encounter a name clash, choose the intermediate header output - # rather than the field in INFO - if ( defined $self->{_vcfFeatures}{$originalName} ) { - $self->log( 'info', - $self->name . ": taking $feature from intermediate header, not INFO" ); - next; - } - - $infoFeatureNames{$feature} = $originalName; - } - - return \%infoFeatureNames; -} - -# Gets the feature names, indices that will be taken from the -# vcf pre-processor's header, rather than info -# @return [][]{featureName: string, headerIdx: int, dbFeatureIdx: int} -sub _getHeaderFeatures { - my $self = shift; - - my %featuresMap; - my $features = $self->features; - - for ( my $i = 0; $i < @{$features}; $i++ ) { - $featuresMap{ $features->[$i] } = $i; - } - - if ( !defined $featuresMap{alt} ) { - $self->log( 'fatal', - $self->name . ": 'alt' feature not specified, required for vcf tracks" ); - } - - my %fieldMap = map { $_ => $self->fieldMap->{$_} } keys %{ $self->fieldMap }; - - my @headerFeatures; - for my $fName ( keys %{ $self->{_vcfFeatures} } ) { - my $idx; - - # Because VCF files are so flexible with feature definitions, it will be - # difficult to tell if a certain feature just isn't present in a vcf file - # Easier to make feature definition flexible, especially since one - # may correctly surmise that we read the VCF after transformation to intermediate - # annotated format - - if ( defined $featuresMap{$fName} ) { - $idx = $featuresMap{$fName}; - } - elsif ( defined $fieldMap{$fName} && defined $featuresMap{ $fieldMap{$fName} } ) { - $idx = $featuresMap{ $fieldMap{$fName} }; - } - - # This $fName isn't requested by the user - if ( !defined $idx ) { - next; - } - - #Stores: - #1) The feature name (post-transformation) - #2) The feature's index in the pre-processor's header - #3) The index in the database - push @headerFeatures, - [ - $self->features->[$idx], $self->{_vcfFeatures}{$fName}, - $self->getFieldDbName( $self->features->[$idx] ) - ]; - } - - return \@headerFeatures; -} - -sub _openVcfPipe { - my ( $self, $file ) = @_; - - my $outputter = Seq::Output::Delimiters->new(); - - my $delim = $outputter->emptyFieldChar; - my $prog = - $self->isCompressedSingle($file) - ? $self->gzip . ' ' . $self->decompressArgs - : 'cat'; - - my $errPath = $file . ".build." . localtime() . ".log"; - - my $op = - "$prog $file | " - . $self->vcfProcessor - . " --emptyField $delim" - . " --keepId --keepInfo"; - - my $fh; - my $err = $self->safeOpen( $fh, '-|', $op ); - - return ( $err, $fh ); -} - -sub _extractHeader { - my $self = shift; - my $file = shift; - my $dieIfNotFound = shift; - - my ( $err, undef, $fh ) = $self->getReadFh($file); - - if ($err) { - return ( $err, undef, undef ); - } - - my @header; - while (<$fh>) { - chomp; - - if ( substr( $_, 0, 1 ) eq '#' ) { - push @header, $_; - next; - } - - last; - } - - $err = $self->safeCloseBuilderFh( $fh, $file, 'error' ); - - if ($err) { - return ( $err, undef, undef ); - } - - my $idxOfInfo = -9; - my $idx = -1; - - my %nameMap; - my %filterMap; - - # Flags may or may not be in the info field - # To speed search, store these, and walk back to find our value - my $flagCount = 0; - for my $h (@header) { - $idx++; - - if ( $h !~ /\#\#INFO=/ ) { - next; - } - - if ( $idxOfInfo == -9 ) { - $idxOfInfo = $idx; - } - - $h =~ /Number=([\w.]+)/; - - my $number = $1; - - $h =~ /Type=(\w+)/; - - my $type = $1; - - # Keep track of things that look like they could mess up INFO string order - # Flag in particular seems often missing, so we'll do a linear search - # From $idx - $idxOfInfo to +$flagCount - if ( looks_like_number($number) ) { - if ( $number == 0 ) { - $flagCount++; - } - } - elsif ( $number eq '.' ) { - $flagCount++; - } - - my $featIdx = -1; - - # TODO: if the flag item is the feature we're searching for do something - # Not critial, but will have less efficient search - # Requires precise spelling of the vcf feature - # TODO: Die if don't find header for any requested feature - FEATURE_LOOP: for my $feature ( @{ $self->features } ) { - if ( !defined $self->{_infoFeatureNames}{$feature} ) { - next; - } - - my $infoName = $self->{_infoFeatureNames}{$feature}; - - if ( index( $h, "INFO\=\ 0 ) { - # my $vcfName = "$feature="; - # In case Number and Type aren't adjacent to each other - # $return[$featIdx] = [$number, $type]; - $nameMap{$infoName} = [ $feature, $number, $type, $idx, ]; - last FEATURE_LOOP; - } - } - - # Filters on INFO fields - FEATURE_LOOP: for my $feature ( keys %{ $self->build_row_filters } ) { - my $infoName = $self->{_infoFeatureNames}{$feature} || $feature; - - if ( index( $h, "INFO\=\ 0 ) { - # my $vcfName = "$feature="; - # In case Number and Type aren't adjacent to each other - # $return[$featIdx] = [$number, $type]; - $filterMap{$infoName} = [ $feature, $number, $type, $idx, ]; - last FEATURE_LOOP; - } - } - } - - return ( undef, \%nameMap, \%filterMap ); -} - -sub _extractFeatures { - my ( $self, $fieldsAref, $infoIdx, $multiAlleleIdx, $vcfNameMap, $vcfFilterMap ) = - @_; - - # vcfProcessor will split multiallelics, store the alleleIdx - # my @infoFields = ; - - my @returnData; - $#returnData = $#{ $self->features }; - - my $firstChars; - - my $warned; - - my $entry; - my $found = 0; - my $name; - my $val; - - my $totalNeeded = @returnData + $self->{_numFilters}; - # $arr holds - # 1) field name - # 2) index in intermediate annotation - # 3) index in database - for my $arr ( @{ $self->{_headerFeatures} } ) { - # $arr->[0] is the fieldName - # $arr->[1] is the field idx - if ( $self->hasTransform( $arr->[0] ) ) { - $fieldsAref->[ $arr->[1] ] = - $self->transformField( $arr->[0], $fieldsAref->[ $arr->[1] ] ); - } - - $returnData[ $arr->[2] ] = - $self->coerceFeatureType( $arr->[0], $fieldsAref->[ $arr->[1] ] ); - } - - my $alleleIdx = $fieldsAref->[$multiAlleleIdx]; - - for my $info ( split ';', $fieldsAref->[$infoIdx] ) { - # If # found == scalar @{$self->features} - if ( $found == $totalNeeded ) { - last; - } - - $name = substr( $info, 0, index( $info, '=' ) ); - - $entry = $vcfNameMap->{$name} || $vcfFilterMap->{$name}; - if ( !$entry ) { - next; - } - - $found++; - - $val = substr( $info, index( $info, '=' ) + 1 ); - - # A types have a value per allele - if ( $entry->[1] eq 'A' ) { - my @vals = split ',', $val; - - if ( @vals - 1 < $alleleIdx ) { - return ( "Err: Type=A field has fewer values than alleles", undef ); - } - - $val = $vals[$alleleIdx]; - } - - # Using $entry->[0] allows us to map the name of the property to be filtered - if ( $self->hasFilter( $entry->[0] ) ) { - if ( !$self->passesFilter( $entry->[0], $val ) ) { - return ( undef, undef ); - } - - next; - } - - # All field to be split if the user requests that in the YAML config - # $entry->[0] is the fieldName - if ( $self->hasTransform( $entry->[0] ) ) { - $val = $self->transformField( $entry->[0], $val ); - } - - # TODO: support non-scalar values - # TODO: configure from either type specified in YAML, or from VCF Type= - $returnData[ $self->{_fieldDbNames}{ $entry->[0] } ] = - $self->coerceFeatureType( $entry->[0], $val ); - } - - return ( undef, \@returnData ); -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Utils/Base.pm b/perl/lib/Utils/Base.pm deleted file mode 100644 index c3fd9a737..000000000 --- a/perl/lib/Utils/Base.pm +++ /dev/null @@ -1,242 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Utils::Base; - -# A base class for utilities. Just a place to store common attributes - -use Mouse 2; - -with 'Seq::Role::Message'; -with 'Seq::Role::IO'; - -use Types::Path::Tiny qw/AbsFile/; -use List::MoreUtils qw/first_index/; -use YAML::XS qw/LoadFile Dump/; -use Path::Tiny qw/path/; -use Time::localtime; - -############## Arguments accepted ############# -# The track name that they want to use -has name => ( is => 'ro', isa => 'Str', required => 1 ); - -has use_absolute_path => ( is => 'ro', isa => 'Bool', default => 0 ); - -# The YAML config file path -has config => ( - is => 'ro', - isa => AbsFile, - coerce => 1, - required => 1, - handles => { configPath => 'stringify' } -); - -# Logging -has logPath => ( - is => 'ro', - lazy => 1, - default => sub { - my $self = shift; - my $path = - path( $self->_decodedConfig->{files_dir} )->child( $self->_wantedTrack->{name} ) - ->child( $self->name . "." . $self->_dateOfRun . ".log" ); - - return $path->stringify(); - } -); - -# In the new API, the producer passed the index of the utility configuration -# aka -# utils: -# - name: liftOverCadd -# args: -# something: 1 -# So that here liftOverCadd is index 0 -# This allows this class to write a "completed" property aka: -# # utils: -# - name: liftOverCadd -# args: -# something: 1 -# completed: Date() -has utilIdx => ( is => 'ro', isa => 'Int' ); - -has utilName => ( is => 'ro', isa => 'Str', required => 1 ); - -# Debug log level? -has debug => ( is => 'ro' ); - -# Compress the output? -has compress => ( is => 'ro' ); - -# Overwrite files if they exist? -has overwrite => ( is => 'ro' ); - -has publisher => ( is => 'ro' ); - -has verbose => ( is => 'ro' ); - -has dryRun => ( is => 'ro', isa => 'Bool', default => 0 ); - -has maxThreads => ( is => 'ro', isa => 'Int', default => 8 ); - -#########'Protected' vars (Meant to be used by child class only) ############ -has _wantedTrack => ( is => 'ro', init_arg => undef, writer => '_setWantedTrack' ); - -has _decodedConfig => ( - is => 'ro', - isa => 'HashRef', - lazy => 1, - default => sub { - my $self = shift; - return LoadFile( $self->configPath ); - } -); - -# Where any downloaded or created files should be saved -has _localFilesDir => ( - is => 'ro', - isa => 'Str', - lazy => 1, - default => sub { - my $self = shift; - my $dir = - path( $self->_decodedConfig->{files_dir} )->child( $self->_wantedTrack->{name} ); - - return $dir->stringify; - } -); - -has _newConfigPath => ( - is => 'ro', - isa => 'Str', - lazy => 1, - default => sub { - my $self = shift; - - return - substr( $self->configPath, 0, rindex( $self->config, '.' ) ) . "." - . $self->_dateOfRun - . substr( $self->config, rindex( $self->config, '.' ) ); - } -); - -# Memoized date, because we want backupAndWrite to give same date as fetch_date, sort_date, etc -has _dateOfRun => ( - is => 'ro', - lazy => 1, - init_arg => undef, - default => sub { my $self = shift; $self->getDate(); } -); - -sub BUILD { - my $self = shift; - # Must happen here, because we need to account for the case where track isn't found - # And you cannot throw an error from within a default, and I think it is - # More clear to throw a fatal error from the BUILD method than a builder=> method - my $trackIndex = first_index { $_->{name} eq $self->name } - @{ $self->_decodedConfig->{tracks}{tracks} }; - - if ( $trackIndex == -1 ) { - $self->log( 'fatal', "Desired track " . $self->name . " wasn't found" ); - return; - } - - $self->_setWantedTrack( $self->_decodedConfig->{tracks}{tracks}[$trackIndex] ); - - my $dir = path( $self->_localFilesDir ); - - $dir->mkpath; - - # If in long-running process, clear singleton state - Seq::Role::Message::initialize(); - - # Seq::Role::Message settings - # We manually set the publisher, logPath, verbosity, and debug, because - # Seq::Role::Message is meant to be consumed globally, but configured once - # Treating publisher, logPath, verbose, debug as instance variables - # would result in having to configure this class in every consuming class - if ( defined $self->publisher ) { - $self->setPublisher( $self->publisher ); - } - - if ( defined $self->logPath ) { - $self->setLogPath( $self->logPath ); - } - - if ( defined $self->verbose ) { - $self->setVerbosity( $self->verbose ); - } - else { - # 1 == "info" level - $self->setVerbosity(1); - } - - #todo: finisih ;for now we have only one level - if ( $self->debug ) { - $self->setLogLevel('DEBUG'); - } - else { - $self->setLogLevel('INFO'); - } -} - -sub _writeCompletedDate { - my $self = shift; - - if ( defined $self->utilIdx ) { - $self->_wantedTrack->{utils}[ $self->utilIdx ]{completed} = $self->_dateOfRun; - } - else { - $self->_wantedTrack->{ $self->utilName . '_completed' } = $self->_dateOfRun; - } - - return; -} - -sub _backupAndWriteConfig { - my $self = shift; - - $self->_writeCompletedDate(); - - my $backPath = $self->configPath . ".utils-bak." . $self->_dateOfRun; - - if ( -e $backPath ) { - unlink $backPath; - } - # If this is already a symlink, remove it - if ( -l $self->configPath ) { - unlink $self->configPath; - } - else { - if ( system( "mv " . $self->configPath . " $backPath" ) != 0 ) { - $self->log( 'fatal', "Failed to back up " . $self->configPath ); - } - } - - open( my $fh, '>', $self->_newConfigPath ) - or $self->log( 'fatal', "Couldn't open" . $self->_newConfigPath . " for writing" ); - - say $fh Dump( $self->_decodedConfig ); - - # -f forces hard link / overwrite - if ( system( "ln -f " . $self->_newConfigPath . " " . $self->configPath ) != 0 ) { - $self->log( 'fatal', - "Failed to hard link " . $self->configPath . " to " . $self->_newConfigPath ); - } - - $self->log( 'info', 'Finished ' . $self->utilName ); -} - -sub getDate { - my $tm = localtime; - return sprintf( - "%04d-%02d-%02dT%02d:%02d:00", - $tm->year + 1900, - ( $tm->mon ) + 1, - $tm->mday, $tm->hour, $tm->min - ); -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Utils/CaddToBed.pm b/perl/lib/Utils/CaddToBed.pm deleted file mode 100644 index 9c60aca8d..000000000 --- a/perl/lib/Utils/CaddToBed.pm +++ /dev/null @@ -1,148 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -# Takes a CADD file and makes it into a bed-like file, retaining the property -# That each base has 3 (or 4 for ambiguous) lines -package Utils::CaddToBed; - -our $VERSION = '0.001'; - -use Mouse 2; -use namespace::autoclean; -use Path::Tiny qw/path/; - -use Seq::Tracks::Build::LocalFilesPaths; - -# Exports: _localFilesDir, _decodedConfig, compress, _wantedTrack, _setConfig, logPath, use_absolute_path -extends 'Utils::Base'; - -# ########## Arguments accepted ############## -# Expects tab delimited file; not allowing to be set because it probably won't ever be anything -# other than tab, and because split('\'t') is faster -# has delimiter => (is => 'ro', lazy => 1, default => "\t"); - -sub BUILD { - my $self = shift; - - my $localFilesHandler = Seq::Tracks::Build::LocalFilesPaths->new(); - - my $localFilesAref = $localFilesHandler->makeAbsolutePaths( - $self->_decodedConfig->{files_dir}, - $self->_wantedTrack->{name}, - $self->_wantedTrack->{local_files} - ); - - if ( @$localFilesAref != 1 ) { - $self->log( 'fatal', - "Expect a single cadd file, found " . ( scalar @$localFilesAref ) ); - } - - $self->{_localFile} = $localFilesAref->[0]; -} - -# TODO: error check opening of file handles, write tests -sub go { - my $self = shift; - - my %wantedChrs = map { $_ => 1 } @{ $self->_decodedConfig->{chromosomes} }; - - my $inFilePath = $self->{_localFile}; - - if ( !-e $inFilePath ) { - $self->log( 'fatal', "input file path $inFilePath doesn't exist" ); - return; - } - # Store output handles by chromosome, so we can write even if input file - # out of order - my %outFhs; - my %skippedBecauseExists; - - # We'll update this list of files in the config file - $self->_wantedTrack->{local_files} = []; - - my $inFh = $self->getReadFh($inFilePath); - - $self->log( 'info', "Reading $inFilePath" ); - - my $versionLine = <$inFh>; - chomp $versionLine; - - $self->log( 'info', "Cadd version line: $versionLine" ); - - my $headerLine = <$inFh>; - chomp $headerLine; - - $self->log( 'info', "Cadd header line: $headerLine" ); - - my @headerFields = split( '\t', $headerLine ); - - # CADD seems to be 1-based, this is not documented however. - my $based = 1; - - my $outPathBase = path($inFilePath)->basename(); - - my $outExt = 'bed' - . ( - $self->compress ? '.gz' : substr( $outPathBase, rindex( $outPathBase, '.' ) ) ); - - $outPathBase = substr( $outPathBase, 0, rindex( $outPathBase, '.' ) ); - - my $outPath = - path( $self->_localFilesDir )->child("$outPathBase.$outExt")->stringify(); - - if ( -e $outPath && !$self->overwrite ) { - $self->log( 'fatal', "File $outPath exists, and overwrite is not set" ); - return; - } - - my $outFh = $self->getWriteFh($outPath); - - $self->log( 'info', "Writing to $outPath" ); - - say $outFh $versionLine; - say $outFh join( "\t", - 'chrom', 'chromStart', 'chromEnd', @headerFields[ 2 .. $#headerFields ] ); - - while ( my $l = $inFh->getline() ) { - chomp $l; - - my @line = split( '\t', $l ); - - # The part that actually has the id, ex: in chrX "X" is the id - my $chrIdPart; - # Get the chromosome - # It could be stored as a number/single character or "chr" - # Grab the chr part, and normalize it to our case format (chr) - if ( $line[0] =~ /chr/i ) { - $chrIdPart = substr( $line[0], 3 ); - } - else { - $chrIdPart = $line[0]; - } - - # Don't forget to convert NCBI to UCSC-style mitochondral chr name - if ( $chrIdPart eq 'MT' ) { - $chrIdPart = 'M'; - } - - my $chr = "chr$chrIdPart"; - - if ( !exists $wantedChrs{$chr} ) { - $self->log( 'warn', - "Chromosome $chr not recognized (from $chrIdPart), skipping: $l" ); - next; - } - - my $start = $line[1] - $based; - my $end = $start + 1; - say $outFh join( "\t", $chr, $start, $end, @line[ 2 .. $#line ] ); - } - - $self->_wantedTrack->{local_files} = [$outPath]; - - $self->_backupAndWriteConfig(); -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Utils/DbSnp2FormatInfo.pm b/perl/lib/Utils/DbSnp2FormatInfo.pm deleted file mode 100644 index 24458df84..000000000 --- a/perl/lib/Utils/DbSnp2FormatInfo.pm +++ /dev/null @@ -1,211 +0,0 @@ -#!/usr/bin/perl -use 5.10.0; -use strict; -use warnings; - -# Take a DbSNP 2 VCF file, and for each row, split the INFO field's FREQ data into separate INFO fields for each population -# NOTE: dbSNP VCF spec: https://www.ncbi.nlm.nih.gov/snp/docs/products/vcf/redesign/ -# NOTE: that dbSNP uses a '.' to represent a missing value and first allele is the reference, which is not the standard use. - -package Utils::DbSnp2FormatInfo; - -our $VERSION = '0.001'; - -use File::Basename qw/basename/; - -use Mouse 2; -use namespace::autoclean; -use Path::Tiny qw/path/; - -use Seq::Tracks::Build::LocalFilesPaths; - -# Exports: _localFilesDir, _decodedConfig, compress, _wantedTrack, _setConfig, logPath, use_absolute_path -extends 'Utils::Base'; - -my $INFO_INDEX = 7; - -sub BUILD { - my $self = shift; - - my $localFilesHandler = Seq::Tracks::Build::LocalFilesPaths->new(); - - my $localFilesAref = $localFilesHandler->makeAbsolutePaths( - $self->_decodedConfig->{files_dir}, - $self->_wantedTrack->{name}, - $self->_wantedTrack->{local_files} - ); - - $self->{_localFiles} = $localFilesAref; -} - -sub _get_fh_paths { - my ( $self, $input_vcf ) = @_; - - if ( !-e $input_vcf ) { - $self->log( 'fatal', "input file path $input_vcf doesn't exist" ); - return; - } - - my ( $err, $isCompressed, $in_fh ) = $self->getReadFh($input_vcf); - - $isCompressed ||= $self->compress; - - if ($err) { - $self->log( 'fatal', $err ); - return; - } - - my $base_name = basename($input_vcf); - $base_name =~ s/\.[^.]+$//; # Remove last file extension (if present) - $base_name - =~ s/\.[^.]+$//; # Remove another file extension if it's something like .vcf.gz - - my $output_vcf_data = $base_name . "_vcf_data.vcf" . ( $isCompressed ? ".gz" : "" ); - my $output_vcf_header = - $base_name . "_vcf_header.vcf" . ( $isCompressed ? ".gz" : "" ); - my $output_vcf = $base_name . "_processed.vcf" . ( $isCompressed ? ".gz" : "" ); - - $self->log( 'info', "Reading $input_vcf" ); - - my $output_header_path = - path( $self->_localFilesDir )->child($output_vcf_header)->stringify(); - my $output_data_path = - path( $self->_localFilesDir )->child($output_vcf_data)->stringify(); - my $output_path = path( $self->_localFilesDir )->child($output_vcf)->stringify(); - - if ( ( -e $output_data_path || -e $output_header_path || -e $output_path ) - && !$self->overwrite ) - { - $self->log( 'fatal', - "Temp files $output_data_path, $output_header_path, or final output path $output_path exist, and overwrite is not set" - ); - return; - } - - return ( $in_fh, $output_data_path, $output_header_path, $output_path ); -} - -sub go { - my $self = shift; - - my @output_paths; - - for my $input_vcf ( @{ $self->{_localFiles} } ) { - my ( $in_fh, $output_data_path, $output_header_path, $output_path ) = - $self->_get_fh_paths($input_vcf); - - my $output_data_fh = $self->getWriteFh($output_data_path); - - $self->log( 'info', "Writing to $output_data_path" ); - - my %populations; - my @ordered_populations; - - my @header_lines; - while (<$in_fh>) { - chomp; - - # If it's a header line - if (/^#/) { - push @header_lines, $_; - next; - } - - my @fields = split( "\t", $_ ); - - if ( !@fields ) { - $self->log( "fatal", "No fields found in row: $_" ); - return; - } - - my @info_fields = split( ";", $fields[$INFO_INDEX] ); - - my @ordered_info_freqs; - my %seen_info_pops; - - my $seen_freq = 0; - foreach my $info (@info_fields) { - if ( $info =~ /FREQ=(.+)/ ) { - if ( $seen_freq == 1 ) { - $self->log( "fatal", "FREQ seen twice in INFO field. Row: $_" ); - return; - } - - $seen_freq = 1; - - my $freq_data = $1; - my @pops = split( /\|/, $freq_data ); - - foreach my $pop (@pops) { - if ( $pop =~ /([^:]+):(.+)/ ) { - my $pop_name = $1; - - if ( exists $seen_info_pops{$pop_name} ) { - self->log( "fatal", "Population $pop_name seen twice in INFO field. Row: $_" ); - return; - } - - my @freq_vals = split( /,/, $2 ); - shift @freq_vals; # Remove the reference allele freq - - push @ordered_info_freqs, [ $pop_name, join( ",", @freq_vals ) ]; - - if ( !exists $populations{$pop_name} ) { - push @ordered_populations, $pop_name; - $populations{$pop_name} = 1; - } - } - } - - # Append the new frequency data to the INFO field - my @new_info_fields; - for my $res (@ordered_info_freqs) { - my $name = $res->[0]; - my $freq = $res->[1]; - push @new_info_fields, "$name=$freq"; - } - - $info = join( ";", @new_info_fields ); - } - } - - $fields[$INFO_INDEX] = join( ";", @info_fields ); - - say $output_data_fh join( "\t", @fields ); - } - - close($in_fh); - close($output_data_fh); - - # Update the VCF header with new populations - my @pop_lines; - foreach my $pop (@ordered_populations) { - push @pop_lines, - "##INFO="; - } - - splice( @header_lines, -1, 0, @pop_lines ); - - my $header_fh = $self->getWriteFh($output_header_path); - - # Write the updated header and VCF to output - say $header_fh join( "\n", @header_lines ); - close($header_fh); - - system("cat $output_header_path $output_data_path > $output_path") == 0 - or die "Failed to concatenate files: $?"; - system("rm $output_header_path $output_data_path") == 0 - or die "Failed to remove temporary files: $?"; - - $self->log( 'info', "$input_vcf processing complete" ); - - push @output_paths, $output_path; - } - - $self->_wantedTrack->{local_files} = \@output_paths; - - $self->_backupAndWriteConfig(); -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Utils/Fetch.pm b/perl/lib/Utils/Fetch.pm deleted file mode 100644 index 8f9f43189..000000000 --- a/perl/lib/Utils/Fetch.pm +++ /dev/null @@ -1,248 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Utils::Fetch; - -our $VERSION = '0.001'; - -# ABSTRACT: Fetch anything specified by remoteDir . / . remoteFiles -# or an sql statement - -use Mouse 2; - -# Exports: _localFilesDir, _decodedConfig, compress, _wantedTrack, _setConfig, logPath, use_absolute_path -extends 'Utils::Base'; - -use namespace::autoclean; -use File::Which qw(which); -use Path::Tiny; -use YAML::XS qw/Dump/; - -use Utils::SqlWriter; - -# The sql connection config -has sql => ( is => 'ro', isa => 'Maybe[Str]' ); -has remoteFiles => ( is => 'ro', isa => 'Maybe[ArrayRef]' ); -has remoteDir => ( is => 'ro', isa => 'Maybe[Str]' ); - -has connection => ( is => 'ro', isa => 'Maybe[HashRef]' ); - -# Choose whether to use wget or rsync program to fetch -has aws => ( is => 'ro', init_arg => undef, writer => '_setAws' ); -has wget => ( is => 'ro', init_arg => undef, writer => '_setWget' ); -has rsync => ( is => 'ro', init_arg => undef, writer => '_setRsync' ); - -sub BUILD { - my $self = shift; - - if ( defined $self->sql ) { - return; - } - - my $aws = which('aws'); - my $wget = which('wget'); - my $rsync = which('rsync'); - - if ( !$rsync || !$wget ) { - $self->log( 'fatal', 'Fetch.pm requires rsync and wget when fetching remoteFiles' ); - } - - $self->_setAws($aws); - $self->_setRsync($rsync); - $self->_setWget($wget); -} - -########################## The only public export ###################### -sub go { - my $self = shift; - - if ( defined $self->remoteFiles || defined $self->remoteDir ) { - return $self->_fetchFiles(); - } - - if ( defined $self->sql ) { - return $self->_fetchFromUCSCsql(); - } - - $self->log( 'fatal', - "Couldn't find either remoteFiles + remoteDir," - . " or an sql statement for this track " - . $self->name ); -} - -########################## Main methods, which do the work ###################### -# These are called depending on whether sql_statement or remoteFiles + remoteDir given -sub _fetchFromUCSCsql { - my $self = shift; - - my $sqlStatement = $self->sql; - - # What features are called according to our YAML config spec - my $featuresKey = '%features%'; - my $featuresIdx = index( $sqlStatement, $featuresKey ); - - if ( $featuresIdx > -1 ) { - if ( !@{ $self->_wantedTrack->{features} } ) { - $self->log( 'fatal', - "Requires features if sql_statement speciesi SELECT %features%" ); - } - - my $trackFeatures; - foreach ( @{ $self->_wantedTrack->{features} } ) { - # YAML config spec defines optional type on feature names, so some features - # Can be hashes. Take only the feature name, ignore type, UCSC doesn't use them - my $featureName; - - if ( ref $_ ) { - ($featureName) = %{$_}; - } - else { - $featureName = $_; - } - - $trackFeatures .= $featureName . ','; - } - - chop $trackFeatures; - - substr( $sqlStatement, $featuresIdx, length($featuresKey) ) = $trackFeatures; - } - - my $config = { - sql => $sqlStatement, - assembly => $self->_decodedConfig->{assembly}, - chromosomes => $self->_decodedConfig->{chromosomes}, - outputDir => $self->_localFilesDir, - compress => 1, - }; - - if ( defined $self->connection ) { - $config->{connection} = $self->connection; - } - - my $sqlWriter = Utils::SqlWriter->new($config); - - # Returns the relative file names - my @writtenFileNames = $sqlWriter->go(); - - $self->_wantedTrack->{local_files} = \@writtenFileNames; - - $self->_backupAndWriteConfig(); - - $self->log( 'info', "Finished fetching data from sql" ); -} - -sub _fetchFiles { - my $self = shift; - - my $pathRe = qr/([a-z]+:\/\/)(\S+)/; - my $remoteDir; - my $remoteProtocol; - - my $fetchProgram; - - my $isRsync = 0; - my $isS3 = 0; - - if ( $self->remoteDir ) { - # remove http:// (or whatever protocol) - $self->remoteDir =~ m/$pathRe/; - - if ($1) { - $remoteProtocol = $1; - } - elsif ( $self->remoteDir =~ 's3://' ) { - $isS3 = 1; - $remoteProtocol = 's3://'; - } - else { - $isRsync = 1; - $remoteProtocol = 'rsync://'; - } - - $remoteDir = $2; - } - - my $outDir = $self->_localFilesDir; - - $self->_wantedTrack->{local_files} = []; - - for my $file ( @{ $self->remoteFiles } ) { - my $remoteUrl; - - if ($remoteDir) { - $remoteUrl = $remoteProtocol . path($remoteDir)->child($file)->stringify; - } - else { - $file =~ m/$pathRe/; - - # This file is an absolute remote path - if ($1) { - $remoteUrl = $file; - } - elsif ( $file =~ 's3://' ) { - $remoteUrl = $file; - $isS3 = 1; - } - else { - $remoteUrl = "rsync://" . $2; - $isRsync = 1; - } - } - - # Always outputs verbose, capture the arguments - my $command; - - if ($isRsync) { - $command = $self->rsync . " -avPz $remoteUrl $outDir"; - } - elsif ($isS3) { - if ( !$self->aws ) { - $self->log( 'fatal', - "You requested an s3 remote file ($remoteUrl), but have no aws s3 cli installed. Please visit: https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html" - ); - return; - } - $command = $self->aws . " s3 cp $remoteUrl $outDir"; - } - else { - # -N option will clobber only if remote file is newer than local copy - # -S preserves timestamp - $command = $self->wget . " -N -S $remoteUrl -P $outDir"; - } - - $self->log( 'info', "Fetching: $command" ); - - # http://stackoverflow.com/questions/11514947/capture-the-output-of-perl-system - open( my $fh, "-|", "$command" ) or $self->log( 'fatal', "Couldn't fork: $!\n" ); - - my $progress; - while (<$fh>) { - if ( $self->debug ) { say $_ } # we may want to watch progress in stdout - $self->log( 'info', $_ ); - } - close($fh); - - my $exitStatus = $?; - - if ( $exitStatus != 0 ) { - $self->log( 'fatal', "Failed to fetch $file" ); - } - - my $outFileName = $remoteDir ? $file : substr( $file, rindex( $file, '/' ) + 1 ); - - push @{ $self->_wantedTrack->{local_files} }, $outFileName; - - # stagger requests to be kind to the remote server - sleep 3; - } - - $self->_backupAndWriteConfig(); - - $self->log( 'info', "Finished fetching all remote files" ); -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Utils/FilterCadd.pm b/perl/lib/Utils/FilterCadd.pm deleted file mode 100644 index 54e39715d..000000000 --- a/perl/lib/Utils/FilterCadd.pm +++ /dev/null @@ -1,331 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -# The Bystro db contains only QC'd CADD sites -# Use these to filter the source CADD sites and output a cleaned version -package Utils::FilterCadd; - -our $VERSION = '0.001'; - -use Mouse 2; -use namespace::autoclean; -use Types::Path::Tiny qw/AbsFile Path AbsDir/; -use Path::Tiny qw/path/; -use Scalar::Util qw/looks_like_number/; - -use Seq::Tracks::Build::LocalFilesPaths; - -use Parallel::ForkManager; - -use Seq::DBManager; -use Seq::Tracks::Cadd; -use Seq::Tracks::Score::Build; - -# Exports: _localFilesDir, _decodedConfig, compress, _wantedTrack, _setConfig, logPath, use_absolute_path -extends 'Utils::Base'; - -my $localFilesHandler = Seq::Tracks::Build::LocalFilesPaths->new(); - -sub BUILD { - my $self = shift; - - # DBManager acts as a singleton. It is configured once, and then consumed repeatedly - # However, in long running processes, this can lead to misconfiguration issues - # and worse, environments created in one process, then copied during forking, to others - # To combat this, every time Seq::Base is called, we re-set/initialzied the static - # properties that create this behavior - # Initialize it before BUILD, to make this class less dependent on inheritance order - Seq::DBManager::initialize( - { databaseDir => $self->_decodedConfig->{database_dir} } ); - - if ( !$self->_wantedTrack->{sorted} == 1 ) { - die "CADD files must be sorted (sorted == 1), at least by chromosome"; - } -} - -sub go { - my $self = shift; - - my $gzip = $self->gzip; - - my ( $localFilesPathsAref, $has_absolute_files ) = - $localFilesHandler->makeAbsolutePaths( - $self->_decodedConfig->{files_dir}, - $self->_wantedTrack->{name}, - $self->_wantedTrack->{local_files} - ); - - my $outDir = - path( $self->_decodedConfig->{files_dir} )->child( $self->_wantedTrack->{name} ); - - my $pm = Parallel::ForkManager->new( $self->maxThreads ); - - if ( !@$localFilesPathsAref ) { - $self->log( 'fatal', "No local files found" ); - } - - my %trackConfig = %{ $self->_wantedTrack }; - - $trackConfig{assembly} = $self->_decodedConfig->{assembly}; - $trackConfig{chromosomes} = $self->_decodedConfig->{chromosomes}; - - my $caddGetter = Seq::Tracks::Cadd->new( \%trackConfig ); - - my $rounder = Seq::Tracks::Score::Build::Round->new( - { scalingFactor => $caddGetter->scalingFactor } ); - - my $db = Seq::DBManager->new(); - - my %wantedChrs = map { $_ => 1 } @{ $self->_decodedConfig->{chromosomes} }; - - my $dryRun = $self->dryRun; - - my $outExtPart = $self->compress ? '.txt.gz' : '.txt'; - my $outExt = '.filtered' . $outExtPart; - - my @outPaths; - - # If we don't call the run_on_finish here - # only 1 outPath will be stored for each fork, regardless of how many - # files that fork read, since this will only be run once per termination - # of a fork (rather than for each finish() call) - $pm->run_on_finish( - sub { - my ( $pid, $exitCode, $liftedPath, $exitSignal, $coreDump, $outOrErrRef ) = @_; - - if ( $exitCode != 0 ) { - $self->log( 'fatal', - "$liftedPath failed liftOver due to: " - . ( $outOrErrRef ? $$outOrErrRef : 'unknown error' ) ); - return; - } - - $self->log( 'info', "PID $pid completed filtering $liftedPath to $$outOrErrRef \n" ); - push @outPaths, path($$outOrErrRef)->basename(); - } - ); - - for my $inPath (@$localFilesPathsAref) { - my $outPath; - - my $outPathBase; - # If this is a compressed file, strip the preceeding extension - if ( $inPath =~ /.gz$/ ) { - $outPathBase = substr( $inPath, 0, rindex( $inPath, '.' ) ); - $outPathBase = substr( $outPathBase, 0, rindex( $outPathBase, '.' ) ); - } - else { - $outPathBase = substr( $inPath, 0, rindex( $inPath, '.' ) ); - } - - $pm->start($inPath) and next; - my $readFh = $self->getReadFh($inPath); - - my $header = <$readFh>; - $header .= <$readFh>; - - my $based = 1; - my $phredIdx = -1; - my $altIdx = -3; - my $refIdx = -4; - - if ( $header =~ 'chromStart' && $header =~ 'chromEnd' ) { - $based = 0; - say STDERR "$inPath is 0-based BED-Like"; - } - - my $outFh; - - my @fields; - my $pos; - - my $dbVal; - my $caddDbVals; - my $caddDbScore = []; - my $score; - - my $ref; - my $alt; - my $lastChr; - my $chr; - - my $skipped = 0; - - # my $lastPosition; - # my $scoreCount; - - # my $nonACTGrefCount; - # my $nonACTGaltCount; - # my $missingScore; - while ( my $l = $readFh->getline() ) { - chomp $l; - @fields = split '\t', $l; - - #https://ideone.com/05wEAl - #Faster than split - #my $chr = substr($l, 0, index($l, "\t") ); - $chr = $fields[0]; - - # May be unwanted if coming from CADD directly - if ( !exists $wantedChrs{$chr} ) { - #https://ideone.com/JDtX3z - #CADD files don't use 'chr', but our cadd bed-like files do - if ( index( $chr, 'chr' ) == -1 ) { - $chr = 'chr' . $chr; - } - - #Our bed-like file will use 'chrM', but the original cadd file will have 'chrMT' - if ( $chr eq 'chrMT' ) { - $chr = 'chrM'; - } - - # Check again that this is unwanted - if ( !exists $wantedChrs{$chr} ) { - $self->log( 'warn', "Skipping unwanted: $chr" ); - - $skipped++; - next; - } - } - - if ( defined $lastChr ) { - if ( $lastChr ne $chr ) { - $db->cleanUp(); - - my $err = "Expected only a single chromosome, found $chr and $lastChr"; - $self->log( 'error', $err ); - - $pm->finish( 255, \$err ); - } - } - else { - $lastChr = $chr; - } - - my $pos = $fields[1]; - - if ( !defined $pos ) { - $db->cleanUp(); - - my $err = 'Undefined position'; - $self->log( 'error', $err ); - - $pm->finish( 255, \$err ); - } - - if ( !$outFh && !$dryRun ) { - $outPath = "$outPathBase.$chr$outExt"; - - $self->log( 'info', "Found $chr in $inPath; creating $outPath" ); - - if ( -e $outPath && !$self->overwrite ) { - $self->log( 'warn', - "outPath $outPath exists, skipping $inPath because overwrite not set" ); - last; - } - - $outFh = $self->getWriteFh($outPath); - - print $outFh $header; - } - - $dbVal = $db->dbReadOne( $chr, $pos - $based ); - - # We expect the db to have a value, but it's possible the CADD track gave us a nonsense pos - # This is unusual, so log - if ( !defined $dbVal ) { - $self->log( 'warn', "Couldn't find a value for $chr\:$pos ($based\-based)" ); - - $skipped++; - next; - } - - $caddDbVals = $dbVal->[ $caddGetter->dbName ]; - - # A value that failed QC, and wasn't included in the db - # Not unusual, don't log - if ( !defined $caddDbVals ) { - $skipped++; - next; - } - - if ( defined $caddDbVals && @$caddDbVals != 3 ) { - $db->cleanUp(); - - my $err = - "Couldn't find 3 cadd values for $chr\:$pos ($based\-based) ... Found " - . ( scalar @$caddDbVals ) - . " instead"; - $self->log( 'error', \$err ); - - $pm->finish( 255, \$err ); - } - - # Everything with $caddDbVals should be a well-qc'd base - # If not, die - $ref = $fields[$refIdx]; - $alt = $fields[$altIdx]; - - if ( $ref ne 'A' && $ref ne 'C' && $ref ne 'T' && $ref ne 'G' ) { - $db->cleanUp(); - - my $err = "$chr\:$pos ($based\-based) : Expected ACTG ref, found $ref"; - $self->log( 'error', \$err ); - - $pm->finish( 255, \$err ); - } - - if ( $alt ne 'A' && $alt ne 'C' && $alt ne 'T' && $alt ne 'G' ) { - $db->cleanUp(); - - my $err = "$chr\:$pos ($based\-based) : Expected ACTG alt, found $alt"; - $self->log( 'error', \$err ); - - $pm->finish( 255, \$err ); - } - - $score = $fields[$phredIdx]; - - if ( !looks_like_number($score) ) { - $db->cleanUp(); - - my $err = "$chr\:$pos ($based\-based) : Expected numerical PHRED, found $score"; - $self->log( 'error', \$err ); - - $pm->finish( 255, \$err ); - } - - $caddDbScore = $caddGetter->get( $dbVal, $chr, $ref, $alt, 0, $caddDbScore ); - - # We round the score to check against the db-held value, which is rounded - if ( $rounder->round($score) != $rounder->round( $caddDbScore->[0] ) ) { - $db->cleanUp(); - - my $err = "$chr\:$pos ($based\-based) : Expected PHRED $caddDbScore->[0], found: " - . $rounder->round($score); - $self->log( 'error', \$err ); - - $pm->finish( 255, \$err ); - } - - if ( !$dryRun ) { - say $outFh $l; - } - } - - $self->log( 'info', "Skipped $skipped sites in $inPath" ); - $pm->finish( 0, \$outPath ); - } - - $pm->wait_all_children(); - - $self->_wantedTrack->{local_files} = \@outPaths; - - $self->_backupAndWriteConfig(); - - return 1; -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Utils/LiftOverCadd.pm b/perl/lib/Utils/LiftOverCadd.pm deleted file mode 100644 index 8207683c1..000000000 --- a/perl/lib/Utils/LiftOverCadd.pm +++ /dev/null @@ -1,164 +0,0 @@ - -use 5.10.0; -use strict; -use warnings; - -# Takes a yaml file that defines one local file, and splits it on chromosome -# Only works for tab-delimitd files that have the c -package Utils::LiftOverCadd; - -our $VERSION = '0.001'; - -use Mouse 2; -use namespace::autoclean; -use Types::Path::Tiny qw/AbsFile Path AbsDir/; -use Path::Tiny qw/path/; - -use Seq::Tracks::Build::LocalFilesPaths; - -use Parallel::ForkManager; - -# Exports: _localFilesDir, _decodedConfig, compress, _wantedTrack, _setConfig, logPath, use_absolute_path -extends 'Utils::Base'; - -########## Arguments accepted ############## -# Take the CADD file and make it a bed file -# the liftOver path is not AbsFile, so that 'liftOver' is valid (provided in $PATH) -has liftOverPath => ( is => 'ro', isa => Path, coerce => 1, default => 'liftOver' ); -has liftOverChainPath => ( is => 'ro', isa => AbsFile, coerce => 1, required => 1 ); - -my $localFilesHandler = Seq::Tracks::Build::LocalFilesPaths->new(); - -sub go { - my $self = shift; - - my $liftOverExe = $self->liftOverPath; - my $chainPath = $self->liftOverChainPath; - - $self->log( 'info', "Liftover path is $liftOverExe and chainPath is $chainPath" ); - - my $gzip = $self->gzip; - - my $localFilesPathsAref = $localFilesHandler->makeAbsolutePaths( - $self->_decodedConfig->{files_dir}, - $self->_wantedTrack->{name}, - $self->_wantedTrack->{local_files} - ); - - my $outDir = - path( $self->_decodedConfig->{files_dir} )->child( $self->_wantedTrack->{name} ); - - my $pm = Parallel::ForkManager->new( $self->maxThreads ); - - if ( !@$localFilesPathsAref ) { - $self->log( 'fatal', "No local files found" ); - } - - my @finalOutPaths; - - # If we don't call the run_on_finish here - # only 1 outPath will be stored for each fork, regardless of how many - # files that fork read, since this will only be run once per termination - # of a fork (rather than for each finish() call) - $pm->run_on_finish( - sub { - my ( $pid, $exitCode, $liftedPath ) = @_; - if ( $exitCode != 0 ) { - $self->log( 'fatal', "$liftedPath failed liftOver" ); - return; - } - - push @finalOutPaths, path($liftedPath)->basename; - } - ); - - for my $inPath (@$localFilesPathsAref) { - $self->log( 'info', "Beginning to lift over $inPath" ); - - my ( undef, $isCompressed, $inFh ) = $self->getReadFh( $inPath, 'fatal' ); - - my $baseName = path($inPath)->basename; - - my $outPath = $outDir->child($baseName)->stringify; - - $outPath = $isCompressed ? substr( $outPath, 0, rindex( $outPath, "." ) ) : $outPath; - - my $compressOutput = $isCompressed || $self->compress; - - # It's a bit confusing how to compress stderr on the fly alongside stdout - # So just compress it (always) as a 2nd step - my $unmappedPath = $outPath . ".unmapped.txt"; - my $liftedPath = $outPath . ".mapped" . ( $compressOutput ? '.gz' : '' ); - - if ( -e $liftedPath && -e $unmappedPath && !$self->overwrite ) { - $self->log( 'info', - "$liftedPath and $unmappedPath exist, and overwrite not set. Skipping." ); - close $inFh; - - # Push so that we can update our local_files after loop finishes - push @finalOutPaths, $liftedPath; - - next; - } - - $self->log( 'info', "Set mapped out path as: $liftedPath" ); - $self->log( 'info', "Set unmapped out path as: $unmappedPath" ); - - ################## Write the headers to the output file (prepend) ######## - my $versionLine = <$inFh>; - my $headerLine = <$inFh>; - chomp $versionLine; - chomp $headerLine; - - my $outFh = $self->getWriteFh($liftedPath); - say $outFh $versionLine; - say $outFh $headerLine; - close $outFh; - - $self->log( 'info', "Wrote version line: $versionLine" ); - $self->log( 'info', "Wrote header line: $headerLine" ); - - $pm->start($liftedPath) and next; - ################ Liftover ####################### - # Decompresses - my $command; - if ( !$isCompressed ) { - $command = - "$liftOverExe <(cat $inPath | tail -n +3) $chainPath /dev/stdout $unmappedPath -bedPlus=3 "; - if ($compressOutput) { - $command .= "| $gzip -c - >> $liftedPath"; - } - else { - $command .= "| cat - >> $liftedPath"; - } - } - else { - $command = - "$liftOverExe <($gzip -d -c $inPath | tail -n +3) $chainPath /dev/stdout $unmappedPath -bedPlus=3 | $gzip -c - >> $liftedPath; $gzip $unmappedPath"; - } - - $self->log( 'info', "Beginning to exec command: $command" ); - - #Can't open and stream, limited shell expressions supported, subprocess is not - my $exitStatus = system( ( "bash", "-c", $command ) ); - - if ( $exitStatus != 0 ) { - $self->log( 'fatal', - "liftOver command for $inPath failed with exit status: $exitStatus" ); - } - else { - $self->log( 'info', - "Successfully completed liftOver with with exit status: $exitStatus" ); - } - $pm->finish($exitStatus); - } - - $pm->wait_all_children; - - $self->_wantedTrack->{local_files} = \@finalOutPaths; - - $self->_backupAndWriteConfig(); -} - -__PACKAGE__->meta->make_immutable; -1 diff --git a/perl/lib/Utils/RefGeneXdbnsfp.pm b/perl/lib/Utils/RefGeneXdbnsfp.pm deleted file mode 100644 index 24ba78f6a..000000000 --- a/perl/lib/Utils/RefGeneXdbnsfp.pm +++ /dev/null @@ -1,278 +0,0 @@ -use 5.14.0; -use strict; -use warnings; - -# Adds dbnsfp to refGene -package Utils::RefGeneXdbnsfp; - -our $VERSION = '0.001'; - -use Mouse 2; -use namespace::autoclean; -use Path::Tiny qw/path/; -use Parallel::ForkManager; -use Seq::Role::IO; -use Seq::Output::Delimiters; -use Seq::Tracks::Build::LocalFilesPaths; - -use List::Util qw/uniq/; - -# Exports: _localFilesDir, _decodedConfig, compress, _wantedTrack, _setConfig, logPath, use_absolute_path -extends 'Utils::Base'; - -# ########## Arguments accepted ############## -# Expects tab delimited file; not allowing to be set because it probably won't ever be anything -# other than tab, and because split('\'t') is faster -# has delimiter => (is => 'ro', lazy => 1, default => "\t"); -has geneFile => ( is => 'ro', isa => 'Str', required => 1 ); - -sub BUILD { - my $self = shift; - - my $localFilesHandler = Seq::Tracks::Build::LocalFilesPaths->new(); - - $self->{_localFiles} = $localFilesHandler->makeAbsolutePaths( - $self->_decodedConfig->{files_dir}, - $self->_wantedTrack->{name}, - $self->_wantedTrack->{local_files} - ); - - if ( !@{ $self->{_localFiles} } ) { - $self->log( 'fatal', "Require some local files" ); - } -} - -# TODO: error check opening of file handles, write tests -sub go { - my $self = shift; - - $self->log( 'info', 'Beginning RefGeneXdbnsfp' ); - # Store output handles by chromosome, so we can write even if input file - # out of order - my %outFhs; - my %skippedBecauseExists; - - my $dbnsfpFh = $self->getReadFh( $self->geneFile ); - - my $header = <$dbnsfpFh>; - - #appropriate chomp - $self->setLineEndings($header); - chomp $header; - - my @dbNSFPheaderFields = split '\t', $header; - - # Unfortunately, dbnsfp has many errors, for instance, NM_207007 being associated - # with CCL4L1 (in neither hg19 or 38 is this true: SELECT * FROM refGene LEFT JOIN kgXref ON refGene.name = kgXref.refseq LEFT JOIN knownToEnsembl ON kgXref.kgID = knownToEnsembl.name WHERE refGene.name='NM_207007' ;) - # So wel'll get odd duplicates; - # A safer option is to lose transcript specificity, but use the unique list of genes - my @geneNameCols = qw/Gene_name/; - my @geneNameIdx; - - for my $col (@geneNameCols) { - my $idx = 0; - for my $dCol (@dbNSFPheaderFields) { - if ( $dCol eq $col ) { - push @geneNameIdx, $idx; - } - - $idx++; - } - } - - my $delims = Seq::Output::Delimiters->new(); - my $posDelim = $delims->positionDelimiter; - my $ovrDelim = $delims->overlapDelimiter; - my $valDelim = $delims->valueDelimiter; - - # namespace - @dbNSFPheaderFields = map { 'dbnsfp.' . $_ } @dbNSFPheaderFields; - push @dbNSFPheaderFields, 'dbnsfp.pubmedID'; - - # unfortunately uses a period as a multi-value delimiter... - my $funcIdx; - - my $i = -1; - for my $field (@dbNSFPheaderFields) { - $i++; - if ( $field eq 'dbnsfp.Function_description' ) { - $funcIdx = $i; - } - } - - my %dbNSFP; - while (<$dbnsfpFh>) { - #appropriate chomp based on line endings - chomp; - - # Strip redundant words - $_ =~ s/TISSUE SPECIFICITY:\s*|FUNCTION:\s*|DISEASE:\s*|PATHWAY:\s*//g; - - my @pmidMatch = $_ =~ m/PubMed:(\d+)/g; - if (@pmidMatch) { - @pmidMatch = uniq(@pmidMatch); - } - - $_ =~ s/\{[^\}]+\}//g; - - # say "length : " . (scalar @fields); - - # Uniprot / dbnsfp annoyingly inserts a bunch of compound values - # that aren't really meant to be split on - # it would require negative lookbehind to correctly split them - # While that isn't difficult in perl, it wastes performance - # Replace such values with commas - my @innerStuff = $_ - =~ m/(?<=[\(\[\{])([^\(\[\{\)\]\}]*[$posDelim$valDelim$ovrDelim\/]+[^\(\[\{\)\]\}]+)+(?=[\]\}\)])/g; - - for my $match (@innerStuff) { - my $cp = $match; - $cp =~ s/[$posDelim$valDelim$ovrDelim\/]/,/g; - substr( $_, index( $_, $match ), length($match) ) = $cp; - } - - $_ =~ s/[^\w\[\]\{\}\(\)\t\n\r]+(?=[^\w ])//g; - - my @fields = split '\t', $_; - - my $index = -1; - for my $field (@fields) { - $index++; - - my @unique; - if ( $index == $funcIdx ) { - @unique = uniq( split /[\.]/, $field ); - } - else { - # split on [;] more effective, will split in cases like ); which /;/ won't - @unique = uniq( split /[;]/, $field ); - } - - my @out; - - my $index = -1; - for my $f (@unique) { - $f =~ s/^\s+//; - $f =~ s/\s+$//; - - # shouldn't be necessary, just in case - $f =~ s/\s*[^\w\[\]\{\}\(\)]+\s*$//; - - $f =~ s/[$posDelim$valDelim$ovrDelim\/]+/,/g; - - if ( defined $f && $f ne '' ) { - push @out, $f; - } - } - - $field = @out ? join ";", @out : "."; - } - - if (@pmidMatch) { - push @fields, join( ';', @pmidMatch ); - } - else { - push @fields, '.'; - } - - if ( @fields != @dbNSFPheaderFields ) { - $self->log( 'fatal', "WTF: $_" ); - } - - my $i = -1; - for my $idx (@geneNameIdx) { - $i++; - - my @vals = split ';', $fields[$idx]; - - # sometimes dbNSFP gives duplicate values in the same string... - my %seenThisLoop; - for my $val (@vals) { - if ( $val eq '.' || $val !~ /^\w+/ ) { - $self->log( 'fatal', "WTF: missing gene?" ); - } - - $seenThisLoop{$val} = 1; - - if ( exists $dbNSFP{$val} ) { - $self->log( 'fatal', "Duplicate entry found: $val, skipping : $_" ); - next; - } - - $dbNSFP{$val} = \@fields; - } - } - } - - # We'll update this list of files in the config file - $self->_wantedTrack->{local_files} = []; - - my $pm = Parallel::ForkManager->new( $self->maxThreads ); - - $pm->run_on_finish( - sub { - my ( $pid, $exitCode, $startId, $exitSig, $coreDump, $outFileRef ) = @_; - - if ( $exitCode != 0 ) { - $self->log( 'fatal', - "Failed to add dbnsfp, with exit code $exitCode for file $$outFileRef" ); - } - - push @{ $self->_wantedTrack->{local_files} }, path($$outFileRef)->basename; - } - ); - - for my $file ( @{ $self->{_localFiles} } ) { - $pm->start($file) and next; - - # Need to reset line endings here, or getReadFh may not operate correctly - $self->setLineEndings("\n"); - - my $fh = $self->getReadFh($file); - my $outFh; - - $file =~ s/.gz$//; - my $outFile = $file . '.with_dbnsfp.gz'; - - $outFh = $self->getWriteFh($outFile); - - my $header = <$fh>; - - $self->setLineEndings($header); - - chomp $header; - - say $outFh join( "\t", $header, @dbNSFPheaderFields ); - - while (<$fh>) { - chomp; - - my @fields = split '\t', $_; - - my $foundDbNFSP; - for my $field (@fields) { - # Empirically determine - if ( $dbNSFP{$field} ) { - push @fields, @{ $dbNSFP{$field} }; - $foundDbNFSP = 1; - last; - } - } - - if ( !$foundDbNFSP ) { - push @fields, map { '.' } @dbNSFPheaderFields; - } - - say $outFh join( "\t", @fields ); - } - - $pm->finish( 0, \$outFile ); - } - - $pm->wait_all_children(); - - $self->_backupAndWriteConfig(); -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Utils/RenameTrack.pm b/perl/lib/Utils/RenameTrack.pm deleted file mode 100644 index bfda44817..000000000 --- a/perl/lib/Utils/RenameTrack.pm +++ /dev/null @@ -1,104 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -# Takes a yaml file that defines one local file, and splits it on chromosome -# Only works for tab-delimitd files that have the c -package Utils::RenameTrack; - -our $VERSION = '0.001'; - -use Mouse 2; -use namespace::autoclean; -use Path::Tiny qw/path/; - -use Seq::Tracks::Build::LocalFilesPaths; -use Seq::Tracks::Base::MapTrackNames; -use List::MoreUtils qw/first_index/; - -# Exports: _localFilesDir, _decodedConfig, compress, _wantedTrack, _setConfig, logPath, use_absolute_path -extends 'Utils::Base'; - -########## Arguments accepted ############## -# Take the CADD file and make it a bed file -has renameTo => ( is => 'ro', isa => 'Str', required => 1 ); - -############# Private ############ - -sub BUILD { - my $self = shift; - - my $databaseDir = $self->_decodedConfig->{database_dir}; - - if ( !$databaseDir ) { - $self->log( 'fatal', - "database_dir required in config file for Utils::RenameTrack to work" ); - return; - } - - # DBManager acts as a singleton. It is configured once, and then consumed repeatedly - # However, in long running processes, this can lead to misconfiguration issues - # and worse, environments created in one process, then copied during forking, to others - # To combat this, every time Seq::Base is called, we re-set/initialzied the static - # properties that create this behavior - Seq::DBManager::initialize(); - - # Since we never have more than one database_dir, it's a global property we can set - # in this package, which Seq.pm and Seq::Build extend from - Seq::DBManager::setGlobalDatabaseDir($databaseDir); -} - -sub go { - my $self = shift; - - #TODO: rename dryRun to dryRun in main package - my $trackNameMapper = - Seq::Tracks::Base::MapTrackNames->new( { dryRun => $self->dryRun } ); - - my $err = $trackNameMapper->renameTrack( $self->name, $self->renameTo ); - - if ( !$err ) { - $self->log( 'info', "Renamed track from " . $self->name . " to " . $self->renameTo ); - } - else { - $self->log( 'info', "Failed to rename track " . $self->name . " because $err" ); - return; - } - - $self->_wantedTrack->{name} = $self->renameTo; - - #TODO: support renaming for the other fields - if ( defined $self->_decodedConfig->{statistics} - && $self->_decodedConfig->{statistics}{dbSNPnameField} ) - { - if ( $self->_decodedConfig->{statistics}{dbSNPnameField} eq $self->name ) { - $self->_decodedConfig->{statistics}{dbSNPnameField} = $self->renameTo; - } - } - - if ( defined $self->_decodedConfig->{output} - && defined $self->_decodedConfig->{output}{order} ) - { - my $trackOrderIdx = - first_index { $_ eq $self->name } @{ $self->_decodedConfig->{output}{order} }; - - if ( $trackOrderIdx > -1 ) { - $self->_decodedConfig->{output}{order}[$trackOrderIdx] = $self->renameTo; - } - } - - my $metaPath = - path( $self->_decodedConfig->{database_dir} )->child( $self->name . '_meta' ); - - if ( - -e path( $self->_decodedConfig->{database_dir} )->child( $self->name . '_meta' ) ) - { - $metaPath->move( - path( $self->_decodedConfig->{database_dir} )->child( $self->renameTo . '_meta' ) ); - } - - $self->_backupAndWriteConfig(); -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Utils/SortCadd.pm b/perl/lib/Utils/SortCadd.pm deleted file mode 100644 index 691f20094..000000000 --- a/perl/lib/Utils/SortCadd.pm +++ /dev/null @@ -1,209 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -# Takes a yaml file that defines one local file, and splits it on chromosome -# Only works for tab-delimitd files that have the c -package Utils::SortCadd; - -our $VERSION = '0.001'; - -use Mouse 2; -use namespace::autoclean; -use Path::Tiny qw/path/; - -use Seq::Tracks::Build::LocalFilesPaths; -use Parallel::ForkManager; - -# _localFilesDir, _decodedConfig, compress, _wantedTrack, _setConfig, and logPath, -extends 'Utils::Base'; - -########## Arguments accepted ############## -# Takes either a bed-like cadd file, or a cadd file, and sorts it -# Works a bit differently from CaddToBed or LiftOverCadd, in that -# even if the input file is compressed, by default we compress the output -# Otherwise the intermediate files generated before sort become enormous, and this run could take over 600GB -has '+compress' => ( default => 1 ); - -my $localFilesHandler = Seq::Tracks::Build::LocalFilesPaths->new(); - -sub BUILD { - my $self = shift; - - $self->_wantedTrack->{local_files} = $localFilesHandler->makeAbsolutePaths( - $self->_decodedConfig->{files_dir}, - $self->_wantedTrack->{name}, - $self->_wantedTrack->{local_files} - ); -} - -sub go { - my $self = shift; - - my %wantedChrs = map { $_ => 1 } @{ $self->_decodedConfig->{chromosomes} }; - - # record out paths so that we can unix sort those files - my @outPaths; - my %outFhs; - - $self->log( 'info', "Beginning organizing cadd files by chr (single threaded)" ); - - my $outExtPart = $self->compress ? '.txt.gz' : '.txt'; - - my $outExt = '.organized-by-chr' . $outExtPart; - - for my $inFilePath ( @{ $self->_wantedTrack->{local_files} } ) { - my $outPathBase = substr( $inFilePath, 0, rindex( $inFilePath, '.' ) ); - - $outPathBase =~ s/\.(chr[\w_\-]+)//; - - # Store output handles by chromosome, so we can write even if input file - # out of order - $self->log( 'info', "Out path base: $outPathBase" ); - $self->log( 'info', "Reading input file: $inFilePath" ); - - my ( undef, $compressed, $readFh ) = $self->getReadFh( $inFilePath, 'fatal' ); - - my $versionLine = <$readFh>; - my $headerLine = <$readFh>; - - $self->log( 'info', "Read version line: $versionLine" ); - $self->log( 'info', "Read header line: $headerLine" ); - - while ( my $l = $readFh->getline() ) { - #https://ideone.com/05wEAl - #Faster than split - my $chr = substr( $l, 0, index( $l, "\t" ) ); - - # May be unwanted if coming from CADD directly - if ( !exists $wantedChrs{$chr} ) { - #https://ideone.com/JDtX3z - #CADD files don't use 'chr', but our cadd bed-like files do - if ( substr( $chr, 0, 3 ) ne 'chr' ) { - $chr = 'chr' . $chr; - } - - #Our bed-like file will use 'chrM', but the original cadd file will have 'chrMT' - if ( $chr eq 'chrMT' ) { - $chr = 'chrM'; - } - - # Check again that this is unwanted - if ( !exists $wantedChrs{$chr} ) { - $self->log( 'warn', "Skipping unwanted: $chr" ); - next; - } - } - - my $fh = $outFhs{$chr}; - - if ( !$fh ) { - my $outPath = "$outPathBase.$chr$outExt"; - - $self->log( 'info', "Found $chr in $inFilePath; creating $outPath" ); - - push @outPaths, $outPath; - - if ( -e $outPath && !$self->overwrite ) { - $self->log( 'warn', - "outPath $outPath exists, skipping $inFilePath because overwrite not set" ); - last; - } - - $outFhs{$chr} = $self->getWriteFh($outPath); - - $fh = $outFhs{$chr}; - - print $fh $versionLine; - print $fh $headerLine; - } - - print $fh $l; - } - } - - for my $outFh ( values %outFhs ) { - close $outFh; - } - - $self->log( 'info', - "Finished organizing cadd files by chr, beginning sort (multi threaded)" ); - - # TODO: use max processors based on # of cores - my $pm = Parallel::ForkManager->new( $self->maxThreads ); - - my @finalOutPaths; - - # If we don't call the run_on_finish here - # only 1 outPath will be stored for each fork, regardless of how many - # files that fork read, since this will only be run once per termination - # of a fork (rather than for each finish() call) - $pm->run_on_finish( - sub { - my ( $pid, $exitCode, $finalOutPath ) = @_; - - if ( $exitCode != 0 ) { - return $self->log( 'fatal', - "$finalOutPath failed to sort, with exit code $exitCode" ); - } - push @finalOutPaths, path($finalOutPath)->basename; - } - ); - - for my $outPath (@outPaths) { - my $gzipPath = $self->gzip; - - my ( undef, $compressed, $fh ) = $self->getReadFh( $outPath, 'fatal' ); - - my $outExt = '.sorted' . $outExtPart; - - my $finalOutPathBase = substr( $outPath, 0, rindex( $outPath, '.' ) ); - - my $finalOutPath = $finalOutPathBase . $outExt; - - my $tempPath = path($finalOutPath)->parent()->stringify; - - $pm->start($finalOutPath) and next; - my $command; - - #k2,2 means sort only by column 2. column 2 is either chrStart or Pos - if ($compressed) { - $command = - "( head -n 2 <($gzipPath -d -c $outPath) && tail -n +3 <($gzipPath -d -c $outPath) | sort --compress-program $gzipPath -T $tempPath -k2,2 -n ) | $gzipPath -c > $finalOutPath"; - } - else { - $command = - "( head -n 2 $outPath && tail -n +3 $outPath | sort --compress-program $gzipPath -T $tempPath -k2,2 -n ) > $finalOutPath"; - } - - $self->log( 'info', "Running command: $command" ); - - my $exitStatus = system( ( "bash", "-c", $command ) ); - - if ( $exitStatus == 0 ) { - $self->log( 'info', - "Successfully finished sorting $outPath. Exit status: $exitStatus" ); - $exitStatus = system("rm $outPath"); - } - else { - $self->log( 'error', - "Failed to sort $outPath. Exit status: $exitStatus. Expect fatal message and program exit." - ); - } - # returns the exit status for run_on_finish to die - $pm->finish($exitStatus); - } - - $pm->wait_all_children(); - - $self->_wantedTrack->{local_files} = \@finalOutPaths; - - # Make sure that we indicate to the user that cadd is guaranteed to be sorted - # This speeds up cadd building - $self->_wantedTrack->{sorted} = 1; - - $self->_backupAndWriteConfig(); -} - -__PACKAGE__->meta->make_immutable; -1; diff --git a/perl/lib/Utils/SqlWriter.pm b/perl/lib/Utils/SqlWriter.pm deleted file mode 100644 index be4aab13f..000000000 --- a/perl/lib/Utils/SqlWriter.pm +++ /dev/null @@ -1,213 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Utils::SqlWriter; - -our $VERSION = '0.001'; - -# ABSTRACT: Fetch and write some data using UCSC's public SQL db -use Mouse 2; - -use namespace::autoclean; -use Time::localtime; -use Path::Tiny qw/path/; - -use Utils::SqlWriter::Connection; -with 'Seq::Role::IO', 'Seq::Role::Message'; - -# @param sql_statement : Valid SQL with fully qualified field names -has sql => ( is => 'ro', isa => 'Str', required => 1 ); - -# Where any downloaded or created files should be saved -has outputDir => ( is => 'ro', isa => 'Str', required => 1 ); - -has connection => ( is => 'ro', isa => 'Maybe[HashRef]' ); - -# @param chromosomes : All wanted chromosomes -has chromosomes => ( is => 'ro', isa => 'ArrayRef' ); - -# Compress the output? -has compress => ( is => 'ro', isa => 'Bool' ); - -has sqlClient => ( is => 'ro', init_arg => undef, writer => '_setSqlClient' ); -######################### DB Configuartion Vars ######################### -my $year = localtime->year() + 1900; -my $mos = localtime->mon() + 1; -my $day = localtime->mday; -my $nowTimestamp = sprintf( "%d-%02d-%02d", $year, $mos, $day ); - -sub BUILD { - my $self = shift; - - my $config = defined $self->connection ? { connection => $self->connection } : {}; - - $self->_setSqlClient( Utils::SqlWriter::Connection->new($config) ); -} - -=method @public sub fetchAndWriteSQLData - - Read the SQL data and write to file - -@return {DBI} - A connection object - -=cut - -sub go { - my $self = shift; - - my $extension = $self->compress ? 'gz' : 'txt'; - - my $query = $self->sql; - - my $perChromosome; - - if ( $query =~ /\%chromosomes\%/ ) { - $perChromosome = 1; - } - - # We'll return the relative path to the files we wrote - my @outRelativePaths; - CHR_LOOP: for my $chr ( $perChromosome ? @{ $self->chromosomes } : 'fetch' ) { - # for return data - my @sql_data = (); - - my $query = $self->sql; - - ########### Restrict SQL fetching to just this chromosome ############## - - # Get the FQ table name (i.e hg19.refSeq instead of refSeq), to avoid - if ($perChromosome) { - $query =~ s/\%chromosomes\%/'$chr'/g; - } - - # Will choose the first FROM; in complex - $query =~ m/FROM\s(\S+)/i; - - ##### use database ###### - # If given database in connection object, use that, else try to infer - my $databaseName; - - my $tableName = $1; - - # Check if table name is database.table - if ( $tableName =~ /\S+\.\S+/ ) { - ( $databaseName, $tableName ) = ( split( /\./, $tableName ) ); - } - - if ( defined $self->sqlClient->database ) { - $databaseName = $self->sqlClient->database; - } - - if ( !$databaseName ) { - $self->log( 'fatal', - "No database found: use a fully qualified table (database.table) or set the 'database' property in 'connection'" - ); - } - - $self->log( 'info', "Set database name to $databaseName\n" ); - - my $fileName = join '.', $databaseName, $tableName, $chr, $extension; - - $self->log( 'info', "Set file name to $fileName\n" ); - - my $timestampName = join '.', $nowTimestamp, $fileName; - - # Save the fetched data to a timestamped file, then symlink it to a non-timestamped one - # This allows non-destructive fetching - my $symlinkedFile = path( $self->outputDir )->child($fileName)->absolute->stringify; - my $targetFile = - path( $self->outputDir )->child($timestampName)->absolute->stringify; - - # prepare file handle - my $outFh = $self->getWriteFh($targetFile); - - $self->log( 'info', "Fetching from $databaseName: $query\n\n" ); - ########### Connect to database ################## - my $dbh = $self->sqlClient->connect($databaseName); - ########### Prepare and execute SQL ############## - my $sth = $dbh->prepare($query) or $self->log( 'fatal', $dbh->errstr ); - - $sth->execute or $self->log( 'fatal', $dbh->errstr ); - - ########### Retrieve data ############## - my $count = -1; - while ( my @row = $sth->fetchrow_array ) { - $count++; - - if ( $count == 0 ) { - # Write header - # Cleaner here, because there is nothing in {NAME} when empty query - my @stuff = @{ $sth->{NAME} }; - push @sql_data, $sth->{NAME}; - } - - my $clean_row_aref = $self->_cleanRow( \@row ); - push @sql_data, $clean_row_aref; - - if ( @sql_data > 1000 ) { - say $outFh join( "\n", map { join( "\t", @$_ ) } @sql_data ); - @sql_data = (); - } - } - - # leftovers - if (@sql_data) { - say $outFh join( "\n", map { join( "\t", @$_ ) } @sql_data ); - @sql_data = (); - } - - $sth->finish(); - # Must commit before this works, or will get DESTROY before explicit disconnect() - $dbh->disconnect(); - - # We may not have data for all chromsoomes - if ( $count > -1 ) { - $self->log( "info", "Finished writing $targetFile\n\n" ); - - if ( system("ln -s -f $targetFile $symlinkedFile") != 0 ) { - $self->log( 'fatal', "Failed to symlink $targetFile -> $symlinkedFile\n\n" ); - } - - $self->log( 'info', "Symlinked $targetFile -> $symlinkedFile\n\n" ); - - push @outRelativePaths, $fileName; - next CHR_LOOP; - } - - $self->log( "error", - "No results found for $chr: \n query: $query, \n archive: $targetFile, \n output: $symlinkedFile)\n\n" - ); - # # We may have had 0 results; - # if (-z $targetFile) { - # unlink $targetFile; - # } - - # if (-z $symlinkedFile) { - # unlink $symlinkedFile; - # } - - # Throttle connection - sleep 5; - } - - return @outRelativePaths; -} - -sub _cleanRow { - my ( $self, $aref ) = @_; - - # http://stackoverflow.com/questions/2059817/why-is-perl-foreach-variable-assignment-modifying-the-values-in-the-array - for my $ele (@$aref) { - if ( !defined($ele) || $ele eq "" ) { - $ele = "NA"; - } - } - - return $aref; -} - -__PACKAGE__->meta->make_immutable; - -1; diff --git a/perl/lib/Utils/SqlWriter/Connection.pm b/perl/lib/Utils/SqlWriter/Connection.pm deleted file mode 100644 index 4bbaf350c..000000000 --- a/perl/lib/Utils/SqlWriter/Connection.pm +++ /dev/null @@ -1,64 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Utils::SqlWriter::Connection; - -use DBI; -use DBD::MariaDB 1.23; # Specify the version of DBD::MariaDB - -our $VERSION = '0.001'; - -# ABSTRACT: Fetch and write some data using UCSC's public SQL db -use Mouse 2; - -use namespace::autoclean; - -# The actual configuration -has driver => ( is => 'ro', isa => 'Str', default => "DBI:MariaDB" ) - ; # Use MariaDB driver -has host => ( is => 'ro', isa => 'Str', default => "genome-mysql.soe.ucsc.edu" ); -has user => ( is => 'ro', isa => 'Str', default => "genome" ); -has password => ( is => 'ro', isa => 'Str', ); -has port => ( is => 'ro', isa => 'Int', ); -has socket => ( is => 'ro', isa => 'Str', ); -has database => ( is => 'ro', isa => 'Maybe[Str]' ); - -around BUILDARGS => sub { - my ( $orig, $self, $data ) = @_; - - if ( defined $data->{connection} ) { - for my $key ( keys %{ $data->{connection} } ) { - $data->{$key} = $data->{connection}{$key}; - } - } - - return $self->$orig($data); -}; - -sub connect { - my $self = shift; - my $databaseName = shift; - - $databaseName = $self->database || $databaseName; - - my $connection = $self->driver; - $connection .= ":database=$databaseName;host=" . $self->host if $self->host; - $connection .= ";port=" . $self->port if $self->port; - $connection .= ";mariadb_socket=" . $self->port_num if $self->socket; - $connection .= ";mariadb_read_default_group=client"; # Change to MariaDB option - - return DBI->connect( - $connection, - $self->user, - $self->password, - { - RaiseError => 1, - PrintError => 1, - AutoCommit => 1 - } - ); -} - -__PACKAGE__->meta->make_immutable(); -1; diff --git a/perl/lib/Utils/scripts/cadd_indel_to_vcf.sh b/perl/lib/Utils/scripts/cadd_indel_to_vcf.sh deleted file mode 100755 index e9bfe56f5..000000000 --- a/perl/lib/Utils/scripts/cadd_indel_to_vcf.sh +++ /dev/null @@ -1,126 +0,0 @@ -#!/bin/env bash - -# Function to show help -show_help() { - echo "Usage: $0 [OPTION]..." - echo "Convert a custom file format to VCF." - echo "Reads from standard input if '-' is provided as an argument, or use --input to specify a file." - echo - echo " -h, --help Display this help and exit" - echo " --input FILE Specify the input file to be converted" -} - -# No arguments provided (not even a dash) -if [ "$#" -eq 0 ]; then - show_help - exit 1 -fi - -# Variables -input_file="" - -# Parse command-line arguments -while [[ "$#" -gt 0 ]]; do - case $1 in - -h|--help) - show_help - exit 0 - ;; - --input) - if [[ -n $input_file ]]; then - echo "Error: Cannot use --input with '-' argument." - exit 1 - fi - input_file="$2" - shift # Remove argument name from processing - shift # Remove argument value from processing - ;; - -) - if [[ -n $input_file ]]; then - echo "Error: Cannot use '-' with --input argument." - exit 1 - fi - input_file="-" # Indicate that we should read from stdin - shift - ;; - *) - # Unknown option - echo "Unknown option: $1" - show_help - exit 1 - ;; - esac -done - - -# Function to extract the reference genome, CADD version, and check headers -check_headers_and_extract_details() { - local input=$1 - local line_no=0 - local ref_genome="" - local cadd_version="" - - cadd_header_regex='^##\s*CADD\s+(GRCh[0-9]+)-v([0-9.]+)' - cadd_header_line_two="^#Chrom\s+Pos\s+Ref\s+Alt\s+RawScore\s+PHRED" - - while IFS= read -r line; do - ((line_no++)) - if [ $line_no -eq 1 ]; then - if [[ $line =~ $cadd_header_regex ]]; then - ref_genome=${BASH_REMATCH[1]} - cadd_version=${BASH_REMATCH[2]} - else - echo "Error: Invalid header line or reference genome and CADD version not found." >&2 - exit 1 - fi - elif [ $line_no -eq 2 ]; then - if ! [[ "$line" =~ $cadd_header_line_two ]]; then - echo "Error: Second header line does not match expected format." >&2 - exit 1 - fi - break - fi - done < "$input" - - echo "$ref_genome $cadd_version" -} - -# Check input and extract reference genome and CADD version -if [[ $input_file == "-" ]]; then - read ref_genome cadd_version <<< $(check_headers_and_extract_details /dev/stdin) -else - if [[ ! -f $input_file ]]; then - echo "Error: File does not exist." >&2 - exit 1 - fi - read ref_genome cadd_version <<< $(check_headers_and_extract_details "$input_file") -fi - -# Check if the ref_genome and cadd_version were properly extracted -if [ -z "$ref_genome" ] || [ -z "$cadd_version" ]; then - echo "Error: Unable to extract reference genome and CADD version." >&2 - exit 1 -fi - -# Output VCF Header -cat < -##INFO= -EOF -echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO" - -# Process the file or stdin and convert it to VCF format using awk -if [[ $input_file == "-" ]]; then - # Read from stdin - awk 'FNR > 2 { printf("%s\t%s\t.\t%s\t%s\t.\t.\tRawScore=%s;PHRED=%s\n", $1, $2, $3, $4, $5, $6) }' -else - # Check if the file exists - if [[ ! -f $input_file ]]; then - echo "Error: File does not exist." - exit 1 - fi - awk 'FNR > 2 { printf("%s\t%s\t.\t%s\t%s\t.\t.\tRawScore=%s;PHRED=%s\n", $1, $2, $3, $4, $5, $6) }' "$input_file" -fi \ No newline at end of file diff --git a/perl/lib/Utils/scripts/dbsnp_rename_chrs_25_39.sh b/perl/lib/Utils/scripts/dbsnp_rename_chrs_25_39.sh deleted file mode 100755 index 429f3b4d0..000000000 --- a/perl/lib/Utils/scripts/dbsnp_rename_chrs_25_39.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env bash -set -e - -# Check if the correct number of arguments are provided -if [ "$#" -ne 2 ]; then - echo -e "\nUsage: $0 \n" - exit 1 -fi - -# Get directories from arguments and strip trailing slashes -hg19_dir="${1%/}" -hg38_dir="${2%/}" - -# Assembly reports for renaming chromosomes -report_dir='ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405' -wget -N "${report_dir}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_report.txt" -wget -N "${report_dir}/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_report.txt" - -# Grab the useful columns -# The structure of the assembly report is: -# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-Location/Type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name -# 1 assembled-molecule 1 Chromosome CM000663.2 = NC_000001.11 Primary Assembly 248956422 chr1 -# 2 assembled-molecule 2 Chromosome CM000664.2 = NC_000002.12 Primary Assembly 242193529 chr2 -# 3 assembled-molecule 3 Chromosome CM000665.2 = NC_000003.12 Primary Assembly 198295559 chr3 -# 4 assembled-molecule 4 Chromosome CM000666.2 = NC_000004.12 Primary Assembly 190214555 chr4 -# 5 assembled-molecule 5 Chromosome CM000667.2 = NC_000005.10 Primary Assembly 181538259 chr5 -# 6 assembled-molecule 6 Chromosome CM000668.2 = NC_000006.12 Primary Assembly 170805979 chr6 -# 7 assembled-molecule 7 Chromosome CM000669.2 = NC_000007.14 Primary Assembly 159345973 chr7 -# 8 assembled-molecule 8 Chromosome CM000670.2 = NC_000008.11 Primary Assembly 145138636 chr8 -# 9 assembled-molecule 9 Chromosome CM000671.2 = NC_000009.12 Primary Assembly 138394717 chr9 -# 10 assembled-molecule 10 Chromosome CM000672.2 = NC_000010.11 Primary Assembly 133797422 chr10 -# 11 assembled-molecule 11 Chromosome CM000673.2 = NC_000011.10 Primary Assembly 135086622 chr11 -# 12 assembled-molecule 12 Chromosome CM000674.2 = NC_000012.12 Primary Assembly 133275309 chr12 -# 13 assembled-molecule 13 Chromosome CM000675.2 = NC_000013.11 Primary Assembly 114364328 chr13 -# 14 assembled-molecule 14 Chromosome CM000676.2 = NC_000014.9 Primary Assembly 107043718 chr14 -# 15 assembled-molecule 15 Chromosome CM000677.2 = NC_000015.10 Primary Assembly 101991189 chr15 -# 16 assembled-molecule 16 Chromosome CM000678.2 = NC_000016.10 Primary Assembly 90338345 chr16 -# 17 assembled-molecule 17 Chromosome CM000679.2 = NC_000017.11 Primary Assembly 83257441 chr17 -# 18 assembled-molecule 18 Chromosome CM000680.2 = NC_000018.10 Primary Assembly 80373285 chr18 -# 19 assembled-molecule 19 Chromosome CM000681.2 = NC_000019.10 Primary Assembly 58617616 chr19 -# 20 assembled-molecule 20 Chromosome CM000682.2 = NC_000020.11 Primary Assembly 64444167 chr20 -# 21 assembled-molecule 21 Chromosome CM000683.2 = NC_000021.9 Primary Assembly 46709983 chr21 -# 22 assembled-molecule 22 Chromosome CM000684.2 = NC_000022.11 Primary Assembly 50818468 chr22 -# X assembled-molecule X Chromosome CM000685.2 = NC_000023.11 Primary Assembly 156040895 chrX -# Y assembled-molecule Y Chromosome CM000686.2 = NC_000024.10 Primary Assembly 57227415 chrY -# MT assembled-molecule MT Chromosome J01415.2 = NC_012920.1 Primary Assembly 16569 chrM - -# The 11th column contains UCSC-style contigs. Unfortunately, when using the UCSC-style-name column -# not all contigs get values. For example, the contig "KI270728.1" is not present in the UCSC-style-name, for example -# HG1459_PATCH fix-patch X Chromosome JH806600.2 = NW_004070890.2 PATCHES 6530008 na -# Therefore, to get a valid VCF we need to use the Sequence-Name column -for k in *assembly_report.txt - do - out=$(echo $k | sed 's/.txt/.chrnames/') - grep -e '^[^#]' $k | awk '{ print $7, $1 }' > $out -done - -bcftools annotate \ - --rename-chrs GCF_000001405.25_GRCh37.p13_assembly_report.chrnames \ - --threads 10 -Oz -o ${hg19_dir}/GRCh37.dbSNP155.vcf.gz ${hg19_dir}/GCF_000001405.25.gz -bcftools annotate \ - --rename-chrs GCF_000001405.39_GRCh38.p13_assembly_report.chrnames \ - --threads 10 -Oz -o ${hg38_dir}/GRCh38.dbSNP155.vcf.gz ${hg38_dir}/GCF_000001405.39.gz \ No newline at end of file diff --git a/perl/lib/Utils/scripts/extract_gnomad_an_af_nhomalt.pl b/perl/lib/Utils/scripts/extract_gnomad_an_af_nhomalt.pl deleted file mode 100644 index e26aafb65..000000000 --- a/perl/lib/Utils/scripts/extract_gnomad_an_af_nhomalt.pl +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; -use YAML::Tiny; -use IO::Uncompress::Gunzip qw(gunzip $GunzipError); - -# Function to read the VCF header from a possibly gzipped file -sub read_vcf_header { - my ($filename) = @_; - my $header = ''; - - # Check if file is gzipped - if ( $filename =~ /\.gz|\.bgz$/ ) { - my $z = IO::Uncompress::Gunzip->new($filename) - or die "gunzip failed: $GunzipError\n"; - while (<$z>) { - last if /^#CHROM/; # Stop at the column header line - $header .= $_ if /^##/; - } - close $z; - } - else { - open( my $fh, '<', $filename ) - or die "Could not open file '$filename' $!"; - while (<$fh>) { - last if /^#CHROM/; # Stop at the column header line - $header .= $_ if /^##/; - } - close $fh; - } - - return $header; -} - -# Function to parse the VCF header and extract the feature types -sub parse_vcf_header { - my ($header) = @_; - my @types; - - foreach my $line ( split /\n/, $header ) { - if ( $line =~ /^##INFO=\n" unless @ARGV == 1; - -# Variables -my $vcf_file = $ARGV[0]; -my $use_bgzip = system("command -v bgzip > /dev/null") == 0; -my $is_gzipped = `file --mime-type "$vcf_file"` =~ /gzip$/; -my $read_cmd = - $is_gzipped ? $use_bgzip ? "bgzip --threads 32 -d -c" : "gzip -d -c" : "cat"; -my $write_cmd = - $is_gzipped ? $use_bgzip ? "bgzip --threads 32 -c" : "gzip -c" : "cat"; -my $num_cores = `nproc`; - -my $vcf_file_dir = path($vcf_file)->dirname; -my $vcf_file_basename = path($vcf_file)->basename( '.gz', '.bgz' ); - -# Read the VCF header -open my $fh, "-|", "$read_cmd $vcf_file" or die "Cannot open file: $!"; -my ( @header_before_contig, $final_header_line ); -my $header_count = 0; -while (<$fh>) { - if (/^#/) { - $header_count += 1; - if (/^##contig=/) { - next; - } - - if (/^#CHROM/) { - $final_header_line = $_; - last; - } - - push @header_before_contig, $_; - } -} -close $fh; - -# Sort the VCF file by chromosome -my $cmd = "$read_cmd $vcf_file"; -open my $sorted_vcf_fh, "-|", $cmd or die "Cannot open file: $!"; - -say STDERR "\nREAD COMMAND: $cmd\n"; -# Process the sorted file -my $prev_chr_file; -my %chr_fhs = (); -my $line_count = 0; -while ( my $line = <$sorted_vcf_fh> ) { - $line_count += 1; - - if ( $line_count <= $header_count ) { - say STDERR "Skipping header line: $line"; - next; - } - - if ( $line_count == $header_count + 1 ) { - say STDERR "Processing first line: $line"; - } - - my $tab_index = index( $line, "\t" ); - my $chr = substr( $line, 0, $tab_index ); - - if ( !$chr ) { - say STDERR "Couldn't find chromosome on line: $line"; - exit 1; - } - - if ( !$chr_fhs{$chr} ) { - my $chr_file = path($vcf_file_dir) - ->child( "$vcf_file_basename.$chr.vcf" . ( $is_gzipped ? ".gz" : "" ) )->stringify; - open my $fh, "|-", "$write_cmd > $chr_file" or die "Cannot open file: $!"; - - say STDERR "Opened: $chr_file"; - - my $final_header = - join( "", @header_before_contig ) . "##contig=\n" . $final_header_line; - print $fh $final_header; - - $chr_fhs{$chr} = $fh; - } - - print { $chr_fhs{$chr} } $line; -} -close $sorted_vcf_fh; - -say STDERR "Processed $line_count lines"; - -for my $chr ( keys %chr_fhs ) { - say STDERR "Closing: $chr"; - close $chr_fhs{$chr}; -} - -print "VCF file split by chromosomes completed.\n"; diff --git a/perl/lib/Utils/scripts/t/test_extract_gnomad_an_af_nhomalt.pl b/perl/lib/Utils/scripts/t/test_extract_gnomad_an_af_nhomalt.pl deleted file mode 100644 index 120bd016e..000000000 --- a/perl/lib/Utils/scripts/t/test_extract_gnomad_an_af_nhomalt.pl +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; -use YAML::Tiny; -use IO::Uncompress::Gunzip qw(gunzip $GunzipError); -use IO::Compress::Gzip qw(gzip $GzipError); -use File::Temp qw(tempfile); - -use strict; -use DDP; -use warnings; -use Test::More tests => 4; # Adjust the number of tests based on your test cases -require 'Utils/scripts/extract_gnomad_an_af_nhomalt.pl' - ; # Replace with the name of your module - -# Test for read_vcf_header with a non-gzipped VCF file -sub test_read_vcf_header_non_gzipped { - my $header = read_vcf_header("path/to/non_gzipped_sample.vcf"); - ok( $header =~ /#CHROM/, "Header read correctly for non-gzipped file" ); -} - -# Test for read_vcf_header with a gzipped VCF file -sub test_read_vcf_header_gzipped { - my $header = read_vcf_header("path/to/gzipped_sample.vcf.gz"); - ok( $header =~ /#CHROM/, "Header read correctly for gzipped file" ); -} - -# Test for parse_vcf_header -sub test_parse_vcf_header { - my $header = "##INFO=\n##INFO="; - my @types = parse_vcf_header($header); - p @types; - is( scalar @types, 2, "Two types extracted" ); - ok( grep( /AN/, @types ), "AN type extracted" ); - ok( grep( /AF/, @types ), "AF type extracted" ); -} - -# Running the tests -test_parse_vcf_header(); - -# Helper function to create a temporary VCF file -sub create_temp_vcf { - my ( $content, $is_gzipped ) = @_; - - my ( $fh, $filename ) = - tempfile( SUFFIX => $is_gzipped ? '.vcf.gz' : '.vcf', UNLINK => 1 ); - - if ($is_gzipped) { - gzip \$content => $filename or die "gzip failed: $GzipError\n"; - } - else { - print $fh $content; - } - - close $fh; - - return $filename; -} - -# Example usage -my $vcf_content = - "##fileformat=VCFv4.2\n" - . "##INFO=\n" - . "##INFO=\n" - . "##INFO=\n" - . "##INFO=\n" - . "##INFO=\n" - . "##INFO=\n" - . "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; - -my $temp_vcf = create_temp_vcf( $vcf_content, 0 ); # Create non-gzipped VCF -my $temp_vcf_gz = create_temp_vcf( $vcf_content, 1 ); # Create gzipped VCF - -# Test with temporary files -my @expected = ( - "- AN: number", - "- AF: number", - "- control_AN: number", - "- control_AF: number", - "- control_AN_nfe: number", - "- control_AF_nfe: number" -); -my @types = parse_vcf_header( read_vcf_header($temp_vcf) ); -is_deeply( \@types, \@expected, "AN type extracted" ); - -@types = parse_vcf_header( read_vcf_header($temp_vcf_gz) ); -is_deeply( \@types, \@expected, "AN type extracted from gzipped file" ); diff --git a/perl/t/dbmanager-cursor.t b/perl/t/dbmanager-cursor.t deleted file mode 100644 index c7b8754c3..000000000 --- a/perl/t/dbmanager-cursor.t +++ /dev/null @@ -1,118 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package TestMe; - -use Test::More; - -use Try::Tiny; - -use Seq::Tracks::Build; - -my $test_db_dir = Path::Tiny->tempdir(); - -Seq::DBManager::initialize( { databaseDir => $test_db_dir } ); - -my $db = Seq::DBManager->new(); - -my $dbIdx = 1; -my $pos = 99; -my $val = "HELLO WORLD"; - -ok( !%LMDB::Env::Envs, "prior to first transaction, no transactions listed" ); - -### Test Unsafe Transactions (Manually Managed) ########## -my $cursor = $db->dbStartCursorTxn('test'); - -ok( keys %LMDB::Env::Envs == 1, "after opening cursors, have one transaction" ); - -my ( $expected, $err ); - -ok( ref $cursor && ref $cursor eq 'ARRAY', 'dbStartCursorTxn returns an array' ); -ok( ref $cursor->[0] eq 'LMDB::Txn', - 'dbStartCursorTxn first return item is an LMDB::Txn' ); -ok( ref $cursor->[1] eq 'LMDB::Cursor', - 'dbStartCursorTxn 2nd return item is an LMDB::Cursor' ); - -$err = $db->dbPatchCursorUnsafe( $cursor, 'test', $dbIdx, $pos, $val ); - -ok( $err == 0, - "dbPatchCursorUnsafe returns 0 error status upon successful insertion" ); - -$expected = $db->dbReadOneCursorUnsafe( $cursor, $pos ); - -ok( - defined $expected, - "Before committing, we can see inserted value, as we have stayed within a single transaction" -); -ok( - $#$expected == $dbIdx && !defined $expected->[0] && $expected->[1] eq $val, - "dbReadCursorUnsafe returns an array of track data; each index is another track" -); - -$err = $db->dbEndCursorTxn('test'); - -$expected = $db->dbReadOne( 'test', $pos ); - -ok( defined $expected, - "After committing, we can see inserted value using dbReadOne w/ commit" ); -ok( - $#$expected == $dbIdx && !defined $expected->[0] && $expected->[1] eq $val, - "dbReadOne returns an array of track data; each index is another track" -); - -$expected = $db->dbReadOne( 'test', $pos, 1 ); - -ok( $expected->[1] eq $val, - "After committing, we can see inserted value using dbReadOne w/o commit" ); - -my $cursorErr; -try { - $cursor = $db->dbStartCursorTxn('test'); -} -catch { - $cursorErr = $_; -}; - -ok( defined $cursorErr, - "Cannot open cursor transaction while active transaction for the given dbi" ); - -my $commitErr; -try { - $db->dbForceCommit( 'test', 1 ); -} -catch { - $commitErr = $_; -}; - -ok( defined $commitErr && $commitErr =~ /expects existing environment/, - "Fatal errors clear dbManager environment state" ); -# ok(defined $commitErr, "dbForceCommit is a void function"); - -# Note, unfortunately we can do this, -$cursor = $db->dbStartCursorTxn('test'); - -ok( - defined $cursor, - "dbForceCommit w/o force sync successfully closes the DB associated txn, allowing us to create a new transaction" -); - -$expected = $db->dbReadOne( 'test', $pos, 1 ); - -ok( - $#$expected == $dbIdx && !defined $expected->[0] && $expected->[1] eq $val, - "Can use dbReadOne, without committing, as subtransaction of cursor-containing separate transaction" -); - -$err = $db->dbPatchCursorUnsafe( $cursor, 'test', $dbIdx, $pos, "SOMETHING NEW" ); - -ok( $err == 0, "Can run dbPatchCursorUnsafe, with uncommitted child transaction" ); - -ok( $expected->[$dbIdx] eq 'HELLO WORLD', "we don't overwrite entries" ); - -$err = $db->dbEndCursorTxn('test'); - -ok( $err == 0, 'dbEndCursorTxn returns 0 upon no error' ); - -done_testing(); diff --git a/perl/t/dbmanager-del.t b/perl/t/dbmanager-del.t deleted file mode 100644 index 9a2441e3c..000000000 --- a/perl/t/dbmanager-del.t +++ /dev/null @@ -1,107 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package TestMe; - -use Test::More; - -use Data::MessagePack; -use Path::Tiny; -use Try::Tiny; - -use Seq::Tracks::Build; - -my $mp = Data::MessagePack->new(); - -my $test_db_dir = Path::Tiny->tempdir(); - -Seq::DBManager::initialize( { databaseDir => $test_db_dir } ); - -my $db = Seq::DBManager->new(); - -my $chr = 'test'; -my $dbIdx = 1; -my $pos = 99; - -my $vals = [ 0, 1, [ 2, 3 ], "hello", "world" ]; - -# off the end of $vals; we'll periodically add a val to end at 5th index -$dbIdx = 5; - -for my $pos ( 0 .. 100 ) { - my @v = @$vals; - - if ( $pos % 2 ) { - push @v, "val " . $pos; - } - - $db->dbPut( $chr, $pos, \@v ); -} - -for my $pos ( 0 .. 100 ) { - my $readV = $db->dbReadOne( $chr, $pos ); - - if ( $pos % 2 ) { - ok( $readV->[5] eq "val " . $pos, "could insert and read values at end" ); - next; - } - - ok( !defined $readV->[5], "could insert and read values at end" ); -} - -$db->dbDeleteAll( $chr, $dbIdx ); - -for my $pos ( 0 .. 100 ) { - my $readV = $db->dbReadOne( $chr, $pos ); - - ok( !defined $readV->[5], "could delete value from end" ); - ok( $readV->[4] eq 'world', - "deleting in middle doesn\'t impact preceding adjacent value" ); - ok( $readV->[3] eq 'hello', - "deleting in middle doesn\'t impact 2nd preceding adjacent value" ); -} - -$vals = [ 0, 1, [ 2, 3 ], undef, "end" ]; - -# in the middle -$dbIdx = 3; - -for my $pos ( 0 .. 100 ) { - my @v = @$vals; - - if ( $pos % 2 ) { - $v[$dbIdx] = "val in middle " . $pos; - } - - $db->dbPut( $chr, $pos, \@v ); -} - -for my $pos ( 0 .. 100 ) { - my $readV = $db->dbReadOne( $chr, $pos ); - - if ( $pos % 2 ) { - ok( - $readV->[$dbIdx] eq "val in middle " . $pos, - "could insert and read values at middle" - ); - next; - } - - ok( !defined $readV->[5], "could insert and read values at middle" ); -} - -$db->dbDeleteAll( $chr, $dbIdx ); - -for my $pos ( 0 .. 100 ) { - my $readV = $db->dbReadOne( $chr, $pos ); - - ok( !defined $readV->[$dbIdx], "could delete value from middle" ); - ok( - join( ',', @{ $readV->[2] } ) eq join( ',', 2, 3 ), - "deleting in middle doesn\'t impact preceding adjacent value" - ); - ok( $readV->[4] eq 'end', "deleting in middle doesn\'t impact next adjacent value" ); -} - -done_testing(); diff --git a/perl/t/definition.t b/perl/t/definition.t deleted file mode 100644 index ad65a0806..000000000 --- a/perl/t/definition.t +++ /dev/null @@ -1,73 +0,0 @@ -use Test::More; -use Seq; -use DDP; -use lib 't/lib'; - -my $test_db_dir = Path::Tiny->tempdir(); - -my %baseArgs = ( - database_dir => 't/tracks/gene/db/raw', - input_files => [ 'foo/bar.vcf.gz', 'baz/qux.vcf' ], - config => 't/tracks/gene/db/raw/config.yaml', - output_file_base => 'bar', - tracks => { - tracks => [ - { - name => 'ref', - type => 'reference', - assembly => 'hg19', - chromosomes => ['chr1'], - files_dir => 'fglla' - } - ] - }, - fileProcessors => {} -); - -sub test_dosageMatrixOut { - my $object = Seq->new(%baseArgs); - - my $outputFilesInfo = $object->outputFilesInfo; - - ok( - defined $outputFilesInfo->{dosageMatrixOutPath}, - 'dosageMatrixOutPath should be defined' - ); - like( $outputFilesInfo->{dosageMatrixOutPath}, - qr/dosage\.feather$/, 'dosageMatrixOutPath should end with dosage.feather' ); -} - -sub test_preparePreprocessorProgram { - my $object = Seq->new( - %baseArgs, - fileProcessors => { - vcf => { - args => '--sample %sampleList% --dosageOutput %dosageMatrixOutPath%', - program => 'mockProgram' - } - } - ); - - my ( $finalProgram, $errPath ) = - $object->_preparePreprocessorProgram( 'vcf', 'foo/bar.vcf.gz' ); - - like( - $finalProgram, - qr/--dosageOutput \S*bar\.dosage\.feather/, - 'Check --dosageOutput includes "bar.dosage.feather"' - ); - like( - $finalProgram, - qr/--sample \S*bar\.sample_list/, - 'Check --sample includes "bar.sample_list"' - ); - unlike( $finalProgram, qr/%sampleList%/, 'Command does not contain "%sampleList%"' ); - unlike( $finalProgram, qr/%dosageMatrixOutPath%/, - 'Command does not contain "%dosageMatrixOutPath%"' ); - -} - -test_dosageMatrixOut(); -test_preparePreprocessorProgram(); - -done_testing(); diff --git a/perl/t/headers.t b/perl/t/headers.t deleted file mode 100644 index d3abe8028..000000000 --- a/perl/t/headers.t +++ /dev/null @@ -1,253 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; - -use Seq::Headers; -use Seq::Output; - -my $head = Seq::Headers->new(); - -my %expected1Idx = ( - 'first' => 0, - 'second' => 1, - 'third' => 2, -); - -$head->addFeaturesToHeader( [ 'first', 'second', 'third' ] ); - -_checkFeatureIdx( $head, undef, \%expected1Idx, 'initial' ); - -my $str = $head->getString(); -my $expected = "first\tsecond\tthird"; -ok( $str eq $expected, "Can write basic header" ); - -my $arr = $head->getOrderedHeader(); -ok( join( "\t", @$arr ) eq $expected, "Store features in order given" ); - -$head->addFeaturesToHeader( [ 'really_first', 'really_second' ], undef, 1 ); -$expected = "really_first\treally_second\t$expected"; - -%expected1Idx = ( - 'really_first' => 0, - 'really_second' => 1, - 'first' => 2, - 'second' => 3, - 'third' => 4, -); - -_checkFeatureIdx( $head, undef, \%expected1Idx, - 'prepended really_first, really_second' ); - -$arr = $head->getOrderedHeader(); - -ok( join( "\t", @$arr ) eq $expected, "Can prepend multiple features" ); - -$head->addFeaturesToHeader( 'really_really_first', undef, 1 ); -$expected = "really_really_first\t$expected"; - -%expected1Idx = ( - 'really_really_first' => 0, - 'really_first' => 1, - 'really_second' => 2, - 'first' => 3, - 'second' => 4, - 'third' => 5, -); - -_checkFeatureIdx( $head, undef, \%expected1Idx, 'prepended really_really_first' ); - -$arr = $head->getOrderedHeader(); -ok( join( "\t", @$arr ) eq $expected, "Can prepend features" ); - -########### Cleared here so all earlier features go away ############### -$head->initialize(); -$arr = $head->getOrderedHeader(); -ok( @$arr == 0, "Can clear header" ); - -$head->addFeaturesToHeader( [ 'child1', 'child2', 'child3' ], 'p1' ); -$expected = "child1\tchild2\tchild3"; - -# These are relative to their parent, or the root if no parent -%expected1Idx = ( - 'p1' => { #0 - 'child1' => 0, - 'child2' => 1, - 'child3' => 2, - } -); - -_checkFeatureIdx( $head, undef, \%expected1Idx, 'added p1' ); - -$arr = $head->getOrderedHeader(); - -ok( join( "\t", @{ $arr->[0] } ) eq $expected, "Can create nested features" ); - -my $idx = $head->getParentIndices(); - -ok( $idx->{p1} == 0 && keys %$idx == 1, - "Can recover top-level feature indices after addition of 1 feature" ); - -$head->addFeaturesToHeader( [ 'c1', 'c2', 'c3' ], 'p2' ); -my $e2 = "c1\tc2\tc3"; - -# These are relative to their parent, or the root if no parent -%expected1Idx = ( - 'p1' => { #0 - 'child1' => 0, - 'child2' => 1, - 'child3' => 2, - }, - 'p2' => { #1 - 'c1' => 0, - 'c2' => 1, - 'c3' => 2, - } -); - -_checkFeatureIdx( $head, undef, \%expected1Idx, 'added p2' ); - -$arr = $head->getOrderedHeader(); - -ok( join( "\t", @{ $arr->[0] } ) eq $expected && join( "\t", @{ $arr->[1] } ) eq $e2, - "Can add nested features" ); - -$idx = $head->getParentIndices(); - -ok( - $idx->{p1} == 0 && $idx->{p2} == 1 && keys %$idx == 2, - "Can recover top-level features after addition of 2nd feature" -); - -# Prepend 3rd parent -$head->addFeaturesToHeader( [ 'c1a', 'c2a', 'c3a' ], 'p3', 1 ); -my $e3 = "c1a\tc2a\tc3a"; - -# These are relative to their parent, or the root if no parent -%expected1Idx = ( - 'p3' => { #0 - 'c1a' => 0, - 'c2a' => 1, - 'c3a' => 2, - }, - 'p1' => { #1 - 'child1' => 0, - 'child2' => 1, - 'child3' => 2, - }, - 'p2' => { #2 - 'c1' => 0, - 'c2' => 1, - 'c3' => 2, - }, -); - -_checkFeatureIdx( $head, undef, \%expected1Idx, 'prepended p3' ); - -$arr = $head->getOrderedHeader(); - -ok( - join( "\t", @{ $arr->[0] } ) eq $e3 - && join( "\t", @{ $arr->[1] } ) eq $expected - && join( "\t", @{ $arr->[2] } ) eq $e2, - "Can prepend nested features" -); - -my $fIdx = $head->getFeatureIdx( 'p3', 'c1a' ); -my $fIdx2 = $head->getFeatureIdx( 'p3', 'c2a' ); -my $fIdx3 = $head->getFeatureIdx( 'p3', 'c3a' ); - -$idx = $head->getParentIndices(); - -ok( - $idx->{p3} == 0 && $idx->{p1} == 1 && $idx->{p2} == 2 && keys %$idx == 3, - "Can recover top-level features after addition of 3nd feature which is pre-pended" -); - -my $p1 = $head->getParentFeatures('p1'); -my $p2 = $head->getParentFeatures('p2'); -my $p3 = $head->getParentFeatures('p3'); - -ok( - join( "\t", @$p1 ) eq $expected - && join( "\t", @$p2 ) eq $e2 - && join( "\t", @$p3 ) eq $e3, - "Can recover features by hash" -); - -$str = $head->getString(); - -$expected = - "p3.c1a\tp3.c2a\tp3.c3a\tp1.child1\tp1.child2\tp1.child3\tp2.c1\tp2.c2\tp2.c3"; -ok( $str eq $expected, "Can build string from nested features" ); - -$head->addFeaturesToHeader('cadd'); -$head->addFeaturesToHeader( 'phyloP', undef, 1 ); - -$expected = "phyloP\t$expected\tcadd"; - -# These are relative to their parent, or the root if no parent -%expected1Idx = ( - 'phyloP' => 0, - 'p3' => { #1 - 'c1a' => 0, - 'c2a' => 1, - 'c3a' => 2, - }, - 'p1' => { #2 - 'child1' => 0, - 'child2' => 1, - 'child3' => 2, - }, - 'p2' => { #3 - 'c1' => 0, - 'c2' => 1, - 'c3' => 2, - }, - 'cadd' => 4, -); - -_checkFeatureIdx( $head, undef, \%expected1Idx, 'prepended phylP, added cadd' ); - -$str = $head->getString(); - -ok( $str eq $expected, "Can mix and match nested features with non-nested" ); - -$idx = $head->getParentIndices(); - -ok( - $idx->{phyloP} == 0 - && $idx->{p3} == 1 - && $idx->{p1} == 2 - && $idx->{p2} == 3 - && $idx->{cadd} == 4 - && keys %$idx == 5, - "Can recover top-level feature indices after addition of non-nested features" -); - -sub _checkFeatureIdx { - my ( $header, $parent, $expectedHref, $testName ) = @_; - $testName //= 'test'; - - for my $featureName ( keys %$expectedHref ) { - my $eVal = $expectedHref->{$featureName}; - - if ( ref $eVal ) { - _checkFeatureIdx( $header, $featureName, $expectedHref->{$featureName}, $testName ); - next; - } - - my $actual = $header->getFeatureIdx( $parent, $featureName ); - - ok( - defined $actual && $actual == $eVal, - "Test: $testName. Can look up index of feature $featureName relative to parent " - . ( defined $parent ? $parent : 'header root' ) - . " (index is $eVal)" - ); - } -} - -done_testing(); -1; diff --git a/perl/t/inputFile.t b/perl/t/inputFile.t deleted file mode 100644 index dd0f08e12..000000000 --- a/perl/t/inputFile.t +++ /dev/null @@ -1,48 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; - -use Seq::InputFile; - -my $inputter = Seq::InputFile->new(); - -my $err = - $inputter->checkInputFileHeader( [ "Chrom", "Pos", "Ref", "Alt", "Type" ] ); - -ok( !defined $err ); - -$err = $inputter->checkInputFileHeader( - [ "Fragment", "Position", "Reference", "Alleles", "Type" ] ); - -ok( !defined $err ); - -$err = $inputter->checkInputFileHeader( - [ "Fragment", "Position", "Reference", "Minor_alleles", "Type" ] ); - -ok( !defined $err ); - -$err = $inputter->checkInputFileHeader( - [ "Fragment", "Position", "Reference", "Type", "Alt" ] ); - -ok( !defined $err, "Alt, Type order doesn't matter" ); - -$err = $inputter->checkInputFileHeader( - [ "Position", "Fragment", "Reference", "Type", "Alt" ] ); - -ok( $err, "Chrom, Pos, Ref order matters" ); - -$err = $inputter->checkInputFileHeader( [ "Type", "Alt" ] ); - -ok( $err, "Chrom, Pos, Ref required" ); - -$err = $inputter->checkInputFileHeader( [ "Ref", "Type", "Alt" ] ); - -ok( $err, "Chrom, Pos, Ref required" ); - -$err = $inputter->checkInputFileHeader( [ "Pos,", "Ref", "Type", "Alt" ] ); - -ok( $err, "Chrom, Pos, Ref required" ); - -done_testing(); diff --git a/perl/t/lib/TestUtils.pm b/perl/t/lib/TestUtils.pm deleted file mode 100644 index 7779ceb87..000000000 --- a/perl/t/lib/TestUtils.pm +++ /dev/null @@ -1,118 +0,0 @@ -package TestUtils; - -use 5.10.0; -use strict; -use warnings; - -use Exporter 'import'; -use Path::Tiny qw(path); -use Type::Params qw(compile); -use Types::Common::String qw(NonEmptySimpleStr); -use Types::Standard qw(ArrayRef HashRef); -use YAML::XS qw(DumpFile LoadFile); - -our @EXPORT_OK = - qw( CopyAll HaveRequiredBinary PrepareConfigWithTempdirs UpdateConfigAttrs ); - -sub HaveRequiredBinary { - my $binary = shift; - my $path_to_binary = `which $binary`; - chomp($path_to_binary); # Remove trailing newline, if any - if ($path_to_binary) { - return 1; - } - else { - return; - } -} - -# PrepareConfigWithTempdirs takes parameters below and returns a string to a -# temporary config file with updated paths in that config file and returns -# an absolute path to the config file -# config_file => configuration file -# src_dir => directory of raw files needed for the test -# want_dest_dirs => names of directories that will be created -# target_dir => name of directory for the raw data -# dest_dir => destination directory -sub PrepareConfigWithTempdirs { - state $check = compile( - NonEmptySimpleStr, NonEmptySimpleStr, - ArrayRef [NonEmptySimpleStr], NonEmptySimpleStr, - NonEmptySimpleStr - ); - my ( $config_file, $src_dir, $want_dest_dirs, $target_dir, $dest_dir ) = - $check->(@_); - - my %tempDirsForWantDir; - - for my $dir (@$want_dest_dirs) { - my $d = path($dest_dir)->child($dir); - $d->mkpath; - $tempDirsForWantDir{$dir} = $d->stringify; - } - - # copy files into temporary dir - CopyAll( $src_dir, $tempDirsForWantDir{$target_dir} ); - - # update config to include temp directories - my $test_config = UpdateConfigAttrs( $config_file, \%tempDirsForWantDir ); - - # write new test config to file - my $test_config_file = path($dest_dir)->child('config.yml'); - DumpFile( $test_config_file, $test_config ); - - return $test_config_file->absolute->stringify; -} - -sub CopyAll { - state $check = compile( NonEmptySimpleStr, NonEmptySimpleStr ); - my ( $src, $dest ) = $check->(@_); - - $src = path($src); - $dest = path($dest); - - if ( !$src->is_dir ) { - die "Source directory does not exist"; - } - - if ( !$dest->is_dir ) { - die "Destination directory does not exist"; - } - - # Recursive copy of directories and files from the source directory to the temporary directory - $src->visit( - sub { - my ( $path, $state ) = @_; - - # Construct the destination path in the temporary directory - my $this_dest = $dest->child( $path->relative($src) ); - - if ( $path->is_dir ) { - # Create directory if the current path is a directory - $this_dest->mkpath; - } - else { - # Copy the file otherwise - $path->copy($this_dest); - } - }, - { recurse => 1 } # Enable recursive visiting - ); -} - -sub UpdateConfigAttrs { - state $check = compile( NonEmptySimpleStr, HashRef ); - my ( $file, $href ) = $check->(@_); - - # load config yaml - my $config = LoadFile($file); - - # update directory keys with new location - for my $key ( keys %{$href} ) { - $config->{$key} = $href->{$key}; - } - - return $config; -} - -1; diff --git a/perl/t/msgpack.t b/perl/t/msgpack.t deleted file mode 100644 index a7d17453f..000000000 --- a/perl/t/msgpack.t +++ /dev/null @@ -1,30 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; - -use Data::MessagePack; - -my $mp = Data::MessagePack->new()->prefer_integer()->prefer_float32(); - -my $float = sprintf( "%0.6f", -3.563 ); - -my $packed = $mp->pack($float); -ok( length($packed) == 11, - "-3.563 as a 0.6f formatted string takes 11 bytes, 1 extra for sign" ); - -$packed = $mp->pack(-3.563); -ok( length($packed) == 5, - "floats are internally stored 4 in 4bytes + 1 for schema" ); - -$packed = $mp->pack("1.000000"); -ok( length($packed) == 10, "1.000000 float as a string takes 10 bytes" ); - -$packed = $mp->pack( "1.000000" +0 ); -ok( length($packed) == 5, "1.000000 float as number (+0) takes 5 bytes" ); - -$packed = $mp->pack( int( "1.000000" +0 ) ); -ok( length($packed) == 1, "1.000000 float as number truncated to int takes 1 byte" ); - -done_testing(); diff --git a/perl/t/output.t b/perl/t/output.t deleted file mode 100644 index e5b27a17e..000000000 --- a/perl/t/output.t +++ /dev/null @@ -1,200 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; - -use Seq::Headers; -use Seq::Output::Delimiters; -use Seq::Output; - -my $head = Seq::Headers->new(); - -$head->addFeaturesToHeader('preProcessorHeader1'); -$head->addFeaturesToHeader( [ 'c1a', 'c1b', 'c1c' ], 'withFeaturesTrack1' ); -$head->addFeaturesToHeader('scalarTrack2'); -$head->addFeaturesToHeader( [ 'c2a', 'c2b', 'c2c_overlapped_vals' ], - 'withFeaturesTrack3' ); -$head->addFeaturesToHeader('ref'); - -# trackOutIndices simply tracks Seq features apart from those passed in by -# a pre-processor -# this allows us to skip iterating over very long feature arrays on which we do no work -my $outputter = - Seq::Output->new( - { header => $head, trackOutIndices => [ 1, 2, 3, 4 ], refTrackName => 'ref' } ); - -my $delims = Seq::Output::Delimiters->new(); -my $header = $head->getOrderedHeader(); - -ok( @$header == 5, "Output header matches # of tracks" ); -ok( @{ $header->[1] } == 3, "First package-defined track has 3 features" ); -ok( !ref $header->[2], "Second track has no features, is itself a feature" ); -ok( @{ $header->[3] } == 3, "Third track has 2 features" ); -ok( !ref $header->[4], "Fourth track has no features, is itself a feature" ); - -my $hStr = $head->getString(); - -my @headFields = split( $delims->fieldSeparator, $hStr ); - -ok( @headFields == 9, - "String header contains all expected fields, including those from pre-processor" ); - -# Everything is output as an array -# The first level is a track -# The 2nd level are the feature values - -# Each feature value can have up to a depth of 3: -# 1) The position value [pos1, pos2, pos3] : only 1 pos for snp -# 2) The feature value at that position -# ###### Can have up to nesting of 2: -# ###### For instance, in refSeq, you may have: -# ###### transcript1;transcript2 transcript1_val1\\transcript1_val2;transcript2_onlyVal -# ###### Which is represented as -# ###### [ [transcript1, transcript2], [ [transcript1_val1, transcript1_val2], transcript2_onlyVal ] ] -# ###### Outer array is for feature -# ###### 1st inner array is for position (in indel) -# ###### 2nd inner array is for multiple values for that feature at that position -my $expected = "somePreProcessorVal"; - -my @t1_f1; -my @t1_f2; -my @t1_f3; - -my @row = ( "somePreProcessorVal", [], [], [] ); - -# No indel -my $posIdx = 0; - -my @valsTrack1 = ( 'transcript1', 'transcript2', "transcript3" ); - -$row[1][0][$posIdx] = $valsTrack1[0]; -$row[1][1][$posIdx] = $valsTrack1[1]; -$row[1][2][$posIdx] = $valsTrack1[2]; - -$expected .= $delims->fieldSeparator . join( $delims->fieldSeparator, @valsTrack1 ); - -my $valTrack2 = "someScalarVal1"; - -$row[2][$posIdx] = $valTrack2; - -$expected .= $delims->fieldSeparator . $valTrack2; - -# Separate delimiters so that one can clearly see that track3_feat3_* -# have a relationship with track3_feat3_val1 -my @valsTrack3 = ( - 'track3_feat1', - 'track3_feat2', - [ - 'track3_feat3_val1', [ 'track3_feat3_val2_overlap1', 'track3_feat3_val2_overlap2' ] - ] -); - -$row[3][0][$posIdx] = $valsTrack3[0]; -$row[3][1][$posIdx] = $valsTrack3[1]; -$row[3][2][$posIdx] = $valsTrack3[2]; - -# Add the reference -# Scalar tracks by definition have no features, and so in Bystro -# they are always 1 nested less deep -$row[4][0] = 'C'; - -my $nestedVals = join( $delims->overlapDelimiter, @{ $valsTrack3[2][1] } ); -my $track3field3val = - join( $delims->valueDelimiter, $valsTrack3[2][0], $nestedVals ); - -$expected .= $delims->fieldSeparator - . join( $delims->fieldSeparator, @valsTrack3[ 0 .. 1 ], $track3field3val ); - -# Add ref -$expected .= $delims->fieldSeparator . "C" . "\n"; - -my @rows = ( \@row ); - -my $str = $outputter->makeOutputString( \@rows ); - -ok( $str eq $expected, - "Can make complex output string with nested features, and with overlapping values" ); - -my @rowFields = split( $delims->fieldSeparator, $str ); - -ok( @headFields == @rowFields, - "Output string length matches flattened header length" ); - -########## Test value deduplication in makeOutputString ########## -@row = ( "somePreProcessorVal", [], [], [], [] ); -$expected = "somePreProcessorVal" . $delims->fieldSeparator; - -# Test all values duplicate -$row[1][0][0] = [ "t1_1a", "t1_1a" ]; -# When both values are duplicate in an inner array, we expect a single value, with no overlap delimiter -$row[1][1][0] = [ [ "t1_2aa", "t1_2aa" ], [ "t1_2ba", "t1_2ba" ] ]; -# If all values are duplicate across delimiters, we expect a single value, with no overlap delimiter or value delimiter -$row[1][2][0] = - [ [ "t1_3aa", "t1_3aa" ], [ "t1_3aa", "t1_3aa", "t1_3aa", "t1_3aa" ] ]; - -# Track 1 values -$expected .= "t1_1a" . $delims->fieldSeparator; #$row[1][0][0] -$expected .= - "t1_2aa" - . $delims->valueDelimiter - . "t1_2ba" - . $delims->fieldSeparator; #$row[1][1][0] -$expected .= "t1_3aa" . $delims->fieldSeparator; #$row[1][1][0] - -# We still handle scalar values just fine -$row[2][0] = [ "blah", "blah", "blah" ]; - -$expected .= "blah" . $delims->fieldSeparator; - -# If not all values deuplicated, we won't deduplcate anything -$row[3][0][0] = [ "t3_1a", "t3_1a", "t3_1b" ]; -# When not all values duplicated in inner array, we will not deduplicate -$row[3][1][0] = [ [ "t3_2aa", 't3_2aa', 't3_2ab' ], [ "t3_2ba", 't3_2ba' ] ]; -# We will deduplicate values across position delimiters too -$row[3][2][0] = "t3_3a"; -$row[3][2][1] = "t3_3a"; - -#$row[2][0][0] -$expected .= join( $delims->valueDelimiter, ( "t3_1a", "t3_1a", "t3_1b" ) ) - . $delims->fieldSeparator; -#$row[2][1][0] -$expected .= - join( $delims->overlapDelimiter, ( "t3_2aa", 't3_2aa', 't3_2ab' ) ) - . $delims->valueDelimiter - . "t3_2ba" - . $delims->fieldSeparator; -#$row[2][2][0] & $row[2][2][1] are collapsed into a single value since they are the same -$expected .= "t3_3a" . $delims->fieldSeparator; - -$row[4][0] = "T"; - -$expected .= "T" . "\n"; - -$str = $outputter->makeOutputString( [ \@row ] ); - -ok( $str eq $expected, "De-duplicates values" ); - -######### Test uniquefy ########## -# Test 1: All identical defined values -my $result1 = Seq::Output::uniqueify( [ 'a', 'a', 'a' ] ); -is_deeply( $result1, ['a'], "All identical values" ); - -# Test 2: All undefined values -my $result2 = Seq::Output::uniqueify( [ undef, undef, undef ] ); -is_deeply( $result2, [undef], "All undefined values" ); - -# Test 3: Mix of undefined and defined values -my $result3 = Seq::Output::uniqueify( [ 'b', undef, 'b' ] ); -is_deeply( $result3, [ 'b', undef, 'b' ], "Mix of undefined and defined values" ); - -# Test 4: Multiple distinct defined values -my $result4 = Seq::Output::uniqueify( [ 'c', 'd' ] ); -is_deeply( $result4, [ 'c', 'd' ], "Multiple distinct values" ); - -# Test 5: Empty array -my $result5 = Seq::Output::uniqueify( [] ); -is_deeply( $result5, [], "Empty array" ); - -done_testing(); -1; diff --git a/perl/t/output/delimiters.t b/perl/t/output/delimiters.t deleted file mode 100644 index 1e474420f..000000000 --- a/perl/t/output/delimiters.t +++ /dev/null @@ -1,47 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package TestMe; - -use Test::More; - -use Try::Tiny; - -use Seq::Output::Delimiters; - -my $delims = Seq::Output::Delimiters->new( {} ); - -my $oD = $delims->overlapDelimiter; - -my $line = "Stuff;1;2;3/.dasf_)," . "4\t5/6|7"; - -my $expected = "Stuff,1,2,3,.dasf_),4\t5,6,7"; - -$delims->cleanDelims->($line); - -ok( $line eq $expected, "Clean all delimiters, including / by default" ); - -my @parts = split( '\t', $line ); - -ok( @parts == 2, "Splitting on single quoted tab char ('\\t') still works" ); - -my @parts2 = split( "\t", $line ); -ok( @parts == 2, "Splitting on double quoted tab char (\"\\t\") still works" ); - -$line = "Stuff;;1;;2;;;3" . "$oD$oD$oD" . "4\t5//6||7|"; - -$expected = "Stuff,1,2,3,4\t5,6,7"; - -$delims->cleanDelims->($line); - -ok( $line eq $expected, "Clean all delimiters even when many instances in a row" ); - -@parts = split( '\t', $line ); - -ok( @parts == 2, "Splitting on single quoted tab char ('\\t') still works" ); - -@parts2 = split( "\t", $line ); -ok( @parts == 2, "Splitting on double quoted tab char (\"\\t\") still works" ); - -done_testing(); diff --git a/perl/t/role/02-message.t b/perl/t/role/02-message.t deleted file mode 100644 index 49239ee3d..000000000 --- a/perl/t/role/02-message.t +++ /dev/null @@ -1,127 +0,0 @@ -#Run from ../lib -use 5.10.0; -use strict; -use warnings; - -package Mock; - -use Mouse 2; - -with 'Seq::Role::Message', 'Seq::Role::IO'; - -has logPath => - ( is => 'ro', init_arg => undef, default => 't/role/02-message.test.log' ); - -sub BUILD { - my $self = shift; - - $self->setLogPath( $self->logPath ); -} - -1; - -package TestMessage; -use Test::More; -use DDP; - -my $mocker = Mock->new(); - -$mocker->log( 'warn', "HELLO WORLD" ); - -my ( $err, undef, $fh ) = $mocker->getReadFh( $mocker->logPath ); -ok( !$err, 'No error gets generated on reading fh' ); - -my @lines = <$fh>; - -$err = $mocker->safeClose($fh); -ok( !$err, 'No error gets generated on closing fh' ); - -ok( @lines == 1, 'Only one line gets written' ); -ok( index( $lines[0], "HELLO WORLD" ) > -1, 'By default warnings allowed' ); - -$mocker = Mock->new(); - -( $err, undef, $fh ) = $mocker->getReadFh( $mocker->logPath ); -ok( !$err, 'No error gets generated on reading fh nth time' ); - -@lines = <$fh>; - -$err = $mocker->safeClose($fh); -ok( !$err, 'No error gets generated on closing fh nth time' ); - -ok( @lines == 0, "Log file gets cleared" ); - -$mocker->setLogLevel('fatal'); - -$mocker->log( 'warn', "A different warning" ); - -( $err, undef, $fh ) = $mocker->getReadFh( $mocker->logPath ); -ok( !$err, 'No error gets generated on reading fh nth time' ); - -@lines = <$fh>; - -$err = $mocker->safeClose($fh); -ok( !$err, 'No error gets generated on closing fh nth time' ); - -ok( !@lines, - "setLogLevel sets log level to fatal, and warning messages don't get stored" ); - -$mocker = Mock->new(); - -$mocker->setLogLevel('info'); - -$mocker->log( 'warn', "A warning above info" ); - -( $err, undef, $fh ) = $mocker->getReadFh( $mocker->logPath ); -ok( !$err, 'No error gets generated on reading fh nth time' ); - -@lines = <$fh>; -$err = $mocker->safeClose($fh); - -ok( !$err, 'No error gets generated on closing fh nth time' ); - -ok( - index( $lines[0], 'A warning above info' ) > -1, - "Role::Message sets info level, and writes warning messages" -); - -$mocker = Mock->new(); -$mocker->log( 'warn', "A new warning above info" ); -$mocker->log( 'info', "An info message" ); -$mocker->log( 'error', "An error message" ); - -( $err, undef, $fh ) = $mocker->getReadFh( $mocker->logPath ); -ok( !$err, 'No error gets generated on reading fh nth time' ); - -@lines = <$fh>; - -$err = $mocker->safeClose($fh); -ok( !$err, 'No error gets generated on closing fh nth time' ); - -ok( @lines == 3, 'Role::Message properly writes multiple lines' ); -ok( - index( $lines[0], 'A new warning above info' ) > -1, - "Role::Message doesn't overwrite previous messages" -); -ok( - index( $lines[1], 'An info message' ) > -1, - "Role::Message records info messages at info level" -); -ok( - index( $lines[1], 'An info message' ) > -1, - "Role::Message records info messages at info level" -); - -use Try::Tiny; - -try { - $mocker->log( 'fatal', "A fatal message" ); -} -catch { - ok( $_ && index( $_, 'A fatal message' ) > -1, - "Role::Message throws a fatal message at info level: $_" ); -}; - -system( 'rm ' . $mocker->logPath ); - -done_testing(); diff --git a/perl/t/role/1-sparseArrays.t b/perl/t/role/1-sparseArrays.t deleted file mode 100644 index 9caf7e625..000000000 --- a/perl/t/role/1-sparseArrays.t +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/perl -# your code goes here -use 5.10.0; -use Test::More; -use strict; -use warnings; - -plan tests => 5; - -my @arr = ( 0, 1, 2 ); - -for ( my $i = @arr; $i < 4; $i++ ) { - push @arr, undef; -} - -push @arr, 3; - -ok( $arr[0] == 0 ); -ok( $arr[1] == 1 ); -ok( $arr[2] == 2 ); -ok( !defined $arr[3] ); -ok( $arr[4] == 3 ); diff --git a/perl/t/tracks/base/01-convert.t b/perl/t/tracks/base/01-convert.t deleted file mode 100644 index a0a9fb9e0..000000000 --- a/perl/t/tracks/base/01-convert.t +++ /dev/null @@ -1,76 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; -use Mouse 2; - -use Seq::Tracks::Base::Types; -use Scalar::Util qw/looks_like_number/; - -my $converter = Seq::Tracks::Base::Types->new(); - -my $floatString = '.0000000000001567'; - -my $floatConvert = $converter->convert( $floatString, 'number' ); - -my $roundedConvert = $converter->convert( $floatString, 'number(2)' ); - -ok( looks_like_number($floatConvert), "number looks like number" ); -ok( looks_like_number($roundedConvert), "number(2) looks like number" ); - -ok( $floatConvert == .0000000000001567, "number doesn't round" ); -ok( "$floatConvert" eq "1.567e-13", - "perl represents large floats in scientific notation" ); -# Is too small to display by default in decimal notation -ok( $roundedConvert == .00000000000016, "number(2) rounds to 2 sigfigs" ); - -my $exactFloat = '1.00000000'; -$floatConvert = $converter->convert( $exactFloat, 'number' ); -$roundedConvert = $converter->convert( $exactFloat, 'number(2)' ); - -ok( looks_like_number($floatConvert), "number looks like number" ); -ok( looks_like_number($roundedConvert), "number(2) looks like number" ); -ok( $floatConvert == 1, - "number converts floats to ints when exact solution possible" ); -ok( $roundedConvert == 1, - "number converts floats to ints when exact solution possible" ); - -my $roundedFloat = '1.05'; -$floatConvert = $converter->convert( $roundedFloat, 'number' ); -$roundedConvert = $converter->convert( $roundedFloat, 'number(2)' ); - -ok( looks_like_number($floatConvert), "number looks like number" ); -ok( looks_like_number($roundedConvert), "number(2) looks like number" ); -ok( $floatConvert == 1.05, "number doesn't round" ); -ok( $roundedConvert == 1.1, "number(2) rounds to 2 sigfigs" ); - -$roundedFloat = '1.0000005'; -$floatConvert = $converter->convert( $roundedFloat, 'number' ); -$roundedConvert = $converter->convert( $roundedFloat, 'number(2)' ); - -ok( looks_like_number($floatConvert), "float looks like number" ); -ok( looks_like_number($roundedConvert), "number looks like number" ); -ok( $floatConvert == 1.0000005, "number doesn't round" ); -ok( $roundedConvert == 1, "number(2) rounds to 2 sigfigs" ); - -$roundedFloat = '123000000.2590'; -$floatConvert = $converter->convert( $roundedFloat, 'number' ); -$roundedConvert = $converter->convert( $roundedFloat, 'number(2)' ); - -ok( looks_like_number($floatConvert), "float looks like number" ); -ok( looks_like_number($roundedConvert), "number looks like number" ); -ok( $floatConvert == 123000000.259, - "number Can convert number larger than 1 million to float" ); -ok( $roundedConvert == 120000000, "number(2) rounds to 2 sigfigs" ); - -$floatConvert = $converter->convert( "1e-13", 'number' ); -$roundedConvert = $converter->convert( "1e-13", 'number(2)' ); -ok( $floatConvert == .0000000000001, - "number can convert scientific notation string" ); -ok( - $roundedConvert == .0000000000001, - "number(2) can convert scientific notation string; will not round if fewer sigfigs available than specified" -); - -done_testing(); diff --git a/perl/t/tracks/base/02-convert-pack.t b/perl/t/tracks/base/02-convert-pack.t deleted file mode 100644 index 6593d1e20..000000000 --- a/perl/t/tracks/base/02-convert-pack.t +++ /dev/null @@ -1,46 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; - -use Data::MessagePack; -use Seq::Tracks::Base::Types; - -my $mp = Data::MessagePack->new(); -$mp->prefer_integer()->prefer_float32(); - -my $converter = Seq::Tracks::Base::Types->new(); - -my $numThatShouldBeInt = '1.000000'; - -my $converted = $converter->convert( $numThatShouldBeInt, 'number' ); - -my $packed = $mp->pack($converted); - -ok( - length($packed) == 1, - "The string $numThatShouldBeInt takes 1 byte in msgpack, when using the 'number' converter" -); - -$packed = $mp->pack($numThatShouldBeInt); -ok( length($packed) == 10, "mspgack will pack a string as a string" ); - -$converted = $converter->convert( '1.1', 'number' ); - -$packed = $mp->pack($converted); - -ok( - length($packed) == 5, - "With prefer_float32 floating point numbers will be packed in 5 bytes/single precision" -); - -$packed = $mp->pack( "1.000000" +0 ); -ok( length($packed) == 5, - "The string '1.000000' + 0 takes 5bytes in msgpack with prefer_float32" ); - -$packed = $mp->pack( "-1.000000" +0 ); -ok( length($packed) == 5, - "The string -1.000000' + 0 takes 5 bytes in msgpack with prefer_float32" ); - -done_testing(); diff --git a/perl/t/tracks/base/normalizeWantedChr.t b/perl/t/tracks/base/normalizeWantedChr.t deleted file mode 100644 index 8f7e15193..000000000 --- a/perl/t/tracks/base/normalizeWantedChr.t +++ /dev/null @@ -1,116 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; -use Path::Tiny; - -use Seq::Tracks::Base; -use Seq::DBManager; -use Seq::Output::Delimiters; - -# use DDP; -# use Scalar::Util qw/looks_like_number/; -# use YAML::XS qw/LoadFile/; -# use Seq::Tracks::Gene::Site::SiteTypeMap; -# use Seq::Tracks::Reference::MapBases; - -# my $baseMapper = Seq::Tracks::Reference::MapBases->new(); -# my $siteTypes = Seq::Tracks::Gene::Site::SiteTypeMap->new(); - -Seq::DBManager::initialize( { databaseDir => Path::Tiny->tempdir() } ); - -my $seq = Seq::Tracks::Base->new( - { - chromosomes => [ 'chr1', 'chrM', '1' ], - name => 'test', - type => 'gene', - assembly => 'hg38' - } -); - -my $chr = 'chr1'; -my $wanted = $seq->normalizedWantedChr->{'chr1'}; - -ok( $chr eq $wanted, "Won't modify listed chromosomes" ); - -$wanted = $seq->normalizedWantedChr->{'1'}; - -ok( $wanted eq '1', "Won't return prefix version if non-prefix version listed" ); - -$wanted = $seq->normalizedWantedChr->{'chrM'}; -ok( $wanted eq 'chrM', "If MT given and chrM listed, returns chrM" ); - -$wanted = $seq->normalizedWantedChr->{'MT'}; -ok( $wanted eq 'chrM', "If MT given and chrM listed, returns chrM" ); - -$wanted = $seq->normalizedWantedChr->{'M'}; -ok( $wanted eq 'chrM', "If M given and chrM listed, returns chrM" ); - -$wanted = $seq->normalizedWantedChr->{'chrMT'}; -ok( $wanted eq 'chrM', "If chrMT given and chrM listed, returns chrM" ); - -$seq = Seq::Tracks::Base->new( - { - chromosomes => [ 'chrMT', '1' ], - name => 'test', - type => 'gene', - assembly => 'hg38' - } -); - -$wanted = $seq->normalizedWantedChr->{'chrMT'}; -ok( $wanted eq 'chrMT', "If chrMT given and chrMT listed, returns chrMT" ); - -$wanted = $seq->normalizedWantedChr->{'MT'}; -ok( $wanted eq 'chrMT', "If MT given and chrMT listed, returns chrMT" ); - -$wanted = $seq->normalizedWantedChr->{'M'}; -ok( $wanted eq 'chrMT', "If M given and chrMT listed, returns chrMT" ); - -$wanted = $seq->normalizedWantedChr->{'chrM'}; -ok( $wanted eq 'chrMT', "If chrM given and chrMT listed, returns chrMT" ); - -$seq = Seq::Tracks::Base->new( - { - chromosomes => [ 'MT', '1' ], - name => 'test', - type => 'gene', - assembly => 'hg38' - } -); - -$wanted = $seq->normalizedWantedChr->{'MT'}; -ok( $wanted eq 'MT', "If MT given and MT listed, returns MT" ); - -$wanted = $seq->normalizedWantedChr->{'chrMT'}; -ok( $wanted eq 'MT', "If chrMT given and MT listed, returns MT" ); - -$wanted = $seq->normalizedWantedChr->{'M'}; -ok( $wanted eq 'MT', "If M given and MT listed, returns MT" ); - -$wanted = $seq->normalizedWantedChr->{'chrM'}; -ok( $wanted eq 'MT', "If chrM given and MT listed, returns MT" ); - -$seq = Seq::Tracks::Base->new( - { - chromosomes => [ 'M', '1' ], - name => 'test', - type => 'gene', - assembly => 'hg38' - } -); - -$wanted = $seq->normalizedWantedChr->{'MT'}; -ok( $wanted eq 'M', "If MT given and M listed, returns M" ); - -$wanted = $seq->normalizedWantedChr->{'M'}; -ok( $wanted eq 'M', "If M given and M listed, returns M" ); - -$wanted = $seq->normalizedWantedChr->{'chrMT'}; -ok( $wanted eq 'M', "If chrMT given and M listed, returns M" ); - -$wanted = $seq->normalizedWantedChr->{'chrM'}; -ok( $wanted eq 'M', "If chrM given and M listed, returns M" ); - -done_testing(); diff --git a/perl/t/tracks/base/types.t b/perl/t/tracks/base/types.t deleted file mode 100644 index 1f666fec1..000000000 --- a/perl/t/tracks/base/types.t +++ /dev/null @@ -1,27 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; - -use Seq::Tracks::Base::Types; - -my $rounder = Seq::Tracks::Base::Types->new(); - -my $rounded = $rounder->convert( "2.032", "number(2)" ); - -ok( $rounded == 2.0 ); - -$rounded = $rounder->convert( 2.032, "number(2)" ); - -ok( $rounded == 2.0 ); - -$rounded = $rounder->convert( 0.0000000023567, "number(3)" ); - -ok( $rounded == 0.00000000236 ); - -$rounded = $rounder->convert( 1.357e-10, "number(2)" ); - -ok( $rounded == 1.4e-10 ); - -done_testing(); diff --git a/perl/t/tracks/build/02-local-files.t b/perl/t/tracks/build/02-local-files.t deleted file mode 100644 index 971680ad4..000000000 --- a/perl/t/tracks/build/02-local-files.t +++ /dev/null @@ -1,47 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; -use Path::Tiny; - -use Seq::Tracks::Build::LocalFilesPaths; - -my $localPaths = Seq::Tracks::Build::LocalFilesPaths->new(); - -my $trackName = 'something'; -my $filesDir = Path::Tiny->tempdir(); - -my $localFiles = - [ 'something.chr*.txt', 'something.shouldnt_match_glob.chr99.txt', ]; - -my @chrs = ( 'chr1', 'chr2', 'chr3', 'chr4', 'chr5' ); - -my $path = path($filesDir)->child($trackName); -$path->mkpath(); - -my @actualPaths; -for my $chr (@chrs) { - my $filePath = $path->child("something.$chr.txt")->absolute(); - - $filePath->touch(); - - push @actualPaths, $filePath->stringify; -} - -my $nonGlobFile = - $path->child("something.shouldnt_match_glob.chr99.txt")->absolute(); -$nonGlobFile->touch(); - -push @actualPaths, $nonGlobFile; - -my $computedPaths = - $localPaths->makeAbsolutePaths( $filesDir, $trackName, $localFiles ); - -for my $i ( 0 .. $#$computedPaths ) { - ok( $computedPaths->[$i] eq $actualPaths[$i] ); -} - -$path->remove_tree(); - -done_testing(); diff --git a/perl/t/tracks/build/build_field_transformations.t b/perl/t/tracks/build/build_field_transformations.t deleted file mode 100644 index b656b7d12..000000000 --- a/perl/t/tracks/build/build_field_transformations.t +++ /dev/null @@ -1,71 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; -use Path::Tiny; - -use Seq::Tracks::Build; -use Seq::DBManager; -use Seq::Output::Delimiters; - -Seq::DBManager::initialize( { databaseDir => Path::Tiny->tempdir(), } ); - -my $seq = Seq::Tracks::Build->new( - { - chromosomes => ['chr1'], - name => 'test', - type => 'gene', - assembly => 'hg38', - features => - [ 'someFeature', 'someOther', 'someToSplit', 'someToJoinLeft', 'someToJoinRight' ], - build_field_transformations => { - someFeature => "replace /[.]+/,/", - someOther => "replace /[.]+/ /", - someToSplit => "split [,]+", - someToJoinRight => ". _with_hello_world", - someToJoinLeft => "chr ." - }, - local_files => ['fake'], - files_dir => Path::Tiny->tempdir(), - } -); - -my $str = 'criteria_provided..multiple_submitters..no_conflicts'; -my $cp = $str; -my $expected = 'criteria_provided,multiple_submitters,no_conflicts'; -my $res = $seq->transformField( 'someFeature', $str ); - -ok( $res eq $expected, "can replace multiple characters" ); -ok( $str eq $cp, "doesn't modify input in place" ); - -$str = 'criteria_provided..multiple_submitters..no_conflicts'; -$expected = 'criteria_provided multiple_submitters no_conflicts'; -$res = $seq->transformField( 'someOther', $str ); - -ok( $res eq $expected, "can replace with spaces" ); - -$str = 'something,to,,split'; -my @exp = ( 'something', 'to', 'split' ); - -$res = $seq->transformField( 'someToSplit', $str ); - -ok( @$res == 3 - && $res->[0] eq $exp[0] - && $res->[1] eq $exp[1] - && $res->[2] eq $exp[2], - "can split on simple characters, with 1+ matches" ); - -$str = 'some_long_string'; -$expected = 'some_long_string_with_hello_world'; -$res = $seq->transformField( 'someToJoinRight', $str ); - -ok( $res eq $expected, "can join to right end" ); - -$str = '1'; -$expected = 'chr1'; -$res = $seq->transformField( 'someToJoinLeft', $str ); - -ok( $res eq $expected, "can join to left end" ); - -done_testing(); diff --git a/perl/t/tracks/build/coerceFeatureType.t b/perl/t/tracks/build/coerceFeatureType.t deleted file mode 100644 index f2614bbdc..000000000 --- a/perl/t/tracks/build/coerceFeatureType.t +++ /dev/null @@ -1,175 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; -use Path::Tiny; - -use Seq::Tracks::Build; -use Seq::DBManager; -use Seq::Output::Delimiters; - -Seq::DBManager::initialize( { databaseDir => Path::Tiny->tempdir(), } ); - -my $seq = Seq::Tracks::Build->new( - { - chromosomes => ['chr1'], - name => 'test', - type => 'gene', - assembly => 'hg38', - features => [ 'someString', 'someInt: int', ], - local_files => ['fake'], - files_dir => Path::Tiny->tempdir(), - } -); - -# unit separator -my $overlap = "/"; - -my $str = - '1: Homo sapiens BRCA1/BRCA2-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -my $expected = - '1: Homo sapiens BRCA1,BRCA2-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -my $res = $seq->coerceFeatureType( 'someString', $str ); - -#modifies passed string, and also returns the modified value -ok( $res eq $str && $str eq $expected, 'Can clean string containing /' ); - -$str = - '2: Homo sapiens BRCA1' - . $overlap - . 'BRCA2-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$expected = - '2: Homo sapiens BRCA1,BRCA2-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$res = $seq->coerceFeatureType( 'someString', $str ); - -ok( $res eq $str && $str eq $expected, - 'Can clean string containing / followed by a-zA-Z' ); - -$str = - '3: Homo sapiens BRCA1' - . $overlap - . 'BRCA2-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$expected = - '3: Homo sapiens BRCA1,BRCA2-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$res = $seq->coerceFeatureType( 'someString', $str ); - -ok( $res eq $str && $str eq $expected, - 'Can clean string containing 2 / folloed by a a-zA-Z' ); - -$str = - '4: Homo sapiens BRCA1' - . $overlap - . '(BRCA2)-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$expected = - '4: Homo sapiens BRCA1,(BRCA2)-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$res = $seq->coerceFeatureType( 'someString', $str ); - -ok( $res eq $str && $str eq $expected, - 'Can clean string containing / followed by a special character, a parenthsis' ); - -$str = - '5: Homo sapiens BRCA1' - . $overlap - . '.(BRCA2)-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$expected = - '5: Homo sapiens BRCA1,.(BRCA2)-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$res = $seq->coerceFeatureType( 'someString', $str ); - -ok( $res eq $str && $str eq $expected, - 'Can clean string containing / followed by a special character, a period' ); - -$str = - '6: Homo sapiens BRCA1|(BRCA2)-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$expected = - '6: Homo sapiens BRCA1,(BRCA2)-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$res = $seq->coerceFeatureType( 'someString', $str ); - -ok( $res eq $str && $str eq $expected, 'Can clean string containing |(' ); - -$str = - '6: Homo sapiens BRCA1|BRCA2-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$expected = - '6: Homo sapiens BRCA1,BRCA2-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$res = $seq->coerceFeatureType( 'someString', $str ); - -ok( $res eq $str && $str eq $expected, - 'Can clean string containing | followed by a a-zA-Z' ); - -$str = - '7: Homo sapiens BRCA1;BRCA2-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$expected = - '7: Homo sapiens BRCA1,BRCA2-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$res = $seq->coerceFeatureType( 'someString', $str ); - -ok( $res eq $str && $str eq $expected, - 'Can clean string containing ; followed by a a-zA-Z' ); - -$str = - '8: Homo sapiens BRCA1;.BRCA2-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$expected = - '8: Homo sapiens BRCA1,.BRCA2-containing complex subunit 3 (BRCC3), transcript variant 2, mRNA. (from RefSeq NM_001018055)'; -$res = $seq->coerceFeatureType( 'someString', $str ); - -ok( $res eq $str && $str eq $expected, - 'Can clean string containing ; followed by a period' ); - -my $test = 'NA'; -$res = $seq->coerceFeatureType( 'someString', $test ); - -ok( !defined $test && !defined $res ); - -$test = 'NA'; -$res = $seq->coerceFeatureType( 'someString', $test ); - -ok( !defined $test && !defined $res ); - -$test = 'NA;'; -$res = $seq->coerceFeatureType( 'someString', $test ); - -ok( !defined $test && !defined $res ); - -$test = 'NA|'; -$res = $seq->coerceFeatureType( 'someString', $test ); - -ok( !defined $test && !defined $res ); - -$test = 'NA;'; -$res = $seq->coerceFeatureType( 'someString', $test ); - -ok( !defined $test && !defined $res ); - -$test = 'NA' . $overlap; -$res = $seq->coerceFeatureType( 'someString', $test ); - -ok( !defined $test && !defined $res ); - -$test = 'NA' . $overlap; -$res = $seq->coerceFeatureType( 'someString', $test ); - -ok( !defined $test && !defined $res ); - -$test = ''; -$res = $seq->coerceFeatureType( 'someString', $test ); - -ok( !defined $test && !defined $res ); - -$test = '.'; -$res = $seq->coerceFeatureType( 'someString', $test ); - -ok( !defined $test && !defined $res ); - -my $delims = Seq::Output::Delimiters->new(); -$test = $delims->emptyFieldChar; -$res = $seq->coerceFeatureType( 'someString', $test ); - -ok( !defined $test && !defined $res ); - -$expected = '. Hello World'; -$test = '. Hello World'; -$res = $seq->coerceFeatureType( 'someString', $test ); - -ok( $test eq $res && $res eq $expected, - 'Doesn\'t strip valued sentences of undef-flagged characters' ); - -done_testing(); diff --git a/perl/t/tracks/build/coerceUndefinedValues.t b/perl/t/tracks/build/coerceUndefinedValues.t deleted file mode 100644 index ca78e530e..000000000 --- a/perl/t/tracks/build/coerceUndefinedValues.t +++ /dev/null @@ -1,125 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; - -use Seq::Tracks::Build; -use Seq::DBManager; -use Seq::Output::Delimiters; - -Seq::DBManager::initialize( { databaseDir => Path::Tiny->tempdir(), } ); - -my $seq = Seq::Tracks::Build->new( - { - chromosomes => ['chr1'], - name => 'test', - type => 'gene', - assembly => 'hg38', - features => [ 'someString', 'someInt: int', ], - local_files => ['fake'], - files_dir => Path::Tiny->tempdir(), - } -); - -my $delims = Seq::Output::Delimiters->new(); - -my $test = 'NA'; -my $res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, "Modifies passed value, and sets NA to undef" ); - -$test = '.'; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, "Sets . to undef" ); - -$test = 'see cases'; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, "'see cases' is not a valid value" ); - -$test = 'see cases '; -$res = $seq->_stripAndCoerceUndef($test); - -$test = 'unknown'; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, - "' unknown ' with leading/trailing whitespace not a valid value" ); - -$test = ' unknown '; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, - "' unknown ' with leading/trailing whitespace not a valid value" ); - -$test = ' see cases'; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, - "'see cases' with leading whitespace not a valid value" ); - -$test = ' see cases '; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, - "'see cases' with leading/trailing whitespace is not a valid value" ); - -$test = 'not provided'; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, "'not provided' is not a valid value" ); - -$test = 'not specified'; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, "'not specified' is not a valid value" ); - -$test = 'no assertion provided'; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, - "'no assertion provided' is not a valid value" ); - -$test = 'no assertion criteria provided'; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, - "'no assertion criteria provided' is not a valid value" ); - -$test = 'no interpretation for the single variant'; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, - "'no interpretation for the single variant' is not a valid value" ); - -$test = 'no assertion for the individual variant'; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, - "'no assertion for the individual variant' is not a valid value" ); - -$test = $delims->emptyFieldChar; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, "Sets the emptyFieldChar to undef" ); - -$test = ' NA '; -$res = $seq->_stripAndCoerceUndef($test); - -ok( !defined $test && !defined $res, "Whitespace doesnt affect coercion" ); - -my $expected = 'NA / Some value'; -$test = 'NA / Some value'; -$res = $seq->_stripAndCoerceUndef($test); - -ok( $test eq $res && $res eq $expected, "Doesn't clear valued statements" ); - -$test = " SOMETHING NOT NULL "; -$seq->_stripAndCoerceUndef($test); -ok( $test eq "SOMETHING NOT NULL", - "_stripAndCoerceUndef also strips leading/trailing spaces" ); - -done_testing(); -1; diff --git a/perl/t/tracks/build/ref_cannot_be_skipped.t b/perl/t/tracks/build/ref_cannot_be_skipped.t deleted file mode 100644 index 504121efc..000000000 --- a/perl/t/tracks/build/ref_cannot_be_skipped.t +++ /dev/null @@ -1,36 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; -use Mouse; -extends 'Seq::Base'; - -1; - -use Test::More; -use Test::Exception; -use lib 't/lib'; -use TestUtils qw/ PrepareConfigWithTempdirs /; - -use Path::Tiny qw/path/; -use Scalar::Util qw/looks_like_number/; -use YAML::XS qw/DumpFile/; - -use Seq::Build; -# create temp directories -my $dir = Path::Tiny->tempdir(); - -# prepare temp directory and make test config file -my $config_file = PrepareConfigWithTempdirs( - 't/tracks/build/ref_cannot_be_skipped.yml', - 't/tracks/gene/db/raw', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify -); - -my $config = YAML::XS::LoadFile($config_file); - -throws_ok { Seq::Build->new_with_config( { config => $config_file } ) } -qr/Reference track cannot have `no_build` set/, 'Reference tracks must be built'; - -done_testing(); diff --git a/perl/t/tracks/build/ref_cannot_be_skipped.yml b/perl/t/tracks/build/ref_cannot_be_skipped.yml deleted file mode 100644 index 7a1ac685a..000000000 --- a/perl/t/tracks/build/ref_cannot_be_skipped.yml +++ /dev/null @@ -1,12 +0,0 @@ ---- -assembly: hg19 -chromosomes: - - chrM -database_dir: ~ -files_dir: ~ -temp_dir: ~ -tracks: - tracks: - - name: ref - type: reference - no_build: true diff --git a/perl/t/tracks/cadd/db/cadd/whole_genome_SNVs.tsv.chr22.organized-by-chr.txt.sorted.29lines.txt b/perl/t/tracks/cadd/db/cadd/whole_genome_SNVs.tsv.chr22.organized-by-chr.txt.sorted.29lines.txt deleted file mode 100644 index c18f1ca26..000000000 --- a/perl/t/tracks/cadd/db/cadd/whole_genome_SNVs.tsv.chr22.organized-by-chr.txt.sorted.29lines.txt +++ /dev/null @@ -1,29 +0,0 @@ -## CADD GRCh37-v1.6 (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved. -#Chrom Pos Ref Alt RawScore PHRED -22 1 G A -0.066139 1.753 -22 1 G C -0.126062 1.181 -22 1 G T -0.115863 1.265 -22 2 A C -0.006414 2.528 -22 2 A G 0.014662 2.849 -22 2 A T -0.000645 2.614 -22 3 T A 0.068504 3.734 -22 3 T C 0.083537 3.986 -22 3 T G 0.058915 3.573 -22 4 C A -0.041268 2.049 -22 4 C G -0.054230 1.890 -22 4 C T 0.007532 2.738 -22 5 T A -0.109528 1.320 -22 5 T C -0.088833 1.513 -22 5 T G -0.113245 1.288 -22 6 G A -0.009301 2.486 -22 6 G C -0.071741 1.691 -22 6 G T -0.058815 1.836 -22 7 A C -0.084844 1.553 -22 7 A G -0.062405 1.795 -22 7 A T -0.080228 1.601 -22 8 T A -0.105626 1.355 -22 8 T C -0.087343 1.528 -22 8 T G -0.111546 1.302 -22 9 A C 0.024365 3.003 -22 9 A G 0.048167 3.394 -22 9 A T 0.030344 3.100 diff --git a/perl/t/tracks/cadd/db/ref/chr22_contrived.fastq b/perl/t/tracks/cadd/db/ref/chr22_contrived.fastq deleted file mode 100644 index ae8e95d63..000000000 --- a/perl/t/tracks/cadd/db/ref/chr22_contrived.fastq +++ /dev/null @@ -1,4 +0,0 @@ ->chr22 -GATCTGATANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -+ -IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII \ No newline at end of file diff --git a/perl/t/tracks/cadd/integration.t b/perl/t/tracks/cadd/integration.t deleted file mode 100644 index 7f8fe0875..000000000 --- a/perl/t/tracks/cadd/integration.t +++ /dev/null @@ -1,153 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; - -use Mouse; - -extends 'Seq::Base'; - -1; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ PrepareConfigWithTempdirs /; -use DDP; - -use Path::Tiny qw/path/; -use Scalar::Util qw/looks_like_number/; -use YAML::XS qw/LoadFile/; - -# create temp directories -my $dir = Path::Tiny->tempdir(); - -# prepare temp directory and make test config file -my $config_file = PrepareConfigWithTempdirs( - 't/tracks/cadd/integration.yml', - 't/tracks/cadd/db', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify -); - -my $runConfig = LoadFile($config_file); -p $runConfig; - -my $seq = MockBuilder->new_with_config( { config => $config_file } ); - -my $tracks = $seq->tracksObj; -my $caddBuilder = $tracks->getTrackBuilderByName('cadd'); -my $caddGetter = $tracks->getTrackGetterByName('cadd'); -my $refBuilder = $tracks->getTrackBuilderByName('ref'); - -my $db = Seq::DBManager->new(); - -$refBuilder->buildTrack(); -$caddBuilder->buildTrack(); - -my @localFiles = @{ $caddBuilder->local_files }; - -# # adapted from scorebuilder -my $headerRegex = qr/^#/; - -# # wigfix is 1-based: 1-start coordinate system in use for variableStep and fixedStep -# # https://genome.ucsc.edu/goldenpath/help/wiggle.html - -my $scalingFactor = $caddGetter->scalingFactor; - -my $rounder = - Seq::Tracks::Score::Build::Round->new( { scalingFactor => $scalingFactor } ); - -for my $file (@localFiles) { - my $fh = $caddBuilder->getReadFh($file); - my $start; - my $based = 1; - - my $pos = 0; - my $firstRef; - my $firstData; - my $firstChr; - - while (<$fh>) { - chomp; - - if ( $_ =~ m/$headerRegex/ ) { - next; - } - - my @fields = split "\t"; - - my $chr = "chr" . $fields[0]; - my $pos = $fields[1]; - my $ref = $fields[2]; - my $alt = $fields[3]; - - if ( !defined $firstRef ) { - $firstRef = $ref; - } - - if ( !defined $firstChr ) { - $firstChr = $chr; - } - - my $expected_score = $rounder->round( $fields[5] ) / $scalingFactor; - - my $data = $db->dbReadOne( $chr, $pos - 1 ); - - if ( !defined $firstData ) { - $firstData = $data; - } - - my @out; - - $caddGetter->get( $data, $chr, $ref, $alt, 0, \@out ); - - ok( @out == 1 ); - - # indexed by position index (here 0, we're only checking snps atm) - my $score = $out[0]; - - say STDERR - "chr: $chr, pos: $pos, ref: $ref, alt: $alt score: $score, expected: $expected_score"; - - ok( $score == $expected_score ); - - my $bystro_style_del_skipped = $caddGetter->get( $data, $chr, $ref, -10, 0, \@out ); - - ok( !defined $bystro_style_del_skipped->[0] ); - ok( @{$bystro_style_del_skipped} == 1 ); - - my $bystro_style_ins_skipped = - $caddGetter->get( $data, $chr, $ref, "+ACTG", 0, \@out ); - - ok( !defined $bystro_style_ins_skipped->[0] ); - ok( @{$bystro_style_ins_skipped} == 1 ); - - # In some Bystro tracks, we tile across indels, outputting an annotation per base disrupted - # For exact match tracks like CADD and VCF, we do not do this - # And we should always only output 1 annotation per indel - # Which is always undefined - my $bystro_style_ins_skipped_tiling = - $caddGetter->get( $data, $chr, $ref, "+ACTG", 0, \@out ); - - ok( !defined $bystro_style_ins_skipped_tiling->[0] ); - ok( @{$bystro_style_ins_skipped_tiling} == 1 ); - - $pos += 1; - } - - # We don't currently support VCF style deletions (e.g. ref: ACTG, alt: A) - my $vcf_style_del_skipped = - $caddGetter->get( $firstData, $firstChr, $firstRef . "CTG", $firstRef, 0, [] ); - - ok( !defined $vcf_style_del_skipped->[0] ); - ok( @{$vcf_style_del_skipped} == 1 ); - - # We don't currently support VCF style insertions (e.g. ref: A, alt: ACTG) - my $vcf_style_ins_skipped = - $caddGetter->get( $firstData, $firstChr, $firstRef, $firstRef . "CTG", 0, [] ); - - ok( !defined $vcf_style_ins_skipped->[0] ); - ok( @{$vcf_style_ins_skipped} == 1 ); -} - -done_testing(); diff --git a/perl/t/tracks/cadd/integration.yml b/perl/t/tracks/cadd/integration.yml deleted file mode 100644 index 1fa8a2d26..000000000 --- a/perl/t/tracks/cadd/integration.yml +++ /dev/null @@ -1,20 +0,0 @@ ---- -assembly: hg19 -build_author: ec2-user -build_date: 2017-11-27T05:44:00 -chromosomes: - - chr22 -database_dir: ~ -files_dir: ~ -tracks: - tracks: - - type: reference - name: ref - local_files: - - chr22_contrived.fastq - - local_files: - - whole_genome_SNVs.tsv.chr22.organized-by-chr.txt.sorted.29lines.txt - name: cadd - type: cadd - sorted: true - based: 1 diff --git a/perl/t/tracks/chrWanted.t b/perl/t/tracks/chrWanted.t deleted file mode 100644 index 00ce1c0ab..000000000 --- a/perl/t/tracks/chrWanted.t +++ /dev/null @@ -1,50 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; - -use Seq::Tracks::Build; - -# create temp directories -my $db_dir = Path::Tiny->tempdir(); -my $files_dir = Path::Tiny->tempdir(); - -Seq::DBManager::initialize( { databaseDir => $db_dir } ); - -my $t = Seq::Tracks::Build->new( - { - files_dir => $files_dir, - name => 'test', - type => 'sparse', - chromosomes => [ 'chr1', '1', 'chr2', '2', '0' ], - assembly => 'hgTest' - } -); - -my $wantedChr = $t->chrWantedAndIncomplete('chr1'); - -ok( $wantedChr eq 'chr1' ); - -$wantedChr = $t->chrWantedAndIncomplete('chr2'); - -ok( $wantedChr eq 'chr2' ); - -$wantedChr = $t->chrWantedAndIncomplete('chr3'); - -ok( !defined $wantedChr, "Unwanted chromsomes result in undef returned" ); - -$wantedChr = $t->chrWantedAndIncomplete('1'); - -ok( $wantedChr eq '1' ); - -$wantedChr = $t->chrWantedAndIncomplete('0'); - -ok( $wantedChr eq '0', "'0' accepted as a valid chromosome number" ); - -$wantedChr = $t->chrWantedAndIncomplete(''); - -ok( !defined $wantedChr, - "Empty strings not accepted as valid chromosome, result in undef returned" ); - -done_testing(); diff --git a/perl/t/tracks/gene/construct.t b/perl/t/tracks/gene/construct.t deleted file mode 100644 index 42f3944c7..000000000 --- a/perl/t/tracks/gene/construct.t +++ /dev/null @@ -1,21 +0,0 @@ -# Test to see if we can construct a gene object -use Test::More; - -use Seq::Tracks::Gene::Build; -use Seq::DBManager; - -Seq::DBManager::initialize( { databaseDir => 'bar' } ); - -my $gene = Seq::Tracks::Gene::Build->new( - { - files_dir => 'foo', - name => 'refSeqTrack', - type => 'gene', - assembly => 'hg19', - chromosomes => [ 'chr1', 'chr2' ], - } -); - -ok( $gene->isa('Seq::Tracks::Gene::Build'), 'Gene object created' ); - -done_testing(); diff --git a/perl/t/tracks/gene/db/raw/clinvar/variant_summary.txt.MT.1600_3250.gz b/perl/t/tracks/gene/db/raw/clinvar/variant_summary.txt.MT.1600_3250.gz deleted file mode 100644 index 7631b5d66d98821337b47992e6dd0597f3fc711d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 523 zcmV+m0`&bKiwFq#SZ7-R19o9@X<=@3UvqVBZDDeGE_8TwE=^P}F*YzTUo$c_FaSl8 zOK;mS49Cyxrx4J~3Z$8x*m>OyXY)KX55z&PgJQF`Fg*fE_LhD6QEAZVV2Kn-{r;q% zGbvGGxrXEQg0STZp+FrRVXLm{Y$W_Bw)rn|@o6D8yTWvOkTkRdF@pKYJRQ)^ujgX}CIc$!v$J1>Bt9cqlu;;!DpoAI2 z&t9`A9x+OqK)hTg^I4os;Wacjq7_GRI)e>1G|W|=MDc7kO(669uiztgv8J`5X#+Bc z;$Kh3h$+;bOV%JM5XRo9M~v%+bx54LWzskYEm{?t@HiBQyqhk7ew8SmEGN-8CU}Tv z1f38I6p#^_HjLBsY$(fWE5a*0pbXU2hBUzzcAIj;ZgUYeJAE`~l`ct6Iu5ERYMCaB Nr~e@Z@qxYq0018{^-cf) diff --git a/perl/t/tracks/gene/db/raw/ref/chr10_fake_overlap.fasta.gz b/perl/t/tracks/gene/db/raw/ref/chr10_fake_overlap.fasta.gz deleted file mode 100644 index 7d4425fe775e81fda945f87a6eb708445524dea6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8080 zcmV;BA8+6viwFouiIH0X17m1%F)&|dVQXbyZ+2yJY+-ONW?^%5VE~PtOL8U2j%4Sx znyw9$2m}M$0CrH59{2tS9T(@Qx$_g5j~PJ*@AZ$l8K6*A7@7a`zyJ6D{rCU*ubHYd z=k%X{oSIYh=hOW8?)f<9&wJCKx2w88@0~sH&-b7Ad8PmS`}2=Khy1xx|2d17y883& zpY#8`wio+cYkIFSpM(E=`{$U?7(NH?@#Xn{{{H#+^QFA{&yPRv{W<*e`E$VM0(%=y z>wWt(xP8XYh&~rO+~@OpamJr#*b~k$0G_6Or+9xp*Yepu1K&IBo%pi{f397X!7)Sg zIjH>+_Et&*$8o%g>?v44-w}2ih-_Z|~pp8MZh64Ds`? z&(iOs7OPx*-Y@$I!wt_q@EHa+tmo$}X3$951**@(?6d8;hfq5k%zLAqlzN5C=Pzr% zOZvI%=Z0q&WJ9L7XYIRLvyEWK_zB?8H?c4`XtVtx{%3QAGsM-F*65;-z3vIR3@ zV_y+TPNscw8z#rQe6C#%=H3Kg1Dg*w*~@aF4OX7*bKcp)U*vZS!^K#<%clM2jJ0~v z*@*b8GmlOs2IIJnj|M+66rIH}(AwG!h;1qk8o< z5;VqHEWrf^z?l@qNyR0!KGV>U1FL$28%Tsoj7>9Vz(|W#pBo#v6cW}L3KJwo6(B=c zyF*}p#=kgd4cp9b2009d2O|j1F@*2Lm=b}kExizvs^kM@PkK|l3xKfB1U5^u_bJ)r zNb`2QM1f#{w%Mm^I3+}3D=)m@KxSMx=Cs1q_u&EAa3+cng2V+}6oGYcD|r|osJMCp zBa$GTp(QG^$J~>;==!O&h+9j3U)s&INMrZFK_~!m;HoO+10zxe5=qro4;|ilnTkk) z0n1f~5<&T67U9q5KR8k1`<&R8tXA#qY9oMo2rvsFAk982{F(wficzFWg?6&Sa8w9Y z2Xcm_JR$?%c86vZI8|ZYn_EC@L;3~J9Z9fxCxHB&zNPWF%oSoT0}#18!Cj`fMW9W>_R=3Q)- zejmG_RxWA@sin3=%qzEr>?0c^!UKNXwzWgh*$dt(7)%E*=j2OJU`mW)mQ_+v#Pf)U zxq)@G1$J>(E$*0>dP3! zP`^5vxzCf9R1G9omv_IX0?KB@^i*{A5}6ZZtq|0^YLZCAf~r>KOwPd#x0v9^TzyKm z)UCZb5~&7G^U_kvR^M3U!Cb^F&fTsQmyN+^N08gM~TuBPTZIWM`M^ z!S1rYXb}+%6VMFwK$m#a+%fWq1Bm#vMX@YFsmZqjFUZJHU|=yOIR3=n+jN7H22Y)5 ztg$U$ZGMLSID}JoG0)|Qcq-)t>c>f|plQ}#{SN%u<=OMj?R^VtnwZUy|>W^Nj zhWqm8p0Me@gmFf-5!F9ozlu*>0xOJ0!ohqWJjV`S0?u|cmL>}4Ce&m~sPaR0-3p5k z>jbENj;M{bxPGz}OR<6I!QA6r4Gme8LU68Xh*Qs%mqHOYrYp%eT6iT>(pU}ZL++tB zg(HtzlQsz$DrYo@b9)#yt&i4}KiYUbq3Nv~a8gfOd#H`q_*S+mir~9!T)io=I?}Yg z&#r`|9;f=JlQ%kfx{QBu^4>mN|INv(IC-x({@Ka<-3$ET*v*N#d8)%He|7BYI(FxA z?E0%?cYb&5`Z{)9Z^tg;_D0qQu>2m`Dg?c}w4aV0)am8e9fue5=Zj-^{m|>!WoY_> zkGy$q@;G+6Nxg2O{3$*26py%vud*#LLuXSyY_qf=Y!T!BC@3@?Lys?1H|6*`YTiaE z+G6_5**qVCWvno#fVzgKj97SXV>^pEV3l8k2us?Bo6!0e)NmA_GWIB}j>?adJIC_2 zty0)y`Ya`$X)QyL46ycFm zeCrwd#KxX_`yYHK71e=uYeF)MI^y9iAgC)^CU_ z#zT^>W_zGB5@|op9&#V zEpiWf1-GlRaMQ7|4e)bIJlhUPAV`}LZr5Ta;QElMaI5jL&h2h7zc@nO(&?w^-{O#V z^0C9*r^2!b?1uPuVDhMADqAWnDe4*A;WB@Pg`;=)~@JQA0XfS<$A>WrPj%mI^Fu|=|0CL@bWTj)S@j7kC01kHUXgh7e{ zN!9UEWdSni>h42PbcPK*59A=8hZdFH5tdCfJTtKP=$7K1B@mNsDZyOoP)DtUi%W`aA06 zwxLNei!$UGG;JeT=WYlPlqu`qFvQ3t%UQG(9$3rwE2BuEarvAn@3bsM_S{Y*t3B4GD zC-%pV=h$Fhi2K1j0(0M(M>`@=?wLKmbX=EH=QUqnh}oU5aRl$}tU}@?W&=msiL>q4 z@uy!3CDS!DXYO>3)Af@-g1hjZei-C$3yU@PePMTqP8l>5CqdcuTG;nU%aICWEit?_ zvVX2Dx`JEm&F{*d@7r!sAPzU7^OtS6ZWE|yW$ki>7GN`EG?;6henv*~YV+2CRTPC*x7~HVwY$G`yQ9!$$@{D|&>AlgS<+#RnOeXWaZmnIMc8$RWgW zrz)LhdlB693Pz=A3;=G5bGy#(4dT-CA6<9thv<2uil7YLVC>OBh!GZ{ZN`t{fpX`D zH20!BUICwcNb-K_(aF>-Rv5#1$zG`RnHV-qJ{OgsbojQ;^%tEcBGJ-54{-j11CZ!5 zUz3X{YCD}o(eKH1<=Mq6+unVH`J(2RE$!bxQ!VpbYT0gnLiyVusQllsDDp0P?ZDmd zRE024_73;8JWKEG2x<%&N3ydMDwD1`mKp_(RMuE5&jI=Cs*mHt z?SsblZy&UdyMZP&@-s!`u`Id2SrA?*kCmm>*{j$OP5G=!aYTuawa2& z>rpR$n;vIa9=MGn#HFT@sS|K8?bw}WLpQn(!b<(lWH2*v0P!4-1@;t*T~E-9lr{cw z92mk*5QaqK{m0M*JjVVP(XBB6&l>fcTGNf9nAxgFqRt)B(Rd-dP+!KC##nh6mf)5` z^&Sp9<*KQI)ISQO{w~cXfg-!;0Pk{#Oo@RNtvn~p-W9=Tf!P&e5YLw-1sJ|^ZfeJI zLo_&B*J;6auF5HGp763e4CdUvecevY*tl^A%9r%Zb-^d z&%2PPXZL2MPa%iI{9vW>4sC@m!}#gEzF2Y!PJF?;#-){bSy^f_`b?(eRN1lMxonus z-7>R+>jVfu(*yHlvPXMnv|#fG0LMQ#;_>EU>;pkwPNF-RfVF|i3vJJF0)cTq4p<>> zqOI~0g%Q17009kw0-kYJy4(!J)z*Zu_&IN7t=G-VCJ;nez4oq52*h;&^*QX59xLXs z8RgJZhN++#DxHKvGo|v&kc5pTiUnCLokQKTrn7?AM!`M0=xLIT<{~Cd;RLipQNWnT z8spgR_2|K%oH_7at-=!_(&8rtySSiK&tQ zv=<9nlv%I$ZU8W2Xfs0?3F`C(sOFC?))EQvxJqw{9^6sIoOr|{xorUP>=#K=`LT(= zN`8K(?>EW+&6_w^@}KhpGdaljrqQI$Ri^5%sszX_=V88r*?a*rrSv1QGr+H4_7zjB zBeCK5$8FBf+>Py7eKDC91gRA|WSh8EcRxqj`Khaz_vAt4&|OvnZ@`uNXwah-lPaqU z)z**JB`CSK#`&!OXsYvVLrDaT3**h}dATiVr=rfFw}dYtQ_-h&u9Zr$?%Tv!s6fMjsinnH7O*v} zh`G#i&t&V0Mn~8gt1O>8+2Vj&Y?r^&X}7_9QU6?73*pf+&o6aI6-jG-HU5U_GfoN<<2((B(9$_bY`O#YI%da~m zSi0{>q?}0Pw1x!ZZy&eZRy$ke zDN0kXK$!zVNCCoIX#$V0R5BY(;y`eugy2y-gMn#&b-3ayirUsEfq85ql#!{k0XmK? zxw(BjxEaI--)>@HZR9p8n50Jr9L=!nb4pz#w|M=kzZgBO!>K#0g+vM4=UzpRXc?w; zVoo^dN=41j-h7YHfXvYYF$5B!v^suisqJ+9h|zy)sd3fB=)nQrr+H%ZvFjvizeHz! ziB2M?HA0!Qu=ZD^l3Md>r@cOQj(dCrO$+4sOxlq6TiJR$ce9s{g7KYzOU)?Z1`;6g zy!W*pK@fsVHMY%)_%1P4^8j3IHax-$5Py+KN8j-3Ma`11?3d8z2?RUJ>Es*g`_b1K z1(6D@xk66W9x?GI!CjXa%w^`fczEww=ECN?%r9E|cR2mJhopg29_nkbf=_`hgO!qf z0rmWQ|Mx4P&W~W#=)kPGIrEO#Id@V#e8B4ty3T{H!A6Fe<_7-l{c~gJ*N>BVb$#2} zNpuz5tVW9m@-sSaW7{#&;-suDoC-OMlhUfDgr@uE+{vNrtGlW8Fmn&L&Fe)3hIQHl zq(nO81!r4A6((G9L6KNY6*OTYp6D4dB4e@5pL(5mgfEltphgKQ4Cd13k5izyS5&ER zPrTHpsUYd&yD+`dh(vOzU65K1+)zP~wjG5GoRVVTtNu_a zC#^vI@XL;McjocY5S~^oX&VA+wg)qreM;F=xPep%&2=Ct3UllDLJk}>LCc-{ zX?AaERKk7pAY)`#;-Zl!=T-<P)q@4YOWi}~7TZy&I>W(q2=PIP51KZ)T zS+BXP9iD3Onu+ldAxT)$H@x?+L>X*Zah8rL`A@9*12F0Gk^W_k6>-S;+ta@#iR3(< zPZt{@Ux;2Ie`B)wg)|YLK&SrSRxN@~effU$N>%a|NBJNA7dLn6y*Wb}xU2m5A2sR9 zelv6VY*$t@w{2-YAt4>R{ztAk*;1JX+|8G}<>q!->UTf4t_3nbKHMSndb)n?xLwVZ z!u8XuK5cBnsZ7fL=2h=EF48x9=c}#$uUmEUaJ2OPq3f)(o*VqLyE9+&ZfEU%%^?-s z{&P2pK(2(teM?&Qh#Y2U@>I~ zoqe|W1Tg(a#z9R^_&?GG_Wz>YO!7Yvs<8iS-KtNwjAl!2J#^~4^=Y_&OFsTWr=BQM zwm-b5DiSeOs(qF(pYPuc&xDn%FSX~95oOPjvZK?K9T`k9Afn9x zCek6;xN)yV%xTPXDLT|Zd$$z?X^-G3-0O&na~<&$i?Spki>`Qc(5J_fOg95gQI{o* z4Z%?Kkqv4sXD`pL7jmYah7HQ*IGyKh6#0W5a}}Esgh?RQD`v08YH4GpPwZ@K(;Vw1 zScMagP;Og8N)HcgNg+8BL3L~!C7xlXM`hL|*3#J4%d(x2=;il=?6V$mQP1v5^OMo{ zm9sq4bQFKR9;B|WwrGa_-6WGO)2KQ4Sas~t#LII@yNq7M6y)6|)O1cc@f62phlfK? z$b=VZ0FJ)wIx8#<{SZ_ka zJbRZ)!Hm72g8zG$K+G2yJK7y6GF~u=?hWbfm5o@yF z;avL1KFMSDDhDnbf{l>T^7QC^NTVudr>&hs+k*BbP%99R@;UO~&RV&Z1M;?~J9!cx z;7MZ+XfsDHg~ZZ>UNV95IwH02f}_U|)c!;mfy?Z8IXd@t6c5I?{I+l{O=qo(R_1Ir zNXWjt5qn~<|+;r%?%ArcO?w3B=Wk}lP` z0611XS!1WyN2?KE=LR#4;|JSpJgksDt>tWhVdt=ot zy!LtKdM<<2j`Zu`9nF+#Go=jM>?Kd%tC6Or4v*UQ#4~%gTN$RdG6gHkwjqyoWe+`9 zoLKNwKcH}Q+vjdkfaPYB$lScEc>De&SuILG-b`6~Q3CE*h_nKMYz9o-il8N~bV{4{ zr1jL26Te|APPtKoz?03>)>0q>tYeI*gw=FF^pF#tPUuXq=$ zWX|@9hB8GUSBUYX2Z5P?ZqrOSsoC6u3!d)g!D+%F~cT$`ZN z9q~h!DP1@CLbbIt=7LM`#`SNg6ai?(Eo}5;BNrH<=A_^i+X3v?XtGVk0)8a_Bqk|* zeyMs&bKX$v%OsDr(KMMXi+ApNOb(PR!E{t-V$EgWWJkPe#07mSfWX*!?B`>jrEZ}! zV^!HRRTA7M?GG^0|8W2yjOc*5@AYC& zXkPWj1`)Nc=sfKq5g5v%=BOxJ;e-Po# z2-I|cV~E9x+*)IH0qfe z3(Bz@$3Gk6!2h`fI2SB)NuW z>F!!;xq*;bVMo@^jsvpqq5Uy%Y*ogT_6Ug9XZ6mxq5rJ;p52jv7go~}mIstusa>{f z`{5FKZkGX|rMavIy`;)Z(L8mn((*ZMp#p1Ey7IP1lc9Akks3HL#29=+;a8Oi*9Geg z-{i8lo0_9zQ5Lu2;8@?ZXC|h_9Xf{oxGVF-NsA`x_x4jb4(0!d;9}+B_a^Oxk@ooQ z#}@0It&%{Ddkf^)+P1R2k&Ueb(|~vR+n4+WL1Vl8O}r5l=w*D#Q5rntK!V;+eP&S z%yWN>VVu?``0~z418*2yCY}AB2P2YK99Y(m;As$AXC#gSvs}T3liJ5702q1Y*QE&w z?f+IG!v&12v++o+xQpe4pJUIJ>fD2J>v;{iX_aCV9-a|_o<0!&51^g2gGa!j76CwR zkPydT1H)+YbL#eq{H@++&?VP5!X!WD7pU1T)-c_zRFGC!KHXmH(zDcIIHwKVbYY{3 ea987V!v6;{3lyvqZU6xJZSv*- diff --git a/perl/t/tracks/gene/db/raw/ref/chrM.fa.gz b/perl/t/tracks/gene/db/raw/ref/chrM.fa.gz deleted file mode 100644 index 8586ef9a900a97e3f07cd0235eba0528c23b61a4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5537 zcmV;S6<+EeiwFo6zQaiX17m1%O)h3(08N`sawR*igXdaQM=&_x=*_r+6CSw79o~Pu z6h0tN)xA~a^2r~GB0&I>|MTFm+k#>?ss<1eMIX)e!+9&^Ywj9->;u%J@-AYc<#Dy-8~Fr z@9)-a`#kHpq)H#$AwMs8{$4)YI#n9R9sT?JdHwU9=VD#y|L;Sd7d~^*1QdY2BfUc? z{qtVlzV`m|M}6PY{GQF|VL?lbO{deZ%Kia3-WIedBFx|*Sa ztk~>oo;OfDSxfWGvptF`Kc>!lQ9QEZPle`QvkI)@*Pg=}U7`3jW6ytDH-^PS?nIw| z9o_q#&(u4TJFL~(-h|PzXff})__ckbepf{kQ*tTEoOufP9o@M}bT7Jj*{cZck`hYB zy}6@!5$Gq#dW&X1TB|1$N(rNQ+z0FHYNh-g`cYPJoBmQxwz5XA(Q;`n<58SUm%=Dk ze3ZR6CYqNfd+)q2xl73+-EguLdcZ9j&5z`|^`lfMt&2Wju-s6^r8jp}OnoCzqnv;0 z<$jQ?SQba0Yvz=46A|6_{5nmTR%@trxoff+_^2Q#vDL~keO;#sF|HB@Xla}%r4lD@ zN6D>ZAalJEyirNknz^!AFFz@zeaBv^U{%@0np<%i6-#?>5yae3v?jO$qH^BqeeLyE z{yYO)SDscyV^}?RoZfd)Z?NxNo29(*94$qW11!PKxMC@VghgFFLz;7ktb(r{KWfQk zvVa+uar0)-(QAQ`nqOORt+3?~@Ny?eZn?sA4OyWz&?+CmOKk}d9QPDz ze&#K1Qc7LS*9`rr{Z7qpeb;!bn&KI1m_aGI^g&Z`)J0MGtZN^d_b?32Kovy&HaN!m`WIN0xZ^97ocGqV?h%uLYt}=h58*7| zsyIB;-?!n?9-Flox}<^*N)pkbzc9=(${2%u2Lmi4*&W>(!3wA@~CF16;u`4NvCn+Yawm9g&U$5bOE92xVYdxL|D;mHv!0+Iv)Jwk}KhZ zrwTd{@$E5~tvfgndA43MvPML=!G@JNU&^=i?fc_KLHE85@P0aAp%P!E|M~p>{(XLb zKIiZA`TP0&{Qdp@{rx|7`{XYNx{r!19`~3Xg5B|<`{qOzo`Q~%| z^ZVZQeExec{yb08y+1!c`o_KK{5&`Q+=o3+xvzb`d!MGipNsD+?lbR$?jL)J;u02W z+$7Iw4dE!m3BUlk3Js)hQ%wr7QZnnKM7ZBM0^Uuh+ys|OC#%c|%ts%%=wJiM*c6Vs zrh=?SOZo__8lK#=fu<>zd$br-WtdvG(OZo5v_|z)^IzqrlBvbMsQvNu2~7(yI_ydi ztwvO$6GMO?@cnj|r9{-6;TQCgz~N1%wrcW2R zV1ZPtq|}-fkEvmK;l><<#;1-;Jjej+m$YLzggN|ed}8Vae=%KhuF)g9-+q3tInAx5kUKu2L6`m?Cme@UOkSI8WAw_1swh$OLdDvsr1qUPPn-_ zAkl%DUHCa-8biH>c9H#{qc~8|ToF26piZHz)lnF4oAg@p-uAk51XCA?s>kW&hdh?5 z_7?B)YG?$OawI7Pu3X-oVBUHm^3(9Wp`gKu(R>EptL2hdT4UgUD2ajstikznA5vjkO=CD})o|8`G~Fv33l@$=4L9%>=9z8reUpeFsHD z@08buC9HL-wan!(`5YJ|z5qRix3c7SZO%fUcfKUdke%)}*b)}q zdzUiwqN2gpqP&r>DHynhx)|zP3>GHk3TlwYniGJycOKZ2Gm~xtHiex>62$aS?KcrB zyE|+2jb;xowaU1eECS|!hhm$FqJSb93^eAeEE<8jYj0z0B1OBTi2+d;#)~xF@VzRu zB|XqYl`BlkMXg=r8ivGQWMWmw3)!U5o-le$p*HVu`_#NuqveOPlw?+g<2ksL^eY#w z5Z8pn)P{9*ldQLIL%mUeD*E$Yp(u>43YbraYsW6rBhE7as9DEOMQX;{FdEU5mP4}c z4-(=~?1tcnbyTPmE&bR~s&B0GwO^Y~QSOi6Sle&2Jk4Ua!_ekw5YLX}SxVbJTD;8) zH8r=;{pVY^cm{+rY((f5#Aa<&mFoPHX2yrq>c?V zrG-sKLlQMRR=HOu*lq@N`%Cw6JuGeeRsC1+;%qJ!^)ep_nWrB>wO4ll+M#=kO)uH5 zS**2*ZDaAw6R*q*txIt))}u35AA8$k<2A`y&2rEwE(ZM`dndY6Fh0E6R@a!WcMT@jh)3S8Ta0>!a*v~Re!q2RrZ6KHHQ@EUiT3c6n$i?wx= zdYw17NUO?d(NXP_rY6d5!Rj@MNORkut(@+BK%$Suz71)R2(t}9D~xVu!}i2J7(752 zY}5|JJ6&O)DgdOi<5u1raOW^e9dp{5fRnLN85LKL{%&{Uxc{ipxo;^`UY9URQJzl2 zE+U5Iv+rx26(gcn)A(29%7+Kk7H@57ZGTwjD_}8AF z8{kBR0BP)4!N&I$DHX-_S}5pKoHz$l9EOBI%^~yK-YBfrs!RWt;v?KzsmN6rdt<*E zIwl(8?pc^iWOD2fu=B)B0NT{g#j%$qQmC3{ItP*)>Hr@f(;ke#H??;8x)p_IDZu*F^_GWnFh0x?FjDs%X z72{4eWzG`>pLhay%*SSj9w0*kNHH9=CYUe3u`f%J!H}xf;-vLS?%z;mSUi;iKAJ2z5sGd6t1E=td-#SBCQm~ zc;kW4Yge?l(?;0pYs9z7wPlnwd21;xz4}#SYf_iQtM4Q3L-1hDhu6*dB|b#jMnTlX zNd?E?-gnRz%VxHwpSM>U5}h3O6$2ufv%Xvno9Mz4pKV!Zh{NTXB#z0unDrFH5>34A|@K(@sF8}C3I{L za(hf$IC!vSi#4_maIkSwCiJ=3vEpL4cJGQ?hIKr7vpPKn+jh)qUSeb-KK4vVF$ez` z1Dui_O9zx!PDtIA$z)0JBJEl9PWxv~(2S^6SjNR<@B92?rBfj0)oa3ps?k4>d8FlL zJeS^V33R_0B~~{`(aEPI%r)`a*kH8{=$6brrip@jlGX)%OcWJH0rlU}PeP zQ26Yb96Pye6zEI@NRus6a}I}O77Kdt%{%)cLMfd(vZ9^r7ZWj`+^9J1uyyW!_hfUA z9(F9EI!96<5vCwXq9r_KLQ>Yip^)||BwNDgy(6w3DCf#9n+cFz?x5{(PK4kqDuYgR5ThqI!7 zad<(WvdaZZM`_tgNsnzPTolJ!5o=cZQpk<(CpMm#mVtOH?%#y6e>sd4$r-n5MbNlKL}foR z?D)mvL#M2H*1W>R#QMEoeHpE0quXRUWYH9M%^N!;g-Z~L?d^eR!m{x4=`1kcb}*Zd zYc~4b0xKT~+^-@q$1KrNFC65xAshXV6zg$&GjgAHfmpk3a(RHuGLjSqiwvSLLTDSL z-%Ps8yYB7CacJSJwXGNApCwnz#xk)~rHvh%pA{V0&9;(lc;|_$7)#9_ONQs3=rNXU z@(z&}8-G0k2s6>NnPWnW<7mEm+;2byGKo7Tyu>^T=(HJWPKBNr1IyIaiqSm$?F#l_MLm=4 z9lC4+Uon-^Jvh@lA!7bEFmFBbS`HB4rdhlRmb3CA8Z&{ioCl=MeK&>ShO`84ywtRB=kh}wY5?m zBYb+nMq67Q#gAw^MWtOHs=(K}=NpFWbpp10JbR^IHc1PDS^?4yH&#r!#kS5UrYRDt zS2c3*f)jj(G%X)Ln-I|y_;)P=HQ?k+z19|irfi4A2gZm!fKfJQ03}3KSw>^Yg%Q(5 z#ydy03v@a?KA^plZ;x7^h?}=jiuFOg=XI22cMR6@Oq#xFB4T8{a}N8KTO=By%mOp# z#Vm1xW0%~jWo=$Vsm2>SFD__Lj!d>pF^93GR^K%YR<1eBFxgkw1Yn{*UmEWJ&-}B? z3dI>Kg|vY+VSt0(qqEL_{V`v0oSqdKMiGx#AEAtQ5!JJKkB3z6gyQzR5Nn@{Gwb6m z6l+vFQO8?ZJMRW}#kI1j>gKbyKii|{Qsm5zQlmwMer|Co##pB+^8$zK`v;$VF6wx7sZg|<*k7jkw~ za&OJcjDaUi&<9h!?^6Fd9N+e@pr6-+oR@}2bF=WBh8aO{-hOIx%H-dP^KJ#^eEqEhi@|hKl`7@ny=IHDs``Ctbwru>H7XXwn6CC zmJNH4B026VN#1%pdh90?!Lo-TfTi<<=^G>V?SU1sV(pak$sJ%VE4g*oGRbf-Z_(^= zd4?(l^+}3;wXkdQKKbwrwoFOdNH(Iv+IM@O%N9XbHkV%Jh9plVSD;b19namE?NC~X zZTNe#T7f^|V3iblvDmx?Jf3Uo!K~ExCBeti>Zs^NVZj_gBjb& zs`GAHY|>yN)C%9jeBo6%oKY0Kj(@&v@j7fmY>pscOSTfmLkbauZID}UeitW6MYA~% zV|fbCium#F=(}f*A6e8sJ}3N`MgT%mR#v1bz&)I{iM)|a%OM!b8yF~ehvg?yg@I%W zN5Op3#y|ZV(cq&95JisT#Futcy~IPL_ZC;iXdj0;7W|S3&WZv0i(% zwaJh(sc@LdInKkF7TIcYd3k?atq0M=0vwamj`K3FZu4N0M27ym^A2|hHhJI|b jEz!#-49kT1m0*qI#7_*tS1qLE!=C>Ko&@jh4MG3_K3x28 diff --git a/perl/t/tracks/gene/db/raw/ref/fakeRef.hg19.60950_70966.txt b/perl/t/tracks/gene/db/raw/ref/fakeRef.hg19.60950_70966.txt deleted file mode 100644 index 419986370..000000000 --- a/perl/t/tracks/gene/db/raw/ref/fakeRef.hg19.60950_70966.txt +++ /dev/null @@ -1,10017 +0,0 @@ -G -G -C -A -G -G -A -G -G -A -G -G -G -T -G -T -G -G -G -A -T -G -G -T -G -G -A -G -G -G -G -T -T -T -G -A -G -A -A -G -G -C -A -G -A -G -G -C -G -C -G -A -C -T -G -G -G -G -T -T -C -A -T -G -A -G -G -A -A -A -G -G -G -A -G -G -G -G -G -A -G -G -A -T -G -T -G -G -G -A -T -G -G -T -G -G -A -G -G -G -G -C -T -G -C -A -G -A -C -T -C -T -G -G -G -C -T -A -G -G -G -A -A -A -G -C -T -G -G -G -A -T -G -T -C -T -C -T -A -A -A -G -G -T -T -G -G -A -A -T -G -A -A -T -G -G -C -C -T -A -G -A -A -T -C -C -G -A -C -C -C -A -A -T -A -A -G -C -C -A -A -A -G -C -C -A -C -T -T -C -C -A -C -C -A -A -C -G -T -T -A -G -A -A -G -G -C -C -T -T -G -G -C -C -C -C -C -A -G -A -G -A -G -C -C -A -A -T -T -T -C -A -C -A -A -T -C -C -A -G -A -A -G -T -C -C -C -C -G -T -G -C -C -C -T -A -A -A -G -G -G -T -C -T -G -C -C -C -T -G -A -T -T -A -C -T -C -C -T -G -G -C -T -C -C -T -T -G -T -G -T -G -C -A -G -G -G -G -G -C -T -C -A -G -G -C -A -T -G -G -C -A -G -G -G -C -T -G -G -G -A -G -T -A -C -C -A -G -C -A -G -G -C -A -C -T -C -A -A -G -C -G -A -C -T -T -A -A -G -T -G -T -T -C -C -A -T -G -A -C -A -G -A -C -T -G -G -T -A -T -G -A -A -G -G -T -G -G -C -C -A -C -A -A -T -T -C -A -G -A -A -A -G -A -A -A -A -A -A -G -A -A -G -A -G -C -A -C -C -A -T -C -T -C -C -T -T -C -C -A -G -T -G -A -G -G -A -A -G -C -G -G -G -G -C -C -A -C -C -A -C -C -C -A -G -C -G -T -G -T -G -C -T -C -C -A -T -C -T -T -T -T -C -T -G -G -C -T -G -G -G -G -A -G -A -G -G -C -C -T -T -C -A -T -C -T -G -C -T -G -T -A -A -A -G -G -G -T -C -C -T -C -C -A -G -C -A -C -A -A -G -C -T -G -T -C -T -T -A -A -T -T -G -A -C -C -C -T -A -G -T -T -C -C -C -A -G -G -G -C -A -G -C -C -T -C -G -T -T -C -T -G -C -C -T -T -G -G -G -T -G -C -T -G -A -C -A -C -G -A -C -C -T -T -C -G -G -T -A -G -G -T -G -C -A -T -A -A -G -C -T -C -T -G -C -A -T -T -C -G -A -G -G -T -C -C -A -C -A -G -G -G -G -C -A -G -T -G -G -G -A -G -G -G -A -A -C -T -G -A -G -A -C -T -G -G -G -G -A -G -G -G -A -C -A -A -A -G -G -C -T -G -C -T -C -T -G -T -C -C -T -G -G -T -G -C -T -C -C -C -A -C -A -A -A -G -G -A -G -A -A -G -G -G -C -T -G -A -T -C -A -C -T -C -A -A -A -G -T -T -G -C -G -A -A -C -A -C -C -A -A -G -C -T -C -A -A -C -A -A -T -G -A -G -C -C -C -T -G -G -A -A -A -A -T -T -T -C -T -G -G -A -A -T -G -G -A -T -T -A -T -T -A -A -A -C -A -G -A -G -A -G -T -C -T -G -T -A -A -G -C -A -C -T -T -A -G -A -A -A -A -G -G -C -C -A -C -G -G -T -G -A -G -T -C -C -C -A -G -G -G -G -C -C -A -G -C -A -C -T -G -C -T -C -G -A -A -A -T -G -T -A -C -A -G -C -A -T -T -T -C -T -C -T -T -T -G -T -A -A -C -A -G -G -A -T -T -A -T -T -A -G -C -C -T -G -C -T -G -T -G -C -C -C -G -G -G -G -A -A -A -A -C -A -T -G -C -A -C -C -A -C -A -G -C -G -C -A -T -C -T -C -G -A -G -T -C -A -G -C -A -G -G -A -T -T -T -T -G -A -C -G -G -C -T -T -C -T -A -A -C -A -A -A -A -T -C -T -T -G -T -A -G -A -C -A -A -G -A -T -G -G -A -G -C -T -A -T -G -G -G -G -G -T -T -G -G -A -G -G -A -G -A -G -A -A -C -A -T -A -T -A -G -G -A -A -A -A -A -T -C -A -G -A -G -C -C -A -A -A -T -G -A -A -C -C -A -C -A -G -C -C -C -C -A -A -A -G -G -G -C -A -C -A -G -T -T -G -A -A -C -A -A -T -G -G -A -C -T -G -A -T -T -C -C -A -G -C -C -T -T -G -C -A -C -G -G -A -G -G -G -A -T -C -T -G -G -C -A -G -A -G -T -C -C -A -T -C -C -A -G -T -T -C -A -T -T -C -A -A -C -A -C -C -T -G -G -T -T -A -G -A -A -A -A -C -T -G -G -G -G -C -C -A -G -C -A -C -A -C -A -G -G -G -G -A -A -G -G -G -T -A -A -G -C -T -G -G -T -T -T -C -A -T -G -A -T -C -G -A -A -T -C -A -A -G -G -C -T -C -A -G -A -C -A -A -T -T -T -T -T -A -A -A -G -G -C -C -A -G -A -G -G -G -T -A -G -A -C -T -G -C -A -A -T -C -A -C -C -A -A -G -A -T -G -A -A -A -T -T -T -A -C -A -A -G -G -A -A -C -A -A -A -T -G -T -G -A -A -G -C -C -C -A -A -C -A -T -T -T -A -G -G -T -T -T -T -A -A -A -A -A -T -C -A -A -G -C -G -T -A -T -A -A -A -T -A -C -A -G -A -A -G -G -T -G -G -A -G -G -G -A -A -C -T -T -G -C -T -T -T -A -G -A -C -A -C -A -G -T -T -C -A -G -G -T -G -A -A -G -A -A -A -G -A -C -C -T -G -G -A -A -A -C -T -T -C -T -G -T -T -A -A -C -T -A -T -A -A -G -C -T -C -A -G -T -A -G -G -G -G -C -T -A -A -A -A -G -C -A -T -G -T -T -A -A -T -C -G -G -C -A -T -A -A -A -A -A -G -G -C -A -A -T -G -A -G -A -T -C -T -T -A -G -G -G -C -A -C -A -C -A -G -C -T -C -C -C -C -G -C -C -C -C -T -C -T -T -C -T -G -C -C -C -T -T -C -A -T -C -C -T -T -C -T -T -T -C -A -A -T -C -A -G -C -A -G -G -G -A -C -C -G -T -G -C -A -C -T -C -T -C -T -T -G -G -A -G -C -C -A -C -C -A -C -A -G -A -A -A -A -C -A -G -A -G -G -T -G -C -A -T -C -C -A -G -C -A -C -C -A -C -A -G -A -A -A -A -C -A -G -A -G -C -C -A -C -C -A -C -A -G -A -A -A -A -C -A -G -A -G -G -G -T -G -A -C -T -G -T -C -A -T -C -C -C -C -T -C -C -A -G -T -C -T -C -T -G -C -A -C -A -C -T -C -C -C -A -G -C -T -G -C -A -G -C -A -G -A -G -C -A -G -G -A -G -G -A -G -A -G -A -G -C -A -C -A -G -C -C -T -G -C -A -A -T -G -C -T -A -A -T -T -T -G -C -C -A -G -G -A -G -C -T -C -A -C -C -T -G -C -C -T -G -C -G -T -C -A -C -T -G -G -G -C -A -C -A -G -A -C -G -C -C -A -G -T -G -A -G -G -C -C -A -G -A -G -G -C -C -G -G -G -C -T -G -T -G -C -T -G -G -G -G -C -C -T -G -A -G -C -C -G -G -G -T -G -G -T -G -G -G -G -A -G -A -G -A -G -T -C -T -C -T -C -C -C -C -T -G -C -C -C -C -T -G -T -C -T -C -T -T -C -C -G -T -G -C -A -G -G -A -G -G -A -G -C -A -T -G -T -T -T -A -A -G -G -G -G -A -C -G -G -G -T -T -C -A -A -A -G -C -T -G -G -T -C -A -C -A -T -C -C -C -C -A -C -C -G -A -A -A -A -A -G -C -C -C -A -T -G -G -A -C -A -A -C -G -A -A -A -A -G -C -C -C -A -C -T -A -G -C -T -T -G -T -C -C -A -G -T -G -C -C -A -C -A -G -G -A -G -G -G -G -C -A -A -G -T -G -G -A -G -G -A -G -G -A -G -A -G -G -T -G -G -C -G -G -T -G -C -T -C -C -C -C -A -C -T -C -C -A -C -T -G -C -C -A -G -T -C -G -T -C -A -C -T -G -G -C -T -C -T -C -C -C -T -T -C -C -C -T -T -C -A -T -C -C -T -C -G -T -T -C -C -C -T -A -T -C -T -G -T -C -A -C -C -A -T -T -T -C -C -T -G -T -C -G -T -C -G -T -T -T -C -C -T -C -T -G -A -A -T -G -T -C -T -C -A -C -C -C -T -G -C -C -C -T -C -C -C -T -G -C -T -T -G -C -A -A -G -T -C -C -C -C -T -G -T -C -T -G -T -A -G -C -C -T -C -A -C -C -C -C -T -G -T -C -G -T -A -T -C -C -T -G -A -C -T -A -C -A -A -T -A -A -C -A -G -C -T -T -C -T -G -G -G -T -G -T -C -C -C -T -G -G -C -A -T -C -C -A -C -T -C -T -C -T -C -T -C -C -C -T -T -C -T -T -G -T -C -C -C -T -T -C -T -G -T -G -A -C -G -G -A -T -G -C -C -T -G -A -G -G -A -A -C -C -T -T -C -C -C -C -A -A -A -C -T -C -T -T -C -T -G -T -C -C -C -A -T -C -C -C -T -G -C -C -C -T -G -C -T -C -A -A -A -A -T -C -C -A -A -T -C -A -C -A -G -C -T -C -C -C -T -A -A -C -A -C -G -C -C -T -G -A -A -T -C -A -A -C -T -T -G -A -A -G -T -C -C -T -G -T -C -T -T -G -A -G -T -A -A -T -C -C -G -T -G -G -G -C -C -C -T -A -A -C -T -C -A -C -T -C -A -T -C -C -C -A -A -C -T -C -T -T -C -A -C -T -C -A -C -T -G -C -C -C -T -G -C -C -C -C -A -C -A -C -C -C -T -G -C -C -A -G -G -G -A -G -C -C -T -C -C -C -G -T -G -G -C -A -C -C -G -T -G -G -G -G -A -C -A -C -A -A -A -G -G -A -A -C -C -A -G -G -G -C -A -A -A -G -C -T -C -C -C -T -C -A -G -C -C -C -C -A -T -T -C -A -A -A -G -A -G -G -C -C -T -G -G -C -C -C -A -C -A -G -G -C -T -C -A -C -G -G -A -A -A -G -T -T -A -G -C -C -T -C -T -C -A -T -G -C -C -C -C -G -A -G -A -G -C -T -G -A -G -T -G -C -A -A -G -G -G -A -G -A -G -G -C -A -G -C -G -C -T -G -T -C -T -G -T -G -C -T -T -C -C -C -A -T -G -C -A -G -A -A -G -C -A -C -C -C -C -C -C -T -C -C -C -A -C -C -C -C -T -G -T -G -C -A -G -G -C -C -G -G -C -C -T -T -C -G -C -G -G -C -A -G -A -C -C -A -C -C -A -T -A -C -A -C -C -A -C -G -T -T -C -C -A -A -G -C -C -A -C -A -C -T -G -A -G -G -C -C -T -C -C -C -T -C -C -A -A -G -C -C -T -G -C -A -G -C -C -C -C -C -A -T -T -T -C -C -A -G -A -C -C -C -T -G -C -C -A -G -G -G -C -A -A -C -C -T -G -C -A -T -A -T -C -C -A -C -C -T -C -C -C -T -A -C -C -C -T -G -C -C -C -C -C -C -T -C -T -T -C -C -A -G -G -A -G -T -C -T -G -C -C -C -T -A -T -G -T -G -G -A -G -T -A -A -G -C -A -C -G -T -G -G -T -T -T -T -C -C -T -C -T -T -C -A -G -C -A -A -C -T -A -T -T -T -C -C -T -T -T -T -T -A -C -T -C -A -A -G -C -A -A -T -G -G -C -C -C -C -A -T -T -T -C -C -C -T -T -G -G -G -G -A -A -T -C -C -A -T -C -T -C -T -C -T -C -G -C -A -G -G -C -T -T -A -G -T -C -C -C -A -G -A -G -C -T -T -C -A -G -G -T -G -G -G -G -C -T -G -C -C -C -A -C -A -G -A -G -C -T -C -C -T -C -A -G -T -C -T -A -A -G -C -C -A -A -G -T -G -G -T -G -T -G -T -C -A -T -A -G -T -C -C -C -C -T -G -G -C -C -C -C -A -T -T -A -A -T -G -G -A -T -T -C -T -G -G -G -A -T -A -G -A -C -A -T -G -A -G -G -A -C -C -A -A -G -C -C -A -G -G -T -G -G -G -A -T -G -A -G -T -G -A -G -T -G -T -G -G -C -T -T -C -T -G -G -A -G -G -A -A -G -T -G -G -G -G -A -C -A -C -A -G -G -A -C -A -G -C -A -T -T -C -T -T -T -C -C -T -G -C -T -G -G -A -C -C -T -G -A -C -C -C -T -G -T -G -T -C -A -T -G -T -C -A -C -C -T -T -G -C -T -A -C -C -A -C -G -A -G -A -G -C -A -T -G -G -C -C -T -G -T -C -T -G -G -G -A -A -T -G -C -A -G -C -C -A -G -A -C -C -C -A -A -A -G -A -A -G -C -A -A -A -C -T -G -A -C -A -T -G -G -A -A -G -G -A -A -A -G -C -A -A -A -A -C -C -A -G -G -C -C -C -T -G -A -G -G -A -C -A -T -C -A -T -T -T -T -A -G -C -C -C -T -T -A -C -T -C -C -G -A -A -G -G -C -T -G -C -T -C -T -A -C -T -G -A -T -T -G -G -T -T -A -A -T -T -T -T -T -G -C -T -T -A -G -C -T -T -G -G -T -C -T -G -G -G -G -A -G -T -T -C -T -G -A -C -A -G -G -C -G -T -G -C -C -A -C -C -A -A -T -T -C -T -T -A -C -C -G -A -T -T -T -C -T -C -T -C -C -A -C -T -C -T -A -G -A -C -C -C -T -G -A -G -A -A -G -C -C -C -A -C -G -C -G -G -T -T -C -A -T -G -C -T -A -G -C -A -A -T -T -A -A -C -A -A -T -C -A -A -T -C -T -C -G -C -C -C -T -A -T -G -T -G -T -T -C -C -C -A -T -T -C -C -A -G -C -C -T -C -T -A -G -G -A -C -A -C -A -G -T -G -G -C -A -G -C -C -A -C -A -T -A -A -T -T -G -G -T -A -T -C -T -C -T -T -A -A -G -G -T -C -C -A -G -C -A -C -G -A -G -G -T -G -G -A -G -C -A -C -A -T -G -G -T -G -G -A -G -A -G -A -C -A -G -A -T -G -C -A -G -T -G -A -C -C -T -G -G -A -A -C -C -C -A -G -G -A -G -T -G -A -G -G -G -A -G -C -C -A -G -G -A -C -T -C -A -G -G -C -C -C -A -A -G -G -C -T -C -C -T -G -A -G -A -G -G -C -A -T -C -T -G -G -C -C -C -T -C -C -C -T -G -C -G -C -T -G -T -G -C -C -A -G -C -A -G -C -T -T -G -G -A -G -A -A -C -C -C -A -C -A -C -T -C -A -A -T -G -A -A -C -G -C -A -G -C -A -C -T -C -C -A -C -T -A -C -C -C -A -G -G -A -A -A -T -G -C -C -T -T -C -C -T -G -C -C -C -T -C -T -C -C -T -C -A -T -C -C -C -A -T -C -C -C -T -G -G -G -C -A -G -G -G -G -A -C -A -T -G -C -A -A -C -T -G -T -C -T -A -C -A -A -G -G -T -G -C -C -A -A -G -T -A -C -C -A -G -G -A -C -A -G -G -A -A -A -G -G -A -A -A -G -A -C -G -C -C -A -A -A -A -A -T -C -C -A -G -C -G -C -T -G -C -C -C -T -C -A -G -A -G -A -A -G -G -G -C -A -A -C -C -A -C -G -C -A -G -T -C -C -C -C -A -T -C -T -T -G -G -C -A -A -G -G -A -A -A -C -A -C -A -A -T -T -T -C -C -G -A -G -G -G -A -A -T -G -G -T -T -T -T -G -G -C -C -T -C -C -A -T -T -C -T -A -A -G -T -G -C -T -G -G -A -C -A -T -G -G -G -G -T -G -G -C -C -A -T -A -A -T -C -T -G -G -A -G -C -T -G -A -T -G -G -C -T -C -T -T -A -A -A -G -A -C -C -T -G -C -A -T -C -C -T -C -T -T -C -C -C -T -A -G -G -T -G -T -C -C -C -T -C -G -G -G -C -A -C -A -T -T -T -A -G -C -A -C -A -A -A -G -A -T -A -A -G -C -A -C -A -A -A -A -G -G -T -G -C -A -T -C -C -A -G -C -A -C -T -T -T -G -T -T -A -C -T -A -T -T -G -G -T -G -G -C -A -G -G -T -T -T -A -T -G -A -A -T -G -G -C -A -A -C -C -A -A -A -G -G -C -A -G -T -G -T -A -C -G -G -G -T -C -A -A -G -A -T -T -A -T -C -A -A -C -A -G -G -G -A -A -G -A -G -A -T -A -G -C -A -T -T -T -C -C -T -G -A -A -G -G -C -T -T -C -C -T -A -A -G -T -G -C -C -A -G -G -C -A -C -T -G -T -T -C -C -A -T -T -C -C -T -T -T -G -C -A -T -G -T -T -T -T -G -A -T -T -A -A -T -T -T -A -A -T -A -T -T -T -A -A -A -A -T -A -A -T -T -C -T -A -C -C -A -G -G -A -A -G -C -T -A -C -C -A -T -T -A -T -T -A -C -C -A -C -A -A -C -T -T -C -A -C -A -A -A -T -G -A -G -A -A -C -A -C -C -G -A -G -G -C -T -T -A -G -A -G -G -G -G -T -T -G -G -G -T -T -G -C -C -C -A -A -G -G -T -T -A -C -A -G -A -G -G -A -A -G -A -A -A -A -C -A -G -G -G -G -A -G -C -T -G -G -A -T -C -T -G -A -G -C -C -A -A -G -G -C -A -T -C -A -A -C -T -C -C -A -A -G -G -T -A -A -C -C -C -C -T -C -A -G -T -C -A -C -T -T -C -A -C -T -G -T -G -T -G -T -C -C -C -C -T -G -G -T -T -A -C -T -G -G -G -A -C -A -T -T -C -T -T -G -A -C -A -A -A -C -T -C -G -G -G -G -C -A -A -G -C -C -G -G -T -G -A -G -T -C -A -G -T -G -G -G -G -G -A -G -G -A -C -T -T -T -C -A -G -G -A -A -G -A -G -G -T -G -G -G -T -T -C -C -C -A -G -T -T -G -G -T -G -A -C -A -G -A -A -G -A -G -G -A -G -G -C -T -G -C -A -A -A -G -T -G -A -A -G -G -A -G -C -A -G -G -G -G -C -T -C -C -A -G -G -T -C -T -G -G -C -G -A -C -A -A -C -C -A -G -G -G -A -A -G -G -G -A -C -A -G -G -G -C -A -G -G -G -A -T -G -G -C -T -T -G -G -A -C -C -A -C -G -A -G -A -G -G -C -A -C -C -T -G -A -G -T -C -A -G -G -C -A -G -T -C -A -C -A -T -A -C -T -T -C -C -C -A -C -T -G -G -G -G -T -C -T -A -C -C -A -T -G -T -G -A -G -G -C -A -T -G -G -T -G -T -G -G -G -A -T -C -C -T -G -G -G -A -A -G -G -A -G -A -C -C -A -A -G -C -C -T -C -A -T -T -T -C -A -G -T -T -T -G -C -T -T -A -T -G -G -C -C -A -A -A -G -A -C -A -G -G -A -C -C -T -G -T -G -T -A -C -C -C -G -A -C -A -A -C -C -C -C -T -G -G -G -A -C -C -T -T -T -A -C -C -A -A -A -A -A -A -A -G -A -G -C -A -A -A -C -A -C -C -A -T -T -C -A -C -T -C -A -C -T -C -A -T -G -T -T -A -G -A -T -A -A -A -C -A -C -T -G -A -G -T -G -A -A -G -T -C -A -C -T -G -G -A -G -C -C -C -A -A -G -G -A -C -T -G -T -G -C -G -A -G -G -T -C -A -G -C -A -C -T -G -C -C -A -A -T -A -C -A -A -G -A -A -G -C -T -G -C -A -G -C -C -C -T -C -C -A -G -C -T -C -G -C -C -T -C -C -C -T -C -A -A -T -G -G -C -C -A -C -T -C -C -G -T -G -C -T -C -C -A -G -C -C -A -T -G -C -T -G -G -C -T -T -C -C -T -T -T -T -A -G -G -T -C -C -T -C -C -A -C -C -T -C -C -A -G -G -C -T -G -T -A -G -T -T -C -A -T -G -T -G -C -T -T -C -T -T -T -C -T -G -G -A -A -T -G -T -T -C -T -T -C -C -C -A -A -C -C -T -A -C -C -C -A -C -T -C -A -A -C -C -C -T -C -A -G -A -C -T -T -T -A -C -C -A -T -A -A -A -T -G -T -C -A -T -T -T -C -C -T -C -A -C -G -T -C -T -G -C -C -T -T -C -C -C -T -G -A -C -C -T -G -A -G -A -C -C -A -A -G -C -C -A -G -G -C -T -T -C -C -C -A -T -G -A -C -G -A -G -C -C -T -C -A -C -A -G -T -A -C -C -C -C -A -T -C -T -C -C -C -C -T -G -A -A -C -A -G -A -T -G -C -A -G -T -A -A -T -A -A -C -C -T -A -C -A -T -A -A -C -C -C -G -G -G -G -C -C -A -T -G -A -T -C -T -A -T -G -G -C -T -T -T -G -A -A -T -C -C -T -G -G -C -T -C -T -G -T -C -A -C -T -A -G -G -C -C -A -G -G -T -C -T -C -T -C -A -G -C -C -C -T -T -C -T -G -T -G -C -C -T -C -A -G -T -T -T -C -C -T -C -A -T -C -T -A -T -A -A -A -A -T -G -A -G -A -T -G -A -C -G -G -C -A -G -T -G -C -C -T -G -C -T -C -A -T -G -A -A -G -T -G -T -G -A -G -T -T -A -A -T -G -C -A -C -T -C -A -A -A -T -C -A -A -T -G -G -T -T -G -T -G -C -A -C -G -G -T -T -T -A -T -A -T -G -A -A -T -A -T -T -A -G -T -G -A -T -T -A -C -A -A -A -A -T -A -T -T -A -T -C -A -A -T -A -G -A -C -C -T -T -G -T -C -A -C -A -A -C -T -G -T -T -A -T -T -G -A -A -G -A -A -C -T -A -A -T -C -A -T -C -T -A -T -T -G -C -T -T -A -T -T -T -A -G -G -T -C -T -T -T -C -T -C -T -C -C -T -G -C -C -A -G -A -A -T -G -T -G -C -G -C -T -C -C -A -G -G -T -G -G -A -G -A -G -G -T -A -T -G -T -T -G -C -C -T -T -A -T -C -C -G -T -G -G -C -T -G -G -A -T -A -T -A -T -A -G -A -G -A -T -T -C -C -C -A -C -A -C -T -G -C -C -T -T -G -C -A -C -A -C -G -A -G -C -A -C -T -G -C -T -G -G -G -T -A -A -A -T -A -T -T -T -G -T -T -G -G -C -T -G -C -A -G -G -A -A -A -A -C -G -T -G -A -A -G -G -A -A -T -A -G -G -C -C -C -T -C -C -A -A -T -G -G -G -A -G -G -A -A -A -A -G -C -A -T -G -A -G -T -T -G -T -G -A -G -A -G -C -A -G -A -G -C -C -A -C -C -A -C -A -G -G -A -A -A -C -C -A -G -G -A -G -G -C -T -A -A -G -T -G -G -G -G -T -G -G -A -A -G -G -G -A -G -T -G -A -G -C -T -C -T -T -G -G -A -C -T -C -C -C -A -G -G -A -G -T -A -A -A -A -G -C -T -T -C -C -A -A -G -T -T -G -G -G -C -T -C -T -C -A -C -T -T -C -A -G -C -C -C -C -T -C -C -C -A -C -A -C -A -G -G -G -A -A -G -C -C -A -G -A -T -G -G -G -T -T -C -C -C -C -A -G -G -A -C -C -G -G -G -A -T -T -C -C -C -C -A -A -G -G -G -G -G -C -T -G -C -T -C -C -C -A -G -A -G -G -G -T -G -T -G -T -T -G -C -T -G -G -G -A -T -T -G -C -C -C -A -G -G -A -C -A -G -G -G -A -T -G -G -C -C -C -T -C -T -C -A -T -C -A -G -G -T -G -G -G -G -G -T -G -A -G -T -G -G -C -A -G -C -A -C -C -C -A -C -C -T -G -C -T -G -A -A -G -A -T -G -T -C -T -C -C -A -G -A -G -A -C -C -T -T -C -T -G -C -A -G -G -T -A -C -T -G -C -A -G -G -G -C -A -T -C -C -G -C -C -A -T -C -T -G -C -T -G -G -A -C -G -G -C -C -T -C -C -T -C -T -C -G -C -C -G -C -A -G -G -T -C -T -G -G -C -T -G -G -A -T -G -A -A -G -G -G -C -A -C -G -G -C -A -T -A -G -G -T -C -T -G -A -C -C -T -G -C -C -A -G -G -G -A -G -T -G -C -T -G -C -A -T -C -C -T -C -A -C -A -G -G -A -G -T -C -A -T -G -G -T -G -C -C -T -G -T -G -G -G -T -C -G -G -A -G -C -C -G -G -A -G -C -G -T -C -A -G -A -G -C -C -A -C -C -C -A -C -G -A -C -C -A -C -C -G -G -C -A -C -G -C -C -C -C -C -A -C -C -A -C -A -G -G -G -C -A -G -C -G -T -G -G -T -G -T -T -G -A -G -A -C -A -A -C -A -C -A -G -C -C -C -T -C -A -T -C -C -C -A -A -C -T -A -T -G -C -A -C -A -T -A -G -C -T -T -C -A -G -C -C -T -G -C -A -C -A -G -A -T -A -G -G -G -G -A -G -T -A -G -G -G -G -A -C -A -G -A -G -C -A -T -T -T -G -C -T -G -A -G -A -G -G -C -C -A -G -G -A -G -C -G -C -A -T -A -G -A -T -G -G -G -A -C -T -C -T -G -C -T -G -A -T -G -C -C -T -G -C -T -G -A -G -T -G -A -A -T -G -A -G -G -G -A -A -A -G -G -G -C -A -G -G -G -C -C -C -G -G -G -A -C -T -G -G -G -G -A -A -T -C -T -G -T -A -G -G -G -T -C -A -A -T -G -G -A -G -G -A -G -T -T -C -A -G -A -G -A -A -G -G -T -G -C -A -A -C -A -T -T -T -C -T -G -A -C -C -C -C -C -T -A -C -A -A -G -G -T -G -C -T -T -G -C -T -A -C -C -T -G -C -C -A -G -G -C -A -C -C -C -T -T -T -C -C -A -T -A -C -C -T -T -G -T -C -T -C -A -G -T -T -C -A -G -C -T -C -C -C -C -A -C -C -T -T -G -G -A -T -A -A -A -C -A -A -G -A -A -A -C -C -T -T -G -G -T -T -G -C -A -G -A -G -G -A -A -A -A -A -A -G -A -G -G -C -T -G -G -A -A -A -C -A -A -A -G -G -G -G -T -A -G -A -A -A -T -G -G -G -G -T -A -G -C -A -G -G -G -G -A -G -A -T -T -G -C -C -T -G -A -T -C -A -A -C -T -G -C -C -A -A -A -T -G -G -T -A -C -A -C -A -G -T -T -C -T -G -G -A -A -A -A -G -C -A -C -A -A -A -A -A -A -T -G -T -G -C -A -C -A -C -A -C -G -G -G -T -T -C -T -T -C -C -C -A -C -T -T -T -A -A -C -C -C -C -T -G -A -G -G -A -A -T -C -T -G -A -G -G -C -C -T -G -C -T -C -C -T -G -A -A -A -C -A -G -A -C -T -G -G -G -C -A -G -T -G -G -C -T -A -G -T -G -A -C -T -C -T -A -G -G -T -A -T -A -G -G -A -G -T -A -T -C -C -A -G -C -C -C -T -G -C -T -C -A -C -C -C -A -G -G -C -T -A -G -A -G -C -T -T -A -G -G -G -G -G -A -C -A -A -G -A -G -G -A -A -A -G -A -G -G -T -G -C -C -T -G -T -G -G -G -G -G -T -G -G -A -G -G -A -C -A -G -G -A -A -G -G -A -A -A -A -A -C -A -C -T -C -C -T -G -G -A -A -T -T -G -C -A -A -A -G -T -G -A -G -G -G -C -A -G -A -G -T -C -T -A -T -T -T -A -T -A -T -T -G -G -G -T -T -T -A -A -T -T -A -A -C -T -C -C -T -C -T -C -C -C -T -G -G -T -G -C -C -A -C -T -A -A -A -G -C -A -G -C -A -A -T -C -A -C -A -C -T -G -C -A -G -A -C -A -G -C -A -C -T -G -A -T -T -T -G -A -T -T -G -G -C -A -A -G -A -G -A -T -G -C -A -C -C -A -G -G -C -A -G -A -A -T -A -T -T -A -A -G -G -G -A -C -C -A -G -G -C -C -C -C -T -A -T -A -A -A -T -A -G -G -C -C -T -A -A -T -C -A -C -A -G -C -C -C -C -T -C -A -C -T -G -G -A -A -A -A -T -G -G -T -A -A -G -G -A -A -G -A -C -A -T -T -A -A -T -C -A -G -G -C -C -T -G -G -C -A -C -T -G -T -G -C -C -C -T -A -G -A -C -C -T -G -C -T -C -C -C -C -T -A -G -G -C -A -C -T -A -C -A -G -T -G -G -G -G -C -C -C -T -T -G -G -T -T -G -C -A -A -C -A -C -A -A -G -T -A -G -G -T -A -G -G -G -A -T -G -G -A -T -G -A -G -T -G -T -G -G -C -A -T -G -A -A -G -G -G -C -C -T -A -G -G -A -G -A -T -T -T -C -A -C -T -T -G -G -G -T -T -T -A -A -A -A -T -G -C -T -G -T -G -A -C -C -T -T -G -A -G -T -A -A -G -T -T -G -C -C -G -T -C -T -C -T -G -A -A -T -C -T -G -A -T -C -C -T -T -T -C -G -A -T -T -T -C -C -C -A -T -T -C -T -C -C -A -A -A -C -T -G -A -G -A -A -C -T -A -G -C -A -C -T -G -C -T -G -A -G -A -C -G -T -G -G -T -T -A -T -T -T -C -C -A -A -T -A -A -T -A -A -T -T -T -G -T -A -T -A -T -T -T -T -A -C -A -T -A -A -C -G -C -A -C -C -A -C -A -C -C -A -A -C -A -T -C -T -T -C -A -C -C -C -A -G -T -T -G -G -A -G -C -C -T -A -C -T -C -C -T -T -T -G -C -T -C -C -C -G -C -T -G -C -T -G -G -C -T -T -C -C -C -C -A -G -C -C -C -T -C -C -C -T -T -C -T -G -C -C -C -T -C -C -T -C -A -G -G -C -C -A -G -C -A -C -T -T -T -T -C -A -G -T -G -A -G -T -T -C -C -T -C -C -T -T -T -G -C -A -T -A -C -A -G -G -C -T -T -T -C -C -A -G -A -T -C -T -G -T -A -C -T -T -G -C -C -T -T -G -A -A -T -A -C -T -C -A -T -C -A -G -A -G -C -C -C -A -G -G -A -G -T -T -A -C -T -C -C -T -C -A -C -C -T -C -C -C -A -C -T -T -A -T -T -T -T -T -C -C -T -C -C -C -A -T -C -A -A -A -T -A -A -C -T -A -A -A -G -C -A -T -G -G -C -C -A -G -C -T -G -A -T -G -C -C -C -A -G -C -C -A -A -C -T -G -A -G -A -A -A -C -C -C -A -A -C -C -C -T -C -T -G -A -G -A -C -C -A -G -C -A -C -A -C -C -C -C -T -T -T -C -A -A -G -C -A -T -G -T -T -C -C -T -C -C -C -T -C -C -C -C -T -T -C -T -T -T -G -T -A -T -T -T -A -T -A -C -T -G -A -T -G -C -A -A -G -T -T -T -G -C -T -G -G -C -T -G -T -C -C -T -A -A -C -T -T -A -T -T -T -C -T -G -T -G -C -C -T -C -A -G -T -T -C -T -C -C -C -A -T -A -T -G -T -A -A -G -A -T -C -A -C -A -A -A -G -G -G -G -G -T -A -A -A -G -A -T -G -C -A -A -G -A -T -A -T -T -T -C -C -T -G -T -G -C -A -C -A -T -C -T -T -C -A -G -A -T -G -A -A -T -T -T -C -T -T -G -T -T -A -G -T -G -T -G -T -G -T -G -T -G -T -T -T -G -C -T -C -A -C -A -C -A -T -A -T -G -C -G -T -G -A -A -A -G -A -A -G -A -G -T -A -C -A -T -A -C -A -C -A -G -A -T -C -T -C -C -T -C -A -A -A -A -A -G -G -A -G -G -C -A -G -C -A -A -G -C -C -C -G -T -T -C -A -A -G -A -A -T -G -G -G -A -C -T -G -A -A -T -A -C -A -C -C -T -G -A -T -G -A -G -T -G -G -T -T -T -A -C -T -T -T -C -T -G -T -C -T -G -C -A -A -A -C -A -T -C -T -A -C -T -G -A -T -C -A -T -C -T -G -T -T -A -G -G -T -G -C -A -G -G -C -C -A -T -G -A -T -C -A -C -A -A -C -A -A -A -G -A -C -G -A -A -T -A -A -G -A -C -A -C -T -A -C -A -C -T -A -G -C -C -A -G -G -G -A -G -A -G -T -C -T -C -A -A -A -A -A -C -A -A -C -T -A -A -A -C -T -C -A -A -A -T -T -A -A -A -T -T -C -A -T -T -C -T -A -C -T -C -C -A -G -T -C -A -T -G -G -G -T -A -C -A -A -A -G -C -T -A -A -G -G -A -G -T -G -A -C -A -A -A -T -C -C -C -T -C -T -T -G -G -A -G -T -T -A -G -G -G -G -A -G -T -C -A -G -G -A -A -A -A -A -G -C -T -C -T -T -A -G -C -A -G -A -A -T -G -T -G -T -G -C -C -T -C -T -C -G -G -C -C -G -G -G -C -G -C -A -G -C -G -G -C -T -C -A -C -G -C -C -T -G -T -A -A -T -C -C -C -A -G -C -A -C -T -T -T -G -G -G -A -G -G -C -G -A -A -G -G -C -A -G -G -C -A -G -A -T -C -A -C -C -T -G -A -G -G -T -C -G -G -G -A -G -T -T -C -G -A -G -A -C -C -A -G -T -C -T -G -A -C -C -A -A -C -A -T -G -G -T -G -A -A -A -C -T -C -C -A -T -C -T -C -T -A -C -T -A -A -A -A -A -T -A -C -A -A -A -A -T -T -A -G -C -C -A -G -G -C -G -T -G -G -T -G -G -T -G -C -A -T -G -C -C -T -G -T -A -A -T -C -C -C -C -G -C -T -A -C -T -C -G -G -G -A -G -G -C -T -G -A -G -G -A -A -G -G -A -G -A -A -T -C -A -C -T -T -G -A -A -C -C -G -G -G -A -A -G -G -T -G -G -A -G -G -T -T -G -C -A -G -T -G -T -G -C -C -A -A -G -A -T -C -G -C -G -C -C -A -T -G -G -C -A -C -T -C -C -A -G -C -C -T -A -G -G -C -A -A -C -G -A -G -G -G -T -G -A -A -C -C -A -G -G -T -C -C -A -G -G -A -A -G -A -A -G -G -T -G -C -A -A -A -G -A -C -A -G -C -A -T -T -C -C -A -G -G -T -A -A -A -A -G -A -A -A -C -A -G -C -T -T -G -A -A -C -A -A -A -A -A -G -T -G -T -G -T -A -G -G -G -G -A -A -C -C -G -C -A -A -G -C -G -G -T -C -T -T -G -A -G -T -G -C -T -G -A -G -G -G -T -A -C -A -A -T -C -A -T -C -C -T -T -G -G -G -G -A -A -G -T -A -C -T -A -G -A -A -G -A -A -A -G -A -A -T -G -A -T -A -A -A -C -A -G -A -G -G -C -C -A -G -T -T -T -G -T -T -A -A -A -A -A -C -A -C -T -C -A -A -A -A -T -T -A -A -A -G -C -T -A -G -G -A -G -T -T -T -G -G -A -C -T -T -G -T -G -G -C -A -G -G -A -A -T -G -A -A -A -T -C -C -T -T -A -G -A -C -C -T -G -T -G -C -T -G -T -C -C -A -A -T -A -T -G -G -T -A -G -C -C -A -C -C -A -G -G -C -A -C -A -T -G -C -A -G -C -C -A -C -T -G -A -G -C -A -C -T -T -G -A -A -A -T -G -T -G -G -A -T -A -G -T -C -T -G -A -A -T -T -G -A -G -A -T -G -T -G -C -C -A -T -A -A -G -T -G -T -A -A -A -A -T -A -T -G -C -A -C -C -A -A -A -T -T -T -C -A -A -A -G -G -C -T -A -G -A -A -A -A -A -A -A -G -A -A -T -G -T -A -A -A -A -T -A -T -C -T -T -A -T -T -A -T -T -T -T -A -T -A -T -T -G -A -T -T -A -C -G -T -G -C -T -A -A -A -A -T -A -A -C -C -A -T -A -T -T -T -G -G -G -A -T -A -T -A -C -T -G -G -A -T -T -T -T -A -A -A -A -A -T -A -T -A -T -C -A -C -T -A -A -T -T -T -C -A -T -C -T -G -T -T -T -C -T -T -T -T -T -A -C -T -T -T -T -A -G -A -A -A -T -C -A -C -A -T -A -T -G -T -G -A -C -T -T -A -A -A -T -A -T -T -T -C -T -T -T -T -C -T -T -T -T -T -C -T -T -T -C -C -T -C -T -C -A -C -T -C -A -G -C -G -T -C -C -T -G -T -G -A -T -T -C -C -A -A -A -G -A -A -A -T -G -A -G -T -C -T -C -T -G -C -T -G -T -T -T -T -T -G -G -G -C -A -G -C -A -G -A -T -A -T -C -C -T -A -G -A -A -T -G -G -A -C -T -C -T -G -A -C -C -T -A -A -G -C -A -T -C -A -A -A -A -T -T -A -A -T -C -A -T -C -A -T -A -A -C -G -T -T -A -T -C -A -T -T -T -T -A -T -G -G -C -C -C -C -T -T -C -T -T -C -C -T -A -T -A -T -C -T -G -G -T -A -G -C -T -T -T -T -A -A -A -T -G -A -T -G -A -C -C -A -T -G -T -A -G -A -T -A -A -T -C -T -T -T -A -T -T -G -T -C -C -C -T -C -T -T -T -C -A -G -C -A -G -A -C -G -G -T -A -T -T -T -T -C -T -T -A -T -G -C -T -A -C -A -G -T -A -T -G -A -C -T -G -C -T -A -A -T -A -A -T -A -C -C -T -A -C -A -C -A -T -G -T -T -A -G -A -A -C -C -A -T -T -C -T -G -A -C -T -C -C -T -C -A -A -G -A -A -T -C -T -C -A -T -T -T -A -A -C -T -C -T -T -A -T -T -A -T -C -A -G -T -G -A -A -T -T -T -A -T -C -A -T -C -A -T -C -C -C -C -T -A -T -T -T -T -A -C -A -T -A -A -G -G -A -A -A -T -G -G -G -G -T -T -A -G -A -A -A -G -A -C -C -A -A -A -T -A -A -C -A -T -T -T -T -T -T -C -A -A -C -A -T -C -A -A -A -A -C -A -C -T -A -G -C -T -T -G -A -G -A -T -C -A -A -G -C -C -C -A -G -A -C -T -T -G -G -A -T -C -T -G -T -C -G -T -C -T -G -A -A -T -T -C -C -A -A -G -C -T -T -T -T -T -G -T -T -A -T -T -T -A -T -T -G -A -T -A -T -G -T -T -T -T -G -T -T -G -T -T -T -T -C -A -T -G -C -A -A -T -A -A -T -G -C -A -A -A -T -C -T -T -A -G -C -C -C -A -A -A -C -A -T -T -T -T -G -T -T -A -G -T -A -G -T -A -C -C -A -A -C -T -G -T -A -A -G -T -C -A -C -C -T -T -A -T -C -T -T -C -A -T -A -C -T -T -T -G -T -C -T -T -T -A -T -G -T -A -A -A -C -C -T -A -A -A -T -T -A -G -A -T -C -T -G -T -T -T -T -T -G -A -T -A -C -T -G -A -G -G -G -A -A -A -A -A -C -A -A -G -G -G -A -A -T -C -T -A -A -C -A -C -T -A -A -C -C -A -G -C -C -C -G -T -A -G -T -G -T -G -T -G -G -T -C -A -A -C -A -C -T -T -T -C -G -T -T -A -C -T -T -T -A -G -T -A -T -A -C -A -T -C -A -C -C -C -C -A -A -T -T -G -T -T -T -G -T -C -T -T -C -A -C -C -A -C -A -C -A -C -T -T -T -G -G -A -G -T -T -A -G -G -T -A -G -T -A -G -T -A -T -C -T -A -T -T -T -T -T -A -C -A -A -A -T -A -A -G -A -A -A -A -C -C -C -A -G -G -C -A -C -A -A -A -G -G -G -G -T -T -G -A -T -T -A -G -C -A -A -T -T -A -T -C -T -T -T -T -G -A -A -A -A -G -C -C -T -G -T -A -G -T -T -G -C -T -C -A -T -C -T -G -A -A -G -A -A -G -T -G -A -C -G -G -A -C -C -A -C -C -T -C -T -T -A -T -T -T -A -G -T -G -G -A -C -A -G -A -C -A -G -T -A -A -C -T -A -G -T -T -G -A -G -A -A -G -A -C -A -G -G -G -G -A -T -T -T -T -G -T -T -G -G -C -G -G -A -A -A -A -A -A -A -A -A -T -T -T -A -T -C -A -A -A -A -G -T -C -G -T -C -T -T -C -T -A -T -C -A -G -G -G -A -G -T -T -T -T -A -T -G -A -G -A -A -A -C -C -C -T -A -G -C -T -C -C -T -C -A -G -T -T -C -C -A -C -A -G -T -G -G -G -T -A -A -C -T -G -T -A -A -T -T -C -A -T -T -C -T -A -G -G -T -C -T -G -C -G -A -T -A -T -T -T -C -C -T -G -C -C -T -A -T -C -C -A -T -T -T -T -G -T -T -A -A -C -T -C -T -T -C -A -A -T -G -C -A -T -T -C -C -A -C -A -A -A -T -A -C -C -T -A -A -G -T -A -T -T -C -T -T -T -A -A -T -A -A -T -G -G -T -G -G -G -T -T -T -T -T -T -T -T -T -T -T -T -T -T -G -C -A -T -C -T -A -T -G -A -A -G -T -T -T -T -T -T -C -A -A -A -T -T -C -T -T -T -T -T -A -A -G -T -G -A -C -A -A -A -A -C -T -T -G -T -A -C -A -T -G -T -G -T -A -T -C -G -C -T -C -A -A -T -A -T -T -T -C -T -A -G -T -C -G -A -C -A -G -C -A -C -T -G -C -T -T -T -C -G -A -G -A -A -T -G -T -A -A -A -C -C -G -T -G -C -A -C -T -C -C -C -A -G -G -A -A -A -A -T -G -C -A -G -A -C -A -C -A -G -C -A -C -G -C -C -T -C -T -T -T -G -G -G -A -C -C -G -C -G -G -T -T -T -A -T -A -C -T -T -T -C -G -A -A -G -T -G -C -T -C -G -G -A -G -C -C -C -T -T -C -C -T -C -C -A -G -A -C -C -G -T -T -C -T -C -C -C -A -C -A -C -C -C -C -G -C -T -C -C -A -G -G -G -T -C -T -C -T -C -C -C -G -G -A -G -T -T -A -C -A -A -G -C -C -T -C -G -C -T -G -T -A -G -G -C -C -C -C -G -G -G -A -A -C -C -C -A -A -C -G -C -G -G -T -G -T -C -A -G -A -G -A -A -G -T -G -G -G -G -T -C -C -C -C -T -A -C -G -A -G -G -G -A -C -C -A -G -G -A -G -C -T -C -C -G -G -G -C -G -G -G -C -A -G -C -A -G -C -T -G -C -G -G -A -A -G -A -G -C -C -G -C -G -C -G -A -G -G -C -T -T -C -C -C -A -G -A -A -C -C -C -G -G -C -A -G -G -G -G -C -G -G -G -A -A -G -A -C -G -C -A -G -G -A -G -T -G -G -G -G -A -G -G -C -G -G -A -A -C -C -G -G -G -A -C -C -C -C -G -C -A -G -A -G -C -C -C -G -G -G -T -C -C -C -T -G -C -G -C -C -C -C -A -C -A -A -G -C -C -T -T -G -G -C -T -T -C -C -C -T -G -C -T -A -G -G -G -C -C -G -G -G -C -A -A -G -G -C -C -G -G -G -T -G -C -A -G -G -G -C -G -C -G -G -C -T -C -C -A -G -G -G -A -G -G -A -A -G -C -T -C -C -G -G -G -G -C -G -A -G -C -C -C -A -A -G -A -C -G -C -C -T -C -C -C -G -G -G -C -G -G -T -C -G -G -G -G -C -C -C -A -G -C -G -G -C -G -G -C -G -T -T -C -G -C -A -G -T -G -G -A -G -C -C -G -G -G -C -A -C -C -G -G -G -C -A -G -C -G -G -C -C -G -C -G -G -A -A -C -A -C -C -A -G -C -T -T -G -G -C -G -C -A -G -G -C -T -T -C -T -C -G -G -T -C -A -G -G -A -A -C -G -G -T -C -C -C -G -G -G -C -C -T -C -C -C -G -C -C -C -G -C -C -T -C -C -C -T -C -C -A -G -C -C -C -C -T -C -C -G -G -G -T -C -C -C -C -T -A -C -T -T -C -G -C -C -C -C -G -C -C -A -G -G -C -C -C -C -C -A -C -G -A -C -C -C -T -A -C -T -T -C -C -C -G -C -G -G -C -C -C -C -G -G -A -C -G -C -C -T -C -C -T -A -C -C -T -G -C -G -A -G -C -C -G -C -C -C -T -C -C -C -G -G -A -A -G -C -T -C -C -C -G -C -C -G -C -C -G -C -T -T -C -C -G diff --git a/perl/t/tracks/gene/db/raw/refSeq/2018-09-06.hg19.kgXref.fetch.orig.gz b/perl/t/tracks/gene/db/raw/refSeq/2018-09-06.hg19.kgXref.fetch.orig.gz deleted file mode 100644 index 707653cbc4330fc09f821e013fe3883264a6e2cd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 504 zcmVr$Dh4Vk;hGP(b-NMM?9s4s@2q}qnq}K+>mZj ziIdvyM8Eyc4j|TQOsfEqyMKH>o8Mn|-x^|c*O2q6^ zf^Igwx#*vbk6B!DF~vnkT;1Cy_T>AW_}J}^X(+z5SmMazkLwv(HLkYp;M?Ai#<-@t zU*lW9FuDqNa|0?j%vv|B^$X^M?R{C-q+3)au@AaK@j(h8)y)!s0Te~b!e3%Q&PY~p zfV99Un36E+f=4Y_PV!v^#mKpY5Ys}c5JORjxl~z*g@RBg@DM@cMxMnDT!`2pV5AXk0YCr%G7!b!Aw ze6g)bY%?_4C#$~GsF+MXZd-n$Nw9H5n*7qX)y5~BCIQ!k%&WVv5I#cT0VzR*U*xSk z&+$qNCYM*=Zpw;Ow@Z+$V8x$ItMWBxag;Mt>7KJZiwFQc!9rR919gwfZUQk3Mc0>KVJa~hO=*{ihs1(eq#uAnCcp^dL~)Sd z@9PNybyF#}@9|5{VIYw44)&aa2j^s{N521Lld{CMknQ=2B1U%iK+ABWTvRD>ky2!^ z4at?A>OCp$HC^P`yh+l+xSX~t^p0&BVjcp)5j%glN;YdU-FO>EA&#qo4`v^H>L|p+-PiL4Y${*6x(jt{p|ZYeZniWVR@M~#*?dQy?-sT1w&s!S&b1*g zOK4U_4(`@88gWJ6 zjjN3fkzi4eVg}3N`V;RTnbHO^5S}Y_sg4sm(hAl3?DIulKykU|3N`9qc@_C{hVoAg zwVD{JiQ${*{R6ib)UdAaXEK`rRtEwYPXMz+6vJ24I8DDS&*N+oW%Nc>vgZozHFzpW zc6`EBl4|i(Ez~?#d#U(uT0JF@BvkUoX4guw|7`dp+UiBmBfmFcCJrLmYn1=m^CK}r N_yb|-kx2jr000oE^MC*V diff --git a/perl/t/tracks/gene/db/raw/refSeq/hg19.refGene.chrM b/perl/t/tracks/gene/db/raw/refSeq/hg19.refGene.chrM deleted file mode 100644 index 4841129ee..000000000 --- a/perl/t/tracks/gene/db/raw/refSeq/hg19.refGene.chrM +++ /dev/null @@ -1,2 +0,0 @@ -name chrom strand txStart txEnd cdsStart cdsEnd exonCount exonStarts exonEnds score name2 kgID mRNA spID spDisplayID refseq protAcc description rfamAcc tRnaName ensemblID geneSymbol -NR_137295 chrM + 1672 3230 3230 3230 1 1672, 3230, 0 RNR2 NA NA NA NA NA NA NA NA NA NA NA \ No newline at end of file diff --git a/perl/t/tracks/gene/db/raw/trackNames_meta/data.mdb b/perl/t/tracks/gene/db/raw/trackNames_meta/data.mdb deleted file mode 100644 index f54b9e2df6384267a6a219141839e492b8b2527e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20480 zcmeI)J5Iwu5CG62p>Sse9Doy1(W8J21<)9{9^x3OqTGNcJviAN8E8yFW9e;?*{>bF zq?v5i5m}l;W%K^=c9dN;UhLyZL=)r4SSypVeO#LLHGAXR$}+U5GnO^xy6msr0s#U9 z2oNAZfB*pk1PBly@DBoGhW?lQfBJ@>wUn9T)yLY;k<9lq{+>5|M>7ZzAV7cs0RjXF z5FkK+0D&oi%)28NaT>dJ*G{c4PJjRb0t5&UAV7cs0RjXF%t4^+u`i?V?jrwxzt8{A zW8JR(0x*Y<(0e36fB*pk1PBlyK!5-N0tbPT!o?ou|)$POO>(fow*R%K~ w|Ic&xW&eNMZteg31`!}YfB*pk1PBlyK!5;&*$E7f?Hfun?CzJ6=j*fm1x*|vi~s-t diff --git a/perl/t/tracks/gene/db/raw/trackNames_meta/lock.mdb b/perl/t/tracks/gene/db/raw/trackNames_meta/lock.mdb deleted file mode 100644 index e250664b49190cecacf7ec0c88043dbaf04f6967..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8320 zcmeIuu?+wa5Qb6gUzkWtVg)N`6mnq!dDq>ae9aquww!W`SL3k%a@o)N)m<_Z0RjXF k5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAn=L60wC%HaR2}S diff --git a/perl/t/tracks/gene/join.t b/perl/t/tracks/gene/join.t deleted file mode 100644 index f07998143..000000000 --- a/perl/t/tracks/gene/join.t +++ /dev/null @@ -1,126 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; -use Mouse; -extends 'Seq::Base'; - -1; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ PrepareConfigWithTempdirs /; - -use Path::Tiny qw/path/; -use Scalar::Util qw/looks_like_number/; -use YAML::XS qw/DumpFile/; - -use Seq::Tracks::Gene::Site::SiteTypeMap; -use Seq::Tracks::Reference::MapBases; - -# create temp directories -my $dir = Path::Tiny->tempdir(); - -# prepare temp directory and make test config file -my $config_file = - PrepareConfigWithTempdirs( 't/tracks/gene/join.yml', - 't/tracks/gene/db/raw', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify ); - -my $baseMapper = Seq::Tracks::Reference::MapBases->new(); -my $siteTypes = Seq::Tracks::Gene::Site::SiteTypeMap->new(); - -# Defines three tracks, a nearest gene , a nearest tss, and a region track -# The region track is simply a nearest track for which we storeOverlap and do not storeNearest -# To show what happens when multiple transcripts (as in NR_FAKE3, NR_FAKE3B, NR_FAKE3C) -# all share 100% of their data, except have different txEnd's, which could reveal issues with our uniqueness algorithm -# such as calculating the maximum range of the overlap: in previous code iterations -# we removed the non-unique overlapping data, without first looking at the txEnd -# and therefore had a smaller-than-expected maximum range -my $seq = MockBuilder->new_with_config( { config => $config_file } ); - -my $tracks = $seq->tracksObj; - -my $dbPath = path( $seq->database_dir ); -$dbPath->remove_tree( { keep_root => 1 } ); - -my $refBuilder = $tracks->getRefTrackBuilder(); -my $geneBuilder = $tracks->getTrackBuilderByName('refSeq'); - -my $refGetter = $tracks->getRefTrackGetter(); -my $geneGetter = $tracks->getTrackGetterByName('refSeq'); - -my $siteTypeDbName = $geneGetter->getFieldDbName('siteType'); -my $funcDbName = $geneGetter->getFieldDbName('exonicAlleleFunction'); - -$refBuilder->buildTrack(); -$geneBuilder->buildTrack(); - -### We have: - -my $db = Seq::DBManager->new(); - -my $mainDbAref = $db->dbReadAll('chrM'); -my $regionDataAref = $db->dbReadAll('refSeq/chrM'); - -my $geneDbName = $geneBuilder->dbName; -my $header = Seq::Headers->new(); - -my $features = $header->getParentFeatures('refSeq'); - -my ( $siteTypeIdx, $funcIdx, $alleleIdIdx ); - -for ( my $i = 0; $i < @$features; $i++ ) { - my $feat = $features->[$i]; - - if ( $feat eq 'siteType' ) { - $siteTypeIdx = $i; - next; - } - - if ( $feat eq 'exonicAlleleFunction' ) { - $funcIdx = $i; - next; - } - - if ( $feat eq 'clinvar.alleleID' ) { - $alleleIdIdx = $i; - } -} - -my $hasGeneCount = 0; -my $inGeneCount = 0; - -for my $pos ( 0 .. $#$mainDbAref ) { - my $dbData = $mainDbAref->[$pos]; - - my @out; - # not an indel - my $posIdx = 0; - $geneGetter->get( $dbData, 'chrM', $refGetter->get($dbData), 'A', $posIdx, \@out ); - - if ( $pos >= 1672 && $pos < 3230 ) { - $inGeneCount++; - - my $alleleIDs = $out[$posIdx][$alleleIdIdx]; - my $s = join( ",", @{ $out[$alleleIdIdx] } ); - - ok( join( ",", @{ $out[$alleleIdIdx] } ) eq '24587', 'Found the clinvar record' ); - ok( join( ",", @{ $out[$siteTypeIdx] } ) eq $siteTypes->ncRNAsiteType, - 'ncRNA site type' ); - ok( !defined $out[$funcIdx][0], 'ncRNA has no exonicAlleleFunction' ); - } - - if ( defined $dbData->[$geneDbName] ) { - $hasGeneCount++; - } -} - -ok( $inGeneCount == $hasGeneCount, - "We have a refSeq record for every position from txStart to txEnd" ); - -$db->cleanUp(); -$dbPath->remove_tree( { keep_root => 1 } ); - -done_testing(); diff --git a/perl/t/tracks/gene/join.yml b/perl/t/tracks/gene/join.yml deleted file mode 100644 index 7220b065a..000000000 --- a/perl/t/tracks/gene/join.yml +++ /dev/null @@ -1,77 +0,0 @@ ---- -assembly: hg19 -chromosomes: -- chrM -database_dir: t/tracks/gene/db/index -files_dir: t/tracks/gene/db/raw -temp_dir: /mnt/annotator/bystro-dev/tmp -tracks: - tracks: - - local_files: - - chrM.fa.gz - name: ref - type: reference - - features: - - kgID - - mRNA - - spID - - spDisplayID - - protAcc - - description - - rfamAcc - - name - - name2 - local_files: - - hg19.refGene.chrM - name: refSeq - type: gene - join: - features: - - alleleID - - phenotypeList - - clinicalSignificance - - type - - origin - - numberSubmitters - - reviewStatus - - chromStart - - chromEnd - track: clinvar - - based: 1 - build_field_transformations: - chrom: chr . - clinicalSignificance: split [;] - origin: split [;] - phenotypeList: split [;] - reviewStatus: split [;] - type: split [;] - build_row_filters: - Assembly: == GRCh38 - features: - - alleleID: number - - phenotypeList - - clinicalSignificance - - type - - origin - - numberSubmitters: number - - reviewStatus - - referenceAllele - - alternateAllele - fieldMap: - '#AlleleID': alleleID - AlternateAllele: alternateAllele - Chromosome: chrom - ClinicalSignificance: clinicalSignificance - NumberSubmitters: numberSubmitters - Origin: origin - PhenotypeIDS: phenotypeIDs - PhenotypeList: phenotypeList - ReferenceAllele: referenceAllele - ReviewStatus: reviewStatus - Start: chromStart - Stop: chromEnd - Type: type - local_files: - - variant_summary.txt.MT.1600_3250.gz - name: clinvar - type: sparse \ No newline at end of file diff --git a/perl/t/tracks/gene/join_no_build.t b/perl/t/tracks/gene/join_no_build.t deleted file mode 100644 index 65c77fbe9..000000000 --- a/perl/t/tracks/gene/join_no_build.t +++ /dev/null @@ -1,133 +0,0 @@ -# This test demonstrates that we can join a track that is not built -# Here, we join the "clinvar" track's data on refSeq, but don't build the individual clinvar track - -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; -use Mouse; -extends 'Seq::Base'; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ PrepareConfigWithTempdirs /; - -use Path::Tiny qw/path/; -use Scalar::Util qw/looks_like_number/; -use YAML::XS qw/DumpFile/; - -use Seq::Build; -use Seq::Tracks::Gene::Site::SiteTypeMap; -use Seq::Tracks::Reference::MapBases; - -my $dir = Path::Tiny->tempdir(); - -my $config_file = PrepareConfigWithTempdirs( - 't/tracks/gene/join_no_build.yml', - 't/tracks/gene/db/raw', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify -); - -my $config = YAML::XS::LoadFile($config_file); - -my $dbPath = path( $config->{database_dir} ); -$dbPath->remove_tree( { keep_root => 1 } ); - -my $builder = Seq::Build->new_with_config( { config => $config_file } ); - -my $tracks = $builder->tracksObj; - -my $clinvarBuilder = $tracks->getTrackBuilderByName('clinvar'); - -for my $chr ( @{ $config->{chromosomes} } ) { - ok( !$clinvarBuilder->completionMeta->_isCompleted('chrM'), - 'Clinvar track is not built' ); -} - -my $baseMapper = Seq::Tracks::Reference::MapBases->new(); -my $siteTypes = Seq::Tracks::Gene::Site::SiteTypeMap->new(); - -my $refGetter = $tracks->getRefTrackGetter(); -my $geneGetter = $tracks->getTrackGetterByName('refSeq'); -my $clinvarGetter = $tracks->getTrackGetterByName('clinvar'); - -my $siteTypeDbName = $geneGetter->getFieldDbName('siteType'); -my $funcDbName = $geneGetter->getFieldDbName('exonicAlleleFunction'); - -my $db = Seq::DBManager->new(); - -my $mainDbAref = $db->dbReadAll('chrM'); -my $regionDataAref = $db->dbReadAll('refSeq/chrM'); - -my $geneDbName = $geneGetter->dbName; -my $header = Seq::Headers->new(); - -my $features = $header->getParentFeatures('refSeq'); - -my ( $siteTypeIdx, $funcIdx, $alleleIdIdx ); - -my $expectedNumberOfTracks = @{ $config->{tracks}->{tracks} } - 1; - -# Check that join track successfully built -for ( my $i = 0; $i < @$features; $i++ ) { - my $feat = $features->[$i]; - - if ( $feat eq 'siteType' ) { - $siteTypeIdx = $i; - next; - } - - if ( $feat eq 'exonicAlleleFunction' ) { - $funcIdx = $i; - next; - } - - if ( $feat eq 'clinvar.alleleID' ) { - $alleleIdIdx = $i; - } -} - -my $hasGeneCount = 0; -my $inGeneCount = 0; - -# We still reserve an index for all specified tracks, even if they are not built -# Therefore, to minimize database space, such tracks should be specified last -ok( !defined $clinvarGetter, 'clinvar track getter does not exist' ); - -for my $pos ( 0 .. $#$mainDbAref ) { - my $dbData = $mainDbAref->[$pos]; - - ok( defined $dbData, 'We have data for position ' . $pos ); - ok( @$dbData <= $expectedNumberOfTracks, - 'We don\'t have a clinvar record in database for position ' . $pos ); - - my @out; - # not an indel - my $posIdx = 0; - $geneGetter->get( $dbData, 'chrM', $refGetter->get($dbData), 'A', $posIdx, \@out ); - - if ( $pos >= 1672 && $pos < 3230 ) { - $inGeneCount++; - - my $alleleIDs = $out[$posIdx][$alleleIdIdx]; - my $s = join( ",", @{ $out[$alleleIdIdx] } ); - - ok( join( ",", @{ $out[$alleleIdIdx] } ) eq '24587', 'Found the clinvar record' ); - ok( join( ",", @{ $out[$siteTypeIdx] } ) eq $siteTypes->ncRNAsiteType, - 'ncRNA site type' ); - ok( !defined $out[$funcIdx][0], 'ncRNA has no exonicAlleleFunction' ); - } - - if ( defined $dbData->[$geneDbName] ) { - $hasGeneCount++; - } -} - -ok( $inGeneCount == $hasGeneCount, - "We have a refSeq record for every position from txStart to txEnd" ); - -$db->cleanUp(); -$dbPath->remove_tree( { keep_root => 1 } ); - -done_testing(); diff --git a/perl/t/tracks/gene/join_no_build.yml b/perl/t/tracks/gene/join_no_build.yml deleted file mode 100644 index e0fa01974..000000000 --- a/perl/t/tracks/gene/join_no_build.yml +++ /dev/null @@ -1,78 +0,0 @@ ---- -assembly: hg19 -chromosomes: - - chrM -database_dir: t/tracks/gene/db/index -files_dir: t/tracks/gene/db/raw -temp_dir: /mnt/annotator/bystro-dev/tmp -tracks: - tracks: - - local_files: - - chrM.fa.gz - name: ref - type: reference - - features: - - kgID - - mRNA - - spID - - spDisplayID - - protAcc - - description - - rfamAcc - - name - - name2 - local_files: - - hg19.refGene.chrM - name: refSeq - type: gene - join: - features: - - alleleID - - phenotypeList - - clinicalSignificance - - type - - origin - - numberSubmitters - - reviewStatus - - chromStart - - chromEnd - track: clinvar - - based: 1 - no_build: 1 - build_field_transformations: - chrom: chr . - clinicalSignificance: split [;] - origin: split [;] - phenotypeList: split [;] - reviewStatus: split [;] - type: split [;] - build_row_filters: - Assembly: == GRCh38 - features: - - alleleID: number - - phenotypeList - - clinicalSignificance - - type - - origin - - numberSubmitters: number - - reviewStatus - - referenceAllele - - alternateAllele - fieldMap: - "#AlleleID": alleleID - AlternateAllele: alternateAllele - Chromosome: chrom - ClinicalSignificance: clinicalSignificance - NumberSubmitters: numberSubmitters - Origin: origin - PhenotypeIDS: phenotypeIDs - PhenotypeList: phenotypeList - ReferenceAllele: referenceAllele - ReviewStatus: reviewStatus - Start: chromStart - Stop: chromEnd - Type: type - local_files: - - variant_summary.txt.MT.1600_3250.gz - name: clinvar - type: sparse diff --git a/perl/t/tracks/gene/ncrna.t b/perl/t/tracks/gene/ncrna.t deleted file mode 100644 index 2211ab5c6..000000000 --- a/perl/t/tracks/gene/ncrna.t +++ /dev/null @@ -1,211 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; -use Mouse; -extends 'Seq::Base'; - -1; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ PrepareConfigWithTempdirs /; - -use Path::Tiny qw/path/; -use Scalar::Util qw/looks_like_number/; -use YAML::XS qw/DumpFile/; - -use Seq::Tracks::Gene::Site::SiteTypeMap; -use Seq::Tracks::Reference::MapBases; -use Seq::DBManager; - -# create temp directories -my $dir = Path::Tiny->tempdir(); - -# prepare temp directory and make test config file -my $config_file = - PrepareConfigWithTempdirs( 't/tracks/gene/ncrna.yml', - 't/tracks/gene/db/raw', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify ); - -my $baseMapper = Seq::Tracks::Reference::MapBases->new(); -my $siteTypeMap = Seq::Tracks::Gene::Site::SiteTypeMap->new(); - -# Defines three tracks, a nearest gene , a nearest tss, and a region track -# The region track is simply a nearest track for which we storeOverlap and do not storeNearest -# To show what happens when multiple transcripts (as in NR_FAKE3, NR_FAKE3B, NR_FAKE3C) -# all share 100% of their data, except have different txEnd's, which could reveal issues with our uniqueness algorithm -# such as calculating the maximum range of the overlap: in previous code iterations -# we removed the non-unique overlapping data, without first looking at the txEnd -# and therefore had a smaller-than-expected maximum range -my $seq = MockBuilder->new_with_config( { config => $config_file } ); - -my $tracks = $seq->tracksObj; - -###### First make fake reference track, using the column of sequence data ##### - -my $refBuilder = $tracks->getRefTrackBuilder(); -my $refIdx = $refBuilder->dbName; - -my @pos = ( 60950 .. 70966 ); - -my $file = $seq->getReadFh( $refBuilder->local_files->[0] ); -my @sequence = <$file>; -chomp @sequence; - -if ( @sequence != @pos ) { - die "malformed test, sequence != pos array"; -} - -my $db = Seq::DBManager->new(); - -for my $idx ( 0 .. $#pos ) { - if ( !defined $baseMapper->baseMap->{ $sequence[$idx] } ) { - die "malformed test, reference base mapper broken for base $sequence[$idx]"; - } - - my $base = $baseMapper->baseMap->{ $sequence[$idx] }; - - $db->dbPatch( 'chr19', $refIdx, $pos[$idx], $base ); -} - -for my $idx ( 0 .. $#pos ) { - # my $base = $baseMapper->baseMap->{$sequence[$idx]}; - my $result = $db->dbReadOne( 'chr19', $pos[$idx] ); - - my $base = $baseMapper->baseMapInverse->[ $result->[$refIdx] ]; - - if ( $base ne $sequence[$idx] ) { - die "malformed test, creating fake ref track didn't work"; - } -} - -my $geneBuilder = $tracks->getTrackBuilderByName('refSeq'); - -my $refGetter = $tracks->getRefTrackGetter(); -my $geneGetter = $tracks->getTrackGetterByName('refSeq'); - -my $siteTypeDbName = $geneGetter->getFieldDbName('siteType'); -my $funcDbName = $geneGetter->getFieldDbName('exonicAlleleFunction'); -# my $funcDbName = $geneGetter->getFieldDbName('exonicAlleleFunction'); -# my $funcDbName = $geneGetter->getFieldDbName('exonicAlleleFunction'); -# my $funcDbName = $geneGetter->getFieldDbName('exonicAlleleFunction'); - -$geneBuilder->buildTrack(); - -### We have: - -my $regionDataAref = $db->dbReadAll('refSeq/chr19'); - -my $geneIdx = $geneBuilder->dbName; -my $header = Seq::Headers->new(); - -my $features = $header->getParentFeatures('refSeq'); - -my ( $siteTypeIdx, $funcIdx ); - -for ( my $i = 0; $i < @$features; $i++ ) { - my $feat = $features->[$i]; - - if ( $feat eq 'siteType' ) { - $siteTypeIdx = $i; - next; - } - - if ( $feat eq 'exonicAlleleFunction' ) { - $funcIdx = $i; - next; - } -} - -# Safe for use when instantiated to static variable; no set - able properties -my $coding = $siteTypeMap->codingSiteType; -my $utr5 = $siteTypeMap->fivePrimeSiteType; -my $utr3 = $siteTypeMap->threePrimeSiteType; -my $spliceAcceptor = $siteTypeMap->spliceAcSiteType; -my $spliceDonor = $siteTypeMap->spliceDonSiteType; -my $ncRNA = $siteTypeMap->ncRNAsiteType; -my $intronic = $siteTypeMap->intronicSiteType; - -# txStart txEnd cdsStart cdsEnd exonStarts exonEnds -# NR_033266 chr19 - 60950 70966 70966 70966 3 60950,66345,70927, 61894,66499,70966, -for my $pos ( 0 .. 100000 ) { - my $mainDbAref = $db->dbReadOne( 'chr19', $pos ); - - # 70966 is +1 of the transcript (txEnd is open) - # and 60950 is +0 of the transcript (txStart is closed) - # anything outside of that is missing/intergenic - if ( $pos < 60950 || $pos > 70965 ) { - # $intergenic++; - ok( !defined $mainDbAref->[$geneIdx] ); - next; - } - - # $genic++; - ok( defined $mainDbAref->[$geneIdx] ); - - my $refBase = $refGetter->get($mainDbAref); - my $alt = 'A'; - - my $out = []; - my $refSeqData = $geneGetter->get( $mainDbAref, 'chr19', $refBase, $alt, 0, $out ); - - # non-coding transcripts don't have UTR3/5 (not translated) - # exonEnds closed, show this explicitly - if ( $pos >= 60950 && $pos < 61894 ) { - ok( $out->[$siteTypeIdx][0] eq $ncRNA ); - } - - if ( $pos >= 61894 && $pos < 66345 ) { - if ( $pos == 61894 || $pos == 61895 ) { - # we're on the negative strand, so should be acceptor - ok( $out->[$siteTypeIdx][0] eq $spliceAcceptor ); - next; - } - - if ( $pos == 66343 || $pos == 66344 ) { - # we're on the negative strand, so should be donor at "end" - ok( $out->[$siteTypeIdx][0] eq $spliceDonor ); - next; - } - - # we're on the negative strand, so should be donor at "end" - ok( $out->[$siteTypeIdx][0] eq $intronic ); - } - - if ( $pos >= 66345 && $pos < 66499 ) { - ok( $out->[$siteTypeIdx][0] eq $ncRNA ); - } - - # before 3rd exon - # 66499 is exonEnds of exon 2 - # 70927 is exonStarts of exon 3 - if ( $pos >= 66499 && $pos < 70927 ) { - if ( $pos == 66499 || $pos == 66500 ) { - # negative strand - ok( $out->[$siteTypeIdx][0] eq $spliceAcceptor ); - next; - } - - if ( $pos == 70925 || $pos == 70926 ) { - # negative strand - ok( $out->[$siteTypeIdx][0] eq $spliceDonor ); - next; - } - - ok( $out->[$siteTypeIdx][0] eq $intronic ); - } - - #3rd exon - if ( $pos >= 70927 && $pos < 70966 ) { - ok( $out->[$siteTypeIdx][0] eq $ncRNA ); - } - -} - -# ok($inGeneCount == $hasGeneCount, "We have a refSeq record for every position from txStart to txEnd"); - -$db->cleanUp(); - -done_testing(); diff --git a/perl/t/tracks/gene/ncrna.yml b/perl/t/tracks/gene/ncrna.yml deleted file mode 100644 index 3dc4ede11..000000000 --- a/perl/t/tracks/gene/ncrna.yml +++ /dev/null @@ -1,27 +0,0 @@ ---- -assembly: hg19 -chromosomes: -- chr19 -database_dir: t/tracks/gene/db/index -files_dir: t/tracks/gene/db/raw -temp_dir: /mnt/annotator/bystro-dev/tmp -tracks: - tracks: - - local_files: - - fakeRef.hg19.60950_70966.txt - name: ref - type: reference - - features: - - kgID - - mRNA - - spID - - spDisplayID - - protAcc - - description - - rfamAcc - - name - - name2 - local_files: - - hg19.complex.txt - name: refSeq - type: gene \ No newline at end of file diff --git a/perl/t/tracks/gene/overlap.t b/perl/t/tracks/gene/overlap.t deleted file mode 100644 index b2bb8a9fd..000000000 --- a/perl/t/tracks/gene/overlap.t +++ /dev/null @@ -1,261 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; - -use Mouse; -extends 'Seq::Base'; - -1; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ PrepareConfigWithTempdirs /; - -use Path::Tiny qw/path/; -use Scalar::Util qw/looks_like_number/; -use YAML::XS qw/DumpFile/; - -use Seq::Tracks::Gene::Site::SiteTypeMap; -use Seq::Tracks::Reference::MapBases; -use Seq::DBManager; - -# create temp directories -my $dir = Path::Tiny->tempdir(); - -# prepare temp directory and make test config file -my $config_file = - PrepareConfigWithTempdirs( 't/tracks/gene/overlap.yml', - 't/tracks/gene/db/raw', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify ); - -my $baseMapper = Seq::Tracks::Reference::MapBases->new(); -my $siteTypeMap = Seq::Tracks::Gene::Site::SiteTypeMap->new(); - -# Defines three tracks, a nearest gene , a nearest tss, and a region track -# The region track is simply a nearest track for which we storeOverlap and do not storeNearest -# To show what happens when multiple transcripts (as in NR_FAKE3, NR_FAKE3B, NR_FAKE3C) -# all share 100% of their data, except have different txEnd's, which could reveal issues with our uniqueness algorithm -# such as calculating the maximum range of the overlap: in previous code iterations -# we removed the non-unique overlapping data, without first looking at the txEnd -# and therefore had a smaller-than-expected maximum range -my $seq = MockBuilder->new_with_config( { config => $config_file } ); - -my $tracks = $seq->tracksObj; - -###### First make fake reference track, using the column of sequence data ##### - -my $refBuilder = $tracks->getRefTrackBuilder(); -my $refIdx = $refBuilder->dbName; - -$refBuilder->buildTrack(); - -my $db = Seq::DBManager->new(); - -my $geneBuilder = $tracks->getTrackBuilderByName('refSeq'); - -my $refGetter = $tracks->getRefTrackGetter(); -my $geneGetter = $tracks->getTrackGetterByName('refSeq'); - -my $siteTypeDbName = $geneGetter->getFieldDbName('siteType'); -my $funcDbName = $geneGetter->getFieldDbName('exonicAlleleFunction'); - -$geneBuilder->buildTrack(); - -### We have: - -my $regionDataAref = $db->dbReadAll('refSeq/chr10'); - -my $geneIdx = $geneBuilder->dbName; -my $header = Seq::Headers->new(); - -my $features = $header->getParentFeatures('refSeq'); - -my ( - $siteTypeIdx, $funcIdx, $nameIdx, $name2Idx, $spDispIdx, - $mrnaIdx, $spIdx, $descIdx, $txNumberIdx -); - -my $dbLen = $db->dbGetNumberOfEntries('chr10'); - -for ( my $i = 0; $i < @$features; $i++ ) { - my $feat = $features->[$i]; - - if ( $feat eq 'siteType' ) { - $siteTypeIdx = $i; - next; - } - - if ( $feat eq 'exonicAlleleFunction' ) { - $funcIdx = $i; - next; - } - - if ( $feat eq 'name' ) { - $nameIdx = $i; - next; - } - - if ( $feat eq 'name2' ) { - $name2Idx = $i; - next; - } - - if ( $feat eq 'mRNA' ) { - $mrnaIdx = $i; - next; - } - - if ( $feat eq 'spID' ) { - $spIdx = $i; - next; - } - - if ( $feat eq 'spDisplayID' ) { - $spDispIdx = $i; - next; - } - - if ( $feat eq 'description' ) { - $descIdx = $i; - next; - } - - if ( $feat eq 'txNumber' ) { - $txNumberIdx = $i; - next; - } -} - -# Safe for use when instantiated to static variable; no set - able properties -my $coding = $siteTypeMap->codingSiteType; -my $utr5 = $siteTypeMap->fivePrimeSiteType; -my $utr3 = $siteTypeMap->threePrimeSiteType; -my $spliceAcceptor = $siteTypeMap->spliceAcSiteType; -my $spliceDonor = $siteTypeMap->spliceDonSiteType; -my $ncRNA = $siteTypeMap->ncRNAsiteType; -my $intronic = $siteTypeMap->intronicSiteType; - -# txStart txEnd cdsStart cdsEnd exonStarts exonEnds -# NR_033266 chr19 - 60950 70966 70966 70966 3 60950,66345,70927, 61894,66499,70966, - -# NM_001009943 index; has some overlapping values in mRNA, rfamAcc, description -my $targetIdx; -# NM_001009941 -my $target2Idx; -my $expSpIds = join( "\t", sort { $a cmp $b } ( 'F8WEI4', 'Q6P6B7' ) ); - -my $expDesc = join( - "\t", - sort { $a cmp $b } ( - 'Homo sapiens ankyrin repeat domain 16 (ANKRD16), transcript variant 4, mRNA.', - 'Homo sapiens ankyrin repeat domain 16 (ANKRD16), transcript variant 1, mRNA.' - ) -); - -my $exp2SpIds = 'Q6P6B7'; - -my $exp2Desc = - 'Homo sapiens ankyrin repeat domain 16 (ANKRD16), transcript variant 2, mRNA.'; - -# We're always only getting a "snp", so posIdx is 0 for the allele -my $posIdx = 0; -for my $pos ( 0 .. $dbLen - 1 ) { - my $mainDbAref = $db->dbReadOne( 'chr10', $pos ); - - my $refBase = $refGetter->get($mainDbAref); - my $alt = 'A'; - - my $out = []; - my $refSeqData = - $geneGetter->get( $mainDbAref, 'chr10', $refBase, $alt, $posIdx, $out ); - - my $siteType = $out->[$siteTypeIdx][$posIdx]; - my $names = $out->[$nameIdx][$posIdx]; - my $symbol = $out->[$name2Idx][$posIdx]; - my $txNumber = $out->[$txNumberIdx][$posIdx]; - - # Tx is NM_019046, none others overlap - # It is the 3rd tx in the list (NM_001009941 is 1st/idx 0, NM_001009943 is 2nd/idx 1) - if ( $pos < 16361 ) { - ok( !ref $siteType ); - - ok( - $txNumber == 2, "The 3rd transcript in the input file is NM_019046, - and we get the correct txNumber" - ); - ok( $names eq 'NM_019046' ); - - # last exon is utr3; this tx is on negative strand so first exon is really last - # exonStarts: 0,16371,18572,21280,22243,23997,26121,27315, - # exonEnds: 966,16562,18651,21442,22352,24040,26342,28172 - # also, 966 is exonEnds, which are open interval, so last base is -1 - if ( $pos <= 966 - 1 ) { - ok( $siteType eq $utr3 ); - next; - } - - # closing boundary not needed since > 16361, for clarity only - # 16371 is exonStarts[1] which is closed interval (so no - 1 to get first base) - if ( $pos > 966 - 1 && $pos < 16371 ) { - if ( $pos == 966 || $pos == 967 ) { - # First 2 bases are really the last; so first 2 should be spliceAcceptor - # when on negative strand, instead of spliceDonor - ok( $siteType eq $spliceAcceptor ); - next; - } - - ok( $siteType eq $intronic ); - } - - next; - } - - # The merge order may not be completely deterministic - # However, txNumber is always the order found in the input file - # and the txNumber will always match the transcript, no matter - # whether that tx is merged first, second, or third - ok( $txNumber->[0] == 2 && $txNumber->[1] == 0 && $txNumber->[2] == 1, - "Can output txNumbers" ); - ok( $names->[0] eq 'NM_019046', "Can get the correct tx name for 1st merged" ); - ok( $names->[1] eq 'NM_001009941', "Can get the correct tx name for 2nd merged" ); - ok( $names->[2] eq 'NM_001009943', "Can get the correct tx name for 3rd merged" ); - - # first condition not needed, simply for consistency - # 1652 is the first exonEnds for NM_001009943 and NM_001009941, and the 2nd for NM_019046 - if ( $pos >= 16361 && $pos <= 16562 - 1 ) { - if ( !$targetIdx ) { - my $idx = 0; - - for my $name (@$names) { - if ( $name eq 'NM_001009943' ) { - $targetIdx = $idx; - } - elsif ( $name eq 'NM_001009941' ) { - $target2Idx = $idx; - } - - $idx++; - } - } - - my $spIds = $out->[$spIdx][$posIdx][$targetIdx]; - my $descs = $out->[$descIdx][$posIdx][$targetIdx]; - ok( join( "\t", sort { $a cmp $b } @$spIds ) eq $expSpIds ); - ok( join( "\t", sort { $a cmp $b } @$descs ) eq $expDesc ); - - my $spIds2 = $out->[$spIdx][$posIdx][$target2Idx]; - my $descs2 = $out->[$descIdx][$posIdx][$target2Idx]; - - ok( $spIds2 eq $exp2SpIds ); - ok( $descs2 eq $exp2Desc ); - } - # ok($out->[$siteTypeIdx][0] eq $intronic); -} - -# ok($inGeneCount == $hasGeneCount, "We have a refSeq record for every position from txStart to txEnd"); - -$db->cleanUp(); - -done_testing(); diff --git a/perl/t/tracks/gene/overlap.yml b/perl/t/tracks/gene/overlap.yml deleted file mode 100644 index 9f718a61a..000000000 --- a/perl/t/tracks/gene/overlap.yml +++ /dev/null @@ -1,64 +0,0 @@ -assembly: hg19 -chromosomes: - - chr10 -database_dir: t/tracks/gene/db/index -files_dir: t/tracks/gene/db/raw -temp_dir: /mnt/annotator/bystro-dev/tmp -tracks: - tracks: - - local_files: - - chr10_fake_overlap.fasta.gz - name: ref - type: reference - - build_field_transformations: - description: split [;] - ensemblID: split [;] - kgID: split [;] - mRNA: split [;] - name: split [;] - name2: split [;] - protAcc: split [;] - rfamAcc: split [;] - spDisplayID: split [;] - spID: split [;] - tRnaName: split [;] - features: - - kgID - - mRNA - - spID - - spDisplayID - - protAcc - - description - - rfamAcc - - name - - name2 - local_files: - - hg19.kgXref.fetch.gz - reportTxNumber: true - name: refSeq - type: gene - utils: - - args: - connection: - database: hg19 - host: genome-mysql.soe.ucsc.edu - port: 3306 - user: genome - sql: - SELECT r.*, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.kgID, '')) SEPARATOR - ';') FROM kgXref x WHERE x.refseq=r.name) AS kgID, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.description, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS description, - (SELECT GROUP_CONCAT(DISTINCT(NULLIF(e.value, '')) SEPARATOR ';') FROM knownToEnsembl - e JOIN kgXref x ON x.kgID = e.name WHERE x.refseq = r.name) AS ensemblID, - (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.tRnaName, '')) SEPARATOR ';') FROM - kgXref x WHERE x.refseq=r.name) AS tRnaName, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.spID, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS spID, (SELECT - GROUP_CONCAT(DISTINCT(NULLIF(x.spDisplayID, '')) SEPARATOR ';') FROM kgXref - x WHERE x.refseq=r.name) AS spDisplayID, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.protAcc, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS protAcc, (SELECT - GROUP_CONCAT(DISTINCT(NULLIF(x.mRNA, '')) SEPARATOR ';') FROM kgXref x WHERE - x.refseq=r.name) AS mRNA, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.rfamAcc, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS rfamAcc FROM - refGene r WHERE r.name='NM_019046' OR r.name='NM_001009943' OR r.name='NM_001009941'; - completed: 2018-09-06T17:18:00 - name: fetch diff --git a/perl/t/tracks/gene/region.t b/perl/t/tracks/gene/region.t deleted file mode 100644 index d57f1fb77..000000000 --- a/perl/t/tracks/gene/region.t +++ /dev/null @@ -1,79 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; -use Mouse; -extends 'Seq::Base'; - -1; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ PrepareConfigWithTempdirs /; - -use Path::Tiny qw/path/; -use Scalar::Util qw/looks_like_number/; -use YAML::XS qw/DumpFile/; - -use Seq::Tracks::Gene::Site::SiteTypeMap; -use Seq::Tracks::Reference::MapBases; - -# create temp directories -my $dir = Path::Tiny->tempdir(); - -# prepare temp directory and make test config file -my $config_file = - PrepareConfigWithTempdirs( 't/tracks/gene/region.yml', - 't/tracks/gene/db/raw', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify ); - -my $baseMapper = Seq::Tracks::Reference::MapBases->new(); -my $siteTypes = Seq::Tracks::Gene::Site::SiteTypeMap->new(); - -# Defines three tracks, a nearest gene , a nearest tss, and a region track -# The region track is simply a nearest track for which we storeOverlap and do not storeNearest -# To show what happens when multiple transcripts (as in NR_FAKE3, NR_FAKE3B, NR_FAKE3C) -# all share 100% of their data, except have different txEnd's, which could reveal issues with our uniqueness algorithm -# such as calculating the maximum range of the overlap: in previous code iterations -# we removed the non-unique overlapping data, without first looking at the txEnd -# and therefore had a smaller-than-expected maximum range -my $seq = MockBuilder->new_with_config( { config => $config_file } ); - -my $tracks = $seq->tracksObj; - -my $refBuilder = $tracks->getRefTrackBuilder(); -my $geneBuilder = $tracks->getTrackBuilderByName('refSeq'); - -$refBuilder->buildTrack(); -$geneBuilder->buildTrack(); - -my $refGetter = $tracks->getRefTrackGetter(); -my $geneGetter = $tracks->getTrackGetterByName('refSeq'); - -my $db = Seq::DBManager->new(); - -my $mainDbAref = $db->dbReadAll('chr19'); -my $regionDataAref = $db->dbReadAll( 'refSeq/chr19', 0, 1 ); - -# What we expect to be found in the region db -# Enough to precisely describe the tx -my @coordinateFields = ( - 'chrom', 'txStart', 'txEnd', 'cdsStart', - 'cdsEnd', 'exonStarts', 'exonEnds', 'strand' -); - -for my $regionEntry ( values @$regionDataAref ) { - for my $f (@coordinateFields) { - my $idx = $geneGetter->getFieldDbName($f); - ok( exists $regionEntry->{$idx}, "Expected $f to exist at index $idx" ); - } - - for my $f ( @{ $geneGetter->features } ) { - my $idx = $geneGetter->getFieldDbName($f); - - ok( exists $regionEntry->{$idx}, "Expected $f to exist at index $idx" ); - } -} - -done_testing(); diff --git a/perl/t/tracks/gene/region.yml b/perl/t/tracks/gene/region.yml deleted file mode 100644 index c8350036f..000000000 --- a/perl/t/tracks/gene/region.yml +++ /dev/null @@ -1,27 +0,0 @@ -assembly: hg19 -chromosomes: - - chr19 -database_dir: t/tracks/gene/db/index -files_dir: t/tracks/gene/db/raw -temp_dir: /mnt/annotator/bystro-dev/tmp -tracks: - tracks: - - local_files: - - fakeRef.hg19.60950_70966.txt - name: ref - type: reference - - features: - - kgID - - mRNA - - spID - - spDisplayID - - protAcc - - descriptionNonNull - - rfamAcc - - name - - name2 - local_files: - - hg19.complex.txt - name: refSeq - type: gene - build_region_track_only: true diff --git a/perl/t/tracks/gene/simple.t b/perl/t/tracks/gene/simple.t deleted file mode 100644 index b91f9d7f5..000000000 --- a/perl/t/tracks/gene/simple.t +++ /dev/null @@ -1,123 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; -use Mouse; -extends 'Seq::Base'; - -1; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ PrepareConfigWithTempdirs /; - -use Path::Tiny qw/path/; -use Scalar::Util qw/looks_like_number/; -use YAML::XS qw/DumpFile/; - -use Seq::Tracks::Gene::Site::SiteTypeMap; -use Seq::Tracks::Reference::MapBases; - -# create temp directories -my $dir = Path::Tiny->tempdir(); - -# prepare temp directory and make test config file -my $config_file = - PrepareConfigWithTempdirs( 't/tracks/gene/simple.yml', - 't/tracks/gene/db/raw', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify ); - -my $baseMapper = Seq::Tracks::Reference::MapBases->new(); -my $siteTypes = Seq::Tracks::Gene::Site::SiteTypeMap->new(); - -# Defines three tracks, a nearest gene , a nearest tss, and a region track -# The region track is simply a nearest track for which we storeOverlap and do not storeNearest -# To show what happens when multiple transcripts (as in NR_FAKE3, NR_FAKE3B, NR_FAKE3C) -# all share 100% of their data, except have different txEnd's, which could reveal issues with our uniqueness algorithm -# such as calculating the maximum range of the overlap: in previous code iterations -# we removed the non-unique overlapping data, without first looking at the txEnd -# and therefore had a smaller-than-expected maximum range -my $seq = MockBuilder->new_with_config( { config => $config_file } ); - -my $tracks = $seq->tracksObj; - -my $refBuilder = $tracks->getRefTrackBuilder(); -my $geneBuilder = $tracks->getTrackBuilderByName('refSeq'); - -my $refGetter = $tracks->getRefTrackGetter(); -my $geneGetter = $tracks->getTrackGetterByName('refSeq'); - -$refBuilder->buildTrack(); -$geneBuilder->buildTrack(); - -### We have: - -my $db = Seq::DBManager->new(); - -my $mainDbAref = $db->dbReadAll('chrM'); -my $regionDataAref = $db->dbReadAll('refSeq/chrM'); - -my $geneDbName = $geneBuilder->dbName; -my $header = Seq::Headers->new(); - -my $features = $header->getParentFeatures('refSeq'); - -# my $txNumberOutIdx = first_index { $_ eq $geneGetter->txNumberKey } @$features; - -my ( $siteTypeIdx, $funcIdx, $txNumberOutIdx ); - -for ( my $i = 0; $i < @$features; $i++ ) { - my $feat = $features->[$i]; - - if ( $feat eq 'siteType' ) { - $siteTypeIdx = $i; - next; - } - - if ( $feat eq 'exonicAlleleFunction' ) { - $funcIdx = $i; - next; - } - - if ( $feat eq $geneGetter->txNumberKey ) { - $txNumberOutIdx = $i; - next; - } -} - -my $hasGeneCount = 0; -my $inGeneCount = 0; - -for my $pos ( 0 .. $#$mainDbAref ) { - my $dbData = $mainDbAref->[$pos]; - - if ( $pos >= 1672 && $pos < 3230 ) { - $inGeneCount++; - - my @out; - - # not an indel - my $posIdx = 0; - $geneGetter->get( $dbData, 'chrM', $refGetter->get($dbData), 'A', $posIdx, \@out ); - - ok( join( ",", @{ $out[$siteTypeIdx] } ) eq $siteTypes->ncRNAsiteType, - 'ncRNA site type' ); - ok( !defined $out[$funcIdx][0], 'ncRNA has no exonicAlleleFunction' ); - ok( $out[$txNumberOutIdx][0] == 0, 'txNumber is outputted if requested' ); - } - - if ( defined $dbData->[$geneDbName] ) { - $hasGeneCount++; - } - - my @out; - my $refSite = $refGetter->get($dbData); -} - -ok( $inGeneCount == $hasGeneCount, - "We have a refSeq record for every position from txStart to txEnd" ); - -$db->cleanUp(); - -done_testing(); diff --git a/perl/t/tracks/gene/simple.yml b/perl/t/tracks/gene/simple.yml deleted file mode 100644 index 069541009..000000000 --- a/perl/t/tracks/gene/simple.yml +++ /dev/null @@ -1,27 +0,0 @@ -assembly: hg19 -chromosomes: - - chrM -database_dir: t/tracks/gene/db/index -files_dir: t/tracks/gene/db/raw -temp_dir: /mnt/annotator/bystro-dev/tmp -tracks: - tracks: - - local_files: - - chrM.fa.gz - name: ref - type: reference - - features: - - kgID - - mRNA - - spID - - spDisplayID - - protAcc - - description - - rfamAcc - - name - - name2 - reportTxNumber: true - local_files: - - hg19.refGene.chrM - name: refSeq - type: gene diff --git a/perl/t/tracks/gene/site.t b/perl/t/tracks/gene/site.t deleted file mode 100644 index d19a0a133..000000000 --- a/perl/t/tracks/gene/site.t +++ /dev/null @@ -1,56 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package Testing; - -use Test::More; - -use Seq::Tracks::Gene::Site; - -my $siteHandler = Seq::Tracks::Gene::Site->new(); - -my $packedData = $siteHandler->pack( ( 0, 'intronic', '-' ) ); - -say "Packed data for '(0, intronic, -)' is "; - -my ( $txNumber, $unpackedData ) = $siteHandler->unpack($packedData); - -my ( $strand, $siteType, $codonNumber, $codonPosition, $codonSequence ); - -( $strand, $siteType ) = @$unpackedData; - -ok( $txNumber == 0, 'returns txNumber' ); -ok( $strand eq '-', 'reads strand ok' ); -ok( $siteType eq 'intronic', 'reads intronic ok from shortened site' ); -ok( @$unpackedData == 2, 'intronic sites have only siteType and strand' ); - -$packedData = $siteHandler->pack( ( 1541, 'ncRNA', '+' ) ); - -( $txNumber, $unpackedData ) = $siteHandler->unpack($packedData); - -( $strand, $siteType ) = @$unpackedData; - -ok( $txNumber == 1541, 'returns txNumber' ); -ok( $strand eq '+', 'reads strand ok' ); -ok( $siteType eq 'ncRNA', 'reads intronic ok from shortened site' ); -ok( @$unpackedData == 2, 'intronic sites have only siteType and strand' ); - -$packedData = $siteHandler->pack( ( 65000, 'exonic', '+', 1, 2, 'ATG' ) ); - -say "Packed data for (65000, 'Coding', '+', 1, 2, 'ATG') is "; - -( $txNumber, $unpackedData ) = $siteHandler->unpack($packedData); - -( $strand, $siteType, $codonNumber, $codonPosition, $codonSequence ) = - @$unpackedData; - -ok( $txNumber == 65000, 'returns txNumber' ); -ok( $strand eq '+', 'reads strand ok' ); -ok( $siteType eq 'exonic', 'reads intronic ok from shortened site' ); -ok( $codonNumber == 1, 'reads codonPosition' ); -ok( $codonPosition == 2, 'reads codonNumber' ); -ok( $codonSequence eq 'ATG', 'reads codon sequnce' ); -ok( @$unpackedData == 5, 'intronic sites have only siteType and strand' ); - -done_testing(); diff --git a/perl/t/tracks/gene/test-prepare-ref.yml b/perl/t/tracks/gene/test-prepare-ref.yml deleted file mode 100644 index 22ca88667..000000000 --- a/perl/t/tracks/gene/test-prepare-ref.yml +++ /dev/null @@ -1,13 +0,0 @@ ---- -assembly: hg19 -chromosomes: -- chrY -database_dir: t/tracks/gene/db/index-prepared-ref/ -files_dir: t/tracks/gene/db/raw -temp_dir: /mnt/annotator/bystro-dev/tmp -tracks: - tracks: - - local_files: - - chrY.fa.gz - name: ref - type: reference diff --git a/perl/t/tracks/merge.t b/perl/t/tracks/merge.t deleted file mode 100644 index a6c65e1c0..000000000 --- a/perl/t/tracks/merge.t +++ /dev/null @@ -1,49 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; - -use Seq::Tracks::Build; - -# create temp directories -my $db_dir = Path::Tiny->tempdir(); -my $files_dir = Path::Tiny->tempdir(); - -Seq::DBManager::initialize( { databaseDir => $db_dir } ); - -my $t = Seq::Tracks::Build->new( - { - files_dir => $db_dir, - name => 'test', - type => 'sparse', - chromosomes => ['testChr'], - assembly => 'hgTest' - } -); - -my ( $mergeFunc, $cleanUp ) = $t->makeMergeFunc(); -my @testVals = ( 67, 45, 22, 35 ); -my @testVals2 = ( 33, 25, 21, 65 ); - -my $chr = 'testChr'; -my $pos = 1; - -my ( $err, $result ) = $mergeFunc->( $chr, $pos, \@testVals, \@testVals2 ); - -ok( join( ',', @{ $result->[0] } ) eq join( ',', 67, 33 ) ); -ok( join( ',', @{ $result->[1] } ) eq join( ',', 45, 25 ) ); -ok( join( ',', @{ $result->[2] } ) eq join( ',', 22, 21 ) ); -ok( join( ',', @{ $result->[3] } ) eq join( ',', 35, 65 ) ); - -# @testVals2 = (3334, 225, 201, 605,777, 888); -# ($err, $result) = $mergeFunc->($chr, $pos, $result, \@testVals2); -# p $result; -# # ok(join(',', @{$result->[0]} eq join(',', 67))); - -# @testVals2 = ('short1', 'short2'); -# ($err, $result) = $mergeFunc->($chr, $pos, $result, \@testVals2); - -$t->db->cleanUp(); - -done_testing(); diff --git a/perl/t/tracks/nearest/db/hg19/raw/ref/chrM.fa.gz b/perl/t/tracks/nearest/db/hg19/raw/ref/chrM.fa.gz deleted file mode 100644 index 8586ef9a900a97e3f07cd0235eba0528c23b61a4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5537 zcmV;S6<+EeiwFo6zQaiX17m1%O)h3(08N`sawR*igXdaQM=&_x=*_r+6CSw79o~Pu z6h0tN)xA~a^2r~GB0&I>|MTFm+k#>?ss<1eMIX)e!+9&^Ywj9->;u%J@-AYc<#Dy-8~Fr z@9)-a`#kHpq)H#$AwMs8{$4)YI#n9R9sT?JdHwU9=VD#y|L;Sd7d~^*1QdY2BfUc? z{qtVlzV`m|M}6PY{GQF|VL?lbO{deZ%Kia3-WIedBFx|*Sa ztk~>oo;OfDSxfWGvptF`Kc>!lQ9QEZPle`QvkI)@*Pg=}U7`3jW6ytDH-^PS?nIw| z9o_q#&(u4TJFL~(-h|PzXff})__ckbepf{kQ*tTEoOufP9o@M}bT7Jj*{cZck`hYB zy}6@!5$Gq#dW&X1TB|1$N(rNQ+z0FHYNh-g`cYPJoBmQxwz5XA(Q;`n<58SUm%=Dk ze3ZR6CYqNfd+)q2xl73+-EguLdcZ9j&5z`|^`lfMt&2Wju-s6^r8jp}OnoCzqnv;0 z<$jQ?SQba0Yvz=46A|6_{5nmTR%@trxoff+_^2Q#vDL~keO;#sF|HB@Xla}%r4lD@ zN6D>ZAalJEyirNknz^!AFFz@zeaBv^U{%@0np<%i6-#?>5yae3v?jO$qH^BqeeLyE z{yYO)SDscyV^}?RoZfd)Z?NxNo29(*94$qW11!PKxMC@VghgFFLz;7ktb(r{KWfQk zvVa+uar0)-(QAQ`nqOORt+3?~@Ny?eZn?sA4OyWz&?+CmOKk}d9QPDz ze&#K1Qc7LS*9`rr{Z7qpeb;!bn&KI1m_aGI^g&Z`)J0MGtZN^d_b?32Kovy&HaN!m`WIN0xZ^97ocGqV?h%uLYt}=h58*7| zsyIB;-?!n?9-Flox}<^*N)pkbzc9=(${2%u2Lmi4*&W>(!3wA@~CF16;u`4NvCn+Yawm9g&U$5bOE92xVYdxL|D;mHv!0+Iv)Jwk}KhZ zrwTd{@$E5~tvfgndA43MvPML=!G@JNU&^=i?fc_KLHE85@P0aAp%P!E|M~p>{(XLb zKIiZA`TP0&{Qdp@{rx|7`{XYNx{r!19`~3Xg5B|<`{qOzo`Q~%| z^ZVZQeExec{yb08y+1!c`o_KK{5&`Q+=o3+xvzb`d!MGipNsD+?lbR$?jL)J;u02W z+$7Iw4dE!m3BUlk3Js)hQ%wr7QZnnKM7ZBM0^Uuh+ys|OC#%c|%ts%%=wJiM*c6Vs zrh=?SOZo__8lK#=fu<>zd$br-WtdvG(OZo5v_|z)^IzqrlBvbMsQvNu2~7(yI_ydi ztwvO$6GMO?@cnj|r9{-6;TQCgz~N1%wrcW2R zV1ZPtq|}-fkEvmK;l><<#;1-;Jjej+m$YLzggN|ed}8Vae=%KhuF)g9-+q3tInAx5kUKu2L6`m?Cme@UOkSI8WAw_1swh$OLdDvsr1qUPPn-_ zAkl%DUHCa-8biH>c9H#{qc~8|ToF26piZHz)lnF4oAg@p-uAk51XCA?s>kW&hdh?5 z_7?B)YG?$OawI7Pu3X-oVBUHm^3(9Wp`gKu(R>EptL2hdT4UgUD2ajstikznA5vjkO=CD})o|8`G~Fv33l@$=4L9%>=9z8reUpeFsHD z@08buC9HL-wan!(`5YJ|z5qRix3c7SZO%fUcfKUdke%)}*b)}q zdzUiwqN2gpqP&r>DHynhx)|zP3>GHk3TlwYniGJycOKZ2Gm~xtHiex>62$aS?KcrB zyE|+2jb;xowaU1eECS|!hhm$FqJSb93^eAeEE<8jYj0z0B1OBTi2+d;#)~xF@VzRu zB|XqYl`BlkMXg=r8ivGQWMWmw3)!U5o-le$p*HVu`_#NuqveOPlw?+g<2ksL^eY#w z5Z8pn)P{9*ldQLIL%mUeD*E$Yp(u>43YbraYsW6rBhE7as9DEOMQX;{FdEU5mP4}c z4-(=~?1tcnbyTPmE&bR~s&B0GwO^Y~QSOi6Sle&2Jk4Ua!_ekw5YLX}SxVbJTD;8) zH8r=;{pVY^cm{+rY((f5#Aa<&mFoPHX2yrq>c?V zrG-sKLlQMRR=HOu*lq@N`%Cw6JuGeeRsC1+;%qJ!^)ep_nWrB>wO4ll+M#=kO)uH5 zS**2*ZDaAw6R*q*txIt))}u35AA8$k<2A`y&2rEwE(ZM`dndY6Fh0E6R@a!WcMT@jh)3S8Ta0>!a*v~Re!q2RrZ6KHHQ@EUiT3c6n$i?wx= zdYw17NUO?d(NXP_rY6d5!Rj@MNORkut(@+BK%$Suz71)R2(t}9D~xVu!}i2J7(752 zY}5|JJ6&O)DgdOi<5u1raOW^e9dp{5fRnLN85LKL{%&{Uxc{ipxo;^`UY9URQJzl2 zE+U5Iv+rx26(gcn)A(29%7+Kk7H@57ZGTwjD_}8AF z8{kBR0BP)4!N&I$DHX-_S}5pKoHz$l9EOBI%^~yK-YBfrs!RWt;v?KzsmN6rdt<*E zIwl(8?pc^iWOD2fu=B)B0NT{g#j%$qQmC3{ItP*)>Hr@f(;ke#H??;8x)p_IDZu*F^_GWnFh0x?FjDs%X z72{4eWzG`>pLhay%*SSj9w0*kNHH9=CYUe3u`f%J!H}xf;-vLS?%z;mSUi;iKAJ2z5sGd6t1E=td-#SBCQm~ zc;kW4Yge?l(?;0pYs9z7wPlnwd21;xz4}#SYf_iQtM4Q3L-1hDhu6*dB|b#jMnTlX zNd?E?-gnRz%VxHwpSM>U5}h3O6$2ufv%Xvno9Mz4pKV!Zh{NTXB#z0unDrFH5>34A|@K(@sF8}C3I{L za(hf$IC!vSi#4_maIkSwCiJ=3vEpL4cJGQ?hIKr7vpPKn+jh)qUSeb-KK4vVF$ez` z1Dui_O9zx!PDtIA$z)0JBJEl9PWxv~(2S^6SjNR<@B92?rBfj0)oa3ps?k4>d8FlL zJeS^V33R_0B~~{`(aEPI%r)`a*kH8{=$6brrip@jlGX)%OcWJH0rlU}PeP zQ26Yb96Pye6zEI@NRus6a}I}O77Kdt%{%)cLMfd(vZ9^r7ZWj`+^9J1uyyW!_hfUA z9(F9EI!96<5vCwXq9r_KLQ>Yip^)||BwNDgy(6w3DCf#9n+cFz?x5{(PK4kqDuYgR5ThqI!7 zad<(WvdaZZM`_tgNsnzPTolJ!5o=cZQpk<(CpMm#mVtOH?%#y6e>sd4$r-n5MbNlKL}foR z?D)mvL#M2H*1W>R#QMEoeHpE0quXRUWYH9M%^N!;g-Z~L?d^eR!m{x4=`1kcb}*Zd zYc~4b0xKT~+^-@q$1KrNFC65xAshXV6zg$&GjgAHfmpk3a(RHuGLjSqiwvSLLTDSL z-%Ps8yYB7CacJSJwXGNApCwnz#xk)~rHvh%pA{V0&9;(lc;|_$7)#9_ONQs3=rNXU z@(z&}8-G0k2s6>NnPWnW<7mEm+;2byGKo7Tyu>^T=(HJWPKBNr1IyIaiqSm$?F#l_MLm=4 z9lC4+Uon-^Jvh@lA!7bEFmFBbS`HB4rdhlRmb3CA8Z&{ioCl=MeK&>ShO`84ywtRB=kh}wY5?m zBYb+nMq67Q#gAw^MWtOHs=(K}=NpFWbpp10JbR^IHc1PDS^?4yH&#r!#kS5UrYRDt zS2c3*f)jj(G%X)Ln-I|y_;)P=HQ?k+z19|irfi4A2gZm!fKfJQ03}3KSw>^Yg%Q(5 z#ydy03v@a?KA^plZ;x7^h?}=jiuFOg=XI22cMR6@Oq#xFB4T8{a}N8KTO=By%mOp# z#Vm1xW0%~jWo=$Vsm2>SFD__Lj!d>pF^93GR^K%YR<1eBFxgkw1Yn{*UmEWJ&-}B? z3dI>Kg|vY+VSt0(qqEL_{V`v0oSqdKMiGx#AEAtQ5!JJKkB3z6gyQzR5Nn@{Gwb6m z6l+vFQO8?ZJMRW}#kI1j>gKbyKii|{Qsm5zQlmwMer|Co##pB+^8$zK`v;$VF6wx7sZg|<*k7jkw~ za&OJcjDaUi&<9h!?^6Fd9N+e@pr6-+oR@}2bF=WBh8aO{-hOIx%H-dP^KJ#^eEqEhi@|hKl`7@ny=IHDs``Ctbwru>H7XXwn6CC zmJNH4B026VN#1%pdh90?!Lo-TfTi<<=^G>V?SU1sV(pak$sJ%VE4g*oGRbf-Z_(^= zd4?(l^+}3;wXkdQKKbwrwoFOdNH(Iv+IM@O%N9XbHkV%Jh9plVSD;b19namE?NC~X zZTNe#T7f^|V3iblvDmx?Jf3Uo!K~ExCBeti>Zs^NVZj_gBjb& zs`GAHY|>yN)C%9jeBo6%oKY0Kj(@&v@j7fmY>pscOSTfmLkbauZID}UeitW6MYA~% zV|fbCium#F=(}f*A6e8sJ}3N`MgT%mR#v1bz&)I{iM)|a%OM!b8yF~ehvg?yg@I%W zN5Op3#y|ZV(cq&95JisT#Futcy~IPL_ZC;iXdj0;7W|S3&WZv0i(% zwaJh(sc@LdInKkF7TIcYd3k?atq0M=0vwamj`K3FZu4N0M27ym^A2|hHhJI|b jEz!#-49kT1m0*qI#7_*tS1qLE!=C>Ko&@jh4MG3_K3x28 diff --git a/perl/t/tracks/nearest/db/hg19/raw/refSeq/hg19.refGene.chrM b/perl/t/tracks/nearest/db/hg19/raw/refSeq/hg19.refGene.chrM deleted file mode 100644 index aa581ae67..000000000 --- a/perl/t/tracks/nearest/db/hg19/raw/refSeq/hg19.refGene.chrM +++ /dev/null @@ -1,8 +0,0 @@ -name chrom strand txStart txEnd cdsStart cdsEnd exonCount exonStarts exonEnds score name2 kgID mRNA spID spDisplayID refseq protAcc description rfamAcc tRnaName ensemblID geneSymbol -NR_137295 chrM + 1672 3230 3230 3230 1 1672, 3230, 0 RNR2 NA NA NA NA NA NA NA NA NA NA NA -NR_FAKE1 chrM + 2000 2300 3230 3230 1 1672, 3230, 0 RNR2_FAKE1 NA NA NA NA NA NA NA NA NA NA NA -NR_FAKE2 chrM + 2200 3400 3230 3230 1 1672, 3230, 0 RNR2_FAKE2 NA NA NA NA NA NA NA NA NA NA NA -NR_FAKE3 chrM + 3800 4000 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA -NR_FAKE3B chrM + 3800 4100 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA -NR_FAKE3C chrM + 3800 4500 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA -NR_FAKE4 chrM + 4300 5000 3810 3900 1 1672, 3230, 0 FAKE4 NA NA NA NA NA NA NA NA NA NA NA \ No newline at end of file diff --git a/perl/t/tracks/nearest/integration.t b/perl/t/tracks/nearest/integration.t deleted file mode 100644 index 32cb4b026..000000000 --- a/perl/t/tracks/nearest/integration.t +++ /dev/null @@ -1,540 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; - -use Mouse; -extends 'Seq::Base'; - -1; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ PrepareConfigWithTempdirs /; - -use Path::Tiny; -use Scalar::Util qw/looks_like_number/; - -# create temp directories -my $dir = Path::Tiny->tempdir(); - -# prepare temp directory and make test config file -my $config_file = PrepareConfigWithTempdirs( - 't/tracks/nearest/test.yml', - 't/tracks/nearest/db/hg19/raw', - [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify -); - -my $baseMapper = Seq::Tracks::Reference::MapBases->new(); - -# Defines three tracks, a nearest gene , a nearest tss, and a region track -# The region track is simply a nearest track for which we storeOverlap and do not storeNearest -# To show what happens when multiple transcripts (as in NR_FAKE3, NR_FAKE3B, NR_FAKE3C) -# all share 100% of their data, except have different txEnd's, which could reveal issues with our uniqueness algorithm -# such as calculating the maximum range of the overlap: in previous code iterations -# we removed the non-unique overlapping data, without first looking at the txEnd -# and therefore had a smaller-than-expected maximum range -my $seq = MockBuilder->new_with_config( { config => $config_file } ); - -my $tracks = $seq->tracksObj; -my $refBuilder = $tracks->getRefTrackBuilder(); -my $nearestTssBuilder = $tracks->getTrackBuilderByName('refSeq.nearestTss'); -my $nearestBuilder = $tracks->getTrackBuilderByName('refSeq.nearest'); -my $geneBuilder = $tracks->getTrackBuilderByName('refSeq.gene'); -my $nearestGetter = $tracks->getTrackGetterByName('refSeq.nearest'); -my $nearestTssGetter = $tracks->getTrackGetterByName('refSeq.nearestTss'); -my $geneGetter = $tracks->getTrackGetterByName('refSeq.gene'); - -my $db = Seq::DBManager->new(); - -$refBuilder->buildTrack(); -$nearestTssBuilder->buildTrack(); -$nearestBuilder->buildTrack(); -$geneBuilder->buildTrack(); - -### We have: -#name chrom strand txStart txEnd -#NR_137295 chrM + 1672 3230 -#NR_FAKE1 chrM + 2000 2300 -#NR_FAKE2 chrM + 2200 3400 - -my $mainDbAref = $db->dbReadAll('chrM'); -my $regionDataAref = $db->dbReadAll('refSeq.nearest/chrM'); -my $hasNearestCount = 0; -my $hasNearestTssCount = 0; -my $nearestDbName = $nearestBuilder->dbName; -my $nearestTssDbName = $nearestTssBuilder->dbName; -my $geneDbName = $geneBuilder->dbName; - -for my $dbData (@$mainDbAref) { - if ( defined $dbData->[$nearestDbName] ) { - $hasNearestCount++; - } - - if ( defined $dbData->[$nearestTssDbName] ) { - $hasNearestTssCount++; - } -} - -ok( $hasNearestCount == @$mainDbAref ); -ok( $hasNearestTssCount == @$mainDbAref ); - -# Want to make sure that order is always preserved -my %map = ( - NR_137295 => 'RNR2', - NR_FAKE1 => 'RNR2_FAKE1', - NR_FAKE2 => 'RNR2_FAKE2', - NR_FAKE3 => 'FAKE3', - NR_FAKE3B => 'FAKE3', - NR_FAKE3C => 'FAKE3', - NR_FAKE4 => 'FAKE4', -); - -for my $pos ( 0 .. $#$mainDbAref ) { - my $dbData = $mainDbAref->[$pos]; - - my @out; - my $nGeneData = $nearestGetter->get( $dbData, 'chrM', 'C', 'A', 0, \@out, $pos ); - - my @outTss; - my $nTssGeneData = - $nearestTssGetter->get( $dbData, 'chrM', 'C', 'A', 0, \@outTss, $pos ); - - my @outGene; - my $geneData = $geneGetter->get( $dbData, 'chrM', 'C', 'A', 0, \@outGene, $pos ); - - # config features order is name, name2, and then we add dist in the 3rd position - # so 0 == name 1 == name2 2 == dist - my $name = $out[0][0]; - my $name2 = $out[1][0]; - my $dist = $out[2][0]; - - my $nameTss = $outTss[0][0]; - my $name2Tss = $outTss[1][0]; - my $distTss = $outTss[2][0]; - - # This track we specify should not have a distance feature in YAML - # Only feature is name2 and we're not calculating a distnace - # so array should have only 1 value per position entry - ok( !@outGene || @{ $outGene[0] } == 1 ); - - my $name2Gene = $outGene[0][0]; - - # check that order preserved - if ( ref $name ) { - my $i = -1; - for my $n (@$name) { - $i++; - my $expected = $map{$n}; - - # If we have an array, we expect all other values (except dist) - # to be in array form - # ie - # if name = [val1, val2] - # name2 = [name2] even when there are no other unique name2 values - # name2 = [name2_forVal1, name2_forVal2] when there is more than one unique value - # This is done because features aren't guaranteed to be scalar - # Let's say we have a feautre for tissues the gene is expressed in - # for the above, when all tissues identical between transcripts, we can, without - # loss of information, compress to - # expression = [[kidney, spleen, pons, medulla]] - # whereas - # expression = [kidney, spleen, pons, medulla] would be completely incorrect - # More detail in lib/Seq/Tracks/Nearest/Build.pm - my $actual; - if ( @{$name2} > 1 ) { - $actual = $name2->[$i]; - } - else { - $actual = $name2->[0]; - } - - ok( $actual eq $expected ); - } - } - else { - my $expected = $map{$name}; - ok( $name2 eq $expected ); - } - - if ( ref $nameTss ) { - my $i = -1; - for my $n (@$nameTss) { - $i++; - my $expected = $map{$n}; - - # If we have an array, we expect all other values (except dist) - # to be in array form - # ie - # if name = [val1, val2] - # name2 = [name2] even when there are no other unique name2 values - # name2 = [name2_forVal1, name2_forVal2] when there is more than one unique value - # This is done because features aren't guaranteed to be scalar - # Let's say we have a feautre for tissues the gene is expressed in - # for the above, when all tissues identical between transcripts, we can, without - # loss of information, compress to - # expression = [[kidney, spleen, pons, medulla]] - # whereas - # expression = [kidney, spleen, pons, medulla] would be completely incorrect - # More detail in lib/Seq/Tracks/Nearest/Build.pm - my $actual; - if ( @{$name2Tss} > 1 ) { - $actual = $name2Tss->[$i]; - } - else { - $actual = $name2Tss->[0]; - } - - ok( $actual eq $expected ); - } - } - else { - my $expected = $map{$nameTss}; - ok( $name2Tss eq $expected ); - } - - # An intergenic position - # In all of these tests, we assume that the end position (here txEnd) is 0-based - # This should be ensured by the nearest builder function - if ( $pos <= 1672 ) { - ok( $dist == 1672 - $pos ); - ok( $name eq 'NR_137295' ); - ok( $name2 eq 'RNR2' ); - - # for intergenic stuff, all should be identical between - # nearest tracks that go from .. to and those with just one endpoint (from) - ok( $distTss == $dist ); - ok( $nameTss eq $name ); - ok( $name2Tss eq $name2 ); - } - - # 2000 here is the txStart of NR_FAKE1, so -1 that is the last non-NR_FAKE1 base - # NR_137295 chrM + 1672 3230 3230 3230 1 1672, 3230, 0 RNR2 NA NA NA NA NA NA NA NA NA NA NA - if ( $pos >= 1672 && $pos < 2000 ) { - ok( $dist == 0 ); - ok( $name eq 'NR_137295' ); - ok( $name2 eq 'RNR2' ); - - # We always expect our gene track to provide scalars, when there is a single unique - # value, since we can do so without any ambiguity (since 1 feature only) - # The uniquness algorithm should do so only when that is possible - ok( $name2Gene eq 'RNR2' ); - - # for spaces between two from points of adjacent transcripts/regions - # nearest tracks of only 'from' coordinate should treat these as essentially - # intergenic - # The midoint should be calcualted as txStartPrevious + txStartPrevious ... txStartNext / 2 - # since txStart is 0-based closed - # In reality we round off, such that the actual midpoint is assigne to the downstream values - # when we consider - if ( $pos < 1672 + ( 2000 - 1672 ) / 2 ) { - ok( $distTss == 1672 - $pos ); - ok( $nameTss eq 'NR_137295' ); - ok( $name2Tss eq 'RNR2' ); - } - else { - ok( $distTss == 2000 - $pos ); - ok( $nameTss eq 'NR_FAKE1' ); - ok( $name2Tss eq 'RNR2_FAKE1' ); - } - } - - # 2000 is the txStart of NR_FAKE1 - # 2200 here is the txStart of NR_FAKE2, the next closest transcript when measured by txStart - if ( $pos >= 2000 && $pos < 2200 ) { - ok( $dist == 0 ); - ok( join( ",", @$name ) eq 'NR_137295,NR_FAKE1' ); - ok( join( ",", @$name2 ) eq 'RNR2,RNR2_FAKE1' ); - - # same as nGene within the gene when storeNearest is false - ok( join( ",", @$name2Gene ) eq 'RNR2,RNR2_FAKE1' ); - # ok(!defined $distGene); - - # for spaces between two from points of adjacent transcripts/regions - # nearest tracks of only 'from' coordinate should treat these as essentially - # intergenic - my $midPoint = 2000 + ( 2200 - 2000 ) / 2; - if ( $pos < $midPoint ) { - ok( $distTss == 2000 - $pos ); - ok( $nameTss eq 'NR_FAKE1' ); - ok( $name2Tss eq 'RNR2_FAKE1' ); - } - else { - ok( $distTss == 2200 - $pos ); - ok( $nameTss eq 'NR_FAKE2' ); - ok( $name2Tss eq 'RNR2_FAKE2' ); - } - } - - # 2300 is txEnd, so -1 for last pos in RNR2_FAKE1 - # This is the closest tx when measured by txStart .. txEnd - # In this range, up to 3 tx overlap whn looking txStart .. txEnd - # NR_137295 chrM + 1672 3230 3230 3230 1 1672, 3230, 0 RNR2 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE1 chrM + 2000 2300 3230 3230 1 1672, 3230, 0 RNR2_FAKE1 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE2 chrM + 2200 3400 3230 3230 1 1672, 3230, 0 RNR2_FAKE2 NA NA NA NA NA NA NA NA NA NA NA - if ( $pos >= 2200 && $pos < 2300 ) { - ok( $dist == 0 ); - ok( - join( ",", sort { $a cmp $b } @$name ) eq - join( ",", sort { $a cmp $b } 'NR_137295', 'NR_FAKE1', 'NR_FAKE2' ) ); - ok( - join( ",", sort { $a cmp $b } @$name2 ) eq - join( ',', sort { $a cmp $b } 'RNR2', 'RNR2_FAKE1', 'RNR2_FAKE2' ) ); - - # same as nGene within the gene when storeNearest is false - ok( - join( ",", sort { $a cmp $b } @$name2Gene ) eq - join( ",", sort { $a cmp $b } 'RNR2', 'RNR2_FAKE1', 'RNR2_FAKE2' ) ); - # ok(!defined $distGene); - - #For single-point (from) nearest tracks, this case is equivalent to being - #intergenic (or on top of @2200) and past/on top of the last transcript (NR_FAKE2) - ok( $distTss == 2200 - $pos ); - ok( $nameTss eq 'NR_FAKE2' ); - ok( $name2Tss eq 'RNR2_FAKE2' ); - } - - # 3230 is 0-based open interval (txEnd), so -1 that is the end of RNR2 - # This is the next closest point by txEnd, and in this interval up to 2 transcripts overlap - # when measured by txStart .. txEnd - 1 - # NR_137295 chrM + 1672 3230 3230 3230 1 1672, 3230, 0 RNR2 NA NA NA NA NA NA NA NA NA NA NA - # -> can't overlap, since ends at 2300 - 1 == 2299: NR_FAKE1 chrM + 2000 2300 3230 3230 1 1672, 3230, 0 RNR2_FAKE1 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE2 chrM + 2200 3400 3230 3230 1 1672, 3230, 0 RNR2_FAKE2 NA NA NA NA NA NA NA NA NA NA NA - if ( $pos >= 2300 && $pos < 3230 ) { - ok( $dist == 0 ); - ok( - join( ",", sort { $a cmp $b } @$name ) eq - join( ",", sort { $a cmp $b } 'NR_137295', 'NR_FAKE2' ) ); - ok( - join( ",", sort { $a cmp $b } @$name2 ) eq - join( ',', sort { $a cmp $b } 'RNR2', 'RNR2_FAKE2' ) ); - - # We don't guarantee transcript order atm, but all features will be correct relative to - # all transcripts - ok( - join( ",", sort { $a cmp $b } @$name2Gene ) eq - join( ",", sort { $a cmp $b } 'RNR2', 'RNR2_FAKE2' ) ); - # ok(!defined $distGene); - - # For txStart, these flank 2300 - 3230 - # NR_FAKE2 chrM + 2200 3400 3230 3230 1 1672, 3230, 0 RNR2_FAKE2 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE3 chrM + 3800 4000 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE3B chrM + 3800 4100 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE3C chrM + 3800 4500 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA - - if ( $pos < 2200 + ( 3800 - 2200 ) / 2 ) { - #For single-point (from) nearest tracks, this case is equivalent to being - #intergenic (or on top of @2200) and past/on top of the last transcript (NR_FAKE2) - ok( $distTss == 2200 - $pos ); - ok( $nameTss eq 'NR_FAKE2' ); - ok( $name2Tss eq 'RNR2_FAKE2' ); - } - else { - #For single-point (from) nearest tracks, this case is equivalent to being - #intergenic (or on top of @2200) and past/on top of the last transcript (NR_FAKE2) - ok( $distTss == 3800 - $pos ); - ok( - join( ';', sort { $a cmp $b } @$nameTss ) eq - join( ';', sort { $a cmp $b } 'NR_FAKE3', 'NR_FAKE3B', 'NR_FAKE3C' ) ); - - # when multiple transcripts overlap AND there is at least one non-scalar value, - # all data is represented in array form to help distinguish between multiple - # overlapping transcripts, and one transcript with some features that contain multiple values - # or multiple transcripts with a mix of unique and non-unique features - ok( @$name2Tss == 1 && $name2Tss->[0] eq 'FAKE3' ); - } - } - - # Between 3230 txStart and 3400 - 1 txEnd (since the 3400 is 0-based, open) - # NR_137295 chrM + 1672 3230 3230 3230 1 1672, 3230, 0 RNR2 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE1 chrM + 2000 2300 3230 3230 1 1672, 3230, 0 RNR2_FAKE1 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE2 chrM + 2200 3400 3230 3230 1 1672, 3230, 0 RNR2_FAKE2 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE3 chrM + 3800 4000 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA - # So we have 1672, 2000, 2200 txStarts - # Closest to 3230 - 3400 is 2200, or 3800 - # Midpoint is 3800-2200 / 2 = +800, or 3000, so it's actually always close to NR_FAKE3 - if ( $pos >= 3230 && $pos < 3400 ) { - ok( $dist == 0 ); - ok( $name eq 'NR_FAKE2' ); - ok( $name2 eq 'RNR2_FAKE2' ); - - ok( $name2Gene eq $name2 ); - # ok(!defined $distGene); - - #For single-point (from) nearest tracks, this case is equivalent to being - #intergenic (or on top of @2200) and past/on top of the last transcript (NR_FAKE2) - ok( $distTss == 3800 - $pos ); - ok( - join( ",", sort { $a cmp $b } @$nameTss ) eq - join( ",", sort { $a cmp $b } 'NR_FAKE3', 'NR_FAKE3B', 'NR_FAKE3C' ) ); - ok( join( ",", @$name2Tss ) eq 'FAKE3' ); - } - - # Testing that if multiple transcripts share a start, but not an end, that 1) we consider the intergenic region - # by the longest end - # And, that we don't consider the overlap incorrectly by modifying the end to the longest end... - # that is critical of course - # NR_FAKE2 chrM + 2200 3400 3230 3230 1 1672, 3230, 0 RNR2_FAKE2 NA NA NA NA NA NA NA NA NA NA NA - #then ... - # NR_FAKE3 chrM + 3800 4000 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE3B chrM + 3800 4100 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE3C chrM + 3800 4500 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE4 chrM + 4300 5000 3810 3900 1 1672, 3230, 0 FAKE4 NA NA NA NA NA NA NA NA NA NA NA - if ( $pos >= 3400 && $pos < 3800 ) { - # nearest is from txStart to txEnd, so use end to calc distance - - # for refSeq.gene, we don't consider intergenice - # ok(!defined $distGene); - # Relevant: - # NR_FAKE2 chrM + 2200 3400 3230 3230 1 1672, 3230, 0 RNR2_FAKE2 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE3 chrM + 3800 4000 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE3B chrM + 3800 4100 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA - # NR_FAKE3C chrM + 3800 4500 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA - ok( !defined $name2Gene ); - - if ( $pos < 3399 + ( 3800 - 3399 ) / 2 ) { - ok( $dist = 3399 - $pos ); - ok( $name eq 'NR_FAKE2' ); - ok( $name2 eq 'RNR2_FAKE2' ); - } - else { - ok( $dist = 3800 - $pos ); - ok( - join( ",", sort { $a cmp $b } @$nameTss ) eq - join( ",", sort { $a cmp $b } 'NR_FAKE3', 'NR_FAKE3B', 'NR_FAKE3C' ) ); - ok( join( ",", @$name2 ) eq 'FAKE3' ); - } - - if ( $pos < 2200 + ( 3800 - 2200 ) / 2 ) { - # should never appear here - ok( $nameTss eq 'NR_FAKE2' ); - ok( $name2Tss eq 'RNR2_FAKE2' ); - # for nearestTss, only txStart is considered of NR_FAKE2 - ok( $distTss == 2200 - $pos ); - } - else { - ok( - join( ",", sort { $a cmp $b } @$nameTss ) eq - join( ",", sort { $a cmp $b } 'NR_FAKE3', 'NR_FAKE3B', 'NR_FAKE3C' ) ); - ok( join( ",", @$name2Tss ) eq 'FAKE3' ); - # for nearestTss, only txStart is considered of NR_FAKE2 - ok( $distTss == 3800 - $pos ); - } - } - - if ( $pos >= 3800 && $pos < 5000 ) { - if ( $pos < 4000 ) { - ok( $dist == 0 ); - ok( - join( ",", sort { $a cmp $b } @$name ) eq - join( ",", sort { $a cmp $b } 'NR_FAKE3', 'NR_FAKE3B', 'NR_FAKE3C' ) ); - ok( join( ",", @$name2 ) eq 'FAKE3' ); - - # name2 gene will have no array values at all here; it's equivalent of only 1 gene - # since we're not recording any per-transcript info - # so no array - # TODO: this may be somewhat confusing... - ok( $name2Gene eq 'FAKE3' ); - - #For single-point (from) nearest tracks, this case is equivalent to being - #intergenic (or on top of txStart) - ok( $distTss == 3800 - $pos ); - ok( - join( ",", sort { $a cmp $b } @$nameTss ) eq join( ",", sort { $a cmp $b } @$name ) - ); - ok( join( ",", @$name2Tss ) eq join( ",", @$name2 ) ); - } - - if ( $pos >= 4000 && $pos < 4100 ) { - ok( $dist == 0 ); - ok( - join( ",", sort { $a cmp $b } @$name ) eq - join( ",", sort { $a cmp $b } 'NR_FAKE3B', 'NR_FAKE3C' ) ); - # We de-dup, to unique values - ok( join( ",", @$name2 ) eq 'FAKE3' ); - - # if all values can be represented as scalars they are, else all represented - # as arrays - ok( $name2Gene eq 'FAKE3' ); - } - - # midopint to NR_FAKE4 chrM + 4300 5000 3810 3900 1 1672, 3230, 0 FAKE4 NA NA NA NA NA NA NA NA NA NA NA - if ( $pos < 3800 + ( 4300 - 3800 ) / 2 ) { - #For single-point (from) nearest tracks, this case is equivalent to being - #intergenic (or on top of txStart) - ok( $distTss == 3800 - $pos ); - ok( - join( ",", sort { $a cmp $b } @$nameTss ) eq - join( ",", sort { $a cmp $b } 'NR_FAKE3', 'NR_FAKE3B', 'NR_FAKE3C' ) ); - ok( join( ",", @$name2Tss ) eq join( ",", @$name2 ) ); - } - else { - ok( $distTss == 4300 - $pos ); - ok( $nameTss eq 'NR_FAKE4' ); - ok( $name2Tss eq 'FAKE4' ); - } - - # end of NR_FAKE3B chrM + 3800 4100 - # to beginning of NR_FAKE3C chrM + 3800 4500 - if ( $pos >= 4100 ) { - if ( $pos < 4300 ) { - ok( $dist == 0 ); - ok( $name eq 'NR_FAKE3C' ); - ok( $name2 eq 'FAKE3' ); - - # ok($distGene == 0); - ok( $name2Gene eq $name2 ); - } - - # Within txEnd bound of NR_FAKE3C chrM + 3800 4500 3810 3900 1 1672, 3230, 0 FAKE3 NA NA NA NA NA NA NA NA NA NA NA - # and txStart bound of NR_FAKE4 chrM + 4300 5000 3810 3900 1 1672, 3230, 0 FAKE4 NA NA NA NA NA NA NA NA NA NA NA - if ( $pos >= 4300 && $pos < 4500 ) { - ok( $dist == 0 ); - ok( - join( ",", sort { $a cmp $b } @$name ) eq - join( ",", sort { $a cmp $b } 'NR_FAKE3C', 'NR_FAKE4' ) ); - ok( - join( ",", sort { $a cmp $b } @$name2 ) eq - join( ",", sort { $a cmp $b } 'FAKE3', 'FAKE4' ) ); - - # ok($distGene == 0); - ok( - join( ",", sort { $a cmp $b } @$name2Gene ) eq - join( ",", sort { $a cmp $b } @$name2 ) ); - } - - if ( $pos >= 4500 ) { - ok( $dist == 0 ); - ok( $name eq 'NR_FAKE4' ); - ok( $name2 eq 'FAKE4' ); - - # if(!defined $distGene) { - # p @outGene; - # } - # ok($distGene == 0); - ok( $name2Gene eq $name2 ); - } - } - } - - # Intergenic, after the end - if ( $pos >= 5000 ) { - # 5000 is the end + 1 of the 0-based end - ok( $dist == 4999 - $pos ); - ok( $name eq 'NR_FAKE4' ); - ok( $name2 eq 'FAKE4' ); - - #For single-point (from) nearest tracks, this case is equivalent to being - #intergenic (or on top of @2200) and past/on top of the last transcript (NR_FAKE2) - ok( $distTss == 4300 - $pos ); - ok( $nameTss eq 'NR_FAKE4' ); - ok( $name2Tss eq 'FAKE4' ); - - ok( !defined $name2Gene ); - # ok(!defined $distGene); - } -} - -done_testing(); diff --git a/perl/t/tracks/nearest/test.yml b/perl/t/tracks/nearest/test.yml deleted file mode 100644 index 9503b6a16..000000000 --- a/perl/t/tracks/nearest/test.yml +++ /dev/null @@ -1,94 +0,0 @@ ---- -assembly: hg19 -build_author: ec2-user -build_date: 2017-11-27T05:44:00 -chromosomes: -- chr1 -- chr2 -- chr3 -- chr4 -- chr5 -- chr6 -- chr7 -- chr8 -- chr9 -- chr10 -- chr11 -- chr12 -- chr13 -- chr14 -- chr15 -- chr16 -- chr17 -- chr18 -- chr19 -- chr20 -- chr21 -- chr22 -- chrM -- chrX -- chrY -database_dir: t/tracks/nearest/db/hg19/index -files_dir: t/tracks/nearest/db/hg19/raw -snpProcessor: bystro-snp -statistics: - dbSNPnameField: dbSNP.name - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tsv - tab: .statistics.tsv - programPath: bystro-stats - refTrackField: ref - siteTypeField: refSeq.siteType -temp_dir: /mnt/annotator/bystro-dev/tmp -tracks: - tracks: - - build_author: ec2-user - build_date: 2017-11-27T05:44:00 - local_files: - - chrM.fa.gz - name: ref - type: reference - - features: - - kgID - - mRNA - - spID - - spDisplayID - - protAcc - - description - - rfamAcc - - name - - name2 - local_files: - - hg19.refGene.chrM - name: refSeq - type: gene - - dist: true - features: - - name - - name2 - from: txStart - name: refSeq.nearest - ref: refSeq - to: txEnd - type: nearest - - dist: true - features: - - name - - name2 - from: txStart - name: refSeq.nearestTss - ref: refSeq - type: nearest - - dist: false - features: - - name2 - from: txStart - to: txEnd - name: refSeq.gene - ref: refSeq - storeNearest: false - type: nearest -vcfProcessor: bystro-vcf - diff --git a/perl/t/tracks/reference/db/raw/refTest/chrM.fa.gz b/perl/t/tracks/reference/db/raw/refTest/chrM.fa.gz deleted file mode 100644 index eaebd9e159e4c22de51ac20439688a75a3604d07..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5549 zcmV;e6;kRSiwFpBw&79$17m1%O)h3(08N{*awNNsM16kLMj&9o+GpVh6f%144gG&S zB2R)n_s(dfR#z1X0*TBd`+t7_@BjbzRCGp0RcGtloPRs+H<9ymRYl$J`o7S0|DNvq z?)#qZoU2OzwSJkE_l7ZV>-o0km98_lNZrT}xEJ2<=c=6V@4fDI_YwE1d1~ij7(IVe zm-W5ZeM^LXF+;u&xc}bo=jV+3vVf{@%rg9+VZVm9;HOLU+4T0s;-H!QxLXSU`y{~SeT!g6Q1 z!JVJyqGFwNOW)>Rb664&EcmLx%zK)!&h*nWs2k*YZQhZLegD_0F)VH}Zx5!giFiI0 zZzMCU*2+%0%q6X*j5y`T=EQu~WZqaxEv1@Ye!*b5AcD6}?x>VC76s0mU2eNwCCdaNy%Tzdf%$aWwL-7hEa1eXh2CX zPKf8O#chQSq+2+n=38aJ4RW)T!o_0vF{7=16+NZldeHO*zmfd{94t6tPY{0mG4rz5d?58xwdO@=jI+%$sl95@V14`xtV&$vr87tsF-@G2?!YzcqS6&shSLmCiE>`Mr9CEV zQFKWK9h4-ZMSo$KWt1@n`3eRYMly{~IR+w-8w@Ko=`RZs*isO8_31hd5~#kpXLGA$ zsTEWe+I@-y__DK`r4Fb!~iTVSCQU#0)^^ZES!e13oae$LO|@6Yew z-{K0o(|`!{nZO|(eLMc^st|MmwD`cb?&Br@0;flbI-Zad=V!+U_3Q$is!V3aD?FmV1QgL z38ZgRO$xD6a@R+R07@U237xVxxKuh>Wkz5&dapc-4J2byIO>`TvKlSvBdi+eQv#}V zlBO8$(PB`s!_>N!df@@%+N~RS1qV~(vCp0O*$Yoc8Xf>h`>nZ}&8rZ!3 zKTB!hCF&Hl{YEt(cf>NF?eB_hP;RUtxOHpIP>g(rmJ266f=G zZYKPaGe1A%rBZG3=TGmOOq_K2RUY0yXVUBb;aQM<6Wj$7ftY?PP`sE7l~nd*t`rG) zbUB^0rb|MOsQ~TR^u8=#wjV0+9l zysZxRBw=gyPk*(6UoxLHyC7j5C3T`+7H!1g3u&rL6iVfn7O<9_n*2Nn`XUMya~G?D>F8|cAQ>*ZXGIHai5dR4cvs&!V(4- zuJ7Dn-Z32_UkTwWDjJ|D&8Hx~TCVBOF*zpDQocvgx7q}md$l>2=6awnmZ*;uy9tr0 zMjZz*j|d&hO&?3&QtuE@kZ(+XltrgwxTL(zfznjKQXx_OrP_B;H1yo^>ZJ*7p=vQR zdcVAU?Lf;`=$DkevqId?4Nh8|@ga8)QXA;!XTcNBdEmnVxfT zW%|{*;4;m}ul@e0c}=JLO!xXxUz&b(-=w$RUP1eTluihfQ-Oz@NLmEXV> zm>V9Ft!gC&9LZpyG1sV96l%B4jk1Td;$@o{5EU3i?Gn@DA`qAKKr>a|VLI2xkP}`< zslU~U5zAi4CVI*-rNvMX!4eRJ88Lw|g zY(-+nq`bbxIZB-EVBuukOWYr%#NpTp!(Y}B%bnJQ zq?Y0v>;2x3-A)nikKkC^aC`cc#ZHGIP1InXw~O^F6_Q2di(qFgH#N85`t$16rh!7a zY((g`h|SulDtnX1%C}cIG>&JQG1w6pv~2u_LahR}D%q}nYE;q}@p;m^yxqEK8P*}s zWwm(I#m6&f4?*JfA+1}NwWuF?)$$lwuj4&MPWS0SiL5z(%d%OR&aV5SZmFBx*chd2)8YaS7+T)7e%_DD*Wdml6x55IBBZ_ zKr1_L;oSjuj^%QPx~*DpGB$!Py^SN+EQ(?S0w1Z}x0ET*CS0YcQc2jws$m)J{k*ea zMAYgwezo3%*@3ixq*&aWLeaj{q(s6BS3YBDO_UR~%>4z@6=oZumkTdt+q7A!#M(khWJEM`!LvriKchNfW)NZPqO*LLnXJBvt>L6)ve4A; z=_gWR71y#5bepX>2U8q|gs_^!OSCx(tF`LV&vKlk#mJYHt1$M&e>HT}YKZ&gm`m*B z=v{0*F%y7x>t}H6u|x`0)7;L1BkT@`W0UgNK|h9ty~Y_rdwe2Y;SRu-T8{rS^(xM_dCRMDDzz>VREz?YQx?$p>W zx6O-DVeUtB!95zZq=eimYUzlxgY1D{fOe@~NpRD;{?(PT3f)|*N)=?bLZLNgq`Bf8 zifuV>-Y8|kU_gA6pEQ7mMMdTA0f}1kxG%w?k2#UUCPBvHW*zVUXtU|{5QF^}jB*Li zxIL&9*ENHgU*n~xj`qHlop@PHcDI^-SfoY)wD4 z`vxRBIczeU4b6o^qTfIYLH;H$colXz@vzmXn@CGW;T}4YkQE}W!fS-DugPlVKd&Y z*In9sUx}O*3)0JdDNtQc6%B{leBu zm`sKQFVdbx&u#y#37RpyBf>d`+h5rD`NvA9K+LP#gb7u{kECTDucJ zMm9%(!uSy#bJe9BZ|_h?fPgjb~a-JDRmUDs24i;g@69Y40xE1gLUD;yz7B zQ6WoKrB%R^;x>AgJ=|Wg>TM2s!k+Q)pq(++a4WPD4zsNRlZxOGZIe3gjfSw*$qj;Y zkcs*<)!#CPX+t(=mnVFFlKb1#pcfl?%BT(jKV8qTb}=u*pu1To&>v_Px$ne-wxtu_ z_6Q=oo40!%-$OcoaJi<;2-i!@|qQv%q}&HmS|W zHXHqJfrU>5I{E>Xp{EUXXv5TrC^R1hNKJMje;h;z;*yE7^v3p16v!)a!Eh)XRmF^>Yebb!hm9=TgK5B{~RbXa4kX~x;T zrEQXCQ#2xTYi~C2s5)$x0twm#e`dI>f;RC6bV)B zH5|O)2%jO1=?Axv&#f0)c{k7^%@tJf$MfTM-0=P1kWD6HkGZThB(z}E53IqX|*5hrxImTJ_) zES&peBNWxrHm{*n<&B+(3+&0AetRCythA*T-!)vUY;%}mvQOA7z^wXwG<^S@e|A|{ z*s*Ne2HJ!H4mQWLea=i@>MM@7XRQpQh_&2@P_B2es;Bok9#Xsst=sb;);^c2y~!j514#Y>LjT;Xn}X(n$CieQyH zx~w}TW%`3`rTaWxr-dt)SHhTRG&aw8TN2+eekrOD1`9}t_QWY zW5niOo0k~_!-pt5^YXq+{OfRh+dt)V)`J8uO>hiuu`e+v!|vIBYIABuebl!5*Ko0k zBF^5xlrT)m7PE6x+cG#B4)fZ0vZ>e4VCyx1D&IKy@Ygy(R>=7uESN{)p_9(a29 zG5!gqg;+0t->fF^#~dtP3gvf5@Ruf~y`b#9J~8mrx~m$z$c-rsE6&=>@#aCK%u$!& zXv)+rXGjoYxfy8S%2k^?N`ULI0}5`38Ke8xtKGefPIq8-;l+x6Tu~KVYq4#(wJ{WX zm@^h1y)J1V@ zwnc0QGqxK?&T_`yG^iD7f$w3y@Krd+N}zlY0oDSUiPvl+ZA+#S#zR^n2-_gHUVaBB zNjU*?*8V8(;`E9i?~cAbbNtAl_VGF4$20;ElCrWQ-2!awisq9yl4*Mg#vASZKonkH z{zR%Xg`;3TY2y$7u4wR41c)NXan_f1Q{-@Q_qNKgM>|e$deCePEy~0AdV$fwRF;Y- zS6Qz;ZEZ5-ObQ%kcY_o8Xpya^EidOml3L5JTU*b)P_=C5F{b_|k)dB7WJ_K$zx7NO zX?kVdrh4$vKSN~Mr{3=R21yxcO*(z5bG?V)piA0UjkkO&%bol-jdOniBK_Nz!(9g3 zlM@jgwhd{>(a-L(72?^GQQP>!2h8HzJj!kMS)swlyf|wOPbf<{%4w-WzpwNrKFnf3 z>D{vcC6(VembI43VAU}Yxxv*_#2lj9&yQg-s6nt%uOQw*(IHV#>1i< zcw#Rt-QLpM2B#y>kmdl(pFhg=$3YhTvnS?x@$aLr{(#{s9Czk+KV4RlCi7Io$sN@j z9P+Ls*GD#g15r3Q?ZT`baP96Q4P5Y7-lm|7y{z|{0MI#?>9Oh2>-z91o9|(Nm3;e* v1SU2)4AxPyVb6a7zUY~a3qk+@a7P_M diff --git a/perl/t/tracks/reference/integration.t b/perl/t/tracks/reference/integration.t deleted file mode 100644 index 6c88d083b..000000000 --- a/perl/t/tracks/reference/integration.t +++ /dev/null @@ -1,74 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; - -use Mouse; -extends 'Seq::Base'; - -1; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ PrepareConfigWithTempdirs /; - -use Path::Tiny; -use Scalar::Util qw/looks_like_number/; -use YAML::XS qw/ LoadFile /; - -# create temp directories -my $dir = Path::Tiny->tempdir(); - -# prepare temp directory and make test config file -my $config_file = PrepareConfigWithTempdirs( - 't/tracks/reference/integration.yml', - 't/tracks/reference/db/raw', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify -); - -# get chromosomes that are considered -my $runConfig = LoadFile($config_file); -my %wantedChr = map { $_ => 1 } @{ $runConfig->{chromosomes} }; - -my $seq = MockBuilder->new_with_config( { config => $config_file } ); - -my $tracks = $seq->tracksObj; -my $refBuilder = $tracks->getRefTrackBuilder(); -my $refGetter = $tracks->getRefTrackGetter(); -my $db = Seq::DBManager->new(); - -$refBuilder->buildTrack(); - -my @localFiles = @{ $refBuilder->local_files }; - -for my $file (@localFiles) { - my $fh = $refBuilder->getReadFh($file); - my ( $chr, $pos ); - - while (<$fh>) { - chomp; - - if ( $_ =~ m/>(\S+)/ ) { - $chr = $1; - $pos = 0; - next; - } - - if ( !$wantedChr{$chr} ) { - next; - } - - for my $base ( split '', $_ ) { - my $data = $db->dbReadOne( $chr, $pos ); - my $out = []; - my $dbBase = $refGetter->get($data); - - ok( uc($base) eq $dbBase ); - - $pos++; - } - } -} - -done_testing(); diff --git a/perl/t/tracks/reference/integration.yml b/perl/t/tracks/reference/integration.yml deleted file mode 100644 index 6f874a240..000000000 --- a/perl/t/tracks/reference/integration.yml +++ /dev/null @@ -1,15 +0,0 @@ ---- -assembly: hg19 -build_author: ec2-user -build_date: 2017-11-27T05:44:00 -chromosomes: -- chrM -database_dir: t/tracks/reference/db/index -files_dir: t/tracks/reference/db/raw -tracks: - tracks: - - type: reference - name: refTest - local_files: - - '*' - diff --git a/perl/t/tracks/score/build/rounder.t b/perl/t/tracks/score/build/rounder.t deleted file mode 100644 index 4fd8da72a..000000000 --- a/perl/t/tracks/score/build/rounder.t +++ /dev/null @@ -1,32 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; - -use Seq::Tracks::Score::Build::Round; - -plan tests => 4; - -my $scalingFactor = 1000; -my $rounder = - Seq::Tracks::Score::Build::Round->new( { scalingFactor => $scalingFactor } ); - -say "\n Testing rounder functionality \n"; - -ok( - $rounder->round(.068) / $scalingFactor == .068, - "as long as enough precision, no rounding" -); -ok( - $rounder->round(.063) / $scalingFactor == .063, - "as long as enough precision, no rounding" -); -ok( - $rounder->round(.065) / $scalingFactor == .065, - "as long as enough precision, no rounding" -); -ok( - $rounder->round(-0.475554) / $scalingFactor == -0.476, - "rounds beyond scaling factor precision to nearest digit" -); diff --git a/perl/t/tracks/score/db/raw/phastCons/chrM.phastCons100way.wigFix.gz b/perl/t/tracks/score/db/raw/phastCons/chrM.phastCons100way.wigFix.gz deleted file mode 100644 index 73d568f6dcab942ae9f87a74fb48a6ca0e3c0216..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13098 zcmV+_Gu6x=iwFQNUHDP}1MPiDk}N%r-u|7UPat!SPG8g!w9*M=Qnht2WNK^GLdVx1 z_{bngfP;Hv=6hwP4KvNd2M!1zV2D;*sllc3En_gj_~VW$1@+<)R#`!`l9@5`rpw#jt1$d zbKcYm^7O1vb>i{@h&E3v?xZ(+dZ(wsc)G{oiL#C{n0Rj2BFhjo%H6Do;}^P#XY7zo8L3qB!;bm zWiyX;kF|Adbs{}`VWYm@Y=|BVz2BtQua-#P(%3Dntfjs=T2n`F@bpDb8#~Z;3h8xc zq*adeRHUsNsn3x%bEG~*T7^LDARHIH=XJk7U-x_3*q)y9G)7P3@VNZRC;dReRV^8#JqB%@N?TKdQqHTZC=q?)XMZLOc z2CgfzmabpE{z~JzXq7Me?nSG1(Hth~8)m~^7tQujS2ois=+nIE#M9h{L^FKU**@vl zldhi^{=evd8Y8XydD0bH&C_9JU*B@a>sJeAP4^!cs1Y2%RZ2UKL2IVJK5IwseAW|O z2bX{V@a(#LO0xUWoyXuF`c@VpUo{)@>-#hR?AiG$nCV>2S8Bh{;Lr3wQMe;+;1W?U z?Kl@0=gh*Ji}RnP+xcAWtm(7X{oq*cQmhs2==x}`M&qS*&%4aAR(}01O@6XF_TLXK z^f|BKv-gv<{rRZ>gnr)b1N2p3Hi-(XV6Y zUJF{qPSk0J&I@!xxuSND_LM#L_RD7L*h%)*+;(F$ZZP)e^M8?67R@$CB~Int^qq7b z!8xlP9d=4x&C}Y3eqO(NF*@EioNd2iuTA<76WDuxzWV)v)-m~wo?Ic1x}THr9)$c_ zzu8WIy~G!5n_!{3o^oKx)r-6*MUE%6R zbY0jZl@62Okj^oC+H}r)$+g)}uW_3<4f`F<=Unz}-0G*T6u?9J{&+Gjd)Q%o?0Rw; zHn;IhdAmZ-#Ql;V(8*U>7+4EyVWjnPg=1_=bK?Bj(j_#Uy5ovo3N6CWD9i;}`I_Mf zOWJ!EJQxMZ0_eK48P;aiO(bC1O@{2oSRMX;x2YBU3GI=cDEf{r6u#KkkNNgqd&auU z;wdHgPCJ!NzSnX=nKo)WuDvrQ+~2>Uw5_J`C#?9cOrbL)`8Ygrg9|ecOE}q$jWh4P zQ1B9NlB}V_%(fg9r zuRDKsze~z|TYgjt2)BLr1{&81-6k;|GW`^#&3U`dsp_hMmge^#bRsweRjk_8iQtZo zv^%=@o6q~S^ji1Ts=et}_U8-@Z+v-pA%Ar(^u(#bvmF5pfYEh>)2=w)Xl7n39^c&H zdTb?=r@x=wt?6xEuV-wfA_>|Uo zKjwt5y-5g%qql;_c*pKntlan3s~{_Ij!p!JN&!;F{HorHg0I*c`}sy<)V1T09_qA=op(Vo4B=yu;ILTM+Af{WIk$oxs zDzYGZPGPan9y-hBt<~6PeG~rH?$-zN#yv`8rNntKrM*4=;_w`sH0m3E#p&4e&)T&f zh51Z`w+w-!1&H$9SmTvetTm_l9xus@N@w1paeJr#^HtvGMa%n7aNY0P(OELmIm-~` zLkBRn>JNc*p>VRf@>9QXZufX#)wQbTW(Y%TfaawWjVFF+h0ggy_#d)6S<*bu6X=(C z^Xuz7Ev>a9-sPabrI9p;pSs9``8?OT_3X+MJ|?k4j&RiJl3yfDa@70ZbCM(DBJTHR z)<)7>5-^`OyuX4|gSYT?j*uST)|#`MrP1h86rB7cWJ_;vznHuL?#q}J*V^ZA<9mEY zTzoBlb>6c5k|YoLkv8-giJ+c;XDv}L!_ud$4Tg8?tn?97MCEC6-0(}#UZ7$Ow| zA{RC+qP9ftbn~{UHI3h#rtt_Vd!{OP&HDv`YC+7^Db7S0CxghEv7~7EbNeg2_E;V8 z0Z5;!j~SXzbD^c)62HBOP9oXi;SJA`Ryn!{2v3T*A$-CK%c0_1=P?f0LE}>fw3Oen zv&6jdD<+TPXYlPb4{>HmaIpjg_L|;J{TZzZzxb*AuU|IiT3_uU-|g`Vc_2>t%i&Ep1FOZ z5C1uB;n)FQvv)kpeE%MAY&l{8b;Pm@u~`dg1603~Mmx0COg2ZC<0a2uZ*P}Diyi&M zQYtfh)Ay`nZX@1+aF}l;B>WUVR0ghO~4ySGfZd6((_Y2FS<9;GthcYmvk?y@@OrNcZQ#5I#uOt z#pX$Go63mx>)?30#J5T$>G;IGPjz~Kz9H^rtL>xaFFFIJOjrGyHaThtA8Uqq4;igf z|rkJ5IzsZRG?InyQEtM&_t|1@f0+qL9Jh3jG=- z;i*hf^GK>!lo(-s)hsOSZ=8A0XZY9q*{BGX)w&yjS-K8n>mt8r+}00wu9*H?$<-VY z?i#&E#7xO(g3bQCDJ^JtgYi@Oyf(1{soye)Y?AA>i*}ejbxhGyh5 z#Mu~zW7kJ9!`Q?y+r?VUSKlHM|74jKt^n_CV(+OZ-_d)tmgEug@7bPvuiDCOEJ@7Z z$5c0`EACCx=uq;gdo;iM9>Vesc-^JyY0viH>nx9~ChPY!Euqm)FLqp*(GSORw>=!? ztl)13BI0~gX~k*Odqg&gP_QvN-yob!?4=bj@Lp)1ZM!XrxLy~=me19D%;*%%{EVgb zlUco@Hm=nbnO9+J?19_03PyK_8_(z&8F*eCBM^!*q}3eu)Pi{Sq&^j+mR;oT*!$VL zBSH4x+*87-&ln*V!YOhSfdjx>9o7ipBqVH0&pb;Tf41M1QCvwv&jhAg?x5k=EsIcV zX1+=y8<1U4=+(-uoV5lv>Si`Jkc6Z~ge5Xi;~TTKqcr9jky6I_exxY~REUf7B+33E@#)T78y2cVj?;*4l@X5eNG}hVb|D!j+-9;l0=RmOWuDv%o>jsvVjoeLb3;*ezK6gC9&ax;wneW2%@-%S_Ei4bW{Pm+XtsJ=L z02xn6r20<5FH!^ z_gYglY}AOnV_K6TPHQ$|yR=rEMlXJaG+5>lkZ0jSEo>J=e#X~qYpRwt$uoJkZNCWJ z-sDIMA>*=5so=`Kst-GTyZ`LfNE!0lcO1K))lm-8_^aAKH)7ThSwD?d=<`w@P1ZZ) z;5s(4$Og|UYcyYxUzd5e=!6bS(OER)$+%!qCoKh@S=3gzL=5%JX9Nvlq8rTT7P-O@ zqcreK%BQTWUcB$hL&8_#7L}?v($9MG=@_eiNSsPo^Yr`sU4Y1LWHq!4s!-{{_A$IU z6O_#G{C%yxm$|6$d|`c6xWS&^vyL5^E7tvq%{eq?8teJ|*bC5qotJ3h$UufM7Iw>% zA?u=_%W)nfDS0@LtXb z{;hG`-`xlu)BP_!dNpqP$Y%$fXYDn%r|nAc*HfM&PtLJl^-$0et92jG#%>SWUg>KZ z-|L7gsEXo6oeLxO`z0|0LjSK?dTl>LQ_tn>j6yznbD`MvJ?zRNBk|9-PtFL&Ke4Oq z%xV&uZ(GXb9%?<3#pb$NrM}PjTan|Uc@4YKzBt2VpXT!~Vp4+ko^ofjIA08`OzvJ) z1+y6=e}ARo7WSsK0cu)-WgNi3J1;ni!r9!w+K|8U3^z+B<#YVz3AIG2j|u%pSiM>q zr|&*T6zhOl!aNAS_UEmq9TAjV8SB;wbRX9x?zFvhR_J+x68VhJ#|g$@er2?q^!~VJ z-8+rZ?Bx6uPc;!797o4(gL`k5yLI&SvX{@GdD-O7(!S;BwCJLIb5DoYJ8u2KTHHLp zoFX6Qe4j?p(o{AfzrERrjxj<<(%Sw#9NChqZ3_P5gq?8q=E-idbMzEh&s5~$^6QFk z_u205+ZCt%s#efIt~1N7un3yF?~n4tx^Whs?G){_4s^6Px*sjm8(FPW`avs%(%$@f zK39rU?F66mo_VtP`6Mcs4D#?BTeI1hd2T%*;}c?QCqZ@EdpK1|jkD9bMiqf!{*8CT zQM~*YRP}~)Bvj%!3vRscA(K$ed8gkI+-IaqqVH@i*)IGRH?D0!#oNZ#6jfTa3%qOm z)yhGa(E@_=GtCp(W=h=z)WSw}q52-|Tr=VGvw2=4A!nTS-U>=7Tg)IXfGosA`0)G7I8qbUeOaMgjXnB7S&YXsHj#C@%aX4LvvdD(@l1TpWn?lWU5e^lxr6t^@N> z^6ZV*G366Y>qmQeLM82eXFZ_yTVC|1wi9Z1JtZjm&na}Lp*8}~La9F1eT#J0tFfY! z*ZUf9S2NDK@D;O{xVPm%HXh^*M-Q3iIm`~nC@_K#k1`k4b-|AIgf_7!T3`2jljG=U zuG|^jJVr^`S)`{ql7o)?ZVs_mOP<+(cPEoDxT}o1Em)UV%)%qdLXz<$%8B%qc;e)O zmSoDob4v4Q;*UM`z^>zzozL2{Y81nq*yocxbxmvaH$2%*WnG4pkD6$(w_w+h6*s}g zXl5rS;0JyJzohp;SlC=Ik@X#|$(KHF*b6(vPR*|MtJ{k|{#biy&buM-=R0Ax1lC%0B6;qIHosaeT{zw2i_W1D?$#f^9Gd2tokjrp1aH^?{=>ShX#bltRz_R$$tmnQuo zwE12>m${`j!I$kFHAeDlxoF)x&*?eSjNKpmon1NNv=8J(;?_caUC<`pL#( z*B;e-(&}ZUM6BOu5s2Rtr)JpZE}=9+!}`AE&urc^mVkC0R{ZITt@WKjc_?vH8) zQADV%_cFtKoIAScevXKzT&wa`vcB5Brp*3(Mp>aQ=Ku0pW9Mf*+ud{Id{*n-l-_=# zd%NK2tUkX&*1x0XOncs+PIDNJ6!~!OMfjGe+rd+2P2q=X(^&Fwg-blu*$x&T^lYBh zy~CnbhwoWd_kK!0E`FzFU}t`QOoAxd`~JGGbx~yanPn@|oYoV>q%aZ&( za~L-=?OdX|jl3r*@y_z=Z3USZo|)P6Cs>+0lzGC(x~WUtzp`@@-@Ed4aojJTpwBDy z@=eQAz}K8=HT=)F?e1pIw=F03=PVWLD{2pS=2_$vHLOmGo|k^IFOYL7ets^%*@t72 z$6R!evz77Uj6ci8VVxd>T%^psNQ$qPO?Lg}sO|SxbRNz!HxXRO)$rxi`yNI9f5)P@ z#}g{e09~+cJ4IYJM;##Q=Z0MV$qYj4E5?i1QmM(PGPmxReQEVwa&Wj_iA(eJUh&@A z)1Rq(yPv0w4Z)7|%$*EEm8;0?K#-M2&8AX#9W2a;$R^#4f^j7Le>JfKEv=R|&Z~sL zhGWNkRoW{1Unvugg*W-$$xgP202*4+7iivni`kTKoVy&!NZw*sy4FZu^jB(Txt%Pg z+)`0;ZtO~7JHVoIkFz($Hud4o6*-(0-`2C!`n%7?Zl=@N&RSGj>akV+xM$lzh2=QG zKdWE&z}@K?pXQX1MD+bf>m}y4&{&OHwb=?mNS<4Ju=aU(b%M}W=W}Sx?s{pbtv$=h zvv&NR*m#%Bn-7fMmUY^WXSPx=Q{#4Kzbip5P%SMkQwV<;(p?MQjz3P&7!Z1c>|0Lw z5Noa8F_%yzh~144LAH6%v-gvx5t(fn`Lcl6H@HVs4bXO0(H!RVw%;BNy5m)eYf|GoJ#tCT^Wq`iTchD!C zUyf^&qA0Upj#ZQF6JTReR7)b4wFZwt8b z$+G&C2*)u>UrC6Sk`4kr&IReB4Ze^Nhlbc(;Mp*>uG%@TUTYbm;LC{75_X3~~f*($5 zhVICVZZ^E;U7GAlCFXk8W}AcZYaK;I3@ObUtaA=p1hic2L}-s@v^(TZ;jLvezaedD zU4qEmdtR^>mK>60FC(ABeuihG`R0?oI1; z36jtL*!%O0Y6_kBU%dq35@Eh0`lqbOU9Y{T-!HDoUF^EOJmdo+TGlc?7H9_r-+#+= zb<@fm7P~IhsJ-skQvNyGPcyyzhL)^M{+-)O-*?=~E#HGP)cPcwGev_2)Os;53qVVg zRZknCWoUQPu7CN7dd{<5`xQILBj?QBy4WHoHZn(z>V#rHGL0^gr6mar$sy|N?X)KH zGcULw;}3)t^ou$*uhvg|%H!4vjv=GPM^y9)CBsi)u1c#e!Q$8NP1DpdY&p3$(vv*e zr^u2mGj?iie~%f(G>ItkJbQ_skJvp%N`V~>Q%WwWN%DMI>-bP)U~YVu4m!y!J26Jo zR(U`+#ZR*(YZicFpM0-r&ikjIp%VGvkh|DPsm&}uB|mxJhd3QYv1!Y!x$P)gdt+(q zJUf!Jo@E)0@ZE3a`sS2F<5e}WdJkqs^ci*=kq?CTR{CiCF|$19yoibC);Qnurav`0 z3C>T8t1E~Qfu4M&bNJ@JTB2uSG2dP`1A?%DB}XGJgN`B+$V39QgW|Au+8^z2@3i^{oCX0|~!x1+@1 zA44QenIGS**ZVzjkLq?$IwQ`u)7m9PNlR~_GhJ-nle&>cWGWDrRUdZqggi$tzsJS* zAkU{NpAyI=t%hT5dkeO9ek@)9-5Vv>tLm}+KK;~#4}aOLzRy0-ocFMfeU@AJ*SkBn8(F)PS}AP8H%PVy?&sP-nvL7~EHn)-E27FdY|sK#j@+zK5=_xIiJp?9-bnCvmhd5IrsN5pM{;r?7M z1$eEx=Nwy^pGh^(%0Jij$Gxa|t+b8Dry~FN`>f?T;gVFdY47aZ^?Hr8EYA11eBt|o zp{ZrgG@O;}Dmv!PH++fic|c=k4~sHu_dMe|Z=6auzRK_HxKDR(#+)EW^x5yG`&u9T zZoB^dSH<2sYDTMwgn~>o@;!5#?Nf}_E`>jv=N(O>`to|oc-)9JD_?})Q`Z?v4*ZhC z0hOvYR~hFHtI3P~8QWQ}J^f$idCS)s&&>6pvu-xheDIW4?zuHCd(3?&=HSyTyu#}=YpfvH#+-XsG0m1J1A!wneB_ioa!j<@AumJ ze4-phBG=642g`h(>syiE!TB9Y%~1gZ{*03>vQlvT4C#1GY3*@;02iY4whHEkhufT1$c-S@iE6q_Bhf}JC50(#s0W$ zy^f=rO_=eGQtw0ONH2ff>RUidIy&X~#-rEvf4yE$^Ba#P^7Blg+S_*X<*u%rPDkX zWtIAs(>Ie=Y{F;`XYJY8C3dU9O1y{U;X8I4{`CuJitOc=T%y;|LO#(@)C0>ZcEVf7 zF-4}ta^COZeCN*Y(dfsIur%-%HBmg{O#3>Ad}hzr=DA2!>G3o`zy{xZ_uc0l#WUl* zm-J?K^dsNgy=P#(uQ8jj>PY^x$BoO|F!MEM*F98!;oj#YXHjXRK+)fc)~vX3%>Sw8 zCD7%L$W@hX1#E{}>G#*F!1Uungcrvu#0fpu?6vd#)t>B5E3v*e`R(m`$DXz9dRSY? zCAYD6drNH_k%!puHRn$g8f#Z})aOyenf9Jt{!>vd{EE8nmxj83>pWJo_O!H@gq5$a z@HL^jbAaQ$-1=F&=NWvzv%|}+c5%B>EB)}k<_5&q0eqlmQ7R1k2V0OcB^P?#*CUM5V(VN|be%XGT9U$(oqY&P1Q?f}Pfq)znVVl0Vx+ z4QWgm6(P-|$kijOho59cho$ZeEjs#%9=;(fJ>#p~)(wQrH9j+rX+AMogUUxqXhf%o zsf2&+RCDFkJVO1hv`mLQNw?>E%Z1D{ydohoDY0yIIl8vVY_Rf?sbi=8Xqc9ZxR}KJ zO$O)6fIMKBXKF()o30_AobDv7CVw-$3FXR0^s!yvgUIeH2(*>d%^2rFZRL~Mes|&r zPg?zi9RUP2ynaIB7#5?Ny$mUYIXwwVWJ?OmH&f4}!D9kpNg*%!0q z^1P0@q>VY25?MqITqgibgWLjo1Lad{K1lS$gnj*wfpOBeBy2!&0v*p zTE96x8=}>Q!bx)Z?3LK52&3=u2qcooK6h+b1ykbIGkq`L^N*J}zEi~ddC~#=x$;X^ z+1E<*y)#|OLnTrDbL=QInf^*cKR^RcvK^`?xCZMYbt)Ze7g3L1jU|6+zn;;6t{Db= zHwaw|*uQ4RTGZ1$Ivm{-e`&VgX1z91pp$jq%#prNKQr18CzR0Ziu{Tvyowib#EKp@ z&5T`mg@}uv;t37e59O)4E1w`&?S{3p z=8V-f6W4rQ^06JQ^8S@Lzz1r%7FW6G?_as-%$_sWHfM8W61bab4RgqM3RypRMXK1& zbJrhaF^#-GUx#j0`<&pxAxTa15Miw~g3rr51{>`uG7HVo6lX>!-nG8&a-lT!Ch(|s zoepR6w4Wa*q(i&Ebj>qcQ_nrDc(LR+I78Fg>N^^{;jc&Ep6E!swd6W(FsV;*Oh+-E zpY@xUS6I!IIVJR7%l@9V?)S0@Yi~Bd`c5s+j(k;aVuKpbkG4~o4LMu*cd~b~H+WlqG zj(CK`JtJflDZbRv%)^^A&xdcDr{1d|^n}E`!+SOm+9RAQK+kS%N8+O}$Vq5N(m`-_ zmOo9IRUTy)ygJgY+-~QtEU>`TQU5FY>yCt-x5@RidLS-FqA7&+z@;%I@AXGkcr@gD zGGgXkV)ZB@UQzVpA98<8LHLH?eo%tCwZwjU*im-)*tV9>_(M} zy>=SEAF#Xo*?4AKaMR3$xmuaT--=vn{R-9-8EmaBW}ut5RP`kdcPG}pu}tpfXu2l# zZri| zeMC{=(r&rPzW&eHe&WrUug)O1(oTCjmIT~j%}-`+|5SXpG0IuuehP`xe$?VCJmwTL z{3xgQosoXN?tbvA@Rtq7(jdSa|KOCy><%63_gH1U1ft35njUW>v*8%@hv|p8B4KBX zE^^0)id}TDAnnJ|WKEtjisa1LZGM>At8uM|*|)gw_0>HOu0M~Y&c@;d}*NnPg*91wp*8T3)89atqcgz*7uf6BP%9N~V z@-q$~GpQDj72$I2EJJk^7763`ERfw;nsX>gVP2a67_gE3oUK~ZA7Fk&&5^{i&7&sc z#bb8xwuwDf($9EY&A0L?gN-_gQ@J^ty`!(!DvMo5^ABqyKDlzLd6FmeBM>q%%ax`GVQ5AR-F)^dFT7{>F%r+-pjct_e$7N7JcNk7FE%!ooD%V za6)ODGO%CWtNs~K^}x+-o4vd!Yb`j5pULGm10>mGYg zjd;{oJ@vNvUpn7-#V&o}w^&ZRC!=^a8&9uTH`37NNdErHE23@r^<`$cuO0hQYMsaF zJ?MYzecyavB-7U)S7bp1#!{~vSZG6!R!bsocL%xAP~8dK)OSRyJ&C6{a5h<-iQ1XJ z!&?G71+1}=m5-6UhyzRr)@~B}&{D|VKF5hy{mCv^p;q>J%L6{Ysng!S0)cae>CeE?B zy|b&LL$kDfx1yLfJd4F1_Y)kIRmvLSqtO?9@cOIIljjFX$z&JKhM2*R#A+LbMZ^BlP)*EA?c zd&(kq!;EKaJcnGjDRw0Cdy2eCTQp|&;`{o0RyDXaE-LRaV)aMV1%EEjf|#T1&6>BQ zNp;A6@9b5?-rmO(4fsckoXXI>4YCZ}n7g`!KH)o@aceYPY z%7}6{QIlkka@;Zri(K8yCn^sTRh;Tc^SzfFX$9&Myl99^*Ut_<)1k;X&1|`rPI-1x zb)ubHeaF4_{)=m+Q}|^TR3tt4;k{o*Pa$88S$M8@&Z@h(qtdSR%uCfX@6n--@Lbh{ zTYC&OtD0z;|W`{GQuN6#qrY1>~HubyG5^r&f%o|7d)PgbzU0;!74`xhMTYJ_T zcaK&BSq4(h=tYu+xNagn(~{N!#MGSk!sBo%`M2*-OKm^HrMFKdbsc+t_!_T=beq;C z$1K@Sqwa0*9I-H8*+1Sd^KdnH_fE#rli%NC|4wU}HB@&Y=aR0;*w9s*@QwRDEIIT)(K)O-2AJVEo4rJ*`1H=ha4_8a_=`d0 z204IqVsby=yWEeEby1d~2*^ECE0}=uJBC*&vBP}+eNV$mc1IDRi5QAGdMEW`2NK(- z*=!gk|CNX)P*96IxwQrL(oV!F17}P_$o51lo)lk6JzH_RS?V(RXnl4fBC;SY>sOHH z&EG`%BCl_QCOb*VqZ6Waqi;pksGAa5^L&Ful@S&Dxy(OySu5DWmz-^)e+#97m?qk>VsPb7ixvw#E*>UdP@?-*eRHtk3WFAo|9%6gwMfl@adhCAAg& z+=|%73#T(L?a8BBMW86517OE7%>)_LW4tG4!%qDOGo$2One-}=s+AIGEw(Z`GCIsz ziRPmaIf^sKu5=i7zAHRyz=WSHPP}3=wbeo1wya%n`%?B+Gb&~3XxzX-= zi-DIj11r|lGHg!4LpE&SPQ)OfuGzsnwl)rEX1ZP_S3`X&D79C1DCZMv7_>xI1&PQ0 zBlj9wMKheW+;5VaS`1P%sZki7_Kv$iE$oZLRXk1#N=|P&M-Ym}i@dmoq*X{`bXmvW zlo$}|bJ)p%r1|mhc;C3CLPkj#1*1iH;6sbVEF38QAjWJbN=R*y|i+2aleR`NyE!G&M(9eOt8> z(2gBuN#P3C;tsC9_xu`lP19L^o%Gp^WPyCoUc0x;%5Hk%Q!X}b8nta}fpu#=@CkAa z?UYd=Mvp;L@ShfOVWV9@d!P1l*s`Rp*1hYGLp!q!pO~MdbL=von2olIMCdJ>)&h0iNCy1hDjQYw8$&!>mJ@ma0&7uO06*Xr)ssA98Tubrn3QK@0`sc*e< zWJHigKO>}~t^=g(rRBV(wg2d=<7carTTtox%%Wqt%ddz^E4CV0iS=T?ZrfYF?TQh( zITYjluQ@#aDX;AW;t#Fe1JnMM~^{mLZT2Yrl`kJG=`({)UdE)sgQCPf74m5Fo7g3+eJv;j==^WAuFG;DUcQ0DJJ#8^g z%XN6c_hcOwc?G)SJ&xV;7O6}0SFG=6^-{BEs8Z%Er^|!#Mc?nC6Pf*n`_#faaOK@P zda9$tTqW~P9s`Y6lqwA4^ZPx`VO{x%$08Lf`nw_&-jmznPB+fX-0jC4Cg0x(56{neY>OU*v}K$BlWZA zIjy@Fi08Q2@10XhSXl?K>Cfy}Ls3qa*Qs&HBi-Y{qV#SnHnd8;81=|zZsIC8=^dI+ zk#XsBFO$4Bhd}(|J#mmb^*K}pU+vi2y zor}!F!tdGCk9%gX<_H|@lsKTzN0fkkj5uAQ%{()*FMdK@Pcl`ut*b4=Ft6S=Lg3>) z7}w6*?v8Y{$|Uzu>zrEDaJQZ#0p>mbe4gj+-L)QlagKxijySTpE#ldHK5IE9imxn> z^q==@ppMAXj#%>ig%Ez?9X-W2Bbp7YsU5&+AWRFKy zdH6)2Q3HKJC%7DK4Z0#4XyWMc>qzVVUHd4oUI968YyZ(JAel*rcQopBwsmu=WxK{- z$1sgD{Z`-#}V&< z6PX*1y`C3v=k<&GgPxr3v8FU$*a{Xl%pS=(<}tempdir(); - -# prepare temp directory and make test config file -my $config_file = PrepareConfigWithTempdirs( - 't/tracks/score/integration.yml', - 't/tracks/score/db/raw', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify -); - -use Seq::Tracks::Score::Build::Round; - -my $runConfig = LoadFile($config_file); - -my %wantedChr = map { $_ => 1 } @{ $runConfig->{chromosomes} }; - -my $seq = MockBuilder->new_with_config( { config => $config_file } ); - -my $tracks = $seq->tracksObj; -my $scoreBuilder = $tracks->getTrackBuilderByName('phastCons'); -my $scoreGetter = $tracks->getTrackGetterByName('phastCons'); - -my $db = Seq::DBManager->new(); - -$scoreBuilder->buildTrack(); - -my @localFiles = @{ $scoreBuilder->local_files }; - -# adapted from scorebuilder -my $headerRegex = qr/^(fixedStep)\s+chrom=(\S+)\s+start=(\d+)\s+step=(\d+)/; - -# wigfix is 1-based: 1-start coordinate system in use for variableStep and fixedStep -# https://genome.ucsc.edu/goldenpath/help/wiggle.html - -my $scalingFactor = $scoreGetter->scalingFactor; - -my $rounder = - Seq::Tracks::Score::Build::Round->new( { scalingFactor => $scalingFactor } ); - -for my $file (@localFiles) { - my $fh = $scoreBuilder->getReadFh($file); - my $step; - my $pos; - my $chr; - my $start; - my $based = 1; - - while (<$fh>) { - chomp; - - if ( $_ =~ m/$headerRegex/ ) { - $chr = $2; - - $step = $4; - - $start = $3; - - $pos = $start - $based; - - next; - } - - if ( !$wantedChr{$chr} ) { - next; - } - - my $value = $_; - - my $rounded = $rounder->round($_) / $scalingFactor; - - my $data = $db->dbReadOne( $chr, $pos ); - - my $out = []; - - $scoreGetter->get( $data, $chr, 'C', 'T', 0, $out ); - - # indexed by position index (here 0, we're only checking snps atm) - my $score = $out->[0]; - - ok( $score == $rounded ); - - # comes after, because first position after header is $start - $pos += $step; - } -} - -done_testing(); diff --git a/perl/t/tracks/score/integration.yml b/perl/t/tracks/score/integration.yml deleted file mode 100644 index 76fc9f7af..000000000 --- a/perl/t/tracks/score/integration.yml +++ /dev/null @@ -1,21 +0,0 @@ ---- -assembly: hg19 -build_author: ec2-user -build_date: 2017-11-27T05:44:00 -chromosomes: -- chrM -database_dir: t/tracks/score/db/index -files_dir: t/tracks/score/db/raw -tracks: - tracks: - - type: reference - name: fakeRef - local_files: - - 'fake' - - local_files: - - chr*.phastCons100way.wigFix.gz - name: phastCons - type: score - scalingFactor: 10 -vcfProcessor: bystro-vcf - diff --git a/perl/t/tracks/sparse/build_clinvar.t b/perl/t/tracks/sparse/build_clinvar.t deleted file mode 100644 index 6e6cc11e5..000000000 --- a/perl/t/tracks/sparse/build_clinvar.t +++ /dev/null @@ -1,140 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuild; - -use Mouse 2; - -use Seq::Tracks::Build; - -extends "Seq::Base"; - -# TODO: allow building just one track, identified by name - -has config => ( is => 'ro', isa => 'Str', required => 1 ); - -package MockAnnotate; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ PrepareConfigWithTempdirs /; - -use Path::Tiny; - -use Seq::DBManager; -use Seq; - -# create temp directories -my $dir = Path::Tiny->tempdir(); - -# prepare temp directory and make test config file -my $config_file = PrepareConfigWithTempdirs( - 't/tracks/sparse/clinvar-test-config.yml', - 't/tracks/sparse/raw', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify -); - -my $mock = MockBuild->new_with_config( - { - config => $config_file, - chromosomes => ['chrY'], - verbose => 0 - } -); - -my $trackBuilder = $mock->tracksObj->getTrackBuilderByName('clinvar'); - -$trackBuilder->buildTrack(); - -my $clinvarTrackGetter = $mock->tracksObj->getTrackGetterByName('clinvar'); - -# The config file has these features -# - alleleID: number -# - phenotypeList -# - clinicalSignificance -# - type -# - origin -# - numberSubmitters -# - reviewStatus -# - referenceAllele -# - alternateAllele - -my $db = Seq::DBManager->new(); - -# We expect an overlap, first the indel, then the snp -my $dataAref = $db->dbReadOne( 'chrY', 2787237 - 1 ); - -#p $dataAref; - -my $clinvarTrackIndex = $clinvarTrackGetter->dbName; - -# p $clinvarTrackIndex; - -my $clinvarDataAref = $dataAref->[$clinvarTrackIndex]; -my @clinvarData = @$clinvarDataAref; -#p @clinvarData; - -ok( $clinvarData[0][0] == 24776 && $clinvarData[0][1] == 99999 ); -ok( $clinvarData[1][0] eq "46,XY sex reversal, type 1" - && $clinvarData[1][1] eq "46,XY sex reversal, type 1 (FAKE TO TEST OVERLAP)" ); -ok( $clinvarData[2][0] eq "Pathogenic" && $clinvarData[2][1] eq "Pathogenic" ); -ok( $clinvarData[3][0] eq "deletion" - && $clinvarData[3][1] eq "single nucleotide variant" ); -ok( $clinvarData[4][0] eq "germline" && $clinvarData[4][1] eq "germline" ); -#numberSubmitters -ok( $clinvarData[5][0] == 1 && $clinvarData[5][1] == 1 ); -ok( - !defined $clinvarData[6][0] && !defined $clinvarData[6][1], - "no assertion criteria provided coerced to undefined" -); -ok( $clinvarData[7][0] eq "TCTC" && $clinvarData[7][1] eq "A" ); -ok( $clinvarData[8][0] eq "-" && $clinvarData[8][1] eq "G" ); - -# We expect an overlap, first the indel, then the snp -$dataAref = $db->dbRead( 'chrY', [ 2787238 - 1 .. 2787240 - 1 ] ); - -# Deletions spans all deleted bases (this one goes from 2787237 to 2787240 in 1-base notation) - -for my $data (@$dataAref) { - $clinvarDataAref = $data->[$clinvarTrackIndex]; - @clinvarData = @$clinvarDataAref; - - #p @clinvarData; - - ok( $clinvarData[0] == 24776 ); - ok( $clinvarData[1] eq "46,XY sex reversal, type 1" ); - ok( $clinvarData[2] eq "Pathogenic" ); - ok( $clinvarData[3] eq "deletion" ); - ok( $clinvarData[4] eq "germline" ); - - #numberSubmitters - - ok( $clinvarData[5] == 1 ); - ok( !defined $clinvarData[6], - "no assertion criteria provided coerced to undefined" ); - ok( $clinvarData[7] eq "TCTC" ); - ok( $clinvarData[8] eq "-" ); -} - -$dataAref = $db->dbReadOne( 'chrY', 2787334 - 1 ); - -$clinvarDataAref = $dataAref->[$clinvarTrackIndex]; -@clinvarData = @$clinvarDataAref; -#p @clinvarData; - -ok( $clinvarData[0] == 24780 ); -ok( $clinvarData[1][0] eq "46,XY sex reversal, type 1" - && $clinvarData[1][1] eq "46,XY true hermaphroditism, SRY-related" ); -ok( $clinvarData[2] eq "Pathogenic" ); -ok( $clinvarData[3] eq "single nucleotide variant" ); -ok( $clinvarData[4] eq "germline" ); -#numberSubmitters -ok( $clinvarData[5] == 1 ); -ok( !defined $clinvarData[6], - "no assertion criteria provided coerced to undefined" ); -ok( $clinvarData[7] eq "G" ); -ok( $clinvarData[8] eq "C" ); - -$db->cleanUp(); -done_testing(); diff --git a/perl/t/tracks/sparse/clinvar-test-config.yml b/perl/t/tracks/sparse/clinvar-test-config.yml deleted file mode 100644 index f04fa4a27..000000000 --- a/perl/t/tracks/sparse/clinvar-test-config.yml +++ /dev/null @@ -1,380 +0,0 @@ ---- -assembly: hg38 -build_author: ec2-user -build_date: 2017-02-15T04:07:00 -chromosomes: -- chr1 -- chr2 -- chr3 -- chr4 -- chr5 -- chr6 -- chr7 -- chr8 -- chr9 -- chr10 -- chr11 -- chr12 -- chr13 -- chr14 -- chr15 -- chr16 -- chr17 -- chr18 -- chr19 -- chr20 -- chr21 -- chr22 -- chrM -- chrX -- chrY -database_dir: ./t/tracks/sparse/index -files_dir: ./t/tracks/sparse/raw/ -statistics: - dbSNPnameField: dbSNP.name - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tab - tab: .statistics.tab - refTrackField: ref - siteTypeField: refSeq.siteType -temp_dir: /mnt/annotator/tmp -tracks: - tracks: - - build_author: ec2-user - build_date: 2017-02-14T20:18:00 - fetch_date: 2017-02-09T16:56:00 - local_files: - - chr1.fa.gz - - chr2.fa.gz - - chr3.fa.gz - - chr4.fa.gz - - chr5.fa.gz - - chr6.fa.gz - - chr7.fa.gz - - chr8.fa.gz - - chr9.fa.gz - - chr10.fa.gz - - chr11.fa.gz - - chr12.fa.gz - - chr13.fa.gz - - chr14.fa.gz - - chr15.fa.gz - - chr16.fa.gz - - chr17.fa.gz - - chr18.fa.gz - - chr19.fa.gz - - chr20.fa.gz - - chr21.fa.gz - - chr22.fa.gz - - chrM.fa.gz - - chrX.fa.gz - - chrY.fa.gz - name: ref - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/ - remote_files: - - chr1.fa.gz - - chr2.fa.gz - - chr3.fa.gz - - chr4.fa.gz - - chr5.fa.gz - - chr6.fa.gz - - chr7.fa.gz - - chr8.fa.gz - - chr9.fa.gz - - chr10.fa.gz - - chr11.fa.gz - - chr12.fa.gz - - chr13.fa.gz - - chr14.fa.gz - - chr15.fa.gz - - chr16.fa.gz - - chr17.fa.gz - - chr18.fa.gz - - chr19.fa.gz - - chr20.fa.gz - - chr21.fa.gz - - chr22.fa.gz - - chrM.fa.gz - - chrX.fa.gz - - chrY.fa.gz - type: reference - version: 45 - - build_author: ec2-user - build_date: 2017-02-09T21:53:00 - features: - - kgID - - mRNA - - spID - - spDisplayID - - geneSymbol - - refseq - - protAcc - - description - - rfamAcc - - name - fetch_date: 2017-02-09T17:07:00 - join: - features: - - alleleID - - phenotypeList - - clinicalSignificance - - reviewStatus - track: clinvar - local_files: - - hg38.refGene.chr1.gz - - hg38.refGene.chr2.gz - - hg38.refGene.chr3.gz - - hg38.refGene.chr4.gz - - hg38.refGene.chr5.gz - - hg38.refGene.chr6.gz - - hg38.refGene.chr7.gz - - hg38.refGene.chr8.gz - - hg38.refGene.chr9.gz - - hg38.refGene.chr10.gz - - hg38.refGene.chr11.gz - - hg38.refGene.chr12.gz - - hg38.refGene.chr13.gz - - hg38.refGene.chr14.gz - - hg38.refGene.chr15.gz - - hg38.refGene.chr16.gz - - hg38.refGene.chr17.gz - - hg38.refGene.chr18.gz - - hg38.refGene.chr19.gz - - hg38.refGene.chr20.gz - - hg38.refGene.chr21.gz - - hg38.refGene.chr22.gz - - hg38.refGene.chrM.gz - - hg38.refGene.chrX.gz - - hg38.refGene.chrY.gz - name: refSeq - nearest: - - name - - geneSymbol - sql_statement: SELECT * FROM hg38.refGene LEFT JOIN hg38.kgXref ON hg38.kgXref.refseq - = hg38.refGene.name - type: gene - version: 32 - - build_author: ec2-user - build_date: 2017-02-09T21:53:00 - fetch_date: 2017-02-09T17:13:00 - local_files: - - chr1.phastCons100way.wigFix.gz - - chr2.phastCons100way.wigFix.gz - - chr3.phastCons100way.wigFix.gz - - chr4.phastCons100way.wigFix.gz - - chr5.phastCons100way.wigFix.gz - - chr6.phastCons100way.wigFix.gz - - chr7.phastCons100way.wigFix.gz - - chr8.phastCons100way.wigFix.gz - - chr9.phastCons100way.wigFix.gz - - chr10.phastCons100way.wigFix.gz - - chr11.phastCons100way.wigFix.gz - - chr12.phastCons100way.wigFix.gz - - chr13.phastCons100way.wigFix.gz - - chr14.phastCons100way.wigFix.gz - - chr15.phastCons100way.wigFix.gz - - chr16.phastCons100way.wigFix.gz - - chr17.phastCons100way.wigFix.gz - - chr18.phastCons100way.wigFix.gz - - chr19.phastCons100way.wigFix.gz - - chr20.phastCons100way.wigFix.gz - - chr21.phastCons100way.wigFix.gz - - chr22.phastCons100way.wigFix.gz - - chrX.phastCons100way.wigFix.gz - - chrY.phastCons100way.wigFix.gz - - chrM.phastCons100way.wigFix.gz - name: phastCons - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/phastCons100way/hg38.100way.phastCons/ - remote_files: - - chr1.phastCons100way.wigFix.gz - - chr2.phastCons100way.wigFix.gz - - chr3.phastCons100way.wigFix.gz - - chr4.phastCons100way.wigFix.gz - - chr5.phastCons100way.wigFix.gz - - chr6.phastCons100way.wigFix.gz - - chr7.phastCons100way.wigFix.gz - - chr8.phastCons100way.wigFix.gz - - chr9.phastCons100way.wigFix.gz - - chr10.phastCons100way.wigFix.gz - - chr11.phastCons100way.wigFix.gz - - chr12.phastCons100way.wigFix.gz - - chr13.phastCons100way.wigFix.gz - - chr14.phastCons100way.wigFix.gz - - chr15.phastCons100way.wigFix.gz - - chr16.phastCons100way.wigFix.gz - - chr17.phastCons100way.wigFix.gz - - chr18.phastCons100way.wigFix.gz - - chr19.phastCons100way.wigFix.gz - - chr20.phastCons100way.wigFix.gz - - chr21.phastCons100way.wigFix.gz - - chr22.phastCons100way.wigFix.gz - - chrX.phastCons100way.wigFix.gz - - chrY.phastCons100way.wigFix.gz - - chrM.phastCons100way.wigFix.gz - type: score - version: 12 - - build_author: ec2-user - build_date: 2017-02-09T21:53:00 - fetch_date: 2017-02-09T17:21:00 - local_files: - - chr1.phyloP100way.wigFix.gz - - chr2.phyloP100way.wigFix.gz - - chr3.phyloP100way.wigFix.gz - - chr4.phyloP100way.wigFix.gz - - chr5.phyloP100way.wigFix.gz - - chr6.phyloP100way.wigFix.gz - - chr7.phyloP100way.wigFix.gz - - chr8.phyloP100way.wigFix.gz - - chr9.phyloP100way.wigFix.gz - - chr10.phyloP100way.wigFix.gz - - chr11.phyloP100way.wigFix.gz - - chr12.phyloP100way.wigFix.gz - - chr13.phyloP100way.wigFix.gz - - chr14.phyloP100way.wigFix.gz - - chr15.phyloP100way.wigFix.gz - - chr16.phyloP100way.wigFix.gz - - chr17.phyloP100way.wigFix.gz - - chr18.phyloP100way.wigFix.gz - - chr19.phyloP100way.wigFix.gz - - chr20.phyloP100way.wigFix.gz - - chr21.phyloP100way.wigFix.gz - - chr22.phyloP100way.wigFix.gz - - chrX.phyloP100way.wigFix.gz - - chrY.phyloP100way.wigFix.gz - - chrM.phyloP100way.wigFix.gz - name: phyloP - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/phyloP100way/hg38.100way.phyloP100way/ - remote_files: - - chr1.phyloP100way.wigFix.gz - - chr2.phyloP100way.wigFix.gz - - chr3.phyloP100way.wigFix.gz - - chr4.phyloP100way.wigFix.gz - - chr5.phyloP100way.wigFix.gz - - chr6.phyloP100way.wigFix.gz - - chr7.phyloP100way.wigFix.gz - - chr8.phyloP100way.wigFix.gz - - chr9.phyloP100way.wigFix.gz - - chr10.phyloP100way.wigFix.gz - - chr11.phyloP100way.wigFix.gz - - chr12.phyloP100way.wigFix.gz - - chr13.phyloP100way.wigFix.gz - - chr14.phyloP100way.wigFix.gz - - chr15.phyloP100way.wigFix.gz - - chr16.phyloP100way.wigFix.gz - - chr17.phyloP100way.wigFix.gz - - chr18.phyloP100way.wigFix.gz - - chr19.phyloP100way.wigFix.gz - - chr20.phyloP100way.wigFix.gz - - chr21.phyloP100way.wigFix.gz - - chr22.phyloP100way.wigFix.gz - - chrX.phyloP100way.wigFix.gz - - chrY.phyloP100way.wigFix.gz - - chrM.phyloP100way.wigFix.gz - type: score - version: 12 - - build_author: ec2-user - build_date: 2017-02-09T21:53:00 - caddToBed_date: 2017-01-19T04:37:00 - local_files: - - whole_genome_SNVs.tsv.bed.mapped.chr*.organized-by-chr.txt.sorted.txt.gz - name: cadd - sortCadd_date: 2017-01-20T23:53:00 - sorted: 1 - type: cadd - version: 33 - - build_author: ec2-user - build_date: 2017-02-09T21:53:00 - build_field_transformations: - alleleFreqs: split [,] - alleleNs: split [,] - alleles: split [,] - func: split [,] - observed: split [\/] - features: - - name - - strand - - observed - - class - - func - - alleles - - alleleNs: number - - alleleFreqs: number - fetch_date: 2017-02-09T18:13:00 - local_files: - - hg38.snp147.chr1.gz - - hg38.snp147.chr2.gz - - hg38.snp147.chr3.gz - - hg38.snp147.chr4.gz - - hg38.snp147.chr5.gz - - hg38.snp147.chr6.gz - - hg38.snp147.chr7.gz - - hg38.snp147.chr8.gz - - hg38.snp147.chr9.gz - - hg38.snp147.chr10.gz - - hg38.snp147.chr11.gz - - hg38.snp147.chr12.gz - - hg38.snp147.chr13.gz - - hg38.snp147.chr14.gz - - hg38.snp147.chr15.gz - - hg38.snp147.chr16.gz - - hg38.snp147.chr17.gz - - hg38.snp147.chr18.gz - - hg38.snp147.chr19.gz - - hg38.snp147.chr20.gz - - hg38.snp147.chr21.gz - - hg38.snp147.chr22.gz - - hg38.snp147.chrM.gz - - hg38.snp147.chrX.gz - - hg38.snp147.chrY.gz - name: dbSNP - sql_statement: SELECT * FROM hg38.snp147 - type: sparse - version: 18 - - based: 1 - build_author: ec2-user - build_date: 2017-02-15T04:07:00 - build_field_transformations: - chrom: chr . - clinicalSignificance: split [;] - otherIDs: split [;,] - phenotypeIDs: split [;,] - phenotypeList: split [;] - build_row_filters: - Assembly: == GRCh38 - features: - - alleleID: number - - phenotypeList - - clinicalSignificance - - type - - origin - - numberSubmitters - - reviewStatus - - referenceAllele - - alternateAllele - fetch_date: 2017-02-09T19:16:00 - fieldMap: - '#AlleleID': alleleID - AlternateAllele: alternateAllele - Chromosome: chrom - ClinicalSignificance: clinicalSignificance - Origin: origin - OtherIDs: otherIDs - PhenotypeIDS: phenotypeIDs - NumberSubmitters: numberSubmitters - PhenotypeList: phenotypeList - ReferenceAllele: referenceAllele - ReviewStatus: reviewStatus - Start: chromStart - Stop: chromEnd - Type: type - local_files: - - clinvar-small.tsv - name: clinvar - remote_files: - - ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz - type: sparse - version: 80 - version: 140 - diff --git a/perl/t/tracks/sparse/raw/clinvar/clinvar-small.tsv b/perl/t/tracks/sparse/raw/clinvar/clinvar-small.tsv deleted file mode 100644 index a50805829..000000000 --- a/perl/t/tracks/sparse/raw/clinvar/clinvar-small.tsv +++ /dev/null @@ -1,16 +0,0 @@ -#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories -24776 deletion NM_003140.2(SRY):c.364_367delGAGA (p.Glu122Asnfs) 6736 SRY HGNC:11311 Pathogenic 1 Jan 01, 1993 606231178 - RCV000010390 MedGen:C2748896,OMIM:400044 46,XY sex reversal, type 1 germline germline GRCh38 NC_000024.10 Y 2787237 2787240 TCTC - Yp11.2 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0001 1 -24776 deletion NM_003140.2(SRY):c.364_367delGAGA (p.Glu122Asnfs) 6736 SRY HGNC:11311 Pathogenic 1 Jan 01, 1993 606231178 - RCV000010390 MedGen:C2748896,OMIM:400044 46,XY sex reversal, type 1 germline germline GRCh37 NC_000024.9 Y 2655278 2655281 TCTC - Yp11.31 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0001 1 -99999 single nucleotide variant NM_003140.2(SRY):c.326T>C (p.Phe109Ser) 6736 SRY HGNC:11311 Pathogenic 1 Dec 01, 1992 104894956 - RCV000010391 MedGen:C2748896,OMIM:400044 46,XY sex reversal, type 1 (FAKE TO TEST OVERLAP) germline germline GRCh38 NC_000024.9 Y 2787237 2787237 A G Yp11.3 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0003,UniProtKB (protein):Q05066#VAR_003730_FAKE_9999 1 -24777 single nucleotide variant NM_003140.2(SRY):c.326T>C (p.Phe109Ser) 6736 SRY HGNC:11311 Pathogenic 1 Dec 01, 1992 104894956 - RCV000010391 MedGen:C2748896,OMIM:400044 46,XY sex reversal, type 1 germline germline GRCh37 NC_000024.9 Y 2655319 2655319 A G Yp11.3 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0003,UniProtKB (protein):Q05066#VAR_003730 1 -24777 single nucleotide variant NM_003140.2(SRY):c.326T>C (p.Phe109Ser) 6736 SRY HGNC:11311 Pathogenic 1 Dec 01, 1992 104894956 - RCV000010391 MedGen:C2748896,OMIM:400044 46,XY sex reversal, type 1 germline germline GRCh38 NC_000024.10 Y 2787278 2787278 A G Yp11.2 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0003,UniProtKB (protein):Q05066#VAR_003730 1 -24778 single nucleotide variant NM_003140.2(SRY):c.178G>C (p.Val60Leu) 6736 SRY HGNC:11311 Pathogenic 1 May 01, 1992 104894957 - RCV000010392 MedGen:C2748896,OMIM:400044 46,XY sex reversal, type 1 germline germline GRCh37 NC_000024.9 Y 2655467 2655467 C G Yp11.3 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0004,UniProtKB (protein):Q05066#VAR_003719 1 -24778 single nucleotide variant NM_003140.2(SRY):c.178G>C (p.Val60Leu) 6736 SRY HGNC:11311 Pathogenic 1 May 01, 1992 104894957 - RCV000010392 MedGen:C2748896,OMIM:400044 46,XY sex reversal, type 1 germline germline GRCh38 NC_000024.10 Y 2787426 2787426 C G Yp11.2 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0004,UniProtKB (protein):Q05066#VAR_003719 1 -24779 single nucleotide variant NM_003140.2(SRY):c.277C>T (p.Gln93Ter) 6736 SRY HGNC:11311 Pathogenic 1 Jul 01, 1992 104894958 - RCV000010393 MedGen:C2748896,OMIM:400044 46,XY sex reversal, type 1 germline germline GRCh37 NC_000024.9 Y 2655368 2655368 G A Yp11.3 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0005 1 -24779 single nucleotide variant NM_003140.2(SRY):c.277C>T (p.Gln93Ter) 6736 SRY HGNC:11311 Pathogenic 1 Jul 01, 1992 104894958 - RCV000010393 MedGen:C2748896,OMIM:400044 46,XY sex reversal, type 1 germline germline GRCh38 NC_000024.10 Y 2787327 2787327 G A Yp11.2 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0005 1 -24780 single nucleotide variant NM_003140.2(SRY):c.270C>G (p.Ile90Met) 6736 SRY HGNC:11311 Pathogenic 1 Apr 01, 2003 104894959 - RCV000010394;RCV000010395 MedGen:C2748896,OMIM:400044;MedGen:C2748897 46,XY sex reversal, type 1;46,XY true hermaphroditism, SRY-related germline germline GRCh37 NC_000024.9 Y 2655375 2655375 G C Yp11.3 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0006,OMIM Allelic Variant:480000.0014,UniProtKB (protein):Q05066#VAR_003724 1 -24780 single nucleotide variant NM_003140.2(SRY):c.270C>G (p.Ile90Met) 6736 SRY HGNC:11311 Pathogenic 1 Apr 01, 2003 104894959 - RCV000010394;RCV000010395 MedGen:C2748896,OMIM:400044;MedGen:C2748897 46,XY sex reversal, type 1;46,XY true hermaphroditism, SRY-related germline germline GRCh38 NC_000024.10 Y 2787334 2787334 G C Yp11.2 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0006,OMIM Allelic Variant:480000.0014,UniProtKB (protein):Q05066#VAR_003724 1 -24781 single nucleotide variant NM_003140.2(SRY):c.317A>T (p.Lys106Ile) 6736 SRY HGNC:11311 Pathogenic 1 Nov 01, 1992 104894964 - RCV000010396 MedGen:C2748896,OMIM:400044 46,XY sex reversal, type 1 germline germline GRCh37 NC_000024.9 Y 2655328 2655328 T A Yp11.3 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0007,OMIM Allelic Variant:480000.0017,UniProtKB (protein):Q05066#VAR_003728 1 -24781 single nucleotide variant NM_003140.2(SRY):c.317A>T (p.Lys106Ile) 6736 SRY HGNC:11311 Pathogenic 1 Nov 01, 1992 104894964 - RCV000010396 MedGen:C2748896,OMIM:400044 46,XY sex reversal, type 1 germline germline GRCh38 NC_000024.10 Y 2787287 2787287 T A Yp11.2 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0007,OMIM Allelic Variant:480000.0017,UniProtKB (protein):Q05066#VAR_003728 1 -24782 deletion NM_003140.2(SRY):c.324delA (p.Phe109Serfs) 6736 SRY HGNC:11311 Pathogenic 1 Jan 01, 1993 606231179 - RCV000010397 MedGen:C2748896,OMIM:400044 46,XY sex reversal, type 1 germline germline GRCh38 NC_000024.10 Y 2787280 2787280 T - Yp11.2 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0008 1 -24782 deletion NM_003140.2(SRY):c.324delA (p.Phe109Serfs) 6736 SRY HGNC:11311 Pathogenic 1 Jan 01, 1993 606231179 - RCV000010397 MedGen:C2748896,OMIM:400044 46,XY sex reversal, type 1 germline germline GRCh37 NC_000024.9 Y 2655321 2655321 T - Yp11.31 no assertion criteria provided 1 N OMIM Allelic Variant:480000.0008 1 diff --git a/perl/t/tracks/vcf/README.md b/perl/t/tracks/vcf/README.md deleted file mode 100644 index 8569eec6e..000000000 --- a/perl/t/tracks/vcf/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Test data note - -The file is fake at most sites over the 10 shown...the real hg38 data is mostly discordant in chr22. - -Have left 15927835 as discordant example (shouldn't make it into the database) - -Made 2 15927834 sites, to check what happens when sites overlap because they're cryptic multiallelics. - -15927759 is acutally a real, non-discordant site w/2 alleles A,C - -15927755 is a site set to T ref (to be non-discordant) diff --git a/perl/t/tracks/vcf/clinvar.t b/perl/t/tracks/vcf/clinvar.t deleted file mode 100644 index a63865f85..000000000 --- a/perl/t/tracks/vcf/clinvar.t +++ /dev/null @@ -1,150 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; - -use Mouse; - -extends 'Seq::Base'; - -1; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ HaveRequiredBinary PrepareConfigWithTempdirs /; - -use Path::Tiny; -use Scalar::Util qw/looks_like_number/; - -use Seq::Tracks::Reference::Build; -use Seq::Tracks::Reference::MapBases; -use Seq::Tracks::Vcf::Build; - -# Check required binary is available -if ( !HaveRequiredBinary("bystro-vcf") ) { - plan skip_all => "Testing relies on bystro-vcf binary, which is not present"; -} - -# create temp directories -my $dir = Path::Tiny->tempdir(); - -# prepare temp directory and make test config file -my $config_file = - PrepareConfigWithTempdirs( 't/tracks/vcf/clinvar.yml', - 't/tracks/vcf/raw', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify ); - -my $baseMapper = Seq::Tracks::Reference::MapBases->new(); - -my $seq = MockBuilder->new_with_config( { config => $config_file } ); - -my $tracks = $seq->tracksObj; -my $refBuilder = $tracks->getRefTrackBuilder(); -my $refGetter = $tracks->getRefTrackGetter(); - -$refBuilder->db->dbPatch( - 'chr1', $refBuilder->dbName, - 1022260 - 1, - $baseMapper->baseMap->{'C'} -); #chr1:1022260 - -$refBuilder->db->cleanUp(); - -my $dbVar = $refBuilder->db->dbReadOne( 'chr1', 1022260 - 1 ); -ok( $refGetter->get($dbVar) eq 'C' ); - -my $vcfBuilder = $tracks->getTrackBuilderByName('clinvar.match'); - -$vcfBuilder->buildTrack(); - -my $vcf = $tracks->getTrackGetterByName('clinvar.match'); - -my $db = Seq::DBManager->new(); - -############### Feature tests ################ -# The vcf file contains the following items: -# 1 1022260 . C T . . START=1022260;STOP=1022260;STRAND=+;VARIATION_TYPE=Variant;VARIATION_ID=128296;RCV=RCV000116258|RCV000550396;SCV=SCV000317028|SCV000150176|SC -# V000653894;ALLELE_ID=133745;SYMBOL=AGRN;HGVS_C=NM_198576.3:c.261C>T;HGVS_P=NP_940978.2:p.Asp87_eq_;MOLECULAR_CONSEQUENCE=NM_198576.3:c.261C>T:synonymous_variant;CLINICAL_SIGNIFICANCE=Benign/Likely_ -# benign;CLINICAL_SIGNIFICANCE_ORDERED=likely_benign|benign;PATHOGENIC=0;LIKELY_PATHOGENIC=0;UNCERTAIN_SIGNIFICANCE=0;LIKELY_BENIGN=1;BENIGN=2;REVIEW_STATUS=criteria_provided..multiple_submitters..no -# _conflicts;REVIEW_STATUS_ORDERED=no_assertion_criteria_provided|criteria_provided..single_submitter;LAST_EVALUATED=Aug_11..2017;ALL_SUBMITTERS=PreventionGenetics|Genetic_Services_Laboratory..Univer -# sity_of_Chicago|Invitae;SUBMITTERS_ORDERED=Genetic_Services_Laboratory..University_of_Chicago|PreventionGenetics|Invitae;ALL_TRAITS=not_specified|AllHighlyPenetrant|NOT_SPECIFIED|Myasthenic_syndrom -# e..congenital..8;ALL_PMIDS=25741868|20301347|28492532;ORIGIN=germline;XREFS=MedGen:CN169374|GeneReviews:NBK1168|MedGen:C3808739|OMIM:615120|Orphanet:590;DATES_ORDERED=0000-00-00|2017-08-11 - -my $href = $db->dbReadOne( 'chr1', 1022260 - 1 ); - -#my ($vcf, $href, $chr, $refBase, $allele, $alleleIdx, $positionIdx, $outAccum) = @_; -# At this position we have CACA,G alleles -my $out = []; - -# Vcf tracks are going to be treated differently from transcripts and sparse tracks -# They should return nothing for the nth position in a deletion -# Or if the allele doesn't match exactly. -$vcf->get( $href, 'chr1', 'C', 'G', 0, $out ); - -ok( @{$out} == 0, "Alleles that don't match won't get reported" ); - -$out = []; -$vcf->get( $href, 'chr1', 'C', 'T', 0, $out ); - -ok( @{$out} == @{ $vcf->features }, - "Report matching alleles, with one entry per requested feature" ); - -my $vidIdx = $vcf->getFieldDbName('variation_id'); -my $aidIdx = $vcf->getFieldDbName('allele_id'); -my $rcvIdx = $vcf->getFieldDbName('rcv'); -my $scvIdx = $vcf->getFieldDbName('scv'); -my $hgvscIdx = $vcf->getFieldDbName('hgvs_c'); -my $hgvspIdx = $vcf->getFieldDbName('hgvs_p'); -my $molIdx = $vcf->getFieldDbName('molecular_consequence'); -my $submittersIdx = $vcf->getFieldDbName('all_submitters'); -my $datesIdx = $vcf->getFieldDbName('dates'); -my $pathIdx = $vcf->getFieldDbName('pathogenic'); -my $lpathIdx = $vcf->getFieldDbName('likely_pathogenic'); -my $ucIdx = $vcf->getFieldDbName('uncertain_significance'); -my $benignIdx = $vcf->getFieldDbName('benign'); -my $lbenignIdx = $vcf->getFieldDbName('likely_benign'); -my $rsIdx = $vcf->getFieldDbName('review_status'); - -# we specify "ALL_TRAITS" => 'traits' in YAML -my $traitsIdx = $vcf->getFieldDbName('traits'); - -my $numFeatures = scalar @{ $vcf->features }; -ok( @{$out} == $numFeatures, - "vcf array contains an entry for each requested feature" ); - -ok( $out->[$vidIdx][0] == 128296, 'can reproduce variant_id' ); -ok( $out->[$aidIdx][0] == 133745, 'can reproduce allele_id' ); - -ok( $out->[$rcvIdx][0][0] eq 'RCV000116258', - 'can reproduce first rcv, by splitting on pipe as defined in YAML' ); -ok( $out->[$rcvIdx][0][1] eq 'RCV000550396', - 'can reproduce second rcv, by splitting on pipe as defined in YAML' ); - -ok( $out->[$scvIdx][0][0] eq 'SCV000317028', - 'can reproduce first scv, by splitting on pipe as defined in YAML' ); -ok( $out->[$scvIdx][0][1] eq 'SCV000150176', - 'can reproduce second scv, by splitting on pipe as defined in YAML' ); -ok( $out->[$scvIdx][0][2] eq 'SCV000653894', - 'can reproduce third scv, by splitting on pipe as defined in YAML' ); - -ok( - !defined $out->[$traitsIdx][0][0], - 'traits are split by pipe as specified in YAML, and not_specified is cleaned to undef in the array' -); -ok( - $out->[$traitsIdx][0][1] eq 'AllHighlyPenetrant', - 'traits are split by pipe as specified in YAML, finding AllHighlyPenetrant in the correct order' -); -ok( - !defined $out->[$traitsIdx][0][2], - 'traits are split by pipe as specified in YAML, and NOT_SPECIFIED is cleaned to undef in the array' -); -ok( - $out->[$traitsIdx][0][3] eq 'Myasthenic_syndrome..congenital..8', - 'traits are split by pipe as specified in YAML, finding Myasthenic_syndrome..congenital..8 in the correct order' -); - -$db->cleanUp(); - -done_testing(); diff --git a/perl/t/tracks/vcf/clinvar.yml b/perl/t/tracks/vcf/clinvar.yml deleted file mode 100644 index 3301704f4..000000000 --- a/perl/t/tracks/vcf/clinvar.yml +++ /dev/null @@ -1,79 +0,0 @@ -assembly: hg38 -build_author: ec2-user -build_date: 2017-08-08T03:49:00 -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA - program: bystro-vcf -chromosomes: - - chr1 -database_dir: t/tracks/vcf/index/ -files_dir: t/tracks/vcf/raw/ -temp_dir: "~" -tracks: - tracks: - - name: ref - type: reference - - features: - - alt - - variation_id: number - - allele_id: number - - strand - - rcv - - scv - - hgvs_c - - hgvs_p - - traits - - molecular_consequence - - clinical_significance - - pathogenic: number - - likely_pathogenic: number - - uncertain_significance: number - - likely_benign: number - - benign: number - - review_status - - last_evaluated - - submitters - - pmids - - origin - - xrefs - build_field_transformations: - molecular_consequence: split [|] - clinical_significance: split [|/] - review_status: split [|] - submitters: split [|] - xrefs: split [|] - traits: split [|] - pmids: split [|] - rcv: split [|] - scv: split [|] - fieldMap: - VARIATION_ID: variation_id - ALLELE_ID: allele_id - RCV: rcv - SCV: scv - STRAND: strand - HGVS_C: hgvs_c - HGVS_P: hgvs_p - ALL_TRAITS: traits - MOLECULAR_CONSEQUENCE: molecular_consequence - CLINICAL_SIGNIFICANCE: clinical_significance - PATHOGENIC: pathogenic - LIKELY_PATHOGENIC: likely_pathogenic - UNCERTAIN_SIGNIFICANCE: uncertain_significance - LIKELY_BENIGN: likely_benign - BENIGN: benign - REVIEW_STATUS: review_status - LAST_EVALUATED: last_evaluated - ALL_SUBMITTERS: submitters - ALL_PMIDS: pmids - ORIGIN: origin - XREFS: xrefs - LAST_EVALUATED: last_evaluated - local_files: - - clinvar_alleles.single.b38.vcf.1lines.gz - name: clinvar.match - type: vcf diff --git a/perl/t/tracks/vcf/integration.t b/perl/t/tracks/vcf/integration.t deleted file mode 100644 index afbb2202d..000000000 --- a/perl/t/tracks/vcf/integration.t +++ /dev/null @@ -1,909 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; - -use Mouse; - -extends 'Seq::Base'; - -1; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ HaveRequiredBinary PrepareConfigWithTempdirs /; - -use Path::Tiny; -use Scalar::Util qw/looks_like_number/; - -use Seq::Tracks::Reference::Build; -use Seq::Tracks::Reference::MapBases; -use Seq::Tracks::Vcf::Build; - -# Check required binary is available -if ( !HaveRequiredBinary("bystro-vcf") ) { - plan skip_all => "Testing relies on bystro-vcf binary, which is not present"; -} - -# create temp directories -my $dir = Path::Tiny->tempdir(); - -# prepare temp directory and make test config file -my $config_file = PrepareConfigWithTempdirs( - 't/tracks/vcf/test.hg38.chr22.yml', - 't/tracks/vcf/raw', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify -); - -my $baseMapper = Seq::Tracks::Reference::MapBases->new(); - -my $seq = MockBuilder->new_with_config( { config => $config_file } ); - -my $tracks = $seq->tracksObj; -my $refBuilder = $tracks->getRefTrackBuilder(); -my $refGetter = $tracks->getRefTrackGetter(); - -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927888 - 1, - $baseMapper->baseMap->{'C'} -); #chr14:19792736-19792737 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927876 - 1, - $baseMapper->baseMap->{'G'} -); #chr14:19792727 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927837 - 1, - $baseMapper->baseMap->{'A'} -); #chr14:19792869-19792870 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927835 - 1, - $baseMapper->baseMap->{'G'} -); #chr14:19792857-19792858 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927834 - 1, - $baseMapper->baseMap->{'G'} -); #chr14:19792818-19792819 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927765 - 1, - $baseMapper->baseMap->{'A'} -); #chr14:19792816-19792817 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927759 - 1, - $baseMapper->baseMap->{'A'} -); #chr14:19792815-19792816 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927755 - 1, - $baseMapper->baseMap->{'T'} -); #On #chr14:19792746-19792747 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927745 - 1, - $baseMapper->baseMap->{'A'} -); #chr14:19792740-19792741 #same - -$refBuilder->db->cleanUp(); - -my $dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927888 - 1 ); -ok( $refGetter->get($dbVar) eq 'C' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927876 - 1 ); -ok( $refGetter->get($dbVar) eq 'G' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927837 - 1 ); -ok( $refGetter->get($dbVar) eq 'A' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927835 - 1 ); -ok( $refGetter->get($dbVar) eq 'G' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927834 - 1 ); -ok( $refGetter->get($dbVar) eq 'G' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927765 - 1 ); -ok( $refGetter->get($dbVar) eq 'A' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927759 - 1 ); -ok( $refGetter->get($dbVar) eq 'A' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927755 - 1 ); #on -ok( $refGetter->get($dbVar) eq 'T' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927745 - 1 ); -ok( $refGetter->get($dbVar) eq 'A' ); - -my $vcfBuilder = $tracks->getTrackBuilderByName('gnomad.genomes'); - -$vcfBuilder->buildTrack(); - -my $vcf = $tracks->getTrackGetterByName('gnomad.genomes'); - -my $db = Seq::DBManager->new(); - -############### Feature tests ################ -# The vcf file contains the following items: -# Comma separated values are treated as belonging to diff alleles -# The G allele is the 2nd of the two at this site: -# C CACA,G -# Therefore, for this first test, we expect the 2nd of two values, when present -# AC=3,0; -# AF=1.81378e-04, 0.00000e+00; -# AN=16540; -# AC_AFR=0,0; -# AC_AMR=0,0; -# AC_ASJ=0,0; -# AC_EAS=0,0; -# AC_FIN=0,0; -# AC_NFE=3,0; -# AC_OTH=0,0; -# AC_Male=3,0; -# AC_Female=0,0; -# AN_AFR=3614; -# AN_AMR=528; -# AN_ASJ=180; -# AN_EAS=892; -# AN_FIN=2256; -# AN_NFE=8466; -# AN_OTH=604; -# AN_Male=9308; -# AN_Female=7232; -# AF_AFR=0.00000e+00, 0.00000e+00; -# AF_AMR=0.00000e+00, 0.00000e+00; -# AF_ASJ=0.00000e+00, 0.00000e+00; -# AF_EAS=0.00000e+00, 0.00000e+00; -# AF_FIN=0.00000e+00, 0.00000e+00; -# AF_NFE=3.54359e-04, 0.00000e+00; -# AF_OTH=0.00000e+00, 0.00000e+00; -# AF_Male=3.22303e-04, 0.00000e+00; -# AF_Female=0.00000e+00, 0.00000e+00; -# AS_FilterStatus=PASS,RF|AC0 - -# Notice that this site has an RF|AC0 value for AS_FilterStatus -# Therefore it doesn't pass - -my $href = $db->dbReadOne( 'chr22', 15927888 - 1 ); - -#my ($vcf, $href, $chr, $refBase, $allele, $alleleIdx, $positionIdx, $outAccum) = @_; -# At this position we have CACA,G alleles -my $out = []; - -# Vcf tracks are going to be treated differently from transcripts and sparse tracks -# They should return nothing for the nth position in a deletion -# Or if the allele doesn't match exactly. -$vcf->get( $href, 'chr22', 'C', 'G', 0, 0, $out ); - -ok( - @{$out} == 0, - "Non PASS AS_FilterStatus causes alleles to be skipped in multiallelic (testing build_row_filters on INFO values)" -); - -# my $numFeatures = scalar @{$vcf->features}; -# ok(@{$out} == $numFeatures, "vcf array contians an entry for each requested feature"); - -# for my $feature (@{$vcf->features}) { -# my $idx = $vcf->getFieldDbName($feature); - -# ok(@{$out->[$idx]} == 1, "Every feature is considered bi-allelelic (because alleles are decomposed into bi-allelic sites, with 1 entry"); -# ok(@{$out->[$idx][0]} == 1 && !ref $out->[$idx][0], "Every feature contains a single position's worth of data, and that value is scalar"); -# } - -# We define these as lower case in our test yaml, so that is how we will look them up -# No decision is made on behalf of the user how to name these; will be taken as defined - -my $trTvIdx = $vcf->getFieldDbName('trTv'); -my $idIdx = $vcf->getFieldDbName('id'); -my $acIdx = $vcf->getFieldDbName('ac'); -my $afIdx = $vcf->getFieldDbName('af'); -my $anIdx = $vcf->getFieldDbName('an'); - -my $acAfrIdx = $vcf->getFieldDbName('ac_afr'); -my $acAmrIdx = $vcf->getFieldDbName('ac_amr'); -my $acAsjIdx = $vcf->getFieldDbName('ac_asj'); -my $acEasIdx = $vcf->getFieldDbName('ac_eas'); -my $acFinIdx = $vcf->getFieldDbName('ac_fin'); -my $acNfeIdx = $vcf->getFieldDbName('ac_nfe'); -my $acOthIdx = $vcf->getFieldDbName('ac_oth'); -my $acMaleIdx = $vcf->getFieldDbName('ac_male'); -my $acFemaleIdx = $vcf->getFieldDbName('ac_female'); - -my $anAfrIdx = $vcf->getFieldDbName('an_afr'); -my $anAmrIdx = $vcf->getFieldDbName('an_amr'); -my $anAsjIdx = $vcf->getFieldDbName('an_asj'); -my $anEasIdx = $vcf->getFieldDbName('an_eas'); -my $anFinIdx = $vcf->getFieldDbName('an_fin'); -my $anNfeIdx = $vcf->getFieldDbName('an_nfe'); -my $anOthIdx = $vcf->getFieldDbName('an_oth'); -my $anMaleIdx = $vcf->getFieldDbName('an_male'); -my $anFemaleIdx = $vcf->getFieldDbName('an_female'); - -my $afAfrIdx = $vcf->getFieldDbName('af_afr'); -my $afAmrIdx = $vcf->getFieldDbName('af_amr'); -my $afAsjIdx = $vcf->getFieldDbName('af_asj'); -my $afEasIdx = $vcf->getFieldDbName('af_eas'); -my $afFinIdx = $vcf->getFieldDbName('af_fin'); -my $afNfeIdx = $vcf->getFieldDbName('af_nfe'); -my $afOthIdx = $vcf->getFieldDbName('af_oth'); -my $afMaleIdx = $vcf->getFieldDbName('af_male'); -my $afFemaleIdx = $vcf->getFieldDbName('af_female'); - -# ok($out->[$trTvIdx][0] == 0, "indels and multiallelics have 0 trTv value"); -# ok(!defined $out->[$idIdx][0], "correctly finds that this site has no rsID"); -# ok($out->[$acIdx][0] == 0, "correctly finds first the G alleles ac value"); -# ok($out->[$afIdx][0] == 0, "correctly finds first the G alleles af value"); -# ok($out->[$anIdx][0] == 16540, "correctly finds first the G alleles an value, which has only one value across all alleles"); - -# ok($out->[$acAfrIdx][0] == 0, "correctly finds first the G alleles ac_afr value"); -# ok($out->[$acAmrIdx][0] == 0, "correctly finds first the G alleles ac_amr value"); -# ok($out->[$acAsjIdx][0] == 0, "correctly finds first the G alleles ac_asj value"); -# ok($out->[$acEasIdx][0] == 0, "correctly finds first the G alleles ac_eas value"); -# ok($out->[$acFinIdx][0] == 0, "correctly finds first the G alleles ac_fin value"); -# ok($out->[$acNfeIdx][0] == 0, "correctly finds first the G alleles ac_nfe value"); -# ok($out->[$acOthIdx][0] == 0, "correctly finds first the G alleles ac_oth value"); -# ok($out->[$acMaleIdx][0] == 0, "correctly finds first the G alleles ac_male value"); -# ok($out->[$acFemaleIdx][0] == 0, "correctly finds first the G alleles ac_female value"); - -# ok($out->[$anAfrIdx][0] == 3614, "correctly finds first the G alleles an_afr value, which has only one value across all alleles"); -# ok($out->[$anAmrIdx][0] == 528, "correctly finds first the G alleles an_amr value, which has only one value across all alleles"); -# ok($out->[$anAsjIdx][0] == 180, "correctly finds first the G alleles an_asj value, which has only one value across all alleles"); -# ok($out->[$anEasIdx][0] == 892, "correctly finds first the G alleles an_eas value, which has only one value across all alleles"); -# ok($out->[$anFinIdx][0] == 2256, "correctly finds first the G alleles an_fin value, which has only one value across all alleles"); -# ok($out->[$anNfeIdx][0] == 8466, "correctly finds first the G alleles an_nfe value, which has only one value across all alleles"); -# ok($out->[$anOthIdx][0] == 604, "correctly finds first the G alleles an_oth value, which has only one value across all alleles"); -# ok($out->[$anMaleIdx][0] == 9308, "correctly finds first the G alleles an_male value, which has only one value across all alleles"); -# ok($out->[$anFemaleIdx][0] == 7232, "correctly finds first the G alleles an_female value, which has only one value across all alleles"); - -# ok($out->[$afAfrIdx][0] == 0, "correctly finds first the G alleles af_afr value"); -# ok($out->[$afAmrIdx][0] == 0, "correctly finds first the G alleles af_amr value"); -# ok($out->[$afAsjIdx][0] == 0, "correctly finds first the G alleles af_asj value"); -# ok($out->[$afEasIdx][0] == 0, "correctly finds first the G alleles af_eas value"); -# ok($out->[$afFinIdx][0] == 0, "correctly finds first the G alleles af_fin value"); -# ok($out->[$afNfeIdx][0] == 0, "correctly finds first the G alleles af_nfe value"); -# ok($out->[$afOthIdx][0] == 0, "correctly finds first the G alleles af_oth value"); -# ok($out->[$afMaleIdx][0] == 0, "correctly finds first the G alleles af_male value"); -# ok($out->[$afFemaleIdx][0] == 0, "correctly finds first the G alleles af_female value"); - -############### Feature tests ################ -# The vcf file contains the following items: -# Comma separated values are treated as belonging to diff alleles -# The +ACA allele is the 1st of the two at this site: -# C CACA,G -# Therefore, for this 2nd test, we expect the 1st of two values, when present -# AC=3,0; -# AF=1.81378e-04, 0.00000e+00; -# AN=16540; -# AC_AFR=0,0; -# AC_AMR=0,0; -# AC_ASJ=0,0; -# AC_EAS=0,0; -# AC_FIN=0,0; -# AC_NFE=3,0; -# AC_OTH=0,0; -# AC_Male=3,0; -# AC_Female=0,0; -# AN_AFR=3614; -# AN_AMR=528; -# AN_ASJ=180; -# AN_EAS=892; -# AN_FIN=2256; -# AN_NFE=8466; -# AN_OTH=604; -# AN_Male=9308; -# AN_Female=7232; -# AF_AFR=0.00000e+00, 0.00000e+00; -# AF_AMR=0.00000e+00, 0.00000e+00; -# AF_ASJ=0.00000e+00, 0.00000e+00; -# AF_EAS=0.00000e+00, 0.00000e+00; -# AF_FIN=0.00000e+00, 0.00000e+00; -# AF_NFE=3.54359e-04, 0.00000e+00; -# AF_OTH=0.00000e+00, 0.00000e+00; -# AF_Male=3.22303e-04, 0.00000e+00; -# AF_Female=0.00000e+00, 0.00000e+00; - -# Vcf tracks are going to be treated differently from transcripts and sparse tracks -# They should return nothing for the nth position in a deletion -# Or if the allele doesn't match exactly. -$out = []; -$vcf->get( $href, 'chr22', 'C', '+ACA', 0, $out ); - -my $numFeatures = scalar @{ $vcf->features }; -ok( @{$out} == $numFeatures, - "vcf array contains an entry for each requested feature" ); - -for my $feature ( @{ $vcf->features } ) { - my $idx = $vcf->getFieldDbName($feature); - - ok( - @{ $out->[$idx] } == 1, - "Every feature is considered bi-allelelic (because alleles are decomposed into bi-allelic sites, with 1 entry" - ); - ok( - !ref $out->[$idx][0], - "Every feature contains a single position's worth of data, and that value is scalar" - ); -} - -$out = []; -$vcf->get( $href, 'chr22', 'C', '+ACA', 1, $out ); -ok( - @$out == 0, - "Vcf getter does not tile annotations by the position of the allele. Only accepts position index 0" -); - -$out = []; -$vcf->get( $href, 'chr22', 'C', '+ACA', 0, $out ); - -# # Although we've deprecated single-line multiallelics, we must continue to support them -# # Until we've decided to drop support -# # Note, taht is for some reason a person wanted to fetch only one of the two alleles -# # the site would appear as a [feature1, featuer2] rather than [[feature1_allele1...], [feature1_allele2...]] -# for my $feature (@{$vcf->features}) { -# my $idx = $vcf->getFieldDbName($feature); - -# ok(@{$out->[$idx]} == 2, "For multiallelic sites, where both alleles on a single input row, each feature is given an array of length == \# of alleles"); -# } - -ok( $out->[$trTvIdx][0] == 0, "indels and multiallelics have 0 trTv value" ); -ok( !defined $out->[$idIdx][0], "correctly finds that this site has no rsID" ); -ok( $out->[$acIdx][0] == 3, "correctly finds first the +ACA allele ac value" ); -ok( - $out->[$afIdx][0] == unpack( 'f', pack( 'f', 1.81378e-04 ) ), - "correctly finds first the +ACA allele af value" -); -ok( - $out->[$anIdx][0] == 16540, - "correctly finds first the +ACA allele an value, which has only one value across all alleles" -); - -ok( $out->[$acAfrIdx][0] == 0, - "correctly finds first the +ACA allele ac_afr value" ); -ok( $out->[$acAmrIdx][0] == 0, - "correctly finds first the +ACA allele ac_amr value" ); -ok( $out->[$acAsjIdx][0] == 0, - "correctly finds first the +ACA allele ac_asj value" ); -ok( $out->[$acEasIdx][0] == 0, - "correctly finds first the +ACA allele ac_eas value" ); -ok( $out->[$acFinIdx][0] == 0, - "correctly finds first the +ACA allele ac_fin value" ); -ok( $out->[$acNfeIdx][0] == 3, - "correctly finds first the +ACA allele ac_nfe value" ); -ok( $out->[$acOthIdx][0] == 0, - "correctly finds first the +ACA allele ac_oth value" ); -ok( $out->[$acMaleIdx][0] == 3, - "correctly finds first the +ACA allele ac_male value" ); -ok( $out->[$acFemaleIdx][0] == 0, - "correctly finds first the +ACA allele ac_female value" ); - -ok( - $out->[$anAfrIdx][0] == 3614, - "correctly finds first the +ACA allele an_afr value, which has only one value across all alleles" -); -ok( - $out->[$anAmrIdx][0] == 528, - "correctly finds first the +ACA allele an_amr value, which has only one value across all alleles" -); -ok( - $out->[$anAsjIdx][0] == 180, - "correctly finds first the +ACA allele an_asj value, which has only one value across all alleles" -); -ok( - $out->[$anEasIdx][0] == 892, - "correctly finds first the +ACA allele an_eas value, which has only one value across all alleles" -); -ok( - $out->[$anFinIdx][0] == 2256, - "correctly finds first the +ACA allele an_fin value, which has only one value across all alleles" -); -ok( - $out->[$anNfeIdx][0] == 8466, - "correctly finds first the +ACA allele an_nfe value, which has only one value across all alleles" -); -ok( - $out->[$anOthIdx][0] == 604, - "correctly finds first the +ACA allele an_oth value, which has only one value across all alleles" -); -ok( - $out->[$anMaleIdx][0] == 9308, - "correctly finds first the +ACA allele an_male value, which has only one value across all alleles" -); -ok( - $out->[$anFemaleIdx][0] == 7232, - "correctly finds first the +ACA allele an_female value, which has only one value across all alleles" -); - -ok( $out->[$afAfrIdx][0] == 0, - "correctly finds first the +ACA allele af_afr value" ); -ok( $out->[$afAmrIdx][0] == 0, - "correctly finds first the +ACA allele af_amr value" ); -ok( $out->[$afAsjIdx][0] == 0, - "correctly finds first the +ACA allele af_asj value" ); -ok( $out->[$afEasIdx][0] == 0, - "correctly finds first the +ACA allele af_eas value" ); -ok( $out->[$afFinIdx][0] == 0, - "correctly finds first the +ACA allele af_fin value" ); -ok( - $out->[$afNfeIdx][0] == unpack( 'f', pack( 'f', 3.54359e-04 ) ), - "correctly finds first the +ACA allele af_nfe value" -); -ok( $out->[$afOthIdx][0] == 0, - "correctly finds first the +ACA allele af_oth value" ); -ok( - $out->[$afMaleIdx][0] == unpack( 'f', pack( 'f', 3.22303e-04 ) ), - "correctly finds first the +ACA allele af_male value" -); -ok( $out->[$afFemaleIdx][0] == 0, - "correctly finds first the +ACA allele af_female value" ); - -##### TESTING VARIANTS THAT AREN'T PASS OR . -# chr22 15927755 . T G 296.53 NON_PASS -$out = []; -$href = $db->dbReadOne( 'chr22', 15927755 - 1 ); -$vcf->get( $href, 'chr22', 'T', 'G', 0, $out ); - -ok( @$out == 0, 'NON PASS/. variants are skipped' ); - -# Next let's check variants that are bi-allelic -# chr22 15927745 . A C 718.20 PASS -# AC=2; -# AF=6.93049e-05; -# AN=28858; -# AC_AFR=0; -# AC_AMR=0; -# AC_ASJ=0; -# AC_EAS=0; -# AC_FIN=0; -# AC_NFE=2; -# AC_OTH=0; -# AC_Male=1; -# AC_Female=1; -# AN_AFR=8454; -# AN_AMR=782; -# AN_ASJ=232; -# AN_EAS=1606; -# AN_FIN=3132; -# AN_NFE=13774; -# AN_OTH=878; -# AN_Male=15900; -# AN_Female=12958; -# AF_AFR=0.00000e+00; -# AF_AMR=0.00000e+00; -# AF_ASJ=0.00000e+00; -# AF_EAS=0.00000e+00; -# AF_FIN=0.00000e+00; -# AF_NFE=1.45201e-04; -# AF_OTH=0.00000e+00; -# AF_Male=6.28931e-05; -# AF_Female=7.71724e-05; - -$out = []; -$href = $db->dbReadOne( 'chr22', 15927745 - 1 ); -$vcf->get( $href, 'chr22', 'A', 'C', 0, $out ); - -ok( $out->[$trTvIdx][0] == 2, "A->C is a transversion, so given value of 2" ); -ok( !defined $out->[$idIdx][0], "correctly finds that this site has no rsID" ); -ok( $out->[$acIdx][0] == 2, "correctly finds the ac value for a biallelic site" ); -ok( - $out->[$afIdx][0] == unpack( 'f', pack( 'f', 6.93049e-05 ) ), - "correctly finds the af value for a biallelic site" -); -ok( $out->[$anIdx][0] == 28858, - "correctly finds the an value for a biallelic site" ); - -ok( $out->[$acAfrIdx][0] == 0, "correctly finds the C allele ac_afr value" ); -ok( $out->[$acAmrIdx][0] == 0, "correctly finds C allele allele ac_amr value" ); -ok( $out->[$acAsjIdx][0] == 0, "correctly finds C allele allele ac_asj value" ); -ok( $out->[$acEasIdx][0] == 0, "correctly finds C allele allele ac_eas value" ); -ok( $out->[$acFinIdx][0] == 0, "correctly finds C allele allele ac_fin value" ); -ok( $out->[$acNfeIdx][0] == 2, "correctly finds C allele allele ac_nfe value" ); -ok( $out->[$acOthIdx][0] == 0, "correctly finds C allele allele ac_oth value" ); -ok( $out->[$acMaleIdx][0] == 1, "correctly finds C allele allele ac_male value" ); -ok( $out->[$acFemaleIdx][0] == 1, - "correctly finds C allele allele ac_female value" ); - -ok( - $out->[$anAfrIdx][0] == 8454, - "correctly finds the C allele an_afr value, which has only one value across all alleles" -); -ok( - $out->[$anAmrIdx][0] == 782, - "correctly finds the C allele an_amr value, which has only one value across all alleles" -); -ok( - $out->[$anAsjIdx][0] == 232, - "correctly finds the C allele an_asj value, which has only one value across all alleles" -); -ok( - $out->[$anEasIdx][0] == 1606, - "correctly finds the C allele an_eas value, which has only one value across all alleles" -); -ok( - $out->[$anFinIdx][0] == 3132, - "correctly finds the C allele an_fin value, which has only one value across all alleles" -); -ok( - $out->[$anNfeIdx][0] == 13774, - "correctly finds the C allele an_nfe value, which has only one value across all alleles" -); -ok( - $out->[$anOthIdx][0] == 878, - "correctly finds the C allele an_oth value, which has only one value across all alleles" -); -ok( - $out->[$anMaleIdx][0] == 15900, - "correctly finds the C allele an_male value, which has only one value across all alleles" -); -ok( - $out->[$anFemaleIdx][0] == 12958, - "correctly finds the C allele an_female value, which has only one value across all alleles" -); - -ok( $out->[$afAfrIdx][0] == 0, "correctly finds the C allele af_afr value" ); -ok( $out->[$afAmrIdx][0] == 0, "correctly finds the C allele af_amr value" ); -ok( $out->[$afAsjIdx][0] == 0, "correctly finds the C allele af_asj value" ); -ok( $out->[$afEasIdx][0] == 0, "correctly finds the C allele af_eas value" ); -ok( $out->[$afFinIdx][0] == 0, "correctly finds the C allele af_fin value" ); -ok( $out->[$afNfeIdx][0] == unpack( 'f', pack( 'f', 1.45201e-04 ) ), - "correctly finds the C allele af_nfe value" ); -ok( $out->[$afOthIdx][0] == 0, "correctly finds the C allele af_oth value" ); -ok( $out->[$afMaleIdx][0] == unpack( 'f', pack( 'f', 6.28931e-05 ) ), - "correctly finds the C allele af_male value" ); -ok( $out->[$afFemaleIdx][0] == unpack( 'f', pack( 'f', 7.71724e-05 ) ), - "correctly finds the C allele af_female value" ); - -$out = []; - -$href = $db->dbReadOne( 'chr22', 15927745 - 1 ); -$vcf->get( $href, 'chr22', 'A', 'T', 0, $out ); - -ok( @$out == 0, "Alelleles that don't match are skipped" ); - -# Next let's check a variant that has 2 non-reference alleles, but which came from different lines -# The order of the lines doesn't matter -# TODO: write explicit test for order not mattering -# The first one - -# chr22 15927834 . G T 183.64 PASS -# AC=1; -# AF=4.21905e-05; -# AN=23702; -# AC_AFR=1; -# AC_AMR=0; -# AC_ASJ=0; -# AC_EAS=0; -# AC_FIN=0; -# AC_NFE=0; -# AC_OTH=0; -# AC_Male=1; -# AC_Female=0; -# AN_AFR=6452; -# AN_AMR=638; -# AN_ASJ=222; -# AN_EAS=1398; -# AN_FIN=2270; -# AN_NFE=12010; -# AN_OTH=712; -# AN_Male=13204; -# AN_Female=10498; -# AF_AFR=1.54991e-04; -# AF_AMR=0.00000e+00; -# AF_ASJ=0.00000e+00; -# AF_EAS=0.00000e+00; -# AF_FIN=0.00000e+00; -# AF_NFE=0.00000e+00; -# AF_OTH=0.00000e+00; -# AF_Male=7.57346e-05; -# AF_Female=0.00000e+00; - -$out = []; -my $firstAllele = []; - -$href = $db->dbReadOne( 'chr22', 15927834 - 1 ); -$vcf->get( $href, 'chr22', 'G', 'T', 0, $firstAllele ); - -ok( - $firstAllele->[$trTvIdx][0] == 2, "trTv is 0 for multiallelics. - However, when our vcf parser is passed 2 alleles at the same position, but on different source file lines, it treats that as a SNP. - Therefore it calls it as a G->T transversion, or 2" -); -ok( !defined $firstAllele->[$idIdx][0], - "correctly finds that this site has no rsID" ); -ok( $firstAllele->[$acIdx][0] == 1, - "correctly finds the ac value for a biallelic site" ); -ok( $firstAllele->[$afIdx][0] == unpack( 'f', pack( 'f', 4.21905e-05 ) ), - "correctly finds the af value for a biallelic site" ); -ok( $firstAllele->[$anIdx][0] == 23702, - "correctly finds the an value for a biallelic site" ); - -ok( $firstAllele->[$acAfrIdx][0] == 1, "correctly finds the T allele ac_afr value" ); -ok( $firstAllele->[$acAmrIdx][0] == 0, - "correctly finds T allele allele ac_amr value" ); -ok( $firstAllele->[$acAsjIdx][0] == 0, - "correctly finds T allele allele ac_asj value" ); -ok( $firstAllele->[$acEasIdx][0] == 0, - "correctly finds T allele allele ac_eas value" ); -ok( $firstAllele->[$acFinIdx][0] == 0, - "correctly finds T allele allele ac_fin value" ); -ok( $firstAllele->[$acNfeIdx][0] == 0, - "correctly finds T allele allele ac_nfe value" ); -ok( $firstAllele->[$acOthIdx][0] == 0, - "correctly finds T allele allele ac_oth value" ); -ok( $firstAllele->[$acMaleIdx][0] == 1, - "correctly finds T allele allele ac_male value" ); -ok( $firstAllele->[$acFemaleIdx][0] == 0, - "correctly finds T allele allele ac_female value" ); - -ok( - $firstAllele->[$anAfrIdx][0] == 6452, - "correctly finds the T allele an_afr value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anAmrIdx][0] == 638, - "correctly finds the T allele an_amr value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anAsjIdx][0] == 222, - "correctly finds the T allele an_asj value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anEasIdx][0] == 1398, - "correctly finds the T allele an_eas value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anFinIdx][0] == 2270, - "correctly finds the T allele an_fin value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anNfeIdx][0] == 12010, - "correctly finds the T allele an_nfe value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anOthIdx][0] == 712, - "correctly finds the T allele an_oth value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anMaleIdx][0] == 13204, - "correctly finds the T allele an_male value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anFemaleIdx][0] == 10498, - "correctly finds the T allele an_female value, which has only one value across all alleles" -); - -ok( $firstAllele->[$afAfrIdx][0] == unpack( 'f', pack( 'f', 1.54991e-04 ) ), - "correctly finds the T allele af_afr value" ); -ok( $firstAllele->[$afAmrIdx][0] == 0, "correctly finds the T allele af_amr value" ); -ok( $firstAllele->[$afAsjIdx][0] == 0, "correctly finds the T allele af_asj value" ); -ok( $firstAllele->[$afEasIdx][0] == 0, "correctly finds the T allele af_eas value" ); -ok( $firstAllele->[$afFinIdx][0] == 0, "correctly finds the T allele af_fin value" ); -ok( $firstAllele->[$afNfeIdx][0] == 0, "correctly finds the T allele af_nfe value" ); -ok( $firstAllele->[$afAsjIdx][0] == 0, "correctly finds the T allele af_asj value" ); -ok( $firstAllele->[$afOthIdx][0] == 0, "correctly finds the T allele af_oth value" ); -ok( $firstAllele->[$afMaleIdx][0] == unpack( 'f', pack( 'f', 7.57346e-05 ) ), - "correctly finds the T allele af_male value" ); -ok( - $firstAllele->[$afFemaleIdx][0] == 0, - "correctly finds the T allele af_female value" -); - -# The 2nd one: -# chr22 15927834 rs199856444 G C 1458410.68 PASS -# AC=5232; -# AF=2.00721e-01; -# AN=26066; -# AC_AFR=1195; -# AC_AMR=199; -# AC_ASJ=48; -# AC_EAS=462; -# AC_FIN=539; -# AC_NFE=2634; -# AC_OTH=155; -# AC_Male=2860; -# AC_Female=2372; -# AN_AFR=7838; -# AN_AMR=630; -# AN_ASJ=216; -# AN_EAS=1372; -# AN_FIN=2596; -# AN_NFE=12638; -# AN_OTH=776; -# AN_Male=14358; -# AN_Female=11708; -# AF_AFR=1.52462e-01; -# AF_AMR=3.15873e-01; -# AF_ASJ=2.22222e-01; -# AF_EAS=3.36735e-01; -# AF_FIN=2.07627e-01; -# AF_NFE=2.08419e-01; -# AF_OTH=1.99742e-01; -# AF_Male=1.99192e-01; -# AF_Female=2.02597e-01; - -my $secondAllele = []; -$href = $db->dbReadOne( 'chr22', 15927834 - 1 ); -$vcf->get( $href, 'chr22', 'G', 'C', 0, $secondAllele ); - -ok( - $secondAllele->[$trTvIdx][0] == 2, "trTv is 0 for multiallelics. \ - However, when our vcf parser is passed 2 alleles at the same position, but on different source file lines, it treats that as a SNP. - Therefore it calls it as a G->C transversion, or 2" -); -ok( - $secondAllele->[$idIdx][0] eq 'rs199856444', - "correctly finds that this site has an rsID" -); -ok( $secondAllele->[$acIdx][0] == 5232, - "correctly finds the ac value for a biallelic site" ); -ok( $secondAllele->[$afIdx][0] == unpack( 'f', pack( 'f', 2.00721e-01 ) ), - "correctly finds the af value for a biallelic site" ); -ok( $secondAllele->[$anIdx][0] == 26066, - "correctly finds the an value for a biallelic site" ); - -ok( - $secondAllele->[$acAfrIdx][0] == 1195, - "correctly finds the C allele ac_afr value" -); -ok( - $secondAllele->[$acAmrIdx][0] == 199, - "correctly finds C allele allele ac_amr value" -); -ok( $secondAllele->[$acAsjIdx][0] == 48, - "correctly finds C allele allele ac_asj value" ); -ok( - $secondAllele->[$acEasIdx][0] == 462, - "correctly finds C allele allele ac_eas value" -); -ok( - $secondAllele->[$acFinIdx][0] == 539, - "correctly finds C allele allele ac_fin value" -); -ok( - $secondAllele->[$acNfeIdx][0] == 2634, - "correctly finds C allele allele ac_nfe value" -); -ok( - $secondAllele->[$acOthIdx][0] == 155, - "correctly finds C allele allele ac_oth value" -); -ok( - $secondAllele->[$acMaleIdx][0] == 2860, - "correctly finds C allele allele ac_male value" -); -ok( - $secondAllele->[$acFemaleIdx][0] == 2372, - "correctly finds C allele allele ac_female value" -); - -ok( - $secondAllele->[$anAfrIdx][0] == 7838, - "correctly finds the C allele an_afr value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anAmrIdx][0] == 630, - "correctly finds the C allele an_amr value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anAsjIdx][0] == 216, - "correctly finds the C allele an_asj value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anEasIdx][0] == 1372, - "correctly finds the C allele an_eas value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anFinIdx][0] == 2596, - "correctly finds the C allele an_fin value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anNfeIdx][0] == 12638, - "correctly finds the C allele an_nfe value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anOthIdx][0] == 776, - "correctly finds the C allele an_oth value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anMaleIdx][0] == 14358, - "correctly finds the C allele an_male value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anFemaleIdx][0] == 11708, - "correctly finds the C allele an_female value, which has only one value across all alleles" -); - -ok( $secondAllele->[$afAfrIdx][0] == unpack( 'f', pack( 'f', 1.52462e-01 ) ), - "correctly finds the C allele af_afr value" ); -ok( $secondAllele->[$afAmrIdx][0] == unpack( 'f', pack( 'f', 3.15873e-01 ) ), - "correctly finds the C allele af_amr value" ); -ok( $secondAllele->[$afAsjIdx][0] == unpack( 'f', pack( 'f', 2.22222e-01 ) ), - "correctly finds the C allele af_asj value" ); -ok( $secondAllele->[$afEasIdx][0] == unpack( 'f', pack( 'f', 3.36735e-01 ) ), - "correctly finds the C allele af_eas value" ); -ok( $secondAllele->[$afFinIdx][0] == unpack( 'f', pack( 'f', 2.07627e-01 ) ), - "correctly finds the C allele af_fin value" ); -ok( $secondAllele->[$afNfeIdx][0] == unpack( 'f', pack( 'f', 2.08419e-01 ) ), - "correctly finds the C allele af_nfe value" ); -ok( $secondAllele->[$afOthIdx][0] == unpack( 'f', pack( 'f', 1.99742e-01 ) ), - "correctly finds the C allele af_oth value" ); -ok( $secondAllele->[$afMaleIdx][0] == unpack( 'f', pack( 'f', 1.99192e-01 ) ), - "correctly finds the C allele af_male value" ); -ok( $secondAllele->[$afFemaleIdx][0] == unpack( 'f', pack( 'f', 2.02597e-01 ) ), - "correctly finds the C allele af_female value" ); - -# Let's see what happens if a user wants to show both alleles on the same line -my $multiallelic = [ [], [] ]; - -$href = $db->dbReadOne( 'chr22', 15927834 - 1 ); -$vcf->get( $href, 'chr22', 'G', 'T', 0, $multiallelic->[0] ); - -$vcf->get( $href, 'chr22', 'G', 'C', 0, $multiallelic->[1] ); - -for my $alleleIdx ( 0 .. $#$multiallelic ) { - for my $feature ( @{ $vcf->features } ) { - my $featureIdx = $vcf->getFieldDbName($feature); - my $posIdx = 0; - - if ( $alleleIdx == 0 ) { - if ( !defined $multiallelic->[$alleleIdx][$featureIdx][$posIdx] ) { - ok( - !defined $firstAllele->[$alleleIdx][$featureIdx][$posIdx], - "multiallelics are reproduced just like bi-allelics, but on single line for feature " - . $vcf->features->[$featureIdx] - ); - } - elsif ( looks_like_number( $multiallelic->[$featureIdx][$alleleIdx][$posIdx] ) ) { - ok( - $multiallelic->[$alleleIdx][$featureIdx][$posIdx] - == $firstAllele->[$featureIdx][$posIdx], - "multiallelics are reproduced just like bi-allelics, but on single line for feature " - . $vcf->features->[$featureIdx] - ); - } - else { - ok( - $multiallelic->[$alleleIdx][$featureIdx][$posIdx] eq - $firstAllele->[$featureIdx][$posIdx], - "multiallelics are reproduced just like bi-allelics, but on single line for feature " - . $vcf->features->[$featureIdx] - ); - } - - next; - } - - if ( $alleleIdx == 1 ) { - if ( !defined $multiallelic->[$alleleIdx][$featureIdx][$posIdx] ) { - ok( - !defined $secondAllele->[$featureIdx][$posIdx], - "multiallelics are reproduced just like bi-allelics, but on single line for feature " - . $vcf->features->[$featureIdx] - ); - } - elsif ( looks_like_number( $multiallelic->[$alleleIdx][$featureIdx][$posIdx] ) ) { - ok( - $multiallelic->[$alleleIdx][$featureIdx][$posIdx] - == $secondAllele->[$featureIdx][$posIdx], - "multiallelics are reproduced just like bi-allelics, but on single line for feature " - . $vcf->features->[$featureIdx] - ); - } - else { - ok( - $multiallelic->[$alleleIdx][$featureIdx][$posIdx] eq - $secondAllele->[$featureIdx][$posIdx], - "multiallelics are reproduced just like bi-allelics, but on single line for feature " - . $vcf->features->[$featureIdx] - ); - } - } - } -} - -$db->cleanUp(); - -done_testing(); diff --git a/perl/t/tracks/vcf/integration_scrambled_multiple_files.t b/perl/t/tracks/vcf/integration_scrambled_multiple_files.t deleted file mode 100644 index 59dc20007..000000000 --- a/perl/t/tracks/vcf/integration_scrambled_multiple_files.t +++ /dev/null @@ -1,1012 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -package MockBuilder; - -use Mouse; - -extends 'Seq::Base'; - -1; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ HaveRequiredBinary PrepareConfigWithTempdirs /; - -use Path::Tiny; -use Scalar::Util qw/looks_like_number/; - -use Seq::Tracks::Reference::Build; -use Seq::Tracks::Reference::MapBases; -use Seq::Tracks::Vcf::Build; - -# Check required binary is available -if ( !HaveRequiredBinary("bystro-vcf") ) { - plan skip_all => "Testing relies on bystro-vcf binary, which is not present"; -} - -# create temp directories -my $dir = Path::Tiny->tempdir(); - -# prepare temp directory and make test config file -my $config_file = PrepareConfigWithTempdirs( - 't/tracks/vcf/test.scrambled_multiple_files.yml', - 't/tracks/vcf/raw', [ 'database_dir', 'files_dir', 'temp_dir' ], - 'files_dir', $dir->stringify -); - -my $baseMapper = Seq::Tracks::Reference::MapBases->new(); - -my $seq = MockBuilder->new_with_config( { config => $config_file } ); - -my $tracks = $seq->tracksObj; -my $refBuilder = $tracks->getRefTrackBuilder(); -my $refGetter = $tracks->getRefTrackGetter(); - -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927888 - 1, - $baseMapper->baseMap->{'C'} -); #chr14:19792736-19792737 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927876 - 1, - $baseMapper->baseMap->{'G'} -); #chr14:19792727 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927837 - 1, - $baseMapper->baseMap->{'A'} -); #chr14:19792869-19792870 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927835 - 1, - $baseMapper->baseMap->{'G'} -); #chr14:19792857-19792858 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927834 - 1, - $baseMapper->baseMap->{'G'} -); #chr14:19792818-19792819 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927765 - 1, - $baseMapper->baseMap->{'A'} -); #chr14:19792816-19792817 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927759 - 1, - $baseMapper->baseMap->{'A'} -); #chr14:19792815-19792816 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927755 - 1, - $baseMapper->baseMap->{'T'} -); #On #chr14:19792746-19792747 #same -$refBuilder->db->dbPatch( - 'chr22', $refBuilder->dbName, - 15927745 - 1, - $baseMapper->baseMap->{'A'} -); #chr14:19792740-19792741 #same - -$refBuilder->db->cleanUp(); - -my $dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927888 - 1 ); -ok( $refGetter->get($dbVar) eq 'C' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927876 - 1 ); -ok( $refGetter->get($dbVar) eq 'G' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927837 - 1 ); -ok( $refGetter->get($dbVar) eq 'A' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927835 - 1 ); -ok( $refGetter->get($dbVar) eq 'G' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927834 - 1 ); -ok( $refGetter->get($dbVar) eq 'G' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927765 - 1 ); -ok( $refGetter->get($dbVar) eq 'A' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927759 - 1 ); -ok( $refGetter->get($dbVar) eq 'A' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927755 - 1 ); #on -ok( $refGetter->get($dbVar) eq 'T' ); - -$dbVar = $refBuilder->db->dbReadOne( 'chr22', 15927745 - 1 ); -ok( $refGetter->get($dbVar) eq 'A' ); - -my $vcfBuilder = $tracks->getTrackBuilderByName('gnomad.genomes.scrambled'); - -$vcfBuilder->buildTrack(); - -my $vcf = $tracks->getTrackGetterByName('gnomad.genomes.scrambled'); - -my $db = Seq::DBManager->new(); - -############### Feature tests ################ -# The vcf file contains the following items: -# Comma separated values are treated as belonging to diff alleles -# The G allele is the 2nd of the two at this site: -# C CACA,G -# Therefore, for this first test, we expect the 2nd of two values, when present -# AC=3,0; -# AF=1.81378e-04, 0.00000e+00; -# AN=16540; -# AC_AFR=0,0; -# AC_AMR=0,0; -# AC_ASJ=0,0; -# AC_EAS=0,0; -# AC_FIN=0,0; -# AC_NFE=3,0; -# AC_OTH=0,0; -# AC_Male=3,0; -# AC_Female=0,0; -# AN_AFR=3614; -# AN_AMR=528; -# AN_ASJ=180; -# AN_EAS=892; -# AN_FIN=2256; -# AN_NFE=8466; -# AN_OTH=604; -# AN_Male=9308; -# AN_Female=7232; -# AF_AFR=0.00000e+00, 0.00000e+00; -# AF_AMR=0.00000e+00, 0.00000e+00; -# AF_ASJ=0.00000e+00, 0.00000e+00; -# AF_EAS=0.00000e+00, 0.00000e+00; -# AF_FIN=0.00000e+00, 0.00000e+00; -# AF_NFE=3.54359e-04, 0.00000e+00; -# AF_OTH=0.00000e+00, 0.00000e+00; -# AF_Male=3.22303e-04, 0.00000e+00; -# AF_Female=0.00000e+00, 0.00000e+00; -# AS_FilterStatus=PASS,RF|AC0 - -# Notice that this site has an RF|AC0 value for AS_FilterStatus -# Therefore it doesn't pass - -my $href = $db->dbReadOne( 'chr22', 15927888 - 1 ); - -#my ($vcf, $href, $chr, $refBase, $allele, $alleleIdx, $positionIdx, $outAccum) = @_; -# At this position we have CACA,G alleles -my $out = []; - -# Vcf tracks are going to be treated differently from transcripts and sparse tracks -# They should return nothing for the nth position in a deletion -# Or if the allele doesn't match exactly. -$vcf->get( $href, 'chr22', 'C', 'G', 0, $out ); - -ok( - @{$out} == 0, - "Non PASS AS_FilterStatus causes alleles to be skipped in multiallelic (testing build_row_filters on INFO values)" -); - -# my $numFeatures = scalar @{$vcf->features}; -# ok(@{$out} == $numFeatures, "vcf array contians an entry for each requested feature"); - -# for my $feature (@{$vcf->features}) { -# my $idx = $vcf->getFieldDbName($feature); - -# ok(@{$out->[$idx]} == 1, "Every feature is considered bi-allelelic (because alleles are decomposed into bi-allelic sites, with 1 entry"); -# ok(@{$out->[$idx][0]} == 1 && !ref $out->[$idx][0], "Every feature contains a single position's worth of data, and that value is scalar"); -# } - -# We define these as lower case in our test yaml, so that is how we will look them up -# No decision is made on behalf of the user how to name these; will be taken as defined - -my $trTvIdx = $vcf->getFieldDbName('trTv'); -my $idIdx = $vcf->getFieldDbName('id'); -my $acIdx = $vcf->getFieldDbName('ac'); -my $afIdx = $vcf->getFieldDbName('af'); -my $anIdx = $vcf->getFieldDbName('an'); - -my $acAfrIdx = $vcf->getFieldDbName('ac_afr'); -my $acAmrIdx = $vcf->getFieldDbName('ac_amr'); -my $acAsjIdx = $vcf->getFieldDbName('ac_asj'); -my $acEasIdx = $vcf->getFieldDbName('ac_eas'); -my $acFinIdx = $vcf->getFieldDbName('ac_fin'); -my $acNfeIdx = $vcf->getFieldDbName('ac_nfe'); -my $acOthIdx = $vcf->getFieldDbName('ac_oth'); -my $acMaleIdx = $vcf->getFieldDbName('ac_male'); -my $acFemaleIdx = $vcf->getFieldDbName('ac_female'); - -my $anAfrIdx = $vcf->getFieldDbName('an_afr'); -my $anAmrIdx = $vcf->getFieldDbName('an_amr'); -my $anAsjIdx = $vcf->getFieldDbName('an_asj'); -my $anEasIdx = $vcf->getFieldDbName('an_eas'); -my $anFinIdx = $vcf->getFieldDbName('an_fin'); -my $anNfeIdx = $vcf->getFieldDbName('an_nfe'); -my $anOthIdx = $vcf->getFieldDbName('an_oth'); -my $anMaleIdx = $vcf->getFieldDbName('an_male'); -my $anFemaleIdx = $vcf->getFieldDbName('an_female'); - -my $afAfrIdx = $vcf->getFieldDbName('af_afr'); -my $afAmrIdx = $vcf->getFieldDbName('af_amr'); -my $afAsjIdx = $vcf->getFieldDbName('af_asj'); -my $afEasIdx = $vcf->getFieldDbName('af_eas'); -my $afFinIdx = $vcf->getFieldDbName('af_fin'); -my $afNfeIdx = $vcf->getFieldDbName('af_nfe'); -my $afOthIdx = $vcf->getFieldDbName('af_oth'); -my $afMaleIdx = $vcf->getFieldDbName('af_male'); -my $afFemaleIdx = $vcf->getFieldDbName('af_female'); - -# ok($out->[$trTvIdx][0] == 0, "indels and multiallelics have 0 trTv value"); -# ok(!defined $out->[$idIdx][0], "correctly finds that this site has no rsID"); -# ok($out->[$acIdx][0] == 0, "correctly finds first the G alleles ac value"); -# ok($out->[$afIdx][0] == 0, "correctly finds first the G alleles af value"); -# ok($out->[$anIdx][0] == 16540, "correctly finds first the G alleles an value, which has only one value across all alleles"); - -# ok($out->[$acAfrIdx][0] == 0, "correctly finds first the G alleles ac_afr value"); -# ok($out->[$acAmrIdx][0] == 0, "correctly finds first the G alleles ac_amr value"); -# ok($out->[$acAsjIdx][0] == 0, "correctly finds first the G alleles ac_asj value"); -# ok($out->[$acEasIdx][0] == 0, "correctly finds first the G alleles ac_eas value"); -# ok($out->[$acFinIdx][0] == 0, "correctly finds first the G alleles ac_fin value"); -# ok($out->[$acNfeIdx][0] == 0, "correctly finds first the G alleles ac_nfe value"); -# ok($out->[$acOthIdx][0] == 0, "correctly finds first the G alleles ac_oth value"); -# ok($out->[$acMaleIdx][0] == 0, "correctly finds first the G alleles ac_male value"); -# ok($out->[$acFemaleIdx][0] == 0, "correctly finds first the G alleles ac_female value"); - -# ok($out->[$anAfrIdx][0] == 3614, "correctly finds first the G alleles an_afr value, which has only one value across all alleles"); -# ok($out->[$anAmrIdx][0] == 528, "correctly finds first the G alleles an_amr value, which has only one value across all alleles"); -# ok($out->[$anAsjIdx][0] == 180, "correctly finds first the G alleles an_asj value, which has only one value across all alleles"); -# ok($out->[$anEasIdx][0] == 892, "correctly finds first the G alleles an_eas value, which has only one value across all alleles"); -# ok($out->[$anFinIdx][0] == 2256, "correctly finds first the G alleles an_fin value, which has only one value across all alleles"); -# ok($out->[$anNfeIdx][0] == 8466, "correctly finds first the G alleles an_nfe value, which has only one value across all alleles"); -# ok($out->[$anOthIdx][0] == 604, "correctly finds first the G alleles an_oth value, which has only one value across all alleles"); -# ok($out->[$anMaleIdx][0] == 9308, "correctly finds first the G alleles an_male value, which has only one value across all alleles"); -# ok($out->[$anFemaleIdx][0] == 7232, "correctly finds first the G alleles an_female value, which has only one value across all alleles"); - -# ok($out->[$afAfrIdx][0] == 0, "correctly finds first the G alleles af_afr value"); -# ok($out->[$afAmrIdx][0] == 0, "correctly finds first the G alleles af_amr value"); -# ok($out->[$afAsjIdx][0] == 0, "correctly finds first the G alleles af_asj value"); -# ok($out->[$afEasIdx][0] == 0, "correctly finds first the G alleles af_eas value"); -# ok($out->[$afFinIdx][0] == 0, "correctly finds first the G alleles af_fin value"); -# ok($out->[$afNfeIdx][0] == 0, "correctly finds first the G alleles af_nfe value"); -# ok($out->[$afOthIdx][0] == 0, "correctly finds first the G alleles af_oth value"); -# ok($out->[$afMaleIdx][0] == 0, "correctly finds first the G alleles af_male value"); -# ok($out->[$afFemaleIdx][0] == 0, "correctly finds first the G alleles af_female value"); - -############### Feature tests ################ -# The vcf file contains the following items: -# Comma separated values are treated as belonging to diff alleles -# The +ACA allele is the 1st of the two at this site: -# C CACA,G -# Therefore, for this 2nd test, we expect the 1st of two values, when present -# AC=3,0; -# AF=1.81378e-04, 0.00000e+00; -# AN=16540; -# AC_AFR=0,0; -# AC_AMR=0,0; -# AC_ASJ=0,0; -# AC_EAS=0,0; -# AC_FIN=0,0; -# AC_NFE=3,0; -# AC_OTH=0,0; -# AC_Male=3,0; -# AC_Female=0,0; -# AN_AFR=3614; -# AN_AMR=528; -# AN_ASJ=180; -# AN_EAS=892; -# AN_FIN=2256; -# AN_NFE=8466; -# AN_OTH=604; -# AN_Male=9308; -# AN_Female=7232; -# AF_AFR=0.00000e+00, 0.00000e+00; -# AF_AMR=0.00000e+00, 0.00000e+00; -# AF_ASJ=0.00000e+00, 0.00000e+00; -# AF_EAS=0.00000e+00, 0.00000e+00; -# AF_FIN=0.00000e+00, 0.00000e+00; -# AF_NFE=3.54359e-04, 0.00000e+00; -# AF_OTH=0.00000e+00, 0.00000e+00; -# AF_Male=3.22303e-04, 0.00000e+00; -# AF_Female=0.00000e+00, 0.00000e+00; - -# Vcf tracks are going to be treated differently from transcripts and sparse tracks -# They should return nothing for the nth position in a deletion -# Or if the allele doesn't match exactly. -$out = []; -$vcf->get( $href, 'chr22', 'C', '+ACA', 0, $out ); - -my $numFeatures = scalar @{ $vcf->features }; -ok( @{$out} == $numFeatures, - "vcf array contains an entry for each requested feature" ); - -for my $feature ( @{ $vcf->features } ) { - my $idx = $vcf->getFieldDbName($feature); - - ok( - @{ $out->[$idx] } == 1, - "Every feature is considered bi-allelelic (because alleles are decomposed into bi-allelic sites, with 1 entry" - ); - ok( - !ref $out->[$idx][0], - "Every feature contains a single position's worth of data, and that value is scalar" - ); -} - -$out = []; -$vcf->get( $href, 'chr22', 'C', '+ACA', 1, $out ); -ok( - @$out == 0, - "Vcf getter does not tile annotations by the position of the allele. Only accepts position index 0" -); - -$out = []; -$vcf->get( $href, 'chr22', 'C', '+ACA', 0, $out ); - -# # Although we've deprecated single-line multiallelics, we must continue to support them -# # Until we've decided to drop support -# # Note, taht is for some reason a person wanted to fetch only one of the two alleles -# # the site would appear as a [feature1, featuer2] rather than [[feature1_allele1...], [feature1_allele2...]] -# for my $feature (@{$vcf->features}) { -# my $idx = $vcf->getFieldDbName($feature); - -# ok(@{$out->[$idx]} == 2, "For multiallelic sites, where both alleles on a single input row, each feature is given an array of length == \# of alleles"); -# } - -ok( $out->[$trTvIdx][0] == 0, "indels and multiallelics have 0 trTv value" ); -ok( !defined $out->[$idIdx][0], "correctly finds that this site has no rsID" ); -ok( $out->[$acIdx][0] == 3, "correctly finds first the +ACA allele ac value" ); -ok( - $out->[$afIdx][0] == unpack( 'f', pack( 'f', 1.81378e-04 ) ), - "correctly finds first the +ACA allele af value" -); -ok( - $out->[$anIdx][0] == 16540, - "correctly finds first the +ACA allele an value, which has only one value across all alleles" -); - -ok( $out->[$acAfrIdx][0] == 0, - "correctly finds first the +ACA allele ac_afr value" ); -ok( $out->[$acAmrIdx][0] == 0, - "correctly finds first the +ACA allele ac_amr value" ); -ok( $out->[$acAsjIdx][0] == 0, - "correctly finds first the +ACA allele ac_asj value" ); -ok( $out->[$acEasIdx][0] == 0, - "correctly finds first the +ACA allele ac_eas value" ); -ok( $out->[$acFinIdx][0] == 0, - "correctly finds first the +ACA allele ac_fin value" ); -ok( $out->[$acNfeIdx][0] == 3, - "correctly finds first the +ACA allele ac_nfe value" ); -ok( $out->[$acOthIdx][0] == 0, - "correctly finds first the +ACA allele ac_oth value" ); -ok( $out->[$acMaleIdx][0] == 3, - "correctly finds first the +ACA allele ac_male value" ); -ok( $out->[$acFemaleIdx][0] == 0, - "correctly finds first the +ACA allele ac_female value" ); - -ok( - $out->[$anAfrIdx][0] == 3614, - "correctly finds first the +ACA allele an_afr value, which has only one value across all alleles" -); -ok( - $out->[$anAmrIdx][0] == 528, - "correctly finds first the +ACA allele an_amr value, which has only one value across all alleles" -); -ok( - $out->[$anAsjIdx][0] == 180, - "correctly finds first the +ACA allele an_asj value, which has only one value across all alleles" -); -ok( - $out->[$anEasIdx][0] == 892, - "correctly finds first the +ACA allele an_eas value, which has only one value across all alleles" -); -ok( - $out->[$anFinIdx][0] == 2256, - "correctly finds first the +ACA allele an_fin value, which has only one value across all alleles" -); -ok( - $out->[$anNfeIdx][0] == 8466, - "correctly finds first the +ACA allele an_nfe value, which has only one value across all alleles" -); -ok( - $out->[$anOthIdx][0] == 604, - "correctly finds first the +ACA allele an_oth value, which has only one value across all alleles" -); -ok( - $out->[$anMaleIdx][0] == 9308, - "correctly finds first the +ACA allele an_male value, which has only one value across all alleles" -); -ok( - $out->[$anFemaleIdx][0] == 7232, - "correctly finds first the +ACA allele an_female value, which has only one value across all alleles" -); - -ok( $out->[$afAfrIdx][0] == 0, - "correctly finds first the +ACA allele af_afr value" ); -ok( $out->[$afAmrIdx][0] == 0, - "correctly finds first the +ACA allele af_amr value" ); -ok( $out->[$afAsjIdx][0] == 0, - "correctly finds first the +ACA allele af_asj value" ); -ok( $out->[$afEasIdx][0] == 0, - "correctly finds first the +ACA allele af_eas value" ); -ok( $out->[$afFinIdx][0] == 0, - "correctly finds first the +ACA allele af_fin value" ); -ok( - $out->[$afNfeIdx][0] == unpack( 'f', pack( 'f', 3.54359e-04 ) ), - "correctly finds first the +ACA allele af_nfe value" -); -ok( $out->[$afOthIdx][0] == 0, - "correctly finds first the +ACA allele af_oth value" ); -ok( - $out->[$afMaleIdx][0] == unpack( 'f', pack( 'f', 3.22303e-04 ) ), - "correctly finds first the +ACA allele af_male value" -); -ok( $out->[$afFemaleIdx][0] == 0, - "correctly finds first the +ACA allele af_female value" ); - -##### TESTING VARIANTS THAT AREN'T PASS OR . -# chr22 15927755 . T G 296.53 NON_PASS -$out = []; -$href = $db->dbReadOne( 'chr22', 15927755 - 1 ); -$vcf->get( $href, 'chr22', 'T', 'G', 0, $out ); - -ok( @$out == 0, 'NON PASS/. variants are skipped' ); - -# Next let's check variants that are bi-allelic -# chr22 15927745 . A C 718.20 PASS -# AC=2; -# AF=6.93049e-05; -# AN=28858; -# AC_AFR=0; -# AC_AMR=0; -# AC_ASJ=0; -# AC_EAS=0; -# AC_FIN=0; -# AC_NFE=2; -# AC_OTH=0; -# AC_Male=1; -# AC_Female=1; -# AN_AFR=8454; -# AN_AMR=782; -# AN_ASJ=232; -# AN_EAS=1606; -# AN_FIN=3132; -# AN_NFE=13774; -# AN_OTH=878; -# AN_Male=15900; -# AN_Female=12958; -# AF_AFR=0.00000e+00; -# AF_AMR=0.00000e+00; -# AF_ASJ=0.00000e+00; -# AF_EAS=0.00000e+00; -# AF_FIN=0.00000e+00; -# AF_NFE=1.45201e-04; -# AF_OTH=0.00000e+00; -# AF_Male=6.28931e-05; -# AF_Female=7.71724e-05; - -$out = []; -$href = $db->dbReadOne( 'chr22', 15927745 - 1 ); -$vcf->get( $href, 'chr22', 'A', 'C', 0, $out ); - -ok( $out->[$trTvIdx][0] == 2, "A->C is a transversion, so given value of 2" ); -ok( !defined $out->[$idIdx][0], "correctly finds that this site has no rsID" ); -ok( $out->[$acIdx][0] == 2, "correctly finds the ac value for a biallelic site" ); -ok( - $out->[$afIdx][0] == unpack( 'f', pack( 'f', 6.93049e-05 ) ), - "correctly finds the af value for a biallelic site" -); -ok( $out->[$anIdx][0] == 28858, - "correctly finds the an value for a biallelic site" ); - -ok( $out->[$acAfrIdx][0] == 0, "correctly finds the C allele ac_afr value" ); -ok( $out->[$acAmrIdx][0] == 0, "correctly finds C allele allele ac_amr value" ); -ok( $out->[$acAsjIdx][0] == 0, "correctly finds C allele allele ac_asj value" ); -ok( $out->[$acEasIdx][0] == 0, "correctly finds C allele allele ac_eas value" ); -ok( $out->[$acFinIdx][0] == 0, "correctly finds C allele allele ac_fin value" ); -ok( $out->[$acNfeIdx][0] == 2, "correctly finds C allele allele ac_nfe value" ); -ok( $out->[$acOthIdx][0] == 0, "correctly finds C allele allele ac_oth value" ); -ok( $out->[$acMaleIdx][0] == 1, "correctly finds C allele allele ac_male value" ); -ok( $out->[$acFemaleIdx][0] == 1, - "correctly finds C allele allele ac_female value" ); - -ok( - $out->[$anAfrIdx][0] == 8454, - "correctly finds the C allele an_afr value, which has only one value across all alleles" -); -ok( - $out->[$anAmrIdx][0] == 782, - "correctly finds the C allele an_amr value, which has only one value across all alleles" -); -ok( - $out->[$anAsjIdx][0] == 232, - "correctly finds the C allele an_asj value, which has only one value across all alleles" -); -ok( - $out->[$anEasIdx][0] == 1606, - "correctly finds the C allele an_eas value, which has only one value across all alleles" -); -ok( - $out->[$anFinIdx][0] == 3132, - "correctly finds the C allele an_fin value, which has only one value across all alleles" -); -ok( - $out->[$anNfeIdx][0] == 13774, - "correctly finds the C allele an_nfe value, which has only one value across all alleles" -); -ok( - $out->[$anOthIdx][0] == 878, - "correctly finds the C allele an_oth value, which has only one value across all alleles" -); -ok( - $out->[$anMaleIdx][0] == 15900, - "correctly finds the C allele an_male value, which has only one value across all alleles" -); -ok( - $out->[$anFemaleIdx][0] == 12958, - "correctly finds the C allele an_female value, which has only one value across all alleles" -); - -ok( $out->[$afAfrIdx][0] == 0, "correctly finds the C allele af_afr value" ); -ok( $out->[$afAmrIdx][0] == 0, "correctly finds the C allele af_amr value" ); -ok( $out->[$afAsjIdx][0] == 0, "correctly finds the C allele af_asj value" ); -ok( $out->[$afEasIdx][0] == 0, "correctly finds the C allele af_eas value" ); -ok( $out->[$afFinIdx][0] == 0, "correctly finds the C allele af_fin value" ); -ok( $out->[$afNfeIdx][0] == unpack( 'f', pack( 'f', 1.45201e-04 ) ), - "correctly finds the C allele af_nfe value" ); -ok( $out->[$afOthIdx][0] == 0, "correctly finds the C allele af_oth value" ); -ok( $out->[$afMaleIdx][0] == unpack( 'f', pack( 'f', 6.28931e-05 ) ), - "correctly finds the C allele af_male value" ); -ok( $out->[$afFemaleIdx][0] == unpack( 'f', pack( 'f', 7.71724e-05 ) ), - "correctly finds the C allele af_female value" ); - -$out = []; - -$href = $db->dbReadOne( 'chr22', 15927745 - 1 ); -$vcf->get( $href, 'chr22', 'A', 'T', 0, $out ); - -ok( @$out == 0, "Alelleles that don't match are skipped" ); - -# Next let's check a variant that has 2 non-reference alleles, but which came from different lines -# The order of the lines doesn't matter -# TODO: write explicit test for order not mattering -# The first one - -# chr22 15927834 . G T 183.64 PASS -# AC=1; -# AF=4.21905e-05; -# AN=23702; -# AC_AFR=1; -# AC_AMR=0; -# AC_ASJ=0; -# AC_EAS=0; -# AC_FIN=0; -# AC_NFE=0; -# AC_OTH=0; -# AC_Male=1; -# AC_Female=0; -# AN_AFR=6452; -# AN_AMR=638; -# AN_ASJ=222; -# AN_EAS=1398; -# AN_FIN=2270; -# AN_NFE=12010; -# AN_OTH=712; -# AN_Male=13204; -# AN_Female=10498; -# AF_AFR=1.54991e-04; -# AF_AMR=0.00000e+00; -# AF_ASJ=0.00000e+00; -# AF_EAS=0.00000e+00; -# AF_FIN=0.00000e+00; -# AF_NFE=0.00000e+00; -# AF_OTH=0.00000e+00; -# AF_Male=7.57346e-05; -# AF_Female=0.00000e+00; - -$out = []; -my $firstAllele = []; - -$href = $db->dbReadOne( 'chr22', 15927834 - 1 ); -$vcf->get( $href, 'chr22', 'G', 'T', 0, $firstAllele ); - -ok( - $firstAllele->[$trTvIdx][0] == 2, "trTv is 0 for multiallelics. - However, when our vcf parser is passed 2 alleles at the same position, but on different source file lines, it treats that as a SNP. - Therefore it calls it as a G->T transversion, or 2" -); -ok( !defined $firstAllele->[$idIdx][0], - "correctly finds that this site has no rsID" ); -ok( $firstAllele->[$acIdx][0] == 1, - "correctly finds the ac value for a biallelic site" ); -ok( $firstAllele->[$afIdx][0] == unpack( 'f', pack( 'f', 4.21905e-05 ) ), - "correctly finds the af value for a biallelic site" ); -ok( $firstAllele->[$anIdx][0] == 23702, - "correctly finds the an value for a biallelic site" ); - -ok( $firstAllele->[$acAfrIdx][0] == 1, "correctly finds the T allele ac_afr value" ); -ok( $firstAllele->[$acAmrIdx][0] == 0, - "correctly finds T allele allele ac_amr value" ); -ok( $firstAllele->[$acAsjIdx][0] == 0, - "correctly finds T allele allele ac_asj value" ); -ok( $firstAllele->[$acEasIdx][0] == 0, - "correctly finds T allele allele ac_eas value" ); -ok( $firstAllele->[$acFinIdx][0] == 0, - "correctly finds T allele allele ac_fin value" ); -ok( $firstAllele->[$acNfeIdx][0] == 0, - "correctly finds T allele allele ac_nfe value" ); -ok( $firstAllele->[$acOthIdx][0] == 0, - "correctly finds T allele allele ac_oth value" ); -ok( $firstAllele->[$acMaleIdx][0] == 1, - "correctly finds T allele allele ac_male value" ); -ok( $firstAllele->[$acFemaleIdx][0] == 0, - "correctly finds T allele allele ac_female value" ); - -ok( - $firstAllele->[$anAfrIdx][0] == 6452, - "correctly finds the T allele an_afr value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anAmrIdx][0] == 638, - "correctly finds the T allele an_amr value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anAsjIdx][0] == 222, - "correctly finds the T allele an_asj value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anEasIdx][0] == 1398, - "correctly finds the T allele an_eas value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anFinIdx][0] == 2270, - "correctly finds the T allele an_fin value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anNfeIdx][0] == 12010, - "correctly finds the T allele an_nfe value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anOthIdx][0] == 712, - "correctly finds the T allele an_oth value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anMaleIdx][0] == 13204, - "correctly finds the T allele an_male value, which has only one value across all alleles" -); -ok( - $firstAllele->[$anFemaleIdx][0] == 10498, - "correctly finds the T allele an_female value, which has only one value across all alleles" -); - -ok( $firstAllele->[$afAfrIdx][0] == unpack( 'f', pack( 'f', 1.54991e-04 ) ), - "correctly finds the T allele af_afr value" ); -ok( $firstAllele->[$afAmrIdx][0] == 0, "correctly finds the T allele af_amr value" ); -ok( $firstAllele->[$afAsjIdx][0] == 0, "correctly finds the T allele af_asj value" ); -ok( $firstAllele->[$afEasIdx][0] == 0, "correctly finds the T allele af_eas value" ); -ok( $firstAllele->[$afFinIdx][0] == 0, "correctly finds the T allele af_fin value" ); -ok( $firstAllele->[$afNfeIdx][0] == 0, "correctly finds the T allele af_nfe value" ); -ok( $firstAllele->[$afAsjIdx][0] == 0, "correctly finds the T allele af_asj value" ); -ok( $firstAllele->[$afOthIdx][0] == 0, "correctly finds the T allele af_oth value" ); -ok( $firstAllele->[$afMaleIdx][0] == unpack( 'f', pack( 'f', 7.57346e-05 ) ), - "correctly finds the T allele af_male value" ); -ok( - $firstAllele->[$afFemaleIdx][0] == 0, - "correctly finds the T allele af_female value" -); - -# The 2nd one: -# chr22 15927834 rs199856444 G C 1458410.68 PASS -# AC=5232; -# AF=2.00721e-01; -# AN=26066; -# AC_AFR=1195; -# AC_AMR=199; -# AC_ASJ=48; -# AC_EAS=462; -# AC_FIN=539; -# AC_NFE=2634; -# AC_OTH=155; -# AC_Male=2860; -# AC_Female=2372; -# AN_AFR=7838; -# AN_AMR=630; -# AN_ASJ=216; -# AN_EAS=1372; -# AN_FIN=2596; -# AN_NFE=12638; -# AN_OTH=776; -# AN_Male=14358; -# AN_Female=11708; -# AF_AFR=1.52462e-01; -# AF_AMR=3.15873e-01; -# AF_ASJ=2.22222e-01; -# AF_EAS=3.36735e-01; -# AF_FIN=2.07627e-01; -# AF_NFE=2.08419e-01; -# AF_OTH=1.99742e-01; -# AF_Male=1.99192e-01; -# AF_Female=2.02597e-01; - -my $secondAllele = []; -$href = $db->dbReadOne( 'chr22', 15927834 - 1 ); -$vcf->get( $href, 'chr22', 'G', 'C', 0, $secondAllele ); - -ok( - $secondAllele->[$trTvIdx][0] == 2, "trTv is 0 for multiallelics. \ - However, when our vcf parser is passed 2 alleles at the same position, but on different source file lines, it treats that as a SNP. - Therefore it calls it as a G->C transversion, or 2" -); -ok( - $secondAllele->[$idIdx][0] eq 'rs199856444', - "correctly finds that this site has an rsID" -); -ok( $secondAllele->[$acIdx][0] == 5232, - "correctly finds the ac value for a biallelic site" ); -ok( $secondAllele->[$afIdx][0] == unpack( 'f', pack( 'f', 2.00721e-01 ) ), - "correctly finds the af value for a biallelic site" ); -ok( $secondAllele->[$anIdx][0] == 26066, - "correctly finds the an value for a biallelic site" ); - -ok( - $secondAllele->[$acAfrIdx][0] == 1195, - "correctly finds the C allele ac_afr value" -); -ok( - $secondAllele->[$acAmrIdx][0] == 199, - "correctly finds C allele allele ac_amr value" -); -ok( $secondAllele->[$acAsjIdx][0] == 48, - "correctly finds C allele allele ac_asj value" ); -ok( - $secondAllele->[$acEasIdx][0] == 462, - "correctly finds C allele allele ac_eas value" -); -ok( - $secondAllele->[$acFinIdx][0] == 539, - "correctly finds C allele allele ac_fin value" -); -ok( - $secondAllele->[$acNfeIdx][0] == 2634, - "correctly finds C allele allele ac_nfe value" -); -ok( - $secondAllele->[$acOthIdx][0] == 155, - "correctly finds C allele allele ac_oth value" -); -ok( - $secondAllele->[$acMaleIdx][0] == 2860, - "correctly finds C allele allele ac_male value" -); -ok( - $secondAllele->[$acFemaleIdx][0] == 2372, - "correctly finds C allele allele ac_female value" -); - -ok( - $secondAllele->[$anAfrIdx][0] == 7838, - "correctly finds the C allele an_afr value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anAmrIdx][0] == 630, - "correctly finds the C allele an_amr value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anAsjIdx][0] == 216, - "correctly finds the C allele an_asj value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anEasIdx][0] == 1372, - "correctly finds the C allele an_eas value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anFinIdx][0] == 2596, - "correctly finds the C allele an_fin value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anNfeIdx][0] == 12638, - "correctly finds the C allele an_nfe value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anOthIdx][0] == 776, - "correctly finds the C allele an_oth value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anMaleIdx][0] == 14358, - "correctly finds the C allele an_male value, which has only one value across all alleles" -); -ok( - $secondAllele->[$anFemaleIdx][0] == 11708, - "correctly finds the C allele an_female value, which has only one value across all alleles" -); - -ok( $secondAllele->[$afAfrIdx][0] == unpack( 'f', pack( 'f', 1.52462e-01 ) ), - "correctly finds the C allele af_afr value" ); -ok( $secondAllele->[$afAmrIdx][0] == unpack( 'f', pack( 'f', 3.15873e-01 ) ), - "correctly finds the C allele af_amr value" ); -ok( $secondAllele->[$afAsjIdx][0] == unpack( 'f', pack( 'f', 2.22222e-01 ) ), - "correctly finds the C allele af_asj value" ); -ok( $secondAllele->[$afEasIdx][0] == unpack( 'f', pack( 'f', 3.36735e-01 ) ), - "correctly finds the C allele af_eas value" ); -ok( $secondAllele->[$afFinIdx][0] == unpack( 'f', pack( 'f', 2.07627e-01 ) ), - "correctly finds the C allele af_fin value" ); -ok( $secondAllele->[$afNfeIdx][0] == unpack( 'f', pack( 'f', 2.08419e-01 ) ), - "correctly finds the C allele af_nfe value" ); -ok( $secondAllele->[$afOthIdx][0] == unpack( 'f', pack( 'f', 1.99742e-01 ) ), - "correctly finds the C allele af_oth value" ); -ok( $secondAllele->[$afMaleIdx][0] == unpack( 'f', pack( 'f', 1.99192e-01 ) ), - "correctly finds the C allele af_male value" ); -ok( $secondAllele->[$afFemaleIdx][0] == unpack( 'f', pack( 'f', 2.02597e-01 ) ), - "correctly finds the C allele af_female value" ); - -# this allele is found in the 2nd file as well -# this tests whether concurrency is broken by having 2 updates issued from one process -# and only 1 from another process, at roughly the same time -my $thirdAllele = []; -$href = $db->dbReadOne( 'chr22', 15927834 - 1 ); -$vcf->get( $href, 'chr22', 'G', 'A', 0, $thirdAllele ); - -ok( - $thirdAllele->[$trTvIdx][0] == 1, "trTv is 0 for multiallelics. \ - However, when our vcf parser is passed 2 alleles at the same position, but on different source file lines, it treats that as a SNP. - Therefore it calls it as a G->A transition, or 1" -); -ok( $thirdAllele->[$idIdx][0] eq 'rsFake', "correctly finds the rsid" ); -ok( $thirdAllele->[$acIdx][0] == 5232, - "correctly finds the ac value for a biallelic site" ); -ok( $thirdAllele->[$afIdx][0] == unpack( 'f', pack( 'f', 2.00721e-01 ) ), - "correctly finds the af value for a biallelic site" ); -ok( $thirdAllele->[$anIdx][0] == 26066, - "correctly finds the an value for a biallelic site" ); - -ok( - $thirdAllele->[$acAfrIdx][0] == 1195, - "correctly finds the C allele ac_afr value" -); -ok( $thirdAllele->[$acAmrIdx][0] == 199, - "correctly finds C allele allele ac_amr value" ); -ok( $thirdAllele->[$acAsjIdx][0] == 48, - "correctly finds C allele allele ac_asj value" ); -ok( $thirdAllele->[$acEasIdx][0] == 462, - "correctly finds C allele allele ac_eas value" ); -ok( $thirdAllele->[$acFinIdx][0] == 539, - "correctly finds C allele allele ac_fin value" ); -ok( - $thirdAllele->[$acNfeIdx][0] == 2634, - "correctly finds C allele allele ac_nfe value" -); -ok( $thirdAllele->[$acOthIdx][0] == 155, - "correctly finds C allele allele ac_oth value" ); -ok( - $thirdAllele->[$acMaleIdx][0] == 2860, - "correctly finds C allele allele ac_male value" -); -ok( - $thirdAllele->[$acFemaleIdx][0] == 2372, - "correctly finds C allele allele ac_female value" -); - -ok( - $thirdAllele->[$anAfrIdx][0] == 7838, - "correctly finds the C allele an_afr value, which has only one value across all alleles" -); -ok( - $thirdAllele->[$anAmrIdx][0] == 630, - "correctly finds the C allele an_amr value, which has only one value across all alleles" -); -ok( - $thirdAllele->[$anAsjIdx][0] == 216, - "correctly finds the C allele an_asj value, which has only one value across all alleles" -); -ok( - $thirdAllele->[$anEasIdx][0] == 1372, - "correctly finds the C allele an_eas value, which has only one value across all alleles" -); -ok( - $thirdAllele->[$anFinIdx][0] == 2596, - "correctly finds the C allele an_fin value, which has only one value across all alleles" -); -ok( - $thirdAllele->[$anNfeIdx][0] == 12638, - "correctly finds the C allele an_nfe value, which has only one value across all alleles" -); -ok( - $thirdAllele->[$anOthIdx][0] == 776, - "correctly finds the C allele an_oth value, which has only one value across all alleles" -); -ok( - $thirdAllele->[$anMaleIdx][0] == 14358, - "correctly finds the C allele an_male value, which has only one value across all alleles" -); -ok( - $thirdAllele->[$anFemaleIdx][0] == 11708, - "correctly finds the C allele an_female value, which has only one value across all alleles" -); - -ok( $thirdAllele->[$afAfrIdx][0] == unpack( 'f', pack( 'f', 1.52462e-01 ) ), - "correctly finds the C allele af_afr value" ); -ok( $thirdAllele->[$afAmrIdx][0] == unpack( 'f', pack( 'f', 3.15873e-01 ) ), - "correctly finds the C allele af_amr value" ); -ok( $thirdAllele->[$afAsjIdx][0] == unpack( 'f', pack( 'f', 2.22222e-01 ) ), - "correctly finds the C allele af_asj value" ); -ok( $thirdAllele->[$afEasIdx][0] == unpack( 'f', pack( 'f', 3.36735e-01 ) ), - "correctly finds the C allele af_eas value" ); -ok( $thirdAllele->[$afFinIdx][0] == unpack( 'f', pack( 'f', 2.07627e-01 ) ), - "correctly finds the C allele af_fin value" ); -ok( $thirdAllele->[$afNfeIdx][0] == unpack( 'f', pack( 'f', 2.08419e-01 ) ), - "correctly finds the C allele af_nfe value" ); -ok( $thirdAllele->[$afOthIdx][0] == unpack( 'f', pack( 'f', 1.99742e-01 ) ), - "correctly finds the C allele af_oth value" ); -ok( $thirdAllele->[$afMaleIdx][0] == unpack( 'f', pack( 'f', 1.99192e-01 ) ), - "correctly finds the C allele af_male value" ); -ok( $thirdAllele->[$afFemaleIdx][0] == unpack( 'f', pack( 'f', 2.02597e-01 ) ), - "correctly finds the C allele af_female value" ); - -# Let's see what happens if a user wants to show both alleles on the same line -my $multiallelic = [ [], [] ]; - -$href = $db->dbReadOne( 'chr22', 15927834 - 1 ); - -$vcf->get( $href, 'chr22', 'G', 'T', 0, $multiallelic->[0] ); - -$vcf->get( $href, 'chr22', 'G', 'C', 0, $multiallelic->[1] ); - -for my $alleleIdx ( 0 .. $#$multiallelic ) { - for my $feature ( @{ $vcf->features } ) { - my $featureIdx = $vcf->getFieldDbName($feature); - my $posIdx = 0; - - if ( $alleleIdx == 0 ) { - if ( !defined $multiallelic->[$alleleIdx][$featureIdx][$posIdx] ) { - ok( - !defined $firstAllele->[$alleleIdx][$featureIdx][$posIdx], - "multiallelics are reproduced just like bi-allelics, but on single line for feature " - . $vcf->features->[$featureIdx] - ); - } - elsif ( looks_like_number( $multiallelic->[$featureIdx][$alleleIdx][$posIdx] ) ) { - ok( - $multiallelic->[$alleleIdx][$featureIdx][$posIdx] - == $firstAllele->[$featureIdx][$posIdx], - "multiallelics are reproduced just like bi-allelics, but on single line for feature " - . $vcf->features->[$featureIdx] - ); - } - else { - ok( - $multiallelic->[$alleleIdx][$featureIdx][$posIdx] eq - $firstAllele->[$featureIdx][$posIdx], - "multiallelics are reproduced just like bi-allelics, but on single line for feature " - . $vcf->features->[$featureIdx] - ); - } - - next; - } - - if ( $alleleIdx == 1 ) { - if ( !defined $multiallelic->[$alleleIdx][$featureIdx][$posIdx] ) { - ok( - !defined $secondAllele->[$featureIdx][$posIdx], - "multiallelics are reproduced just like bi-allelics, but on single line for feature " - . $vcf->features->[$featureIdx] - ); - } - elsif ( looks_like_number( $multiallelic->[$alleleIdx][$featureIdx][$posIdx] ) ) { - ok( - $multiallelic->[$alleleIdx][$featureIdx][$posIdx] - == $secondAllele->[$featureIdx][$posIdx], - "multiallelics are reproduced just like bi-allelics, but on single line for feature " - . $vcf->features->[$featureIdx] - ); - } - else { - ok( - $multiallelic->[$alleleIdx][$featureIdx][$posIdx] eq - $secondAllele->[$featureIdx][$posIdx], - "multiallelics are reproduced just like bi-allelics, but on single line for feature " - . $vcf->features->[$featureIdx] - ); - } - } - } -} - -$db->cleanUp(); -done_testing(); diff --git a/perl/t/tracks/vcf/raw/clinvar.match/clinvar_alleles.single.b38.vcf.1klines.gz b/perl/t/tracks/vcf/raw/clinvar.match/clinvar_alleles.single.b38.vcf.1klines.gz deleted file mode 100644 index 4d72e3ca41dbfc4e43db298beb18c2ef5f33eced..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 62902 zcmV)oK%BoHiwFQL#(`S^1MGcUQya;a?z8vLm>n_ap*M8iFZQ7$)EHxCV_*P#rk@mQ zEE~m<1tZ~}VS*ojR@QZ8R+h+;g}JCEdUkgusijg?e(Qeu``^DzrsFSH^KYY@olm>( z{u0~tZ-4*$;_CbSV!U%Poy`6k&0*o-=-u(o|2=rSv-|$^_`^SrzJL33Jl~=JIKRCf z@0{JtC$q2rcspKP%qQ13ldIXzzhwLV>%ZS@b=)P?S4vmX}h0-Cz15Av+!es!^6G9y|jC-_6Fbl$A{mK4^zH8+vtDz{r;!3 z;cmi*^No=YryQJoqyOO#$A^2n9}fqo!`9vmI)4i1NB z2m40{@8CbvNJl@HKzew5`gZShFXbIB{d~Z52IudO_xE6X3h${l`$P^8{<(Mf$FLKv z#%FwzAK{Kq&j$xbT_11!gVFo_9lmz|DE(gZ{XVH=8nF+b?tMDg`;Q^;%JYvYyx-fu zgj9#468&88v<7GA!@W;~!;gdWB%1yQ8mwq|_VM=*2j}N|rzyYL|3JW$s`U=PxcQ;x z9fBM=I8VL1d~=h?$%litsWh^^K8b^)_j{)Y=NWWA9KYR5A=UUya98`l1>OxEM@tg^ zXnVjlPTlKG52T`1D}^AO(&2M*qYA2|7~B zL(cXEKLrkYmow3W|K2H+OkCIqXuam$DbPhdgK*4vmN@cV;p`-g-O#w6NCDQ!ID zoSNWx8&NX6w~QJgX1hMU#Oel$i)|Kfqs?u_m!by(=TP&O=gaUmK_rQNDf|U*qoVP; zi!a60jM6R7jmY{ZPy_e~-$L=F5>sQwhDS92Qm*he3U4*O4ZQ^loK}Zyw0Nh&+oo*z zPV;TldfVU$N59nHrc8q+7St4aAX+KXLXCOUfK5lY5&S^3Bn|dbyn6h20cbS-+ys6g z@CzExt$4iNHraBJ7{zY^&rHrN;crTAq@)5dB9#oGYg zEeFZlCb^A&Da9{Kf(j7-QtnYBT8ij48Eg~r=O*#v6~;7PR<=!grgzt?qDWfb}aN}?h6;S|4gT4{eI~|7!L*EFxZi5LGPXfIm0sJxk8!5%R zSQ{6`v_N%kp(XIHrpi^^7sI<`ixj||R=698KUMsVAaYyoi+H>_wqQ<@aJ&~F6()S^RT_Rm3Ej-CmQ@1ASbf)0W23w}#{mqc%yE4a-#I`B6# z(vTzhiPq>ub6{a{S`wWJrd!^�ms!<0c&IguvVzZ@Z9hp5P*Jm4#z;P>}ZVZ&ZjD zn1(IV2$2F=6vc+Xycc9?{E=dAuT@+|N`p5M5k2zKphpEtPn;)VEd}hXD8!^eH?C;I zM94Q;bWd@IewU6y+^+e zZUZx!IQIa<-8b{%hYs652J0*39}*6@M;vV1>JZ=+!qg1<4tiZi!1cRnTbCF zt|{IW*N!;0iI{2ur^a6gmVWe!V{kGRM~lGX%{j#4Pn_UQ@l(L@$z${V?7xR$oM@G;cbee#s9jv=n%}K z6qkDfqlLAYr_!?nY+Q5X1lx%=&Z0SH(bTpe82fTS@roWg;n~8*#5Dx#IT}n0SHU-K zNSqxS&q6oxlc_xWJbqNbpz&?}_$J>rj~^A!?ou-T#5MCa#h*Bq!BobdxHS0dWSp^b z%nqG#9DgnN6o{60NASaJ@nL`=fH$=mo}R}{#L>b%t6s!y6)rq0yy22BUW>VZlk`pj zsNQi(o|ztJ7{WtKi>|`4yaQ`9eox*b;4Sxr%{9v6PBekFL(m~xj*919h-(urjk zaprGokqF+VMl`0lapi4_KLY&3ZTzH2?7OfH&uwu~u-_$aWPO<*RC264xmv*)D%Nsv+YUk5vJ{iq! z-rzK-9RX&i;crfNKe;3*FbaW1D^A~>!NOU#prjOvzo`1ge+c$Y;(ayoyK9z6s zOrf2l4@2VAKf29dUu-M3|Jxw{)5*@!iTe|AkNoxZ_StyO=wLB>6V2?2(RuxCvRI5~ zi}CQU;&(*TYIc4f&n91I@jsR!-km@5O?9@*4tdkafPuAm_Da1u&mw{A;y>scFM(s{ zVm`SU!`%+A=U0DCK94_dZ!adZuha2x@%_(llN)e-UQex!h)T}-sEBZov-8hx9Mayev^gjcRq|i?~iA%cLkxK z+3~yM4+kG!E2=0{cgOQ)ezl~-?cu)=t zGQ@&AZ!sWb>pzHsyMqrOJ{}#v-Ghg60B`LN_o4qdIDbF4Rf)G>>0UfzTDEw1Ehfy7c4y@PE$uByJ6K$|K5ulHnK%7Vuh!dj|pab@Q zD|7~nYi)<){~9J?C#Up!aSQ0V{dVrz%=GpkjV+u@h1`S8QlbbRrB3Lk(Y3P*o7-rhbL&2MI4Hw@p8N7I|j-O+qJJb~-0 z*=RcX-|^>aYYbXzrH}AAySf=JuE!31xr8X&dCT7J<*e^V|Y59@EjSc9|hvV;* ziotaBWs!upg4pXX^U=5Q;&Srk<{rrP>fG~E=ja^>3TX|4|M+Q|9hWu}V4-HmB?gsT z1ymvgl?g$mLQt6$P-(m0ZWMi|Os85!Ii=nzdRZ`f1P}>HqvjE;*C5I-x_aBFXC84z zS?Fz}p|?#XfJkV%#ItZqrB1ewa#Fp0lxQFEjj)e+i409M@v<8tLt|v{@-|dJNz>R9aC@bx96fArUVhBxZ|<3?%L{_pbR)glxS8NqNx;B!VjGfy6~kJ&}1Pb&x2qEvD$1ihmh%r4Hnn!_z0+^qT}Q20`!*AWHwyDkMt#)P)^u8ep}CN=iNF)KN}s}Kdh2)pH!aFXg?MA1O#U`WlsE*Ci=nQkt*Uso19 zcMm0h5|p%W-={Z|YmAG<_BPlza9qDkCl@!1^=fU*!~y0`_uK!uTY|G-N2?F*LBJPy z*zVQ~Mzuxs7L|9U(#K4JOICR5_5)nPdX3Bcq6;pyW}0}oWZH5JMfgFC_~@g;+>bsX zc7NNCqYscrB|tC5@7k-q zdpVhYo{wk$X90_I(EZ+DEv_AqhNIc%;VxigaGv2_D@x+{cf zaXma6&;OcSj2FW?%(cD!aW?r2e);6)c6jw=2sphMeZ6Xv%-yKmwcb)`NRc3H-uhAb0!fZc850WOlJPcFx4EWpv}W%Q5A=(F+lkmxva3rw_X!>N(v)x*XX zRMYaHa~m}h8Mo-5^Ns;7MnUjl4+Ga7uIaW2n~oyu9@F&41>n7wtuOPs*|~)vyn7l3uW*c|Ay+eBF3m%M zATW(f2pfd9uUoI!O>xnM-6&2K_DQ+^7g7ZrCXdsWd;}hpN|39-{I)0`fc=}oEvntG z0Tc79Ss&|bLH5@DaE@UzvMUW(>rx-bbit0Knw2DKhS~Z(j{4; zta-}>@ej11G(j*jruN&y6Z_L!tj)FsMK{+9vnp_{ zNOM<5?`+8NnYpW-E?S`|gcX6GMWo0AMO%%m!Q$H7lN8dR)Ya4S1Y;5G;ruSkakh#( zWfRoN!F2k5^7V3hd*VJaha>e;G|XjcC8s=7D?^S0>ov#8FS<%$9Y|qDLn*96DV&fZ zz^NFtvyvGl@?e_gu*oU)Vz?K>z8H2bd}38;*?I|z@}gtjjPTMfB`}FFZ$^g6Wg;QU zFRW;luP%_GWUALDr_|?`b(4?B7dK_ma|0wVE>}ivQBoE^0}t0Wf|YEcS5)eY(u zKLaqlh5(tflcE5iGNH3GCE<*z<=%glteNJd(lyhc?DAZ{uQLI058~@wMS;x#n&;Yx z2T@=jOmST)0=U+gpF_L33;Z|2bq|gBZ?g3o`1wUgAt|lb2}Dt010f0Q1Pe(`NJ6k* zx~iP^^ipbhI@U8%eQ82s7U^2DuAhNq%(F;E2@oNiWE>2_7@_+U5{|li{f?lD`w7@Y zi7){>?3SYve4Ns%p|s}ncskf`fy&|Sf?5kZ`q1_YNhwrs4i?xJ&4cA&*=rHW+|1T1 zAQTtfqE{|o4Y?UW)alBxDF|me43>NGaq7plI7#qjYGzxLI`^lyOr(pEoYXTBBM<1X ze0Y!K5_oNJ=(*i72w*g^rdHWHLTuL>hzzmZL#$GYRiOwsIVMCb$B1PiVpRfSxmgx{ zfLlmMY)0xq!w*H=;ep*HPpcFntIcbvHR{p4I+_V9(QKVZ^E^~U-8w?E?ds8l-^{R= z27Y=_Ibq;=;1(x>Q7RK0Gg`9qWoc-Vg2|Q!*^E-3n7Nk5jx88{eiu-7}%T-cxI`JH8gzFy5@7y36O_+QN%m-d0&)AH-D{D$vSx2R| zsH&BBQSza}dDh@V*m=gfI?pmMvKMKQ+%5QCWGe$NvK0wDg|*jI4d!2&*1NY$dJ{G% z^~(DX^N{QLkhb(1c!{oOu|-XR9$6$Pgsx|a5)_JyZs;M_Yl6eMUpV2CWt9%mli*;c zpaisfm;pXqvi<38THB7CRF9vv7{6=ju=YIg1Lia@@ncw2`bfhNeoV(WKV{0n1#7`V zurhR$6>VTgFcpSZ<^EajsMj|a6zUXR7vP{<1=Id?#wB0c;1HC>gt zbX7v{g3MKiLmsIT%+@hjyN)7p1tc^qAmO#iYP{hJS$6BC(I%RNaB7mIgB&8#>$*x_42^ zcx!*4!F0s6OfC5=NIh4kBc5>BxP_}kJmP!%hr^ODAYat>_TKmT)tAX^QS{dWl>q&D z6~?sAW?voM{?GhJCWaoou4g|t4iR)qd&sy1v7p)ARSno!!K~R9k1=Z*tEbP_5wn&Z z>*+H;hw}X##>&9Yp^4|`0JfZ>(@{>^6SC)Dxz?cBC9>CyRPX0lM^93+mPuRB!_NWM zlPvM0Nz`3k7OQ;0^wJS=?F>az0NRK~a$E>*%|O+1hgGUZVJvRk9g?Qf1=vJdmG^ ziduda7hOBCn6Mw33OliQwezkMOUh^ouGr`A*DXofzyXDnpoV@~oR8-w4K)SDeiY?1 z=*seQnz6uFDY-d*7c&yncFxe%FK=Ug=NE#(++*Py>ETF z5v?x;XDq9&+ie(2@RvHJk9Ti=>AJZRFR~(x*CjnW)FwF8Da|k8l!CC5!~Eak^LV;@ zo_f3+AM6kM{T2*s>4E-C&U|SNFkPN`5v|H`>*&O<5=&Inuga0K!N$YrUrM7n%9l6Q z66sB~3R1l_u&MlB*q92#aUlusTu72x!d?Qm&_u#m+v8?hL-jsUYlERzzf9=Tz^)`oxyD;`8b#RJ&ov^5r! z0Vci0aY3sG%qqbbqBaLG0x(ko#+bN-F<84=X*z`0Uq=egMOE{xGSk5y6+!UCQWff5 zDQ+dfMXT&Nznp`R>j0&UR^MS~&1HvuPmUkx$x%4AI5*J-if+BeEbm%akzv;Fumf6O zU0!Tlqf{jEC}URE(NoSWik?z${&46FqbW%WGWUeFwab!}BxhHcj@2HSnoNB)S?h*f zX#%@4DWh8Q_R8Sh+l8d*&GQOc>9W1gT57j6r>&F7U!vly%aP)s9rA0hHzvz0TrSB|p?vPRd2q|hSjQah$v%_BXIVEs9IX1i+;j5HnHl^CD zD?W{;Qi>1bn?#7HF4YxJ*6?~-{*_GWYVlS^jHO2`DeDWp9=6actL|d_u2N1*udXV> zAfa-KOS6a~r6}d?p;#kdi+O zs*PTvvdomR9r#Q}>jm(cr`0?vE1|P}e$#sKtgHHOfNFGnf?LOU?w<8dy$NEHQ%1w= z1k=cRPbl1DWO-;+S<(Tud2F>$;$5ZF@&>u9WaaJ^T$z}=*R9up$uGKU-6GVwRjqYf zip#-W2Ni=mEK-A2aDhdm3{~(17crxTx4MYIn%B&H8cowMmXqpc%|BA)Ry;Y`*Ou_J z&K4;TuS@?x)7Bpl!HqsDPeB$hYs=!e%w3rLBt*Prpj)pQj_4X+$~11jzbn|v4JTnE zsGX4f#F!MnP?eh7QE%^aJ2oS?!-Z;NN7}7dN4mJ^dZevyL{dX)BT|^qbH#8WjA<0Y zBqYl+rrLu+gXu_1b$*!^3@S)Hp8{moK^Ogra`Wx@qV#m9v=-}a#-u!VRym#lnb8=Q zTgUM28pBc!k$$#eR@UhwHk<~ZWHT~NmsLt@nO^u=(CTf*jg?-^*jU}OTd$9Qe$gdi z!Dyke3siuOM{$#kY)D6#u)tCjGOp<_l)2L9q!V6kPgBVH1CD%^@`>gkIGu<0;nwEo&7DEu3 zH{09)gzp-@8%-AQr!yx6N7KpTrmj2r*Wu}SF&@n?E{Cp$Wjhz#+sXE zl40GwKng|TVp|c!g;dwZzAj%`d-=-Qq-NCc+GQnsy-Zvpa!Tk@Xud2zz%ZFQ*SsLr zXXme@x}|7@En}APV~liIp=A39Tdm4$@Pa6TWmIeui(5y)G944?xUrU+pzMVNWiN_| z(2JDfL>1Pax5@33%dz3n51d5dB6S2_LFze46!rrW%AGZO%%TL>&F_e0Xl@SyGa}*+y zAZKQN(G?=z7+0xXYw!RfEP#keXf(nkZFgp#U(OBh;LOZv^~&2yC73P2jh6@*S$qGb ziVl`=>j*Giv!=6@5pS@Bgx?re3NdW$xReo=BvZKhMM#I4G)$D+`b2Hk7COV-DQsBk$x=$nrCd6N+sNc8+y$v;QP1+MgGzl~Zeof>6|2!ES}Eez z5n#;{W7DM` zUCvT6>v@^EZsg^rbpK>ZE-hs?MkI$Sb5hUAvHUbcl}A^94XJsm0T#MNO?BGiMHsM- zPkTp$BXdD%&jn>DHK7a2pT6Z%gpyAC@-T`K?YbXZn4{J4 zsx4SYc-9@OwiuVvuex=$3Y9yKYEJ7mY*f2)!#YRPP(XhPZZ|6j_hhr4M}TW4Q@qm3 zE-{IwBcPhxcqS*6swD0fkZ?J~6W2=rdOp4xxl_1sS%cT3o69SAel8|Aw=1cRYic(e z=HqmcniuWAah*ElytQr3T89*LA8+?Fzdm49RZ-jzBowV-AYL&g;bAk2r0W*IXu-mG zF(5Dh&(eIgHtwu}b;QkVM`ulDzT)&($+Ynll8iGo%E3}}5KWinm0OaYSDusV&DV9z zlHMfqRaEAyA}l&J!L3*LDK5I=$9mP2m>D8} z(aGHg%iUeln0n_=f*@hMJ!cwe%*_^qu)4dW|)6w0A$P=jP zXmQaM5Y)GrB;g?-Sm<#`Y1?Lv-d)mK)LOMa{ho%FoK$Zyts~08TXGm*B7XE1rzL(& zw1h+9ieA;ZqBpac{65si5E5EU;f(Q=wuc-(+iEeL%&#aRG}&Y-i1jAZ#+po$iRk&4 ztnSBDTy&Km!JJ>}McFd>!9Dq*tVr+}aKdft`F}rIP+%z?&VNR%uZvCeO1A{SUZRW; zRa6I#YP4j;ts{)^73yNCH%*9HKLb6(NH|R>#jD2Qep(Ad&M)VH5e2GNB1AG;y^{6g zO4eo#!&cQWY$fA0S4v6@psqDnS!-S$w&u0LKq1Z0Kq1a?Yu-jT-Xv(Lc4(cA0a`5G zV_gt?5b<;OFs`}^OZ-t?btzqVSLxJ?JP9dN7WP-2PV~@IiGH9@Z(F0V#^9kG}x*UWW-uQv5&t|teh}qg%jQG z3dzJYQ-eiX52k`vKVE$O>7Ok!JA%EnW@2^AC4G>uPE1VoGQoas*da^-d&sO+w#9gz zz*_1yipL#DuwHYj{GzLI;B^3(BI`%$8wV@N=}7lgn`;STb4Fbp<#o5krKY)F8y>#CLU$!Gv5kg>9VFzwCEEp8i^KG zt7riftRoERju>)bQ{#2haO0L(7#wZ%z)7f|g=24ip*FS7sovffNNi%ds^)^lGKt)J zb-;^@uD(?p`c|vF0B5h*(4A_r2NiXQv6O~R(-c`Eh*;LjnNluOdpQ@ho~zUJ`HuMR z&<6U^%Ln@r---RFfl4%7>DCd;v8!ardp;0Fv=O0R8cxNL32ouIknMiLOfQuk{De8N z-UqT$xKeM97Fv&5(}M8W5Ky;XBPPG-CZstbp^&cHxN&7oqer7b>Ar^G04Ofdr#zI{ zDVnQLvUPCiU^)Vnb?`Li#9l!A#%C4j?3muSzDVa|1t%X(#Q$jI@FztI>%2ZKl!#?< zxOK$DR7bI&F)YVZ_?668}+$$geKduQD5d%@EctMJXq*d2uq52ii1*$ zC5>Vi*A!r!Ztmp7dZ)rFp)I;WsYsxm7pX{EQ4y6tiXweHzv!xuvrr!wfwSM>8se(% zFOy6nRKb&kku?BF z4OD9G^-0Uvho`S!XhTsFZF*a_R8CUzmnY4y#5))GD3 zsE&+Pyxq`vyxm|$-*i>42d$>T+nKE+<7Et0nT-6+AK~k_#SlT0MZ)k_6f^ zxa}nZ2vlEL0;uRn5lSF~I#LuDT><2^tK){#M#4%sL?tQtO9;|z3_L;z*9 zdI0rPh*q6Kq`^pyOCW(%xcs8q$aSy;af1qO1R?bpLPl}TKKyhmIkFWQj(fOa>>|b0 zq-=YQLJlo{KEF6!#-G*>a7-s0=np-M$2wJr}E%&ZhZ5vBRT!uUkjh>DrtzGaS9HUy?B!4iIGFyc?N>9o?RI=i0C^X;Mc_7qoiA zaUJSyTloGZ8jf{*+kn9q^Sqd>grYL)lH|q3mwvgcr~jaE_%{Nyc45Sr9ZxDwZ%y94fA~ zmFW&6)^cJ!V!9&6O`oRZiT1mubc>b}0l*TAH@o%veCHQkn?!oG48(E#ut}uni+4>T zQ#w=|*VCXyB4iFjqd+hu9`B4+uTuT!aep`dv$gB=<`^s4n31wYF@Pp6Ib4wsLe>gZ zLdkr`02CE!|i*s1u2(I*}L}F9C4t2tX^9 z0En;zKvkI>Oc)iq%w^qtgcE`*`tiU8hI{B`^#Cb-!Hrtp?}r(i9RJE`J=fl0&!e!v zZ$r|zAxUgQ0?bKA-7VQP?V5w>6;uG?xmz@Cw2h`JaJT3*Zlqh?Ed@8~U_55TdgHP0 zK9<^j%mC8kh%s2^n-%&8r>JYsC_4#U|8>Q&LQ<5=%`{ z&RDW3n$vprT5-Z?gQuhw&xS004B1mufBO>II(JQNvC^%@cr^_KK1sXj#XNpwV3y&#P-d5?<* zlng~sGF=NKjkR7}N^rx4^lKs&4Ko^OLWQyOcI~%8>C5_mv6^^UucaF|Pdm09b#mn4_gVilE#^}sBlR!UG# z+hawSV*rBgAX748&q<~{(Q>IQ0@UKJ-RJ_~Y#bzG$M^5)|-8|@$s9;azKkJHR- zNFCac)^A75wCAbPxVc-yQq%ld}6X94KfQ{6-+A zp80Nxh^8OftS@JU7_rJYUhXy7Fp5{{hg|q`1T($V;`Bp`tw>yxn04Gp$ng=($fc4= zFqfd)b+Nx!Ef3!VWwZ;1_2ON+RZvU33+vexqQp zX~X1x9*zJ>$ukB}n_rTv_^aJ(pP;jT!!rT94_g`87IA!u^05xdSulgLY2FP8IP<82 zKz7kpKH~IzCaE?&5Q(xeMk*nxC_B`Y4!U3<+unmC#YG*W#zAr9G-C&km6*ica)x18}F9J{B;wm%&lAt8pTUTbnN7Cd}qMX+Asn+37IDNuA zQqt~vzhJX^+PcPkTa4F!GE;C59`7mR)@$y;JNJ}vI;l6qL?~$*I%!zMr&BxwT-(;3 zGH?#4P~`kJt*1;us}I)puC)+W{~rJV|Nrb=X>(f1lKnZebfBnx@rM^dV?Wt5vaGOp zJ>yRVZQ(Z47!)A&8%^-zC#&vORd-Pp0t<=ToBI~1jO0AI=gH{Q8f`JZ)HS~})%@E2 zC>8Wa$!dp*A~IyY;KXf~Qm*-}HA_WsH!;>>eoeC!QwcTQ?6Q8w=I74X7-lol()ioY z*t|Fkj=_TGT8E*=85?t6X`SpL)ogofStui@R?`YskI5)?(zQ!>bBkHRF5UdHK9#hK zLO`lnA-p;}ol(6MjxG~f@^usH#UL zZ(Qic#4Hz%*PA`0qDgv5CsV@<(ZoUk#n>N?JJy0Nrr9xG*4v+xtv++zxP084 zJMym+2QzKdr^BcZ7f~N(h&XF3IdaF=Y}BWn5gA@FAtLE5|MG^UZ9Et=*~wXZEzkl8 zvsf^llAb5BIEoz^?$X`B%W(*%`cj zakHK+|78P*>#q-!_p8mVaT+Jn<;`RSpnPOg*?uC+gM8Yjjqa3xygxT)EuT4-1d#)` zXd=F?)v$!1O>^$%yi%tkbxEQPXT3Tvk(`9PE?8}3C`KC!Oj_sMtu~}8c9V8~SD%I6 z%Pz#0>eQ=?ZnG_gMRY4<{gEMsdDJYfAno(>WyVQSX&YqoG@OvGZIHXz27eC{gsUMjFAJyP;%o!yn1;(lJCGJiML%HQ$&h8eS^G zJ7;w6>}>pfHgTf7GQNZNU%~P8X12KfKA(0)qjWRclEBrQ`OSj+1J8PK8d0!seEtAL}SrZx^wk1b*|nFSt*x^f??wjl&%A2 zf`=1VE+HEW#;-`sYHYK_^t_Gy3EXka1qXyhLd)LuQuEjNSV!s0c(mtc^D${=Vt8ra zDtM|!es6Q`z*D8#H_CjbU5?<}_hujsT)iFJW+7xuXe=r294O-{tcclVeHAvDDCYi; z3Bv6Ilc&M7D-4wxDi~y>E$BA5if%zy?o!ajI4tPWvW82M#z75NW0ygOxCLG5+Q4vu zorobigBZL!>8cW%@UqKZK~L`i=@9o{#I?31tawZfN51i{ zNATAe{yQ}PHynby?SspVv6bM`JkdiS8vDs4q6PJwZOXXvCU}VCG-^^#N774`G-v^NSZ9V0L%#wCDaY>}G4F4mV74 z?MA9KUoc`-D^parqQ}@LfoTGI>}9VYQa!$eEpA!n+A+!*=Nz6UPlL_x7W;4jtEw@- z3+FDN)ec6_W76l0doHJitq$s!BBp++I)(&H7lyzrx(h=k4GHu1soG3RQs@-2zQvsYgDu5?$ zqcKjgM&6_PlOMTX!LFDBw|<5s+A-&tvoa<=cBA;js2Ivf73>Fki6 zgT*XBGBQf-)JL>P+qd^3eIg=F8aEW#^Gc*iYA2-$<(b#2MU^CBD>4r!Tvb#kuRE6F zZEi83DIo)AFYEKyk!;^5G^iv8Smqd^q-jveo>v&j9#SyEvEUv?2ny0&@Xh<}jLd*P z0wY?;&PhMPMf9X!cGpXa$&WMLWkPV$vN$qztaZ;NO&ggKmWw;X(k|sW55yhLD{;>r zQoS61X`RGcR>>)|&)jR>smMaD|Kl>n@Nj7zi&=D{C4d?Y|8^cJ_{G~_fHU;Go0IoI z$KBmT|M(Z3Ryju4aWWjzW&MY>s0-4jJ?6JTIh7xfZdTG zH9)#hOpRQFZY{8r>SzoxS^NA?F0pNBE?#y(I4S`&`B4X0LzuZkbcrMf zPnBeNxw;kzGtodElOdz@v1QZlS9|WlA#wMs^|6_sNkoblc3OZT_Izwg6lboT=B{bx zp^qIAc^dS!W@mELL}zF6%U(!!=J;H2+!Af}F8IE9?{VdVSJ1Y!HVx3$PPT}p(%RUJ zs<%y%5{yG$C@zAGs*dPsgiq5&fxmf0)#ETJdeoOq)%|Ri>Z%A4fu)!R(97< z%Jh)enCUU{xP2tuQ+rPmkhUMcz{h!Seq595FPCpWfBN|D>@g9XbI;Rej7SHx)Y}Nc6cyS9 zE%y9rf1IJRgGToaZ5ZGOCt0Pfs@9I%|GN`Q#C?TdfO+PbkO$#Y3YCWhWQ8Fb0d!s`-OcqCs4nauY&bnqiT1i>7N1NSsz&dVT zz&&sd>Uo)1nhLcT^;zUY&ecNDTKz97Fs~XHsoL`j^J=#BL)+unh;~oOT^?4sfh7+)=;6t zX%9g{s}fV2B%BR+EcIUb?ibLp`^>`}%L!>=NO_GA5XFXilb>sJtjxIs0ltf4C4I+A ztB|lu(7H)MDlDiWEkQm?+K(uMey*+mTL=zd0 z4S@W4TqXso0VwFO#8Isqly+?1TucE#Dt1SGepe5mLv~2NMgUa>CBQ}_>4Q0U0kjJ> zkMO1sgg`i~A=N5HQ7#;n?TIrBARV7L%P#8^XQxVff+=rWs$~8cSZs*NO!-ikRdW6=ukN2q_eG9Dwco@d)W&qyBt@k)MNx`e5tf29gZrF@`6R& z;IV0`oIQ6zo48cYP6@DZN}vEOVP@xYIWmBhlHr_tIoeK?;O@}7bVmk++wp5h24YI8 zly`Len#6_T@retb((!xv-O+n(rwBdFly-`rvO?Mp9kkS(4l)ezOhX4HE5P2=(;mhR zIFmS=?1qq*P!+dMoa& znu?kErCg0W`=^yiaY4*wKYb^{!H78y=%qZJm}3|64zv1)v@9~&=ls;6RoD~er4GIF ztH!CJ2c&d^R3jOkId>F?N^E+uv5N|psYvXT1qkf!(vvj~u9&WED$y?(#}blOZ$B`J z^nuzs$#Jm3?*j8rU+;<~DyOhu52Iqe3CFCN=GUB85M!w{zl<>C*=7b}?ys+wR4Q+$ z7raNt^p#QdNP$Kt6lhdmpxM`Y6VyVO+K!gTps_E$M{rkjdQ$&HNsSL z?f__)by(bIe4$8&EoCz3)kiS|ikxF%XiysgG=5lPvzRV&9aer<51=PE^53pcsw0L~ z9WjD7r*`dVN&%qkAr(L#7X!nDg^m0&Y~&{~DG$!u_sN-_OvMw(T(ZkvNY&+ehh0t? zBh5jHpxVXA4vU7lKM)2UV@N9|NoZtZsu!HM2PJ0bqT)1m=VE?WZ)Eo3rL{2czD59n zmzFcoVo96ZE~VS_B2$}Q036(FyJSpruD!&WlpZf1RgBS?sGiIAFed|$iU*MF!<@Y> zEtRbtwSaSd!)@Dz3>YLSJIf;9Hz zSaQ7*o?-xmKzqNJ^1cWQV$MobpHIl%(6Zd3_>(;LxCO>h3w6d{}-xNv1D3rMv=Eo&GMkJfaMRIl4mirP8 z%`q#SyKqR1S*cQng4c1miS!x9IK|W>F`IzIU}OAd~IlMuD#12OY+g5zJ} zX>^Kr@0~{HK+U#CvV-SP9k+%n0qefwwxI{GHXX35rEA+BVA}Wd3Iae)HQ7INUMhtA zA+-d@{ke0*!bU0SGoJ_@vj>`sjy=#2$AJAy7@N&5>&w)U^jash6+jF~z7>EfOKvg@ z=UU8pg%Pz2xe50e<~RT#WSElyxk=ofM@jgwZF&W?9@Yu$|JhxA$iI&=H98@RMhj;6bPT}rp~B|;Z8CbcGsXx*}r<1Cs3CA;gT zE^Q_WM(TL#UUq47OFw&y%Xe%Au6|9kOFLfVEvagzgE8lh%1P~vu{PVI#AHPAa21x6 zzRgBa;jE*t=qNNuRWIs^Bf!;5AxkIUp5WbPHUx3zuCwF_rJ) z+FYs+mvD2OjOjX&w7cs>V;Vg#lik(3xci6^(FwIO3-EFa05R4KAjTX3vDASxLa;*q zd?oSgWD*eO@%Nee%BUETwePktWnqo^czqiQAbVF2phMF5KmUAXMY>NxI2y$r&MR@x z9un<^@aZa*5+Pls3dh4_jC-Xp>tr0)RK=$av&;IlVPqk=r8DT&nS`vg%nFQ9+KdrO z9gNV_ZM>8jepbgMXzjD|b4`(-S|?IOEv8Vkop;arR4jpaFT1R7o!Dg_EMI=q*gEkX z^{-glvZu0jR;6O)RwTw~Ms68Ysmyr^rTihSOM@V<2bCk{^`P1`p&Oo=}$3^8*^0VyuwKKkcttXl0tHiUeIDBoEDB+!mimPIVu&O zqss2;L+pJdc~7YfE6Xx0Q_TU1Ij;bu(gPCs)`%C7!1sttif}-rm#|W_>IAgV@t|(d9fqaXVGMx0UOghHt~rTe-CspmGJ#P0wCZou1%cON0hHg>185&vYtqu4 z{^|f^v(^Qr3{#e~qDK73l*3t(KP1h!a!rIc-^vB`veraWZ(EER_+t?b;R0dZO8j^6 zumJ$c^F^F1nBUa{XfMsKvcZ+sy$zpO9PoNVG*qsD*Xl!S7epl}x_N}73wu&+>8kwQZxdWio2+IP<@3j^fg{-&A z-%TKNOz)h)Sa-Xe48<}C<#+X?2|HxcPbj0u%ZwgJ%^s>bFX56uBr>${V6iRuxj}@6 znB$?UTDTU`X8rG2$5K>eZ+$~SEapFElYfjq@7?8mZ3}C6@gq%-HEQXvdxcA&5XlR` zrn0IUs2sEN97^Sbh0-9_@g!gt+p#a%bP{QKwYzcU9wfIdp3MTJ5^ z-?n)Tt23O9#S!sHwxf1c-(?g@_OKU9^$$OO+%H!*Gw@>Pv*p)^Ne;SP(&LJ-RAD1k zr3vAhRwjf|jG1oZaL~EvYu?acmz`IludCgp38F8<20I#xeppylG)nYE=yo}xZ!1Hx zvoKVx*W}x3J^8p=%)Z_);0r)dLB5v|Ph3sc zcT4c|Chup{#ohPObUm9~!SiZ4UCjS)c2mB!2|s(iKYeh48F>-n3FRd^ASweK1^wUT zahdZHaQQ>3+40&DnC(vp%#Kg5B`qCF1mK94tsd9tVWS^OqStQf@fK%la=z4;ZE7-N zRV&oG-bav~yOOVWkbI9=4$KIv#Tqn_e1Re*pZZJR9m&6#-D`EVUPTJNxv2NIc8EIm z(whC;QAZwdQ%M`x6|?gS1hQ)9j01rdLc0yuND(F6)Rp;Y-s%L52v>3L7S^;q-d_J9 zc;^z$+C}~5gy4%B7^w2iD>RgOBX~^$Z_m4>ef#0#$A4UYdixH(=MV6Q^99Z(|B5H@ z(}TC%{hNyh@WrJWsy_f4Y&|jCBtkI5Hj`UJgE=qJkUylF1Qw9YC={9m5}E`OXA)Ew z8Ze4Q14)_QIc-1aHHF-i5ZGd}gL8?6XNwy`96)Wi#8n zVcIO6D$Ixm#jUs3m!CeqdpjA8-wyBi4z6|Ls0-j)(-7A> zz_rJg3%FJ)4%Y!?fCFq3T6EHlh1c}RK0LDDeP7Jho59=Za*QW$Yt*rl1ab)Zb5W-@-XGT- zPg=$M#Y4($fu<1fyR!w7a_**p!Q2afm9a*G#fds0<2K(D%#%7_JVc4Qy`~rSeehxC zUBWyR^>^nNlcF!*S?_yx_U?YY`Zix~@>S#DiaIYhL1V|84hto2JGXcVr3UVs?A(P? z^43woe5=8jQr^~4qOc6Mj#AE|phR((w})}8yJzp|v9u4ffsc0OC?8;1$U4qwYHDT! zQ%BZuDoqVX+#AW{1bjcQW^UGX_+5%bU>qmweAH$(40k_?wHx-HUe@dUo5&T(n!9+c5$4vwhBJdtpZC?&NUY?vWP%djI3^G+6pJIdxFOX z@7YCtVX|*Ic&#l5;EFc;xYD(jGSONZaO)7$htcy#YVi z!n->Kc^ixFMeY5Mc)j?^Aphw-uvqLTIaBWOzJ!OXLccuXlY%&!HkXFf3`PT7NVc%4 znYvFZ+|-OnP0cU@KB-qdRSGfJDdmz$>XAZYN^o2cETyW|D?MKrp-%87QunNxmQob@cOvxfXyD)R$igeVnJwCy(>AvltKnk$t!qf9 zcG;V6>*xm3`M-(xfy*C5QC?g_z?`%AH+UeQf)!%*cdqQ=al z=QL4Wq^%QtLWR7!za5{C+AGV17{35M|9rVIVOwv&(hQiHHAOAEc%Qv+jMMyUheB$fKP$avl_t*B)1jLV3+-23= zDU{t=1G;Z~em)vCxL?}Q-4gBKeg%_y426?=GMLm;ahjW9J^>6Fuc`R>?Sp$%PfRl$ z!|?XF2BG@xZ{|zuo%N1c@0h(H$4pXQ-|Lp0JIJQVtx!l%;kQCb>9;~rP6CTYd0c44 zxE!_R){HyG>4(KE4#)PU-lFYeE!FXM{^6?gPd05~%g!BfPu;?1>O>;UBam?9TnC0M zuJ4Zt(LUq0xy4)zH*r&L*-d?+s4o<~){vY3@XTyPEW$ID(^icFaaZmbt4dx#pbXQl zyMRDR!jPAZV1$%gvB0wuGW$$eV#ATynC_n~*RST>6O;riyJp0x8Bq z?!3sX30Jhk*gjy*WBWF;n_iNmNuJR`9)32heI`q#6E#5jMHwOlQ}qFq=n-_PTM!0e{J=x~fhhqE&i6Hf5i znSBZ3^sjgK>)GZB#fY;r`_kV(4@cvFe13Ob4;>#{9?1-m0Eq1)v6&{#Ga+zZVJdq_ z#FPjz6&CVjP{`A9$xgxACqm4!Z!BZ`u(F$aO!b)BGp1zI5t__vWqUM56gT;x=Db8y z{*aWFq*S=m07M8gsO)AXIYZ1#L`q9E7lA0rX{#4{wO$d9qY;%|)gx*@)5`h$%jvO6 zV^q%9P^25@q0PC2D4jYF&8b=5=1%$(%!5gPEozhgajAWU+vlN&nCOfhcEgPZe8+_Q%?1HwJp8mctc)Hfhf`IRiGNSpyt;ns7P*y8(| zJi7Hfys9@Kdom#Egbj!)5Y^(6Vb>+YJGf+AyDb}3AkgapqBIW>MdO&DREJAuf*i$T zJN&bodIQqKX2N?!YNEXJ9gdGRz3lFw2gDj9?t^lG5#!tI@-TA{6b7>ArH+ zX9QnyMv!9FSVJ@C4!Tt$x-Bud42Z#*4vE2CN>s}Q9#=)vsH=*0Pf>hEFuST>^RP>| z6GkU$9W7gTjvs=kT_YFbhoEZQEPqI>LF3Rm%FsHpz&geSZ<_FN8+yPlQlw*>n6jIC z>)3bpzqrnRN(I#omG103M{fCGkvoXpZiKb+SPvxkxu_`M%?%T6|9;8i^Y*t3>;e}t zcsqd8Y$z2$kZ5-&mE9v;DX@MW$gN$~M~VAbcZ83Y^Z@LX&ZV7kTKDs^^Ge#YhXiQ{ z!DY-{$jvd6-nvtPw8GxrI;te*G188RU-xySeYpMuK-ImYv36B2?fc+pBQ48)N3$h- z#pK&+J-M3Q%%^v2aN0mZS3lra5Z3uuklnMh(f9e{2F%94Y~XPH^#N?j=5~Gu9^7^Wl)sb8lG{@^1urf7W0P`EEgbk2m*2Vqp=Fj=$ zV*2~p*{{=w#cDlxzqA#$~cKDC+r|VB2 zP2LyadpW!u`-+5`bcGUI!jQ3#cw4C!Zbm4XsBxfoXI$OxswrI=vw zP(W80f@t`62r|^-a$K!foQMP^)N=dlQA!6$WNlOPTyd(XE9bR!-gq$eu!9x>8OKVh zn(|;BnX+S18f<{bw$dMk(y%Pn!=TMBd%;x~Vh10Edg_8ekP)a0CbyNtqJ*nEP^dd% z5@TE<=OufJh%+Y$B?V$CvADIy$y>d6`HsVHRG*omtM$rjkJeUrr2*00i zqSae=Tkiz*PEd@h;NTh{QP&VJ*a=d0)L;}=MHDK#U&@D6b)Y?-ly#jRkp$r=~AnM~SBc@In*D zDGDNw z^({vd+>2=VXSemqlRkNJ#Lmcme3rHt=_PqFkEcuhH;)G=g8p^+dASuUY6l zjCUz^v!K}R>pi30q?0@5+Es!zy+>m79^w;FSJD)$N9OyowW3jrAM}2{;fSQLuY0)<~o)3oOL zd1)l%x_&xTfiy}5sz)L}I$Zw|ukqB+OfmNgu6O-@ih{$40vZwMwbLq~6{-##zQX%^ zOzn}xjw9R0Ed4%a<&j0&K4y1<#`ZDCC^x1}8&IZ`3g>Is>feF3V_GL@ySD55PUpVU z`Bip0GmPx?Yp4MjMwJH09@3_YI1i?Ze7abH%^aOSRisGFR1wkGXXaVhaxekZ(h&Qd!uGsCTJew)+8~03=UH(^sK4t|YU*ZgsnWO_+x+<7pNGnYn%PUQ zEFBLfRQb$47;{P383xnxVKDqT&0tYW4RWba1xmy32F6MU+<}JUk zjg|kgg7|;6SpD(f0!x6Tn-p9Qtoic$%=}Ppa%8=~dISF;1Rgk}Tf0WX$VG&2Us-xRs;Kw!6Jt*A)-#zlbz>SEFV%BT_INR7+t{)69nZ;c$rdm z`=Ar@06@xmvtir!dUlEARf%~x9=@eR7p7QYrOueGiKxH<6i9G`JDY*v6rkE=B{7zu zBm|)P>)8P8v~dB+fBrfFp}1Smraua-ZkvAFSVWKVC7K}Uletmpx9i#6)D$E(Mp!E3 zg7m}}Y999{lREdF;!S>S{|0|rPw?gHuhx~JhpNQvM|ZrRu4jwEmlgaZ${geS;pcZ3 zgD;-|Tuo(^&+kU>uCG7*_Ab~m3X*1o!`S|>)4QqJm9^Op@23AdpKh;b^Z&bF&spB-6ilWp8O9`5mCot%wJ`uHUscNTq6fzUQEt`GxvzfAd8vN#hm?&36ts;O zWNJk{j!X_3N4ga4jBsZlvP;PjqDOw3vr;k_vwJDKgUs-vKE~WjAwwMLK7x`K&&V-v zSP_+-JHpNr31JzxtGV5jzGjH~>o+8F{a&+vgPMD>(E5vp7Hhd1>){V5AD+mh_hF6Bt) zd2A41o>xdmft2&ILRvl~uG900e<4t$zCTY;-Jg$gaUfFg_qRAw4sk0{8ba7ODYrNE zI{h@Z>46{%+_26hJFlcYdx(_w{|5j7|NrcL+fpM*w&nBrkg?NkX(Ii8$%kV`0!?u` zFIYlWPxp&V36Y7Wgv3aY-L&apeq(-bzGSw0T;%Q%A@>jhL=t7)Rd^71=;Hgf_TFn( z(W~ds6HEvSqE{Rak0dPVwCiA)o*%su9UG<>Zq*Oddo>X((M0PeC&pZ_4t%lbni>&) zYJ^k0N{wiw!_-JgvgSWf_7+p7DdlDAzmn@+N0 zZIZKz@Z>p@oPBbmb8^EgLs%YWKeNf~<~p02OvYt48Qm{W;Ul}b%I0ukn%&K3<|y1R zKi`;fMx$F;!|Gr(zZ}o5@AF@-$#j(6BSl?!*hP|*`B+m^XEPtG#A}!Yp`1$8U|H?; zQqh!)4)qXNVC*Rx%oJtrxD>3psYP`Lg@K`El^=Qj@#rIA$SXd=7&FO~&5(k!3Cry; zMhdLAEsFp>0{cY`>CKlAjMKCRe2i(;UoqUDOvc8^ zc^AB*tLW%b;VRzY2m4>^ZPj#G@6nN z_lp6uy<^ck7Tb3$7;i$)?DdrhGTybDOoQ`)z3u{rgPLc-wM*DT1+z^%i6U4_iPwC5 zMmqTTMZUX7jkl2ZCVWRhGz6xH7w4X7pdb zu`N6xoCoTi`^60=e0|A(!+!m_f9Scl&==X&elj@sq&W8w(KYIO8|jtjK3{Zoxmf6O zi4fC4q05Dn&poTPP=@**i}ht64g2+&PVag>pX(*Cvh8NaHBnTqD0XaQW>g}-7zI&I zYA}w+q(P*hB3%^Oxu2@{MDh301Y;gy)Tr;*o1m?6cr%EE$VqLs-Jh>MFoX=EA%(v@%tRDa_ z$ekppG0=iFIs${sM(k1dj81;NIO^4+OMMPjiV|=12T_`N?AXeH5ANs>4yLrC5jc5I z+;eV*9Pxs(l^%G9oSS*C*z(grOvQ0Rdx=mwZ%+n?6*g;cPYxej!QRVHUxtWMtSD?W z1yNDOZwD8%OVhJ^J0DFKaBOZ8iJf{@kh|#+w+s|H9RD0Ij9mofUXpbjjz>SHvxT7- z+yf^g8#YTqphOQbrDoX9 z@&3MNsie?YSRwv!zhI0VPo`^m)!KJW=Tz&%vvqvT1v0*S2qBUMGA@;*vY|mTzg|i~ zx#+@dPME}Q>pQ`DTiMEd2|<}Rxr%_SRi4`Mr}sYJ6agRAmU?Pg^U(+rJO>6RBT57W zOzZ4duu~r>tq+v`l6@Ou)O?aprl8_)?`|{82i3p=HJjXBe}m0ea6#k6{7=9nKctjAF4of!|Cf*~RUhsq`E#M&HJUL}z(CnSHahpY=*mBf0a@Wc=^!a(Fv~ z?|yJGG5;AZ%1uAsEiUd%lz4T2Gc&i0+*lB6F2LJd5@Q^s*6owA{AjP({jFe1FH4#p z!@=XNIPzA+T?x8cfgtG!y$?c?qWC=!PDbv@Ut{bZ9(`8+exmpA_&p*d=5)t6EtDl`+{EfRO=4c>ty%Fsdi5^6OOqRxCPASYX(jP|m^U z&@SPxK#DQ1eVadPz(9V&bX66Ao!`%i(vf!7aI3zZbsIw$?pyn}Pv@h5jTgi3 zciDVs1>lDbBmNkk-(}y%X4MpMCHz$j$Q}bG2)J%Y$7{;v0P&*n+lT~yTu4chUEl1y~Zx!XJY^{liu3=sYlA!EUVVvW>fYQk9D-b(Z6oy z*2^~B#=xI#Kmm7xEMq9JB6)= z@Uln$9L+Pcc$`d>&tx*se%QA#n=U_^cJlA`^=3EYNp@pa^K0|qi>uN62mI~P(Q*D3 zemIY5ySbk70v%X9^ma7Av~2v112h>~emwegZdv6)@f}+A)i|>k?o8>&jwRTdX7)aQ zrmb30gQu-lia9GLS1Wk!46jxg>nJwX=j-}>U7xRe%zRxO3s21H#q&{p-tSk;`z5^h zq}*V_``T9iP$lg1i{=<*Jf|mAJZF`btBl*_mzmFu%hxZP^GvXD+slhabDrj&_;<0* z{A%nT>)}T4i9XBv#;v(r4BCmS?gBXxn#$PSD6GZc)ja5zwUrYf^PsKi2y{BVRL_G%7yF8`8cwsGllZ2Bk-lmfSXx?kLmeq7%zmIpKVn9bvUju!^aSz$oSv)lRT z+jwE_+63^K&;EIPRV+S?ufG|%1-?^@x13Ci9n1^;=V8C<+7lb^Bx<1ZcNV#a>!`Doe8B<4x@6~v;P!+taw#7b#s zP)NW-n(CI!^$Kz-7M*w%K>vFKiI24#vYq7nF+)XJalG@|)&1J>c57Ef=h{_}TY=o#RYo=9$fM9^V0nBBZA(=WJ>FEG zw;k`3Nv2S0>1w{wi$+PGo}y9g=7AiI)ZFT1AEZ;L%Dt*pR93AL&4k<9L(JUARFP^)Qdp6S zP>^^@3PBQ+6!xL1;&)JtfH8>l*x=hYrsmE7{R>lbxDvYY?fpjewe$oC0 zHy7b$w~WaC-H&qWO-;i0ufNUjhXoA2Sgy!Z?NqwiB+DW8@`mA`PUm~W_u0S4+34;+ z7sJWu+n@LL4u1POyv&B5;Cr^bB+pV)`$FG|qd`xm@RFy)A4N{$vS7Ace*E(G*&o@`MSEI>w<1Y{I7kL8aH`0{>nGZ`6LyVc;xcI)iq%)rn{qV6Y&a=hs z;vKvKB=p0wmyO0p^;$wU(pB(FF*l$Q8O zNZ2GLTDFu#NlK8!&tY03VOzj`989Mq{#_)2=?Or2B3Qf?AQ(c41W+{voJ|2_5EIrp0H}R^+*xC=G}r}Hk#MT$5Y?1x+75Vvh17KQ_wa(i zTH#gzltQ2@901nO(Pxi_hG(1o}Yj6rjnX7 z8sJ)QN?dF4mLuzWTgV4#tX*bAA~ZO7c4P2}iRODo$psg1^=M`gjcS4rC!jVW?JdUl z8`%FX%Q0Wzn71~)2s7P*s34*X)PC-uIs!COvyp^J1JB)C1P{du(uu&#~ zhS8>jS}2KZ4k|VmHq1for{zk|D_;NVO?*7QG*Z1et(Ejxxe>(HYb|4AW3PLDt zoRzJ$mg2)O(;sMj_9F?9fBfW9?bAEA%q?tcvnEhK)B~c+02dQGFsRm}iis28;;usy;pHl~ zN0n)TG50tH7<}I@rvQ9DC}AKLn8f3@4$s-~ai)i{WT`IXp03d-LMLZ?nbC7y_QAI5F`rolq}1cya?DIB_7AAOccyt>G{> zvMa`dhm`IqF1F$r2A(5(dF9$u$NS__0K7120T%v^U>#3n^QG{C^P2$3%4<5?V` z1QkK_x|w29MtB)KJWG<}v;rRc&=<|&&@TqMk8 zNeU_xg1Cg>Yu5s+Upi=d$gFT{zF0H&5N588V;k5w60*6=(8kfw#^Dqjr&_s(_7^u& z98d1jZ5%yM%h}k6_BO7!aq-gBhg6}aC2N90)2Cum4|5ys1({7Hm^+of_%(AAN#%`w zsbHTA*ae(_OP!KE&j0dU)+qM=#GQcYY`Q@yT2OixmYNkVdE5o(&81eDyDs=91C?-C zYUQ?*akg4eUUSafb~5fUAF9x8Cn;{bY%R4a-1hr)x1A-HTG@xb=u)e`x9m}R%QhH_ z6lv%BMS8Uy|2~+M)LO}ukmG#nX+pa;Dd-8O03|?yzbuhiim#kf?~+1xmf{C*ix6!~ zH8Cl~YQN|Nl2<)l+KuqfS_&@fYn4q|x#{{c| zjS1eQ#ROa^;LQf$)QA;&vaY;rmW&va^>$&wqp$%}5dD&|($iw@_R352|%!JJa< zv7`tJ@v3{C8IxdB63(+?2mf{M#j6Oy@I8`qp%699{`WBHtlN#=E$*kY>HYQWZjn}R z^)2PhPrQ4#b@cI0)LH#Em=HqS`g(qjZ%0{k0Ga>FjrA_aGZV6d38?q1EbiSiuRy1Q zC)IN1bGrhkv~E&X&v@fJ<4yUDw@Ogy;6@TDUXXgi5f(x_)`AKJY8dIdwU7gZ2ZWX) zCA2jW2cbv_?TD1n8;F$lL`q3k6Dbym6sHLyrA1i4Sc%l{qX{L+B4sz~MXIw%-A#r> z^djYplpGL|ixdZ6v&l-C>y=0qiw+_M<`FEZTN8{0^1LplDW?7)CU7=cf=GcY+-j!w zCnKhlWh&n(U8ue`j&Rv%?#sqv_K;yOk0)jZPo=mSPd%NF{xx0 z9u|6YBE|F&L0jeI{A#Y$$LuyqO!AFVQwqJ1v#i;)gSo))0ccU&gU18YP@}hIp$@`E9(wlqa9}e5?Mvr#V-c^mfCy8)|?_ zDMyMlA;;#rBS(sC01F^jju@3#gmOfEIg&U@jxYR3RL@Yq{a#X_t ziVO_cHr8m5;QC6A@}CIa}6Bm=C;xsg`J3cGQC)? zk<-XuwEB7-gjp~lK?Igq!l@MsFNwnDzvhQr&X<}~fpnsSxZDBx7Xzf1r7A!LEGW4} z+7CjLYOYtRpjdP@sn~N(1!gETsoXcI+~mj-1Rf96L1Hqtyg5zadq|^Qf~Vzh_THpA zB84kCvXDm2q)Hdkkewk7D@Vk0O$9+blp_(krX)pzGM`BFs(h!;{NNV8N4;z(7#=qgpz1LFnbHQ+=Bz==!=_&8~os{O?cmg!QJZ`DiH zCf+7)WleppQY8mimr_MVtuMZ~?nsqgbdV}6h-az{tB!UvT!=~~-3%ASgyR&MqG-Pz ze80m{dZ=Kh)C|{ry!d1IeROdshW$}_ah*nrKZc@YamsI9#vY(qJ2keIw> z{CoCoZhB3sM)I2pC^uV9%a`ye9evsVaPkj0(?_Q#AC6AX{yzNM$@y=?zZ|{)0_)+~ z(aAr)et`pi`r+%@-kv#;LolLeU(XJXPR~!?pL{qu|NHR#vm>6EV+@g8TeMc@35LJT zuCw8n?0YuPjBMtq=!-%5@(1SS4~myRC|~}-y!;g@9Oj52?9|nwnmi(AW3VFStgx0% zu%@|Q$z!qT3deXUXAJ6rga`}{L6?+t7&lc)EwJWUHpf~=#;FB6^*QH<8ok5pVn|x} zDVhP3bZDg=Qt?&|0Ln(`Gh32@$$%8;Gerbyln=PRl8=1R(O%KeUX^GiEWo^$3qm+o zC5}lsFv6NUj8O;VwKUXAEKKhlCCdcQ`Vmcc^*bWDscfufwqil^A+9xtzWhM`2RV2%Zg%c5 zdJAt&2Ak98Mu|;{jsUl0StVXsrGa%*9B!_=8n`1$Ow34MH}Ogk#nRQSgiA?Wl$gK) zQ7I^~5Ncgf93eEdDlXrt_fQ^O-EcE6wIdI;Hf#p02WrLH0ja z{ITZ#Q@A|3%qBNi<581l-P+!XUQnT@iWc-#kH=KURr01WlT8J3=6WS>#iA?j#XQ^# zVI)J`ONVH3O0l*yEOIR|`iJa}FnT(lCCOX9Q!j5@se(W~3|6axK&6mr))ZEl>yD7= zj#UtZa_Uuj6T+16hJ7g{mDh(#h1NVp$RuwK1%4k*2-hc*EKSJlPQ8#lRLBf1Bx>o0 z$mv-u*~@8mIUe24;lMZ0zN}2o$83IazcN6UHXX#LrOtJ$`?vI1c;lxwsE=Jc#h?Vh zG{{Vodugt(Br#uftq12`N4V00hV|fXeXOYmPfGJBELz>mvusXedc$A7RbSWldg{T! zL^k%~nd{DZ=N$np>)yHNokL8p$Bsyue%|d?{drF_E`6JL z=)sYyO&nfEBQR{PujD9Sbd@9NhecNRhZ)(lA>dON*!68WB zWf4df8<3*Q0z4qnR1&JKr6{E;Uv!l!=gVq$uWJC$=K)+zN z>RZLPQ!?6jVbn`T%PtHpQFGKIkrVagi>_UPe#t1(GVEre0m%vHDZLgLc3xB6Hl;MZ zx~%x3s#w zXO(*;7y;<)<})pYbEaj-B28-@{L*L6B?+xV(3m4h5?_xwq_WeImdPi&k6u>XB7GxH zom-@6#Gz(vMuXhg>!sY3i>`9xrA|TBCdTl)xR`=ReB}us1)~r|;%S9)4Qi#uy$4)8{t}6MtK;CF5rWqH+sn@4o zy}lq6C(4*wSEfTHN-fhVwt7ich&;dwUEfV^$2YDZE%x@Nvtdb6vQDqc3}V`=7rKKW6vYd|gTZCfhiYp4X8Z!88T!H&>&H z70UZAJSGJqQ7fV^=OYC=VeN1WVw$E`!z&C+mQw1uYt~Jx5QgY)SurxOQl&JK?%C^( zO4*2X&!gohTJkVCOv4&lngj}Bj@DIKP{q>SSG!fJ;^@&8CpX-;j@NU;x)dtaM9gom zJ3^(p(gm5AQ24E?!mF9JF`*7)!Ze{0?bQ6n<5fDfd%W1{C0XO$j0r!@eh%rc)U1jJ z6crMJNmEwWUSCN_zUXRpyqYA&G{<4mIR z3XY6>`Uv-Vr{MINaVo*-)A7?s)G=uA{`5sppWNZoC(F~v8WJ!0b?5Z4j)@nlgp}T> z9tLNNxu=gPe?X!+eOSd$Up^eMKYcO43CU)k_nA+hux5~l!^!&PW`efGxIuVk( z33R57lj)v5u*||Ms>f#dTJ-ecT4xFc7i13?Q=)z!P1d=t{x*GRw+#fPjq3UNl`JcW zkKqKXrZ_FXzB=ZrbDV~3ouPBgxyeDe$DAW7-D6HQHpD=&W6pK!F*l7PC=#Oi`}F60 z8yUSe2%}p7cXo9%A5U-polOVFv&rSRZ2n`oxSw9mXV;k@c%D~33y9>+C2>dnP`G#$@O4tH@oTZ|WP%4fe7T;Xz$8o$nBwbhSet3mmftAW>) zVDKmk(i}wyCap83nG$$4WSTIOh^wMTMUf_`rIDBy;ER>le6ZAUO{kSC)LjQlnGSwe zs0m0c_geae;KWVH6EJr;9yJPb{$Iz%VK1^ZlP!|Ckj>^18Rzu4NS1d$i=)bvKhFjy@*l^+o9stx!? z6qcdd;3Zoir%ZePFIOxc`fGNuMukw4>i^n(9#MrXo3#nqF>SJaRzQa$q{e$?S^pkS z`D}ChL=Ch&`|b40$*1#w9DN!bfBtaz{^-l`@BoD7R`BkSJF4sEA`+5~spjg)hAP8zq%D;Q}+Fa2yxVx>Yfnv~RQ} z*x%@dR5^;BDzY*q_NcA)_siGQRZ-M9gBS5}HpwpTCZqYV5;;RrR4S)sy-927R8?7x zR_(CjKRO-FZ>R7khreZ`$?esF5xG-X&!$DHth{x3bo`-wvd<8BbT^-Ux60G9C5=0u%pL(NdR$#kE)L5=AtRTN%`mQ zJVx(ay3g(b7x^&*!3P_)j5#H5Tw~OCzoLGBY`w0jPrM7cVLOb;r<;vB~izsMrFHJzv8gmPOq&Neo3-MT&NoBC50P><_^Ht{2~q@ z@l)vSG-wSoHb%5RTDu$#pX&FPp%LP6wNLbXHJzhfs$|Y6m%1@lS=DdsnVCA5WOoRd zuFp&`udeGE52xiU7Z8%wVb1C5O8zqJP79*>=k7PT47xN_ld|O|w_^Qs`5*V^q$l&< zO8M#_$SG)@3(8BZs`sy-Fr#1z^M^AddBIijP#-g%y$uVzxqyjV1YLH)wabQd&4#y((-w;Wyt z!L9FnYBZ$cK6tJ%0Dsr6!3B1_aA_o#Q5OsWs%lMzE!r-)k*z0LGjxv@9$i8{BaC?K zXLh}?oT1JOkDH!y;gie6+;c^6RILm_X@MFmEk!fvK6F7#rAm+x9KIuuS#+0nXTX4K zZqOTb0WF$Pp*HabsXVq*w{=4PQp^y#=z&H|D0MO)@ziXycJ#PKos7t8f;2IYSIr7j z68%Z1Ww%EK`mWhB-%v7R6fU_=gA8)GhOsvExPkh^CKRA zYxk<8RKjl{xN3zE)MV%$q#Sn6u`m276b~(+O`P5V#i=vxFH*n%a$wt_ z(4jr$@;WKTFfJ3`j6fpWi#dwBR6d8C-J9zvrH6|q-dD#jVsGANE;QhVL6 zgk|>J;>x{sB;x6TvY@A$o`9c9SDn>I7lt!y(_V;rwpMZ*&WPl*FPOYvVA~58>>T!p ztZQ5Ez7Dp8IyxP!Oambvw^c0+8Q1!i@aEv2#0tVP{Rp=#0W^3r(%->1l3pmtW8WGo zWmnK{Y0y6Mb*OLtK2SW824#F>jnyq?7w>3_=(_v!a{pE#Ls3X{!_7U1&JOFE^053f zKUx1K)8|dMdyS!IhvNn^6$@Ei|3+IbxvZIq=V#X)`5TkoX20gg!tG0{Ku?h0VK#(T z;_}SDw-ZkU1|#NHI=y}(0r{Z2s6OOW@MGF)etd*d@q^sA({w?!KACcUooGM;UDg3M z=5!O&*idX^9cB&|2v-Y+oLIL;(1x|L&D%(RPnDc&Eh}){qR)=XxG&9_$&s;HwiUY# zhx-tb2f}im^~yd-oe|;`%-L$7xCzWRlIH8a%ssS!=`RjnCFW-p*88EF5yCJU6%&($Oszse$bQN znkOtvq-TUkVWx|hXJLi^oko=zf4(kZuSY35ix0vVL<-*bVTAveO^AngJ@94ShTJN| zb2r67bevDTg@bfD9W3K^M9$u}2TR;jAX0E^%031h@W@oI?`fUebbhyhK%l2-eZu>YtyAMI>Vuh$!fEfGqzciz{Is<*f9s_+Pr6UHWXOJ|PrkPW}P5GCvq;OwHR9C!BpnIiYEQal0jIL>1H^Ql7k5``|hY4w8 zqvPwGFt)ydzpHiNF5BC{DCH>V0BF+89wW}szs{;(|Gw+uNSg(4$Z@Ahw;>bdS#qDy;zSh*t~HC(E?U|e+LooC%QfJrmAs@N9?t>deuzMnddZzVIAxY%Co7H_`6`l zaE}&VT*ZMMD7lt=gSW)sdFCD`XXKVmTNbnRB%JKW*WcTFamm${auxJbqan%%K9y!|*XacvBTdJwgZZFRGj zT#3@s5s%A?mA$|sr)lEjrWW^l34|^U{-u+w?nWbS1giv1EPwKy2b7W>t5LnW#ce=> z9Aa{j(x397-%iFTn>hZ&aBta~aNXR$OPTg4;9v*1iJ^8)qMcl!YaT@QT1UN0RoIqQ z-E3n8-eBJw5FZwfBR3Lqn?u2RayL!Y5SB^$5ZxZxlC(t#8Otd>*?tw&`GZvRFw1kx z|80VV7)*2!<*c%Tsx6!Qg56~nvuH6B3}yp|b;uCB9Q3~@Ov`iT?6+HLve>c3QYkn| z_Qv}Y=zJ3GEle@&D%q^?W;ctA4Xrj2Gjiqi+-WltnwYWNXgbCdHB^oho$?xaZblfw zQ~X`FWIUarQ1cP>;z69)a6~()Ibux=D7nP2UBfRq6;*pt@7^)!M5DG!ltuhlsgujX*qkG5x*7&LFPj#fRb) zxpL@t*jfhaqmbne21JFL=diY12@`@_hGW;T?`>ig9T(sd@IZEo8RNvKIi0rRA?A}= zrsKBu;x3x&=R_T-si!&ZpwrZujneHytZM<*wb`MRC@pbN9Q z1_7QaNIB~PMQ1Pa3;_GHkV$lAMA)L@Eq1P(^Lw?KA*JA35LR2w#orpm!Y0P`aMc~u zMb<&&OR}DrK^U9LIv(Xa=~h@;%%7AFe5wr5Vwh6cV2GL3R88qd^a~$~9ZL-Uzb>_W zee?>XY?Dk8V~)6?c0xN^dV4p@%wJLOT%I<>*;%=X>h;nQxZ5>n{y_InW9dFdskcI3 zaZ%)3<>#{#JJ=SicTmgT$L0M;Zl;&f>tc57#9jOBiI9_k9-X8 zovKakjnRpWVab6}$A#3Hz}Syl7_o&TlaWkF(z@ISl!6{N-!a?M(2XZS`?K>)|dlqxR|{o@=bccwhNV*AVH ziwCoDq=sCbk&{IS&lqiL7INd!F_CWt%%~S?g1rVV3nb!hMSrQM{(h8r>GsU6K#^|7 z#x8Q$R3L{~%GGZxz@|{>kM!G}_?3R2nkMU7W@f}q?z1qXA}5oL6*6rr7i6qglX;7atDxF z__uMEQ;THSzwYwbBG5b{wSd{9@K*KLyjEMBvH-waVg`WlK+T@$iT!Oe(^3w}36`v% zsj4^`Gr<6MR$x(Fk3e1RuhB^3aDa49sNn}*?}jCAB0N1c*>T(45kp0E8=?$6o!>l_ zt_lV7dSei5!mgHe;nd+X>0z(>a40usNQS6H+H)T!iPg(^Qi7s28tSAR!99zXSaEob zET#bwS-vJhFkpv@=9uPQqG|a+&v7yx_AtJ|rAA@X^!&&yjaGK*t+&~Qd_4(=CE3tj zG*KpQw3%u*W2l+#z^=Beh{a&;u?YY-ZkxFrAeLzhjc4MWpImm>aF#)zCA-0n><0HP zBIh@^iY-3x4yP$PJW_0mjQAEAjNa`PEmhxXt6ym^N7SK{{D}1-bscTk0b$=4vBgZI z(Bc|L{D;$5KR#PX0<{-MzIpb09)p_2uZ3;hO=@$Gw$vHn(#e6M@kPHHiP%u7H-uXN zuvmvHEj;DAz^3Mpnu;0xfpxGNdu|112c{|;t8M6JI%?aWtcBRba8#*8UZVMI2L$<( zY)eu&vAn@pPjC@O@NDou3^`#nd^J`Bpyg^oILKV|D3*S47xXJ{Djw$C8ERv1_|8Pz zvVN>q5>)FdkN(geSY1@|xI(APL598hQDkNxi$&L7h{>CVv`@Y%4ASdVb zF_p3q^XH~?WNJ%YLe?T{{1G%r|BBsld?`Zj>5bL4u*7z;F?!qVn=aZWpv&d&Gbs)0 zP)j}mec|)Mn5k{KK~WmCUemVO*CvZ2ky@*jDEHv@6Gyd+EFtlLkN+L8jjudLsMul6Tg6>Fw;L#bA>k>b15)FN_!ZRkrx zLo`HBCA%L-`u<0++rz_^E?ynKH-{e&@XFCn(L*{P9^ay77`bK0|ARUC%mmhkX0@+E zvTXNSOlStA#V9#IxEMBb0V@n44*WC!Y>$ycXfP~D#js$MW>hlY(T#K2Bt>}Y z-S75sC)O(2zUoWc<4?WDU@E~AOYq);YBJnw|7ZRW6gnd$1I&CJTN;g$gKlp(b%~%H zQUwFilq>UpiCX0zC}O9OH6)Ka;!=Om-t4!LORfm5M*$Q4dy&ET)Y zg;3Bp1o@Q=wdGnUJ2&F!} zE`eK<T|~wq!&1&qT<8JB<@^xx2So2IHCE^q93Hl_2=`;E*q2f z*Dks)P}Qc^cxwlg`$=0pKs|{Mktvs=#^G=%*?i1Um}*i{GN05GAuz!>7{?ZnWB~be z(bP_4b?#*tF!QFCY1q+5_UhcS>2@vmbZgKQkzYf3qd2 z%uUOzy3qk?7x*mS;Zs)CpgL(+$aU&^k*-Dtd7KGR1FS3EH%r_+?K9|hW$!Z3$k4Ni zTb1w8c}a^hJE6L^(B-t% zZiR&3+UHy@uZcs_n1gKPES1oEIlN5kL5@KF6SUvr3^2XF20tHO?_7R57L6LlD*Em@ zY&ntMC{PL>g2*Mje=YCWw}|Y-8jj#Wg}C=rHjm=bi^q8vajZ_i5Ss)3+b9HAZpso$ zfg!u>HH7e_;m>A0j34$E6LY7r)r(5{SkZt3mKC`s0!gD0KWca!%cdN&<)*%$W);g? z^H>9pbfS|V-A@mf(yp)ewohA^uaBei0obC=37gY3crdePj~E;^F^hZ8_B&Dh3~zF~ z6sjWO5f}kfsU)YrS`;V_s0$e7fuY^lj2hz4g2I01{0*tunBBa|XSRM$zp-&UCYvifYZ0n7cp=FZ|zR|Ah zu-YM%ru5nlr-Wwd;X1I+qHES9>2KKS-*d#!_snpma+34uYhvvr^3>E!!s6jAjXLBe z_eo}W{+jZ`h6=40e07TVsQB$6)nEQ0L7;6j27(enp!E6ZkOz`%ea?)%1AQPe9RwX% zjQ*0$Et}?9JB33X)B?#U5Wd$d&g5R=&F9?cDFhh+P?hp!i#0j6KAFYm10+G@g{}w9>J$Mvy zUQU&{wwCjl89lxf*x*^ZvFPYYlz9tWOk~GQz9a!#X}UyOS#6j_%oW?d?Fg^)m7Y7x zdC*Fa-0p_vs`;wsUD7eBYaegDo~ljNSQeO=u1WPUzVwvwy!02c)h%8FqpOAEb`2XP zSt!2-XxMC*me-IMhw>sX1<6g(I=mNIiPOue1IJNvI2v>w_JAc763JNldrCB? z6$;>lo3$9^Y!lY~ z6KjoD6j-PAfKLL3%$Es{jvC*|sLByr3oa|&rR|UHzh@&Le;3P9oO2mtmE!}mC|Hg2 zW$EW2gmakjw zLwxmZ256OZ1w)PH_?2ClTcbTT6fYNsIL13yi3!obY2@(sc}9>7>00}#+$RJXSbM(@ zgphlk)Ux?V?kvQq4ug-D`;~=M4qw2Qi2vp)^l0xOcBT0R0>ce2t6Gkob{1FPB$Js{F4fNU7xaKo zeWn3(uHiN6?)Nr4FOY6@{29h&aEbIB8JP&46507z^5glF8t9yFGp2+3 zif3BH6&=q`I=xug(cn~L3P6`a*wwh;3GU=AJZPiX#81eS?V1?yOkG*-=uQ5iND&Dd zy($P zh!$*Okye${6jmw#t%}~r&HfbM0wxXUXXTHUr}c!;kZG*T6xN4jJ~^^};ZDYiA^47D zkSH}Uf3XVzB+SV|!*-R*(RTCVv9y%j?i%Unjw%Q|YAI42V|yCV5iO0Y{AgLZQMA|R zfXHhR0essJ|05k^1{q3nS1?4pI8j{6>g!UA9MbKQT!mS#VDtBo+YUx=H z&rrZtl7m(%$NG0`NTT2F-%2#jOa64xynXZdM^~v4uFK7Hn2jav%}X6_$1N?@hYu?} z`&r29gh1Vo_trK|T}Z_8ZZ@{Ea@srzj$8cBf4`S9+`beX@@hRKl=-xO+t1yKP@!`| z8A@LmhhxJlzdjox0JbB4{E8Cwnv<5maN2w?3l5JY8u9OXTTb;bm3BK<#}K-zRD-x| zz=M(rnc;!_)+T*~Nbc|>)M%+La&#k-!EGuD$s>c*6koQj)W_7|Aa7_7ll<#~+#=CL zazZ2}eXc^-_2j@5$X0*W~cxxg?waEy)SzIWTv6X*x0fOuW4u`DupS7a$E&4T@%kBQ~?Cr6J13C7Y zry6F0pSEfApExqPq9d-f5mhDF0uWlde@{GOkyKbJd-xp;N$M|)4aT`Ogd)P{jRf0* zfT#sUDS1Hng%IK~vRibP_5*{FFeB_8P*3Q(b3JOKQ`bU)INz7jdPC?D5b8 zcIEy}J5ep^_rQl{4fQSFaral+;TeVzXN&#nFL?MFqL2-nP{Y&c2e;A29Dg_mmhvAQ zB2@8;y+y&9acf(|qHqU_doY2IFftwW2#P;ok+R#~kj7{bR~4kCZmW9)&XR;r$7Y%A z>zS4D18aRZ+@M-&XDmY$!n|cF^Lr>1{a7>+H^$Y698shPoC6C7gwY z6G~fPoP{B_E<-P1!10>?^Xt9SQxp!=f+0TGR&nCpyVGilXqnZ?MhPi=<$1CS!GhkVC6rYzeCx`8`^?dX%$k?5{OiRxh_j zD&0E@l~ZBxy&T~XZXB#r98*4&mKsU6rk9-;$r0fNBEkn`m&H$RC5HW9A`sz#53b-^ zafwTPu<;dl{z79zlKY`oz)krdiR^jRx=HUpG0{8!UJPneYfha98*NMj>Q;>KE*UZR zw=fD8_9TjbQr7#guj)89IK(H9i213oJOy18o)njaT|_zC)Eu!S4#lE#T}5Xvfh~tf zQF$PoEM+`LYQX#&cXs37#oePXdf7udv*utjP36=|4QLe5L_w78&Adcn)ZE0sk;qO6 zs$G!8J6?x$aH~Jo8ReOO$LiE|bKZeBj=u9^c%-Sv3!4rOmEN(cze z2=d2jnP;3z`S9HHM-eOq7F}r}7lgg+2c8v#F%EeRih=&9+umK6ip3CXUHFOESZ35| zkY`#RSBa2=jD3jD?zeop=V--vVvoHBS9NPCRm~(&XdH;P7psOOb;{0*o z8F@K~KmE9410;=$kvY+lHwgHjtf% zn6L7SN3Z$`WK(#5Cn{X1Uzqp#PqKW!@P1=($gp64zrf;Ma7d>9C_aZ{Kl+b|L2IIN zf0qEE9EwRhU#5>z7h=u&?_UvDY2sp4i%Bhkc*vU9(4Ka9-G0EDf2y>=V>Rb!X%&y2 zR1CfdS8?b)C68$MD2=AMx4|+8YHQl47A$jr#%eXoCx!FWD2#zq^3sxtj-*` zs*OMi>JK(5usCb09wcBuL$ALdtgBAAzdzT7#B@t>ttM+VL93iB^5jKpP6b){(%`MT z<@i#>mLL>WzoLh!kHoOXo*&Xfq9@I3yk22Exq?+$7y}J`hEUr^mu0RdA>w9) zHB20SUnE+#-p@!$a+DmiaU`naWtOR=Am@Gl=XNO5Z-NgD64|ITG zVocBvTAe)fU+nO*(Om~-68t>bYK>_=U>3qra6G!+o`ynJeMCUQ;+r)Z`U09 zgc(ykc{S#~2=GlFi9-|HM-(n;7jLr%G%jg)^;9}&AOJxZa(>fEVN_2ObkRihNPS^6 zlFWZj{vKy)a#PQyuqxmt=uo4-*oE0Khme>$Wb!P;Vtcb)NcaLxi8H}8OZ^Z`7IJ#F z3_mLo*B0Bk5Z4}XM}KvR(e&ZeGR3o{EjrFVo6P7E=I=L(7cj@Ghs3L^it9;7YX_qn z!3`_tncMFSi}WA07md32Zs!^02%GxOLIxxrgO?8RQYl7x+(~3B&AE_VApYP2O-~sR zYQfvK$}Ff-q_yQlsZ`wY$IDwsf~J>v&hil0bo$J<<0Oig7y!MGJ;53aPomAiU?dKl z+Q$YRZhA?_+I0G>b9BWe!{P!TAi6qH8p~V#TLzNIxB}LA@ zYdJ3aR8{J@LPw<^ik2d#E4Yis?konsO7Fy|rQYJ8sIR=UhxKgkW4ZReJt=>9B;30^ zkYa&-v(WwtHxsz9NzURiD%2Cf_ld>~{9`!gA`6uQNI+us2OJobn76vEt*l!|Cl@oH ziUUK8-#!1 zjfcp-E7H;NhucMHoE^dP6yk5kL`2m^&aPT>02CNY#305MhJgd`^GAPa!AvCqaP7Op zmPeeOTq-t7FAd{PLuS`bxb9rtQR=#2b{Sl2s^|pn9#9se!g<6Mkhtw}zxGtoI|S2% z8x_SgX5o*#p)6WohK2mxsRwmm3biD?0FhtyuXDGud;P-e>?`NdKAsN~!n2qakh*PV z(pf#QD~(a=02QkXNEAnYmYv7p&)bsKRcokD+iSknnBI^Kc7!|O(Zz%94j;la*Q!?% z%EVkTuWOwG0#>S|zYH!URG}Fd)WHcsD&dfu8#)IZd$u#N^>=#RJmrfg0yde(%{o1G zK7oXpO9M?6r?w9t4z}Yr_QT@>KwjQUmutc@rZn*YT=kc%FBY@Tqo=F0_4h~H3-fAm z3jS*u?Of2)n5ORIbr+Ugq^xW2{q6ym)d%iB%rCdME$15*zk%w*O}W06)t`=Ue_FcV zo*q8;OP{rM3;0JGVhR+Ir*L^VR^HtnsDAl9gWW4C3e{M_uC+xG?(^SUcX0huLi4 z6I9EdUK`}ybh;E0oJN3|#9XJH0A_G2PZWEk=^SkfsM+3pCRzk}H#Zi{P9lOPN)NhAf|U z{~{R$W_a{oxCDAXHDisfFxqr8cXwRPyPt{V&-;o8CftNOWPePb6|~FB+>OXI)&tR~ z_DV%(H5oIaUkUB6xQs$*IvV6nQa?wX>-oC2?}kN*jq#w_G~JzGgcJSB66iTxI3kKB z*hCZuL_G{YFUSkW&l1QDPq0acK1_V}D%ls8nALFK4V$-85I6X}ngLYYNTuf{|Gf zy|qgEyOFb7>d}KH-^CSCi<2=>Z`FoK^!xMLKr156I+#4wk zA(QipN|g3UumCo4#bQjo(}YVTOEP?KK(!vTREtXC7?l;?YydbK^_Y@?D@P z)QOE9S@3~%^9FiKQ_5b`VYFb5ez~MF1rv1Z8?u#WO{K)!>nzON>&>gUoPKxS42Jsn zIMH?i*e&#>oD*q586(-6_|4pz4EC^Xg<3KvD1tY517Gpcx~s265+@)F3sa~+ zbRU}Lcbh21|{OIysuoOtAE0sB=#v3{Th56jUiEDH2xY3D+J$&=3?wq>|X<{{1q$S>Ad zOXxBwYm?2I_)3EvJ z9S=YB#wF27co8~TEG2%jd}~<$?%CeZN_M<(X7r(QGJjRJ9Lzn~8*p~CC*$bgAl^|c zC-i&y93yZ!)340sHZFA=W6c?_+45qqAXrc$vU9IUPP&a!Z~sYx4NE{2!@Q{NUxH3n zHq(6DzebHlAHr7mDvd^MJ>+M7Dr2QlXAizgo~=7iP`QqjI2Q->9>^LH?C%fFtb2vp zE3~}bWd-P&oc&zQ+qGrA9ah~r<60xDI9RWJQk<}$=$0L_^=KEiQuR9jCyYafzSg>f zoROIB-nxT5;uw=w=6e^=hs7YnvA!G0LUSEvor2YH(!Bg4aDo_y^mNFM-!&hVbyfg= zUL!aYm-a+9`E~;@Dj20er7w^&_EpqYH$74cj_b|$kLh>P;e!pB8Lm+lH&ksi=0TMHX%sSLt}ayo4|CQV4h^cO-T+%}8L!eNOgZf}!1!IbYtr~J%BA{|xm#D(>TV#E+Q zO<*~wQyno+v#r0-3L0nTJFn&6-tlwJ1zZ0pYzD06sBQ;X;J2(1)f$wzaTm!t)^|;c zVAIru*5S6KI*dfOi8b_Xa~b|6!*2nTLo+V)Z0@3!vh@09)~p#?pf;&Wpm=`%ksmPZ z^hLGe47WmrUXdw9y^qE$m-jjatSf+y<-CB`rjsvd6Hj=ZZZv2y_z>=LW?0U!srxRF zD_bfpRnh#!%Dk{ndk&)cUe@)<#9N*B+j%d=hFJ`#;D?QpW_k6GMl!vKLmWXonyX6eDmJLIS|J^ayZ&#wdA7L*Dl`Y~%B#$No z)y#(4;8d;mjya048ve{MiKffJw{~;<<1pgl9RIgz7{&S;tp#SjJGsg9Y;sj+*f_gEp1)JjF0kY?)otrWuONEKen> zXe~psp=rxHXn1SMbh@2^LtxTgn-}(0q6y?_55(2SOKeOj8h_dXx-m8H*T^Osm~)|D z+1N5?Ues*#BGOvF%@hJ+78Y&?PdUz;P+n(bwnGKz4@$!f zJJrxEH4sv=o-(O6Z5Nw z$Z%xkL&@MnI~|e7-ShW~#I#c`1{*bv);*tH^8gwe1YPaj?8K!>brxRVGQfOxm&NQ! z7gbxTnP_VVqM}BE`@4izKc$&*J*`CJ*^^ATQK?RH0!Ps=a0*kBQkWlYVy&pgiLrO3 zHdZuLL|~!kpd%JcbdG#Z_L}&N$y7wS^9KLfvp$gF5%se%9__U9=z50&M#LHZ9@1Y$ zcxFWQ%np6sz3l!qX>6bVdTsfw-j(-0b@sW{A@sZ34YOm|fZ)n^;=a_?3|Cl#1FeX@8RFI`cH3KOn=@D(liLWy9v9M^f zRTUtzRwzG~e*%wphKQ9>8CItWTLs3Q`76q!(%;$+3F>1S{vlP;Oh7;M?WNvT0H{lX z($ewrZ#(W(wJsATu^(Y+xO6JJ=w>Xf+kV%)4JYo%N8CGhVw&IW-gKAY#VSfq<&qV}k^dF%qM(iw zmxPu^eXwI(Alx}ao|HDzOQ~Cd$n~3v$_A$8&I0Mb*1>is0%S;^R{b;us=wE+( zQ@29P=6(EsguC|!8RvRIYpEy)xL~Zx!k(#8h@qltnAm-0BKB-hydF;J=p*I7OQ?r; z(9P<5rJx07+;@2)33?Y=nVIm$GhIA5qV}4c3=UgE22E8rA1HpJY^EyY-Gp;U--Ku_3DDds1@F)Mw`R`N zbLMrdPp%va`oHIZYvi2EE!fGxS+wrmZ>jl=UNHkz-JMY!o3_H-i8|dwK@utmpqSysI7y2%}>n5}dS1Lv| zJ$s{l^wB(c`|C?t_55M}-}UUfb6@!>7LVVtT*2PHE_AK!f3|skZtBlZ2vg~Aw?Y26 zad7MeP0%HEXyN-HIa!sqQK>^*TU!qC^_H5B5_RO-#{mYz!U@t?gtnDb(txgCSPt+AKuOPC2P zlwe(ry8oHrNSM5LoV3$+ z4g9}%M);#2yot5(M}YRSfdJ$>`7xul5jm;ffe@}0`G36DymA^n{&(>b`Zgs(^;|g5 zAE)&A;H$M>s^H-J>xQfQ|ICm78Tx@A`H>$eeH~WpBMTaovuzI8mQa1J1?!qA-~P{2 z!kzBPk(V}vy>}fu^kZb2x3Yx(A_w0AbidsEb$6V;k(dAHLCZKt#dEfjp3{FW_^Pao z^n%E=lSa4rrq9cN19|Upx-<7iMBWg(XbKuL}c_@+@dBPV zt|=B~My`=A{X*RA)rUyLDUr0!miX$Ax^FArNW5rU#s0uz5vj@8+Q*>*h3W*-$cS_$TMOzAEKnC{7?BB2${ zpioNoOr!rQ^+~l<#yfDeSJwHdZpkPd3Ma$Y>CDFcX0?DpL!1(c(3#Z|n*d3ltsdu$ zgU>qSB+D<`O%n}b@>Zb5qw8i77l@+!Zn~pV+%F&6nAxe~?)H%Xv(qks=k3fDSHr5) z`X3JjV7m5t2rytgW9P+1b=|L%pARy} zhw@&$*gF$^z@#-}WY(UU-iUX+n-33={r#7_Gry|s+V;b1tSdK`p0?I>*9;TtuX=9Z z=hNpe@2|D9z*GJ0?{@SZGuHyYf$QjQ#Xh7;!SgVYd0yBCmz-vmXb+xJU@O!bi2W>+ zM0UqE4Jds9Ii?cx9ryk?m5nJw?H{LExWXPqW|jQWj(Tw6eT((jy9F(#y9dNVLtnI=q@yU=qml?#yP) z!HPD4+n=*t@ApH8m-Et_xboGz2e|US8Gq_Af)Dz~uDt0&*274urkyvV?O2T~H$KgW zJnkZwe z8?kizN}^Abb&urzT~MnT7Lp)et+)oXTF-)ZjQ&wFR`fT)B$!empG&OFh*>%z07|xK zlbbKB2k%g9r3Q*mE#T~Y zcbe>Ey;t7_cMYqmI{#eI*U@^{ zXE<(QDV4R$x~$KC9J<7(f4|X~;}yl*bNW3Lo2zmlnv*GQTeyea76Mi3{J zauvIn#6)l%kH(BkKbB_g2^-|9lou+!kr@*@>fHEGKANlUtINT}k>kS#vztP>=>U3k z&l{Kz@#I7@QBiyF2*JMTLRTNI@epMGxLl z+nb?XTSZK&y5v=Rrq$wEadM9TP!pSrA-S85w{X<6<4OPxxf9~b74J?c^AyPO6; z-v%C!m+M$Xv$Gg6?`HG>*N0=n2L~&vyQ8UF8xJo_zrE9%`?bBN z_t_WClspdg)P|vx~P_dDlj0&r{ zEjwA?a=i+_N(z-BZ|+gDP?rSi=~L#+QRILk{U?()(Sq4-bZiQTaqOY3Kmq3NmVH|$TG~Lxbt;Ejr0!y) zvpOagfC%re7ye6cKGAxzU*GsE=I~I0`ts0Dl9ohAWN{6pMvyjkMIFF}6rqV)MNz!b zfTl1@U_!~T?$!4Nn5zEe)YbyY{M@QuVUB^0L$`N3k9(k{b?Fe!XR`WHWO z0eND`A2pV>d&KmNFeQdS2`TG6Bs6_u{Ht=sI?rEz^Q&C6TNAB6vd2@oZ4C8k&%8qE zG~|C0#>pf)Kk<_Jo-UhF)2@glVn}w5g-BazBpH+{6%I3S(GScqgjwu$r22KY>9qoP z*Om^K?#Rvd3HUmf*V9;wU;0{lv^gq*++&8ueu7E?^|jFS<$Ib%xvc{_aXFDsrK^s` zqct^UBB=(_K7y8PHWl8WEq%<$Hwht3r8fjPGv(IYIP;%)E@7eJo<=|W+WnQZkvE=0 z9nZbtTfF@#vFObQaU4b99`9=edYY65Jmr5jxAVJ0*r%u0B2+WEB`Y9%-7N+`2a>| z+?A->WJ6EAgvka~oNRDLXSDQEa5sW7F*;QmAH_8FD3>A_P%kHUOY7~T>z10UYTZ(` zJFDH2EsiM4zAIu?F8yl49^+pZOh)70usHotIg%4d_mbY8i zBZ{bT3wz@+ADASJlKkxZ^?+;ZDzb#c4>D*kd$eLK@!X5C#_inIR`O>t*jMNn(^XA* z3cmf{A=5a0t)$86UQ794`cMD$?!$-wcz5*f(?27sv7EMF|4x3h567J&kgLM744`g8 zASgwstK{j7@zub#34V6Ibg+wAD*$LTNjV$i=isK9Dm)luK-A$tn&bd33V$j2EC3yW(&#{QII~8{Ki=5G<>{TZRBo;>ZteB6%d_^gb~2f5r$5hD^V+Wa(sWa}+IE=i zyAgQ5&anVakN%olKK;FVr7{6vw8t#(-9a={V|SB z{WUk$D9n!vxX<(JDbcSr^=AUJBfl%morh#94D!1qISc!2XX09|=>fGC#zS>!Jk%!_ zWSU&)y3QB~kTT6Q&~QqdNz)D(2SBQEg-6i}ues?ZadZ#HEN|OM{^Q}+NBZPJ4ZGu* z?Q0p{)+BfMghGFKo5}AabmtqnX13Z%7%hd7N>Gu+G9cMWs0k*|Bdxaq0=uhX{dTQ{ zubG^h$qA^hxquw?-T9jO?L%22hop%d+70BfCJKji+YR8gw=`~%HM>Q#Y%ukD~kbA<&UYD5<(1y7p&6)8ov>)xvCa#<-J)t60t4wJsc{26HL_W1*++fry=fjKF zPR^IW&BysIjy(LOEna)BI(6}i2Q{d5;9X0|?g7(&;Ln;&P?`d>hB|~*{elv$!=cL7 z3Lw!E6waw{9=M-OOb)IBJ6L`NE{_&>I5PR0Dw%ennc6DAo_jj%nf9I12LQXzGIi7c zwPe{A7Iz+iot4hZvJROJgEEX55F~ZTXyTBe_TF*OV=XMe3ATl$xv7G^dSlfai@hk72^e z;noUZf}vKxPy=HdaSeVbTXPd$d~C53H#TY>sz{H%tN}_zmQF{`tibs9%Zoql-^3c1 zjzov&R7k9_*>ZOHfHL)Jg#A+nzk;@j0%F^Q1K0u=7 z158M5SUiD5>Uim#ps0pyYRJ^y?_!Gw*55uElb6^+%-)@~vSud*&gQj-W6XM~wN{Ge z!!e2m3{zVy&(bt5aMl!3HzwUkfT~Hc{?1^~<&W9Ia5SxST}V(}nCil0To}dju?@vS zp-{N<$Gb2rbzxxD1v-?D^7Y<@p`i;SLl;IRE)2!A7ExtJigfM_!@Ds4rkCr&(0^O| zGS!8tE=+Y{9>|4JYQVfwp(Nd+EjyB=j1relPRuKvnpc=A3+w5OO_VYeJ5Z{*sG3*R zysG9^EEKfNn^*RHBNUp4(iC`D_Dlt_)a@d~R>;KLi<$jl*Fj-tYe8sUnKU1cseuN= zCjz_IxO*8l&FQ7&yVId`FFoJ=bQ9{H?uV(_^(U~Kgbz#c)Pw@UxoZ+RNK7by-bu^u zp`>e=SraPi8um%FKtKxP{Lip!m?|YOE%-qB$&d+kvb^RL+eg`4R86Q|?4JdeuVF3s z)^_9bVWYg|1>hrF!Sn=JLo3=1jEd9L{(Ma}T4<3K z3$MhhT46F<-69L&dd}J%_D+mxC+pH-4DJgi6n2G0{G1y`crtk;T7l^2VD=`DU`FHe zv-x5~rX5>o;leQfq8Hxg@~Fnb*Z|DZ2_yM2mD^9TUk`i)>)eAki&IG3^9`h%htmEn zo-U21Y5!K3xb^*87)aBTq3I$Y=eKZjj_Zzgw_q*eX9?z4*|YT+Yoi_9X{PkjpNE_% zEoRWpj*S);fr(}vGHRrWiW#iyh!XR|6_t(?f6Nv*Kk)!RTIyMH;!y?sZ&!a#sTlSK zpG?48=?)XR7VyGX=JYz{-B(xuXR*x)sZhi-rfF}WKTT%(8jeXGO$41zgsW6LIYJ>f z{Dz-}{dHbgdGqflpK7S%>3yo2EqeNFk!51rsUI79&3P*Y)(>6^{Dd>~m|#3?m;_28 zihsK4bShd(#Ejq5*GPer-~RgP^Iz@9ZUljq9Ga3h%uVDbv0`&y0(e7RtI^#JsL=>$ zdQVTat$zD7btQ$I+Z!+6F#Frm&MZV_lA`en;}Ro!byt-)IK!~S9vm_QU6Cs^MXszT z3r;@1&l2377u1r3gyo8AM%A|QN#-&DU>evP(#(m8!~V_r8usl&5$u^YFrykYqm7JU z&k2qE8>Z}c;b6}Q|4H!i6lnJK3f+BzuF8!5Y|aDds$z+4p(|48sua2^g)YZKR}vpd z$3nMxy+U`NpldSi0L^&-T|>8ku1uk;Q|Rgxx}pHOl$S`|=Jg8QJ%TPkX8m1rzJYG@ zPzqg@Lf53wH7Rsu0dzSng06qPLU*5_D^1SDbLRncrP%_yI)<)Gu*cA4B!#ZRLRUn9@|^`X0-i?Dtug)XGfWhr!7 z0$olDpeyN733UDI6}o!_T~28hx|{|PhUTFZx-^9@Poc|G=t2Q>2^2xszh0rcSJ0K- z);;J!SBB7)TR@kk&=o0kMG9Sthb}YfWNfE)_Vo(geS)qq-q!6wR|FA;=Ajh2JcVwW zO=D>`jia+^7K@S}mO$6PUZJ~B&^00tT_XbM8nFd*MGD?8b@c-EEcIQg06qP zLU)g#D+tR%*U#(ipxZo@LRY5HO|xk%&8Bg5HqBv?ltGtN=m8c zQs}1HG?r%5I69kVu}E}DaIAT~LU*5_t1=rVn)3j{EJAiJQO@lO>#?x$?D9EOHEaDA@ z=Jg8QeS)qSFc?G#U9k;x(`*`~*)*PJ(?mfw4OEfA(7axuyGPJfRAiy6Xb@p&9!jB` zX44?ertvhJCJM4?MwcMhzg>a5Pr#KvzHoT>4+n>-$)IEqV^DI7R4}AT2}qL?JWWc7 zf~15nQ~xi&{rK?k;_}A+CD35bes=};UV&R?ZQ%yXKLqSFQ4DFKn5T(iQIIGWs>Brb zZ&%>%5peYYaCHb=T@GBq1WDWEMbaj(MJjVh*J1f~tm zk9M%<8%#G3rI>~k(=x?0q?ks=+c*(}qWbXe#(p|lR5w>uQN4Gt&RZZREDtv2<-r!# zG=;TFVNFw5BV%!#j94I6R5QE<(p*(xy;rc7L~5R~K>T?J>+Ycx);xu^Nny=XSR*|` zL1EAWIa=I-;zx_`xCvDLrfPxg6Q)5BXH_5cKyAOE*Pw16O2Zs!WhSJRnLMq`6v)a< zA?U<9pMEkl3c8#3Yp7$N(93f6vp)~O%eE*hhO{;l(%MX()@BN1ZKek!C)gm}=Jg8Q zJ%X-OCQ~Kk&o|I*9txmqVVc?!BC2n*B9=re&5Ys<8O#tu9}dm3{5)TRmiG*^1ZtEd z!q2-E0)8@5^ec!~xetBd%|HKrdG$ZL+bH{#1J5TZ5k6gwNGs#@b5ph;(x#J z4?2&3&;kCT*srztS1JB!ihq^jAL$w>qXzMRa=xTeew^Q;@ox+g|Nc`e{`Us|j7USd z>~-9>=MDZh4+Z$A_9uj}frbL7QEG`QU?hye+i!8z-`&cLPj5eh4AE~11l;4$y1C=M zw&|+{doo@RY7efLQ=;;FFE+;d^vl(9`tfQpKmER#E!Xz>n>WX^1LMXN%kO zL$}4@*gjuf&K4K{H$NNo4R(6}Ad7yB_Wy0k7C9k&LR7~VO?uu!dmgYw7Y=V^ZY+Oj zEpcW93%v!wsE)J+lL9Eiu%+iJxY2pae#xh+)IG_nO;6Qykx)l&ZtzfNc`c} z%lv+NJEbyDv(nv3LmO|mhBlB1l@8jVWo{aDd)`8O{ZOO`lpCxNC<1$;LPa2~fu&8m z(UwZ1qUkqYD-aCMnV!HsdnEcV<3go>!GO3y?~ksa9n=T(eh`2AzPP=(4#9Zy=8e?_ z-D=t&SVX(pc54_syn;@s;`c}iZa%->L`}Ke7sMj|1yrh55IVNHySKmkXE9>9%TRY zHnex!VCld%*h>2!SVhm%UXo)xNXE`aHHQ8Uxg#3V@xQK3c3 zIn@-UsW}_8NL{6g_&gsgbxrPh;Ec~JSb%u&{p%FuZq_&N*g{oqSuLzBR3-mn zhAOz?oQ0t}!8I45^Cbo6QK*7*4FsjsdS%@ksQ)orfErKJB==yUUR|$Xsk6xg`~Bsg z7q_!)aH9KC0PPQRw+4Q0ZH_G{1D^MGTru=dPz%tTttP%cuAiIig! z!?9*tMm}V?{7U7?@*2glW{pPW*bi^0UQ8KOjwKvZ6Sj_h=zh;%dmeC17xw&7s&tgZ zr67QqRTs$AlCmhHMHE-7O;l6zE-GS(j0Pk7s0b>04}^?`?-?KO3rsJ1IVL14pxy%p zK66|I@@UbG)d;^I^5c-vjuLs&20&K#2$sYr99%sASc1j}UI?r@nFC}UgJEPXe)=^e z?RhAE3TsF>kuL8ZWCg8A5n&&j)$q!My%EfS?{ve+6&Tj>POeyE8Yx$xD)}7c8q*13Es-eA6Um-$h}1lkn8qA94X3kfbvoLjil+4 zHwwsz$}T)E4_Am39>4_Ds>^Fo-ci(dCuS(hf(Q{U?M~#B$!LNY*OKTkLL?N6ll7bk z5J)1*u^b*ZkDXN75}=mOSS6)Ybc0g zJaV4lQK~2(E5T4R3z^J7dvU>&P0xsw>Obd z6$AD^z-Ub;z`+f!W}vyLYVTdx$dpzaxo0C&+R==0znuOsr5Ou6(dMD3mk$&Vr%&1+ zg-5|q!MIkDjVuKPRPh_*>g1Qh;p#V#2a6ewX8xv1vz?^vo^5*{-ISqmp6?T|=K7nRFidMoXWZFM%;^QogyVYU!N_kb$C@r_TlIwl>8A z8AyK@XM4Uud-G6&c1E>{65mEk8f7UAkuoasva&;>>anfB8SUs}0*m(Mri%6-w@HV) z{GJ}|vMp(JK&F{0;pRL*J1tKj5KiI&4Ti(2i}4u|rNO-^s8>OY5J^ozRKSl@Gbr`l zau?3f8HDDl7gFu`yiCQh?z?I#?uDrs%zFv?3Yiwbb-#DFId8?u`k_R50@Kn&DSk#p zC_$Y6sCp@WRE0EWX2Oq<{$uwt&VPLR63u7Q^KP}_(73TzlC-xvm##@7GX7L^9*{(o zHwF?8kG~kob$2LGfQv0n)O#bh?=cK&!SndWT1_wPCxpA z=|^lr+M#E}v~_Ndy2j=34x;TC?Rmg&Q(8^xI`2Z8nk%8Ac!}@4qlps4Wi_~lgLpee zkkz=k1c^?SH#b!evbNssuJz8+#U5mJraj+qs(C2Oi-U@4M>)4-@BCCIbhJ51Fb-OU zekvpxbBgm*^>B33ZEKVa+vwWdR9l92V4${Rrw76{W6mbFO?`JYE6>dz^g};cmpYK` zXllnuzs|j{3Td~_ee+Oa>M}u9Wa?5*?T^{fRtgiAkWoz(l_6~-yO&8LP@bX?Y*0yl zoZo^LSlTu>VQE_@h3>LaU?4JeICr1wjniXmXU-gDrj{L7-|W9PM(W}D#p2A8>mP6I z;qvs(GVbQu(ta_WU7k%3Ek`X4$Ara$e9q8fsuGDKlvQq0qeD^6CaY&4dfe=|pkTH0 zvZ)dMJWg{Sh#vn?8j6bYILZnmNJ3FE4MpJ!R)u0e8SJ#-?&TVaDxLq`rXgRgSq*Kn zlNrgS3q_rr9G;}6wf&k}o7V23)Lo@c?M9ux)c!B(^rcLtoxT#4-nX)9Ldn$q3)I`xA2{kRWt(}+g82$MM*3Cm{l$AzNR%PXTSLCJ$iRRF| zkSizakgGCQV+LxXH6!;m2>h$HKE_bROqNs=#=VG*N>?Zp`&_Z9{bY7prnaV2Se=}ZfzeFNE3fmUB^@` zO5E17W|pdW4O`E4yn%v1$3C5zYTTR$?9+uaYbcdGZl2PbN7V48cgpa^H_cU5VQduwY2eMP!q{R@V8^~(Ur}=&u&=bQ$VX^c zJiJCLvKe0EEwey-XhL+b>61e+$ws6D@+oS=lG1Fr{pfBzW0=ee9kbppwSFj7`67%VkFv1!L3@4=G(YR*m%AT2aOs_r>Q)fF9zy zVH4D)DGXo-AvZT+srW3lpY>K}_NQNZzp2Nf0k+-qO*z-K$w{!3XwEyYPdz!Be#W~ISngH!XSvv=ha|H|x{J#M0U2AV0Hzq@PDmawY)vN$`X`s_rn(H$){qW$Hb(1sCb?JMgO`7@oE{?k5`||p3;1*k=@YB{R zQE`i1`2i0%9+^#qa3d9W(l!0?xW2uEGHwx@f{1zSW<9xEy&^=>iv#P>6<4iCM; z7D+r-j3o9EiLor};KN+(ig+ajH~bubd<39gp9c_^9f155h#4r0^X2%y1WK=OWz5nD z9Wc5{*JeacQJ?^?$VfNeajT5^rw2^B>9!4e(xt#e%_JzTae+0P-ydP3dpA9XTPlD| zM1_(bW*SB)#R((Igr8UQGFGIV4CzP%Sqjf;Aj^=xwez`$_o(?Rg$`#8BF0w#FR{Ln z^&9yRJ?C$?c~@+^>wL&dpZ=u%d~RE1lL-VQKM2ankjFGO1kJ@XHoPxBJF5G$ z3z~y!ASHc0kYeexrEu*HWxu?=MqQW{K(XcEwZ)js91-Jv?ltqh0`;Pdh^vz5=pm=waA$ z)E*v&U-1plSKtmbNq_KLi%3smo`s^omY;)OF;C=X0~N9=5O!A2hXZY;o|i|%(oj~5 zLOpNmeKEk&{75O=qAct)&hN|5 z&IUo2c61$qLc0DiN(P^fRBh$zj|+<@6FvxLZRaRj{sif8);Aa-?d1P@vW@7z z+Ulf_Y~w4kjcLPFa(A6O_1b4X+-(heZwlD^4WbN9$*G`l_UbW+wsLnxEz56!FR~3? ze-Mx**9-}Qx@nn4hJpkwU$1#M>-DZRhYoe}H_SwmxWj!juz!D<$SX-el3hX3ip+IS z-p!CnD*<82z~`14_~iG~muPzEZG%|yC6WS6-3F-(8f1}Hq;xh-%Ws%pk?F^IQ^V1E zT%)QSDwxrF+|YI~6Xp`3za*^gK2Z zyy*z*>8w}MM0inSkr!F@eE#IHX|;2Ji4uOHfKKm^SjrE*eLJM`W@#tGbwGjCQvszH zrkLN)4gTnSB8TReu=-XXl-_9npbvX9zrnFhZ z!qVLQ{`lF+4_&fuL^3^8GD;$fFOBlClUGvhvml8+k0 z{9P*XhJxbdjl*Pl-3waZC-i0)d559EN2sV4DgJh#{d--7FCv$*sl@G$v+xouyGT3wrN5g$a zQcP$si3TC=PjzK-l>~0Qxj%Yrj`@Cm_+1%&egBqu-_ud!zJGOc_4eaXlAZ!>rr=8% z0I@*wOYe_B%MZQHLPdvN%7~fdr=+O2l7lR_;?ypQ$O}2)Z@lJFqpV?voO~1)f-l+} zeA#W}i!lbi!6^M(bp^-pyusd+FnvUDNoAYf7atMq%}(R=j%24=sOSeor+1!Yr`GYj zP-HlJZG)C_fgXAPr=O}37W-p2)pXE-)pOp@f>oRU$DHU+eFCoL}g*3r|e+-Kt@ zkVM#qm`n!aQd!T0miB9Isb7?fl^@eBbvli2mwG_gC$4YY{mIY&3CiT_GEYs8jkr(I zHyf75nD_JKT({kwG=vnKu6p8qU!UIeBky~9=nW!@1Q7?;RYAlE8brM33C@(0&BbXk zh~B6d)`?kJ1WS^TnSV2$C5dZ~6rNAJ~u84t_-244cp_X5i%)5s!%Detjn3zHSQ zi7m`*O?tiBn)IIEnoLKH2sV5spVwznN-LK+PT~C#X!)UcIzSD26bd(s@}v`_6@lQa zDyhU!itJ)I@tU9tU)E-(wzA6elp@4RN@Px5+?z$um>u!o9qa~ z7X{VwX9o)fUiWGMP*Q;;LrEv;q8cxZB2+VU;e*5bo?v@Z&F%f8mdK9p`KS@!_b_8` ziFz+2YmREyNf%_*ayc5_7pPLbS#y--l?r=htv0Ii!qpY0Mmi>{)M`1ZWk)%h&l*#@ zEge;%N1A-nQ5AYZmca$%y!5_6mgywXqUe`lFYYX0Ie(1^koRCn83^0zr%a$pE zr(1Qv#Qd6vB#Zy^VBRgASpJJMKQ5HnxLy(nuP4-ir>nP2zIQ`IlW(!!SFT5{4FTUGF%tq5N;=%%!$iqRPkma<(nSJswkNCEp5^sQ4Z_+^Sgks2$MRXPDY-t6DMW1Yw94K~ zfe{?5SEt-Ej8vhP$Pc|E2S-TU&Y-ME4yqnG@F+J%w^#ASk%R8ox{=QsFxoWCYlwj^ zcm`pM0A-f@Le3O@q5RN`EKkL(PM+wHG(nb=Ox?RN?e0%PR!8?IpEZ!RDbH(ot6VU$ zLh3h}x;VEy01594WV!AEh;@dfet^p0)Nu`kiX`F)6lg`U;z5^eQde)DLmp4Xxb~dn zm$&!Yz(4Mp)AW2)U$bQFcNjF&(U)yI)|x9=pqYBH?0pm7AHPTWp-)?{2-zTN>y@V3 zdPivMNn6j$p{-~;+(g>C@$B`p`KU3F-9=w-&(%~d0Kv}p-8AmIDerr4&oX4TQc#+j zJC#itn%Rcf%e?P?^}butao>H%eIIG<@pcc=+5)U*(wdjcqk!i^SZv*ABi#s4cJ`R?ZW`rq!te^*IqDp4mygMi+> z3o&#OCub$KHdW}fqHYoLh{{g-H_`z{m)Q6h`0eB8^jBz3*-BwBm|59+kxDwlDhK zhX*4qyWyUX8j0v072jS4&d;$TZffCyuqw&7Pwu-_#eKKkninsL!w5*L-HF1rmboxn zc+T4E6>hHq6CwulzNe$ceIE+fvrxDOq%AH2kMB!3_8L741B#Sq#>f-@Sx<^EFM(-G z;#_;fcY>n}nHv+1>8Jt6T^!Z*WzVVXmDCfeyk1V~ye?4Xw|HV&GVYa{2s@LObF#rq zD$m;aQZXUs-I^1vLHyy}7be8gQA61I+`>+1jERS^b1z{hOgXFsyBIXX`y;&bLsw4| z5Q$43N|b~Yh+31HEfj1~>dis4N3k}B4%H%i44sb}L358Bwq=_4@->^Bz3W1hSrn4R z_r-nJy+Se#Mv}EV5bn1in}ubxYOXd5Gw-`;3(2nU-x{shHCsMv-1o1&??3(e$F!l~ z{O-FH^X`Cfm6wa6*kUN|dv8%BqfnA;7IvQKkJDPyO1LjAPhH~8N>P;VJAX2j&lm=F zi>fv+D20s)Lu_fXztoUE2B))mB|(#^$sI7n!Ne0L$u*`2Gkq=vH^cHUp`Tj1<0JwnBiG z`~g|oau6%(jyQYmPY1-loWSvf?!b&{&Nmkw#ioe2+{Ju{qXr!JNFNVy9MxQJHD^mWx<$TicwfS?b9S4k zNOYr&4k7lq!Xo@RYF?5qH}d)#s4V6=2rt`A+yvB)^fR6HO6;j$mXow7pl`@J#?p%K z&>T%4+e6@&1uNjoblZ2Q+rlDg&k7Yj733QdcW9H(MyP99?V<34ZNW6RMEerwmZAk4O(p5WwK?5#-@{5+{fjPQj zjg)x%mSB?JcVOh{hrj>l-J4(j{^Q+$|0DnF`the9Z+`mc&(}BUE|Cz1(Tq~I64IB( z6{hLo6rM0yM4jwZO5(30uDUIvLiBNp8g_;yaC_hl|RmRxk~<)SBNK}C0^0NofGodrqvGdc^hrsW`4x#*8p z_M~H(jQ%>RxEEbAlp353TGd4)V7X5@k+%?+Nxo5oaRd%-A^B3}BZS*pPv111zDaJN zP;HWLOu;0dEsqpS?0r3OW{A>I;8 z&ZrGxSrmq=zq$&=O8zR8!&!V7jg)JxLC{(|^x`R3DBY*+KY*qYIZE0u_R@Ys7znYU zj2zO1c2ziHp(m!;*A`RGbSreBBpD`VTTRSB zK24mfS(Ab;bFOY{?7D(;l^=S^jfVJsdS&bV29`=0UHFUe5bEsBUvHa^x)wZFjDI+4 z>^JC7`Ggx(#uFDaF=c__KZ~%6Zn!h7@QdX3j|G~ppo8rQ)huJPrzuN(T3H- z?-o6qgO!gOUAH|}aLL9Bt`|=Mg}ah!y6pP>#K=fR z2c=QGEb9I@uW<)XuJ$U7WDiR7Q3GhZ7|)K~ZwrpM6VpezMnrREL~~p8DL@@>!T6_1 ztP(|EH9Ou+3|x$U&`-r6_k=gkR`h$4w}G+9wOGG27E_{I&ge+_&OtH5$j#m;?{wdhE;tb|S$b{s6Zc8H4At2gZG zH6Jyq*Ij~^IM;y#9!;^dTejfUrHoVe$~a?6283pX8oaMlvT9$YkgT;S5W-ATb(LaA zhgdV8HDU)ZcC@baMaPaAOGYfm4kL=#kstb2QVK#@ROpF3pZDD@s?KmQJ#8KANO-5U{5_zqCg{m#l zeR%IA+;C@T`KXZ`?V*7Z!-8X*1&5kfAcIk28IExo&kRS^yD(KmWeJ(IIV+L-c-VQ z5Vdf=EQF$Po|(HTd)*=$UVjc);!_R3S-m}ozicj__Fix&JNgsf zSnJX-)6W2KrU3Y+!Ry8Bq(mo9^)!WLf`0+oy4@oz_ZJLf6@p2a* z1AEc^n&_5bXH@A$dwyfedeZdoLZ@3U08l!8q(56v-4fz3Zel#gMNhh8?+Zf2324Qin!zBZP#iTkVLAA1V-X8xb^ z-Vo#ca=7yAVtw zTEYB*S)P?yq00|Uulb?!@uO7Q-e9O;W+yldDFyMsaYCEY39ekOk_>UU`9ynGmOn5( z`9$BW?D++qJ1aSR$iCl1$yscX9%HnvyE*057Bp8dwFNEWGRU#aBCuC#5{(k5)WpQ> vhPDNruiJt=Hl;?h+7?vi?%@*?$?2%k7TnR^^pe|x`0xJ@F~Yq*X+H}9Ei=U6 diff --git a/perl/t/tracks/vcf/raw/clinvar.match/clinvar_alleles.single.b38.vcf.1lines.gz b/perl/t/tracks/vcf/raw/clinvar.match/clinvar_alleles.single.b38.vcf.1lines.gz deleted file mode 100644 index 14c061a5be8515e7c15a177d0a074f042d806dc5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2627 zcmV-J3cU3niwFSO&VgG11D#r1bK+JKerA3~=5Z(Z+%IO8Dh6X)HC!CP-prGVVPrk@`TW8PPY_*Jj zIpBWz4U^st$2AB66_Sm**EPrh6_T}klNw}z3dzQk_NWfw0aqj)-qoTU-W3T)?ZH(o z(mgOFerS*4_9Pw-lF8S*ZVe_OK^3#Ojy@P(a9Zb~j?+311xwT-8yCsiyFeL5|d)>}`uRTgS!@;=w`+axNsS^Yw zUC6rAiwAM1-Al&t%^<#pKkKn3>1wb}hNG+Qs9VQ}e@R~`I_=5r@TLp!I<@Bn4w>}g zFWuhPHD8};xbWL zL+B&%q^@>HaGA+nKfbEVGako7NMZ2Bb_8=bD8JkKpAa=_!ZRtIRJsVT?*J!-wo36Lz?}e0TF|n@ z!M_8vGu#?(3IK67xZqxDQB+c#3~m(s!;LKhPWD#DGVh%%0!}i(9o#8XwA9(y-U+X@ zF6wZ$wP%7WKokIHQ#-Du6D1BOOLNN{)7};VCrfkf!4cjTpW+v4k{77Iik+ohRjE-N!yvmW0x5uL>1m-HJ4?d|Kns-tH)gdWVY&cQwB0l|uX3*B ztOHC3(JLJn#RSaFi(M~md34Cb5_>^FGE@W-){P6T$r7!k3t@~7a*4|zgHA^47C4JB zS(+&M!`f@@c<}xy(o_vd8x#_gelEjM@I#E@$&qadmV+E_1ag|N#u0+ zP#VErVmw!KD`!w9d)1kesigz?VOX(EtBm~~4XYmUP_x3S9@#d9;qD$F-3;FcF0sP^w2d4PbsixktwDfQrz&; zif{yZq!R=NtrK>iHgpBlIQF#EWKcneKnZQ=PM{sGAh1f(A%!_9y1PnjOxWH@ai9h+ zGIaHT#XVf}=%B;9pq3Ru7vyM_op2dqrBjXEEl6>q-RH)%7({Pb(jbcqN1H%ejzRQv zB=JrpOhs~9EDkn((&9Kdg`u-W;r6ya5Th;5Fo=!@mUTGUS!azU>A0do)CO+(BLYn3Rc`Px*WP-Lw9s! znqWVtLj@L^aM!0n;ciZcO0ahsO&lJ3gA+4FXNwHBdPVOlQUzOh=}10zi~HXo9Yg_)-ZZCRO{WWn3bLhQ zR*|^h!L~{36qrC(=!DlbF6cuvg{MQ9AuB>9I9;G83>}G@E+z54${BqE4sEaw$Oj-g#zhFK=m-I*^fo3}+yo%n z1lZ!9jueg0E_~_aHV+hh?m};Q3h$NPRU`q{r*!a`V-&_5WUbT3be%3|X>@VFN!Oc; zfB)ylbUEkp;%YT}c}kbt&BZ+1UVNW!rx){dxq3=3=IhxH=`L5dxvSDGJq11t*58f!EgoyG+I+iV;zbVd31 zt^Fk}^kIMPD*9V1ygC&>xChSb)m^RT}JwLh}wMrgM?e$aZ~GFxN# zb$s{&R%|BR?)U*#Ms7|1!Dy%is&%@rba)x6bnAElDQe%02Cd_LqG-@hxOW<|MqbXE z!tl;-ljCo9(cmueig^fearxYAZ=Rh^(m#_Hc>{>#bh&w3u9k04tCvmknm;mm11I{N zF0=XaqL=+MUA!gV4xeb~`@7+2fw8-s@ zw$8RGJW2ArUcF}D)9=mZ)5~I;JulK^^YZN}+rrPAW^=hpAbuYg*=)OMkyl9~fJ`?K zwEGv|Nr~@{khIB`b1bJ7-Y1FLFY^Rehvxo8Rx7+765XxSS4antb(1dBZ8qEN@;{Ps zx_-@O=_ct-zpd8O?P~qjY~C-kS9p?a`PSaNTsz~&Ia3gej z7jV8l0(ROao9A?vJ!W8*_F{3H&3`Q3?)D^W2&vs*I7!BL-41xOd$sGoO*h*g5Dv2h z!teVUQoh*)UqJQ)=S8#WTBmy@(f++TtORZ+V5s3z*`098!)7X4$BPirJY7F6vSr#j zTyBW^>37ievNPbuLsA`VI7(l$^k19H!RIfKvTpZ|k&Zbl{chNg``&s}1 diff --git a/perl/t/tracks/vcf/raw/gnomad.genomes.scrambled/test.vcf b/perl/t/tracks/vcf/raw/gnomad.genomes.scrambled/test.vcf deleted file mode 100644 index 99113db5a..000000000 --- a/perl/t/tracks/vcf/raw/gnomad.genomes.scrambled/test.vcf +++ /dev/null @@ -1,200 +0,0 @@ -##fileformat=VCFv4.2 -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FILTER= -##FILTER== 20, DP >= 10, AB => 0.2 for het calls))"> -##FILTER= -##FILTER= -##FILTER= -##FILTER= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##reference=file:///seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta -#CHROM POS ID REF ALT QUAL FILTER INFO -chr22 15927888 . C CACA,G 1164.92 PASS AC=3,0;AF=1.81378e-04,0.00000e+00;AN=16540;BaseQRankSum=-2.24000e-01;ClippingRankSum=-2.63000e-01;DP=230703;FS=8.86700e+00;InbreedingCoeff=5.80000e-03;MQ=3.46800e+01;MQRankSum=1.43000e-01;QD=1.01300e+01;ReadPosRankSum=-2.13000e-01;SOR=3.30000e-02;VQSLOD=-1.37800e+00;VQSR_culprit=SOR;VQSR_NEGATIVE_TRAIN_SITE;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|1|0|0|4,0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0;DP_HIST_ALT=0|3|2|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0,0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|2|0|1|1|1|0|0|0|0|1|0|0|0|0,0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0;GQ_HIST_ALL=104|464|608|1985|2741|1441|2526|1859|721|1069|690|260|538|62|159|47|92|8|46|68;DP_HIST_ALL=1018|6123|5177|2097|791|161|97|24|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|2|0|1|1|1|0|1|0|0|1|0|0|0|0;AC_AFR=0,0;AC_AMR=0,0;AC_ASJ=0,0;AC_EAS=0,0;AC_FIN=0,0;AC_NFE=3,0;AC_OTH=0,0;AC_Male=3,0;AC_Female=0,0;AN_AFR=3614;AN_AMR=528;AN_ASJ=180;AN_EAS=892;AN_FIN=2256;AN_NFE=8466;AN_OTH=604;AN_Male=9308;AN_Female=7232;AF_AFR=0.00000e+00,0.00000e+00;AF_AMR=0.00000e+00,0.00000e+00;AF_ASJ=0.00000e+00,0.00000e+00;AF_EAS=0.00000e+00,0.00000e+00;AF_FIN=0.00000e+00,0.00000e+00;AF_NFE=3.54359e-04,0.00000e+00;AF_OTH=0.00000e+00,0.00000e+00;AF_Male=3.22303e-04,0.00000e+00;AF_Female=0.00000e+00,0.00000e+00;GC_AFR=1807,0,0,0,0,0;GC_AMR=264,0,0,0,0,0;GC_ASJ=90,0,0,0,0,0;GC_EAS=446,0,0,0,0,0;GC_FIN=1128,0,0,0,0,0;GC_NFE=4230,3,0,0,0,0;GC_OTH=302,0,0,0,0,0;GC_Male=4651,3,0,0,0,0;GC_Female=3616,0,0,0,0,0;AC_raw=6,1;AN_raw=30976;AF_raw=1.93698e-04,3.22831e-05;GC_raw=15481,6,0,1,0,0;GC=8267,3,0,0,0,0;Hom_AFR=0,0;Hom_AMR=0,0;Hom_ASJ=0,0;Hom_EAS=0,0;Hom_FIN=0,0;Hom_NFE=0,0;Hom_OTH=0,0;Hom_Male=0,0;Hom_Female=0,0;Hom_raw=0,0;Hom=0,0;POPMAX=NFE,.;AC_POPMAX=3,.;AN_POPMAX=8466,.;AF_POPMAX=3.54359e-04,.;DP_MEDIAN=10,8;DREF_MEDIAN=3.15558e-20,2.51189e-13;GQ_MEDIAN=99,72;AB_MEDIAN=4.41558e-01,6.25000e-01;AS_RF=5.16800e-01,3.29197e-01;AS_FilterStatus=PASS,RF|AC0;CSQ=C|intergenic_variant|MODIFIER||||||||||||||||2||||insertion|1|||||||||||||||||||||||||||||||||||||||||||| -chr22_ALT 15927876 . G A 108.57 PASS AC=1;AF=5.09632e-05;AN=19622;BaseQRankSum=-7.51000e-01;ClippingRankSum=-1.43000e-01;DP=255559;FS=0.00000e+00;InbreedingCoeff=-4.70000e-03;MQ=3.40100e+01;MQRankSum=-3.32000e-01;QD=9.87000e+00;ReadPosRankSum=6.60000e-02;SOR=9.90000e-02;VQSLOD=-3.45100e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1;DP_HIST_ALT=0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=90|295|443|1443|2398|1347|2546|2110|845|1318|842|356|727|79|237|59|159|7|90|99;DP_HIST_ALL=626|4935|5582|2741|1103|289|169|45|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=1;AC_OTH=0;AC_Male=1;AC_Female=0;AN_AFR=4696;AN_AMR=588;AN_ASJ=206;AN_EAS=1092;AN_FIN=2392;AN_NFE=9986;AN_OTH=662;AN_Male=10956;AN_Female=8666;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=1.00140e-04;AF_OTH=0.00000e+00;AF_Male=9.12742e-05;AF_Female=0.00000e+00;GC_AFR=2348,0,0;GC_AMR=294,0,0;GC_ASJ=103,0,0;GC_EAS=546,0,0;GC_FIN=1196,0,0;GC_NFE=4992,1,0;GC_OTH=331,0,0;GC_Male=5477,1,0;GC_Female=4333,0,0;AC_raw=1;AN_raw=30980;AF_raw=3.22789e-05;GC_raw=15489,1,0;GC=9810,1,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=NFE;AC_POPMAX=1;AN_POPMAX=9986;AF_POPMAX=1.00140e-04;DP_MEDIAN=11;DREF_MEDIAN=7.94328e-18;GQ_MEDIAN=99;AB_MEDIAN=5.45455e-01;AS_RF=4.14273e-01;AS_FilterStatus=PASS;CSQ=A|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chr1 15927837 . A C 215.56 PASS AC=1;AF=4.23837e-05;AN=23594;BaseQRankSum=1.45000e+00;ClippingRankSum=9.70000e-02;DP=287254;FS=0.00000e+00;InbreedingCoeff=-3.60000e-03;MQ=3.28600e+01;MQRankSum=2.26000e-01;QD=9.37000e+00;ReadPosRankSum=-4.20000e-01;SOR=2.75000e-01;VQSLOD=-3.70600e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1;DP_HIST_ALT=0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=74|217|284|953|1552|1124|2262|2199|980|1697|1212|502|1089|163|422|118|256|20|147|222;DP_HIST_ALL=374|3170|5440|3653|1801|617|309|107|16|5|1|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=1;AC_Male=1;AC_Female=0;AN_AFR=6398;AN_AMR=642;AN_ASJ=220;AN_EAS=1378;AN_FIN=2270;AN_NFE=11958;AN_OTH=728;AN_Male=13148;AN_Female=10446;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=1.37363e-03;AF_Male=7.60572e-05;AF_Female=0.00000e+00;GC_AFR=3199,0,0;GC_AMR=321,0,0;GC_ASJ=110,0,0;GC_EAS=689,0,0;GC_FIN=1135,0,0;GC_NFE=5979,0,0;GC_OTH=363,1,0;GC_Male=6573,1,0;GC_Female=5223,0,0;AC_raw=1;AN_raw=30986;AF_raw=3.22726e-05;GC_raw=15492,1,0;GC=11796,1,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=.;AC_POPMAX=.;AN_POPMAX=.;AF_POPMAX=.;DP_MEDIAN=23;DREF_MEDIAN=1.58489e-28;GQ_MEDIAN=99;AB_MEDIAN=3.47826e-01;AS_RF=7.79080e-01;AS_FilterStatus=PASS;CSQ=C|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chr3 15927835 . C T 149.65 PASS AC=1;AF=4.22297e-05;AN=23680;BaseQRankSum=1.22000e+00;ClippingRankSum=-1.07000e-01;DP=287265;FS=0.00000e+00;InbreedingCoeff=-3.20000e-03;MQ=3.78300e+01;MQRankSum=-1.07000e-01;QD=1.06900e+01;ReadPosRankSum=9.67000e-01;SOR=3.50000e-01;VQSLOD=-2.44000e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1;DP_HIST_ALT=0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0;GQ_HIST_ALL=64|220|294|926|1565|1084|2265|2217|996|1671|1225|488|1101|152|440|148|259|14|143|221;DP_HIST_ALL=374|3167|5435|3621|1823|620|325|106|16|5|1|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=1;AC_OTH=0;AC_Male=1;AC_Female=0;AN_AFR=6446;AN_AMR=636;AN_ASJ=220;AN_EAS=1394;AN_FIN=2274;AN_NFE=11990;AN_OTH=720;AN_Male=13182;AN_Female=10498;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=8.34028e-05;AF_OTH=0.00000e+00;AF_Male=7.58610e-05;AF_Female=0.00000e+00;GC_AFR=3223,0,0;GC_AMR=318,0,0;GC_ASJ=110,0,0;GC_EAS=697,0,0;GC_FIN=1137,0,0;GC_NFE=5994,1,0;GC_OTH=360,0,0;GC_Male=6590,1,0;GC_Female=5249,0,0;AC_raw=1;AN_raw=30986;AF_raw=3.22726e-05;GC_raw=15492,1,0;GC=11839,1,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=NFE;AC_POPMAX=1;AN_POPMAX=11990;AF_POPMAX=8.34028e-05;DP_MEDIAN=14;DREF_MEDIAN=6.30957e-22;GQ_MEDIAN=99;AB_MEDIAN=5.71429e-01;AS_RF=6.44340e-01;AS_FilterStatus=PASS;CSQ=T|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chr22 15927834 . G T 183.64 PASS AC=1;AF=4.21905e-05;AN=23702;BaseQRankSum=2.20000e+00;ClippingRankSum=-6.08000e-01;DP=287202;FS=0.00000e+00;InbreedingCoeff=-3.00000e-03;MQ=3.51700e+01;MQRankSum=2.36000e+00;QD=9.18000e+00;ReadPosRankSum=1.14000e+00;SOR=8.60000e-02;VQSLOD=-3.66200e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1;DP_HIST_ALT=0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=54|216|270|931|1532|1084|2270|2199|971|1682|1284|459|1155|157|434|133|260|21|158|223;DP_HIST_ALL=380|3164|5416|3618|1841|618|328|106|16|5|1|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0;AC_AFR=1;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=0;AC_Male=1;AC_Female=0;AN_AFR=6452;AN_AMR=638;AN_ASJ=222;AN_EAS=1398;AN_FIN=2270;AN_NFE=12010;AN_OTH=712;AN_Male=13204;AN_Female=10498;AF_AFR=1.54991e-04;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=0.00000e+00;AF_Male=7.57346e-05;AF_Female=0.00000e+00;GC_AFR=3225,1,0;GC_AMR=319,0,0;GC_ASJ=111,0,0;GC_EAS=699,0,0;GC_FIN=1135,0,0;GC_NFE=6005,0,0;GC_OTH=356,0,0;GC_Male=6601,1,0;GC_Female=5249,0,0;AC_raw=1;AN_raw=30986;AF_raw=3.22726e-05;GC_raw=15492,1,0;GC=11850,1,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=AFR;AC_POPMAX=1;AN_POPMAX=6452;AF_POPMAX=1.54991e-04;DP_MEDIAN=20;DREF_MEDIAN=2.51189e-25;GQ_MEDIAN=99;AB_MEDIAN=4.50000e-01;AS_RF=8.93688e-01;AS_FilterStatus=PASS;CSQ=T|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chr22 15927834 rs199856444 G C 1458410.68 PASS AC=5232;AF=2.00721e-01;AN=26066;BaseQRankSum=9.10000e-02;ClippingRankSum=0.00000e+00;DB;DP=379518;FS=1.42950e+01;InbreedingCoeff=-1.97800e-01;MQ=3.43800e+01;MQRankSum=5.53000e-01;QD=9.27000e+00;ReadPosRankSum=5.81000e-01;SOR=1.93600e+00;VQSLOD=-3.58400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=35|58|54|77|62|73|105|121|121|134|123|107|124|140|135|130|112|141|175|4217;DP_HIST_ALT=75|462|1286|1606|1270|749|409|209|87|54|23|10|4|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|96|323|534|762|776|697|765|409|526|278|241|166|133|147|151|77|13|0;GQ_HIST_ALL=449|206|192|333|415|337|693|898|568|1088|1019|567|1363|338|675|319|555|199|455|4792;DP_HIST_ALL=342|1228|2964|3911|3340|1799|1048|584|146|60|23|10|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|96|323|534|762|776|697|765|409|526|278|241|166|133|147|151|77|13|0;AC_AFR=1195;AC_AMR=199;AC_ASJ=48;AC_EAS=462;AC_FIN=539;AC_NFE=2634;AC_OTH=155;AC_Male=2860;AC_Female=2372;AN_AFR=7838;AN_AMR=630;AN_ASJ=216;AN_EAS=1372;AN_FIN=2596;AN_NFE=12638;AN_OTH=776;AN_Male=14358;AN_Female=11708;AF_AFR=1.52462e-01;AF_AMR=3.15873e-01;AF_ASJ=2.22222e-01;AF_EAS=3.36735e-01;AF_FIN=2.07627e-01;AF_NFE=2.08419e-01;AF_OTH=1.99742e-01;AF_Male=1.99192e-01;AF_Female=2.02597e-01;GC_AFR=2733,1177,9;GC_AMR=120,191,4;GC_ASJ=60,48,0;GC_EAS=228,454,4;GC_FIN=767,523,8;GC_NFE=3709,2586,24;GC_OTH=235,151,2;GC_Male=4350,2798,31;GC_Female=3502,2332,20;AC_raw=6394;AN_raw=30922;AF_raw=2.06778e-01;GC_raw=9217,6094,150;GC=7852,5130,51;Hom_AFR=9;Hom_AMR=4;Hom_ASJ=0;Hom_EAS=4;Hom_FIN=8;Hom_NFE=24;Hom_OTH=2;Hom_Male=31;Hom_Female=20;Hom_raw=150;Hom=51;POPMAX=EAS;AC_POPMAX=462;AN_POPMAX=1372;AF_POPMAX=3.36735e-01;DP_MEDIAN=18;DREF_MEDIAN=6.30957e-17;GQ_MEDIAN=99;AB_MEDIAN=3.84615e-01;AS_RF=5.10467e-01;AS_FilterStatus=PASS;CSQ=T|intergenic_variant|MODIFIER|||||||||||||||rs199856444|1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chrX_random 15927759 . T A,C 692.49 PASS AC=1,7;AF=3.55290e-05,2.48703e-04;AN=28146;BaseQRankSum=-1.66300e+00;ClippingRankSum=-6.10000e-02;DP=405801;FS=2.74010e+01;InbreedingCoeff=2.60000e-03;MQ=3.41700e+01;MQRankSum=-1.61300e+00;QD=1.10000e+00;ReadPosRankSum=-3.22000e-01;SOR=6.70000e-02;VQSLOD=-6.27400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1,1|1|1|3|1|0|0|0|2|3|1|1|0|1|1|1|0|0|0|2;DP_HIST_ALT=0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0,0|0|4|5|2|5|1|2|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0,0|0|5|7|2|3|2|0|0|0|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=75|210|161|374|461|353|797|1090|677|1437|1528|731|2217|413|1197|417|966|122|653|1602;DP_HIST_ALL=347|986|2229|3523|3708|2138|1370|889|199|50|27|9|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|5|7|2|3|2|0|1|0|0|0|0|0|0|0|0|0|0|0;AC_AFR=0,1;AC_AMR=1,0;AC_ASJ=0,1;AC_EAS=0,0;AC_FIN=0,3;AC_NFE=0,1;AC_OTH=0,1;AC_Male=0,3;AC_Female=1,4;AN_AFR=8282;AN_AMR=744;AN_ASJ=226;AN_EAS=1590;AN_FIN=2918;AN_NFE=13538;AN_OTH=848;AN_Male=15510;AN_Female=12636;AF_AFR=0.00000e+00,1.20744e-04;AF_AMR=1.34409e-03,0.00000e+00;AF_ASJ=0.00000e+00,4.42478e-03;AF_EAS=0.00000e+00,0.00000e+00;AF_FIN=0.00000e+00,1.02810e-03;AF_NFE=0.00000e+00,7.38662e-05;AF_OTH=0.00000e+00,1.17925e-03;AF_Male=0.00000e+00,1.93424e-04;AF_Female=7.91390e-05,3.16556e-04;GC_AFR=4140,0,0,1,0,0;GC_AMR=371,1,0,0,0,0;GC_ASJ=112,0,0,1,0,0;GC_EAS=795,0,0,0,0,0;GC_FIN=1456,0,0,3,0,0;GC_NFE=6768,0,0,1,0,0;GC_OTH=423,0,0,1,0,0;GC_Male=7752,0,0,3,0,0;GC_Female=6313,1,0,4,0,0;AC_raw=1,19;AN_raw=30962;AF_raw=3.22977e-05,6.13655e-04;GC_raw=15461,1,0,19,0,0;GC=14065,1,0,7,0,0;Hom_AFR=0,0;Hom_AMR=0,0;Hom_ASJ=0,0;Hom_EAS=0,0;Hom_FIN=0,0;Hom_NFE=0,0;Hom_OTH=0,0;Hom_Male=0,0;Hom_Female=0,0;Hom_raw=0,0;Hom=0,0;POPMAX=AMR,ASJ;AC_POPMAX=1,1;AN_POPMAX=744,226;AF_POPMAX=1.34409e-03,4.42478e-03;DP_MEDIAN=19,22;DREF_MEDIAN=3.16228e-18,1.99522e-05;GQ_MEDIAN=99,47;AB_MEDIAN=4.21053e-01,1.73913e-01;AS_RF=7.14240e-01,1.47824e-02;AS_FilterStatus=PASS,RF;AS_RF_NEGATIVE_TRAIN=1;CSQ=A|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|||||||||||||||||||||||||||||||||||||||||||||,C|intergenic_variant|MODIFIER||||||||||||||||2||||SNV||||||||||||||||||||||||||||||||||||||||||||| -chr22 15927755 . T G 296.53 NON_PASS AC=2;AF=7.06764e-05;AN=28298;BaseQRankSum=-4.54000e-01;ClippingRankSum=-7.13000e-01;DP=415781;FS=1.91800e+00;InbreedingCoeff=-2.80000e-03;MQ=3.76000e+01;MQRankSum=9.30000e-02;QD=6.18000e+00;ReadPosRankSum=6.51000e-01;SOR=1.19500e+00;VQSLOD=-2.46100e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|2;DP_HIST_ALT=0|0|0|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=77|197|151|386|424|329|796|995|663|1392|1445|753|2211|438|1213|415|1061|122|696|1718;DP_HIST_ALL=325|927|2029|3382|3793|2212|1494|1006|220|51|28|9|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0|0;AC_AFR=2;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=0;AC_Male=1;AC_Female=1;AN_AFR=8322;AN_AMR=748;AN_ASJ=226;AN_EAS=1598;AN_FIN=2966;AN_NFE=13580;AN_OTH=858;AN_Male=15596;AN_Female=12702;AF_AFR=2.40327e-04;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=0.00000e+00;AF_Male=6.41190e-05;AF_Female=7.87278e-05;GC_AFR=4159,2,0;GC_AMR=374,0,0;GC_ASJ=113,0,0;GC_EAS=799,0,0;GC_FIN=1483,0,0;GC_NFE=6790,0,0;GC_OTH=429,0,0;GC_Male=7797,1,0;GC_Female=6350,1,0;AC_raw=2;AN_raw=30964;AF_raw=6.45911e-05;GC_raw=15480,2,0;GC=14147,2,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=AFR;AC_POPMAX=2;AN_POPMAX=8322;AF_POPMAX=2.40327e-04;DP_MEDIAN=24;DREF_MEDIAN=1.25594e-15;GQ_MEDIAN=99;AB_MEDIAN=3.57391e-01;AS_RF=5.53853e-01;AS_FilterStatus=PASS;CSQ=G|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chr22 15927745 . A C 718.20 PASS AC=2;AF=6.93049e-05;AN=28858;BaseQRankSum=-1.40100e+00;ClippingRankSum=3.61000e-01;DP=440172;FS=3.22100e+00;InbreedingCoeff=-1.50000e-03;MQ=3.87000e+01;MQRankSum=-7.30000e-02;QD=7.48000e+00;ReadPosRankSum=0.00000e+00;SOR=1.09300e+00;VQSLOD=-2.17400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|1|2;DP_HIST_ALT=0|1|0|0|0|2|1|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|1|0|0|0|0|1|0|1|1|0|0|0|0|0|0|0|0;GQ_HIST_ALL=30|139|117|300|352|256|578|857|572|1259|1323|742|2332|396|1303|408|1234|111|867|2309;DP_HIST_ALL=235|803|1714|3101|3801|2363|1856|1262|252|55|28|9|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|0|1|0|0|0|0|1|0|1|1|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=2;AC_OTH=0;AC_Male=1;AC_Female=1;AN_AFR=8454;AN_AMR=782;AN_ASJ=232;AN_EAS=1606;AN_FIN=3132;AN_NFE=13774;AN_OTH=878;AN_Male=15900;AN_Female=12958;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=1.45201e-04;AF_OTH=0.00000e+00;AF_Male=6.28931e-05;AF_Female=7.71724e-05;GC_AFR=4227,0,0;GC_AMR=391,0,0;GC_ASJ=116,0,0;GC_EAS=803,0,0;GC_FIN=1566,0,0;GC_NFE=6885,2,0;GC_OTH=439,0,0;GC_Male=7949,1,0;GC_Female=6478,1,0;AC_raw=4;AN_raw=30970;AF_raw=1.29157e-04;GC_raw=15481,4,0;GC=14427,2,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=NFE;AC_POPMAX=2;AN_POPMAX=13774;AF_POPMAX=1.45201e-04;DP_MEDIAN=27;DREF_MEDIAN=2.50594e-10;GQ_MEDIAN=96;AB_MEDIAN=4.72222e-01;AS_RF=5.74840e-01;AS_FilterStatus=PASS;CSQ=C|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| diff --git a/perl/t/tracks/vcf/raw/gnomad.genomes.scrambled/test_split_part1.vcf b/perl/t/tracks/vcf/raw/gnomad.genomes.scrambled/test_split_part1.vcf deleted file mode 100644 index d6c15ef52..000000000 --- a/perl/t/tracks/vcf/raw/gnomad.genomes.scrambled/test_split_part1.vcf +++ /dev/null @@ -1,198 +0,0 @@ -##fileformat=VCFv4.2 -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FILTER= -##FILTER== 20, DP >= 10, AB => 0.2 for het calls))"> -##FILTER= -##FILTER= -##FILTER= -##FILTER= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##reference=file:///seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta -#CHROM POS ID REF ALT QUAL FILTER INFO -chr22 15927834 rs199856444 G C 1458410.68 PASS AC=5232;AF=2.00721e-01;AN=26066;BaseQRankSum=9.10000e-02;ClippingRankSum=0.00000e+00;DB;DP=379518;FS=1.42950e+01;InbreedingCoeff=-1.97800e-01;MQ=3.43800e+01;MQRankSum=5.53000e-01;QD=9.27000e+00;ReadPosRankSum=5.81000e-01;SOR=1.93600e+00;VQSLOD=-3.58400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=35|58|54|77|62|73|105|121|121|134|123|107|124|140|135|130|112|141|175|4217;DP_HIST_ALT=75|462|1286|1606|1270|749|409|209|87|54|23|10|4|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|96|323|534|762|776|697|765|409|526|278|241|166|133|147|151|77|13|0;GQ_HIST_ALL=449|206|192|333|415|337|693|898|568|1088|1019|567|1363|338|675|319|555|199|455|4792;DP_HIST_ALL=342|1228|2964|3911|3340|1799|1048|584|146|60|23|10|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|96|323|534|762|776|697|765|409|526|278|241|166|133|147|151|77|13|0;AC_AFR=1195;AC_AMR=199;AC_ASJ=48;AC_EAS=462;AC_FIN=539;AC_NFE=2634;AC_OTH=155;AC_Male=2860;AC_Female=2372;AN_AFR=7838;AN_AMR=630;AN_ASJ=216;AN_EAS=1372;AN_FIN=2596;AN_NFE=12638;AN_OTH=776;AN_Male=14358;AN_Female=11708;AF_AFR=1.52462e-01;AF_AMR=3.15873e-01;AF_ASJ=2.22222e-01;AF_EAS=3.36735e-01;AF_FIN=2.07627e-01;AF_NFE=2.08419e-01;AF_OTH=1.99742e-01;AF_Male=1.99192e-01;AF_Female=2.02597e-01;GC_AFR=2733,1177,9;GC_AMR=120,191,4;GC_ASJ=60,48,0;GC_EAS=228,454,4;GC_FIN=767,523,8;GC_NFE=3709,2586,24;GC_OTH=235,151,2;GC_Male=4350,2798,31;GC_Female=3502,2332,20;AC_raw=6394;AN_raw=30922;AF_raw=2.06778e-01;GC_raw=9217,6094,150;GC=7852,5130,51;Hom_AFR=9;Hom_AMR=4;Hom_ASJ=0;Hom_EAS=4;Hom_FIN=8;Hom_NFE=24;Hom_OTH=2;Hom_Male=31;Hom_Female=20;Hom_raw=150;Hom=51;POPMAX=EAS;AC_POPMAX=462;AN_POPMAX=1372;AF_POPMAX=3.36735e-01;DP_MEDIAN=18;DREF_MEDIAN=6.30957e-17;GQ_MEDIAN=99;AB_MEDIAN=3.84615e-01;AS_RF=5.10467e-01;AS_FilterStatus=PASS;CSQ=T|intergenic_variant|MODIFIER|||||||||||||||rs199856444|1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chr22 15927834 rsFake G A 1458410.68 PASS AC=5232;AF=2.00721e-01;AN=26066;BaseQRankSum=9.10000e-02;ClippingRankSum=0.00000e+00;DB;DP=379518;FS=1.42950e+01;InbreedingCoeff=-1.97800e-01;MQ=3.43800e+01;MQRankSum=5.53000e-01;QD=9.27000e+00;ReadPosRankSum=5.81000e-01;SOR=1.93600e+00;VQSLOD=-3.58400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=35|58|54|77|62|73|105|121|121|134|123|107|124|140|135|130|112|141|175|4217;DP_HIST_ALT=75|462|1286|1606|1270|749|409|209|87|54|23|10|4|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|96|323|534|762|776|697|765|409|526|278|241|166|133|147|151|77|13|0;GQ_HIST_ALL=449|206|192|333|415|337|693|898|568|1088|1019|567|1363|338|675|319|555|199|455|4792;DP_HIST_ALL=342|1228|2964|3911|3340|1799|1048|584|146|60|23|10|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|96|323|534|762|776|697|765|409|526|278|241|166|133|147|151|77|13|0;AC_AFR=1195;AC_AMR=199;AC_ASJ=48;AC_EAS=462;AC_FIN=539;AC_NFE=2634;AC_OTH=155;AC_Male=2860;AC_Female=2372;AN_AFR=7838;AN_AMR=630;AN_ASJ=216;AN_EAS=1372;AN_FIN=2596;AN_NFE=12638;AN_OTH=776;AN_Male=14358;AN_Female=11708;AF_AFR=1.52462e-01;AF_AMR=3.15873e-01;AF_ASJ=2.22222e-01;AF_EAS=3.36735e-01;AF_FIN=2.07627e-01;AF_NFE=2.08419e-01;AF_OTH=1.99742e-01;AF_Male=1.99192e-01;AF_Female=2.02597e-01;GC_AFR=2733,1177,9;GC_AMR=120,191,4;GC_ASJ=60,48,0;GC_EAS=228,454,4;GC_FIN=767,523,8;GC_NFE=3709,2586,24;GC_OTH=235,151,2;GC_Male=4350,2798,31;GC_Female=3502,2332,20;AC_raw=6394;AN_raw=30922;AF_raw=2.06778e-01;GC_raw=9217,6094,150;GC=7852,5130,51;Hom_AFR=9;Hom_AMR=4;Hom_ASJ=0;Hom_EAS=4;Hom_FIN=8;Hom_NFE=24;Hom_OTH=2;Hom_Male=31;Hom_Female=20;Hom_raw=150;Hom=51;POPMAX=EAS;AC_POPMAX=462;AN_POPMAX=1372;AF_POPMAX=3.36735e-01;DP_MEDIAN=18;DREF_MEDIAN=6.30957e-17;GQ_MEDIAN=99;AB_MEDIAN=3.84615e-01;AS_RF=5.10467e-01;AS_FilterStatus=PASS;CSQ=T|intergenic_variant|MODIFIER|||||||||||||||rs199856444|1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chrX_random 130000000 . T A,C 692.49 PASS AC=1,7;AF=3.55290e-05,2.48703e-04;AN=28146;BaseQRankSum=-1.66300e+00;ClippingRankSum=-6.10000e-02;DP=405801;FS=2.74010e+01;InbreedingCoeff=2.60000e-03;MQ=3.41700e+01;MQRankSum=-1.61300e+00;QD=1.10000e+00;ReadPosRankSum=-3.22000e-01;SOR=6.70000e-02;VQSLOD=-6.27400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1,1|1|1|3|1|0|0|0|2|3|1|1|0|1|1|1|0|0|0|2;DP_HIST_ALT=0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0,0|0|4|5|2|5|1|2|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0,0|0|5|7|2|3|2|0|0|0|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=75|210|161|374|461|353|797|1090|677|1437|1528|731|2217|413|1197|417|966|122|653|1602;DP_HIST_ALL=347|986|2229|3523|3708|2138|1370|889|199|50|27|9|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|5|7|2|3|2|0|1|0|0|0|0|0|0|0|0|0|0|0;AC_AFR=0,1;AC_AMR=1,0;AC_ASJ=0,1;AC_EAS=0,0;AC_FIN=0,3;AC_NFE=0,1;AC_OTH=0,1;AC_Male=0,3;AC_Female=1,4;AN_AFR=8282;AN_AMR=744;AN_ASJ=226;AN_EAS=1590;AN_FIN=2918;AN_NFE=13538;AN_OTH=848;AN_Male=15510;AN_Female=12636;AF_AFR=0.00000e+00,1.20744e-04;AF_AMR=1.34409e-03,0.00000e+00;AF_ASJ=0.00000e+00,4.42478e-03;AF_EAS=0.00000e+00,0.00000e+00;AF_FIN=0.00000e+00,1.02810e-03;AF_NFE=0.00000e+00,7.38662e-05;AF_OTH=0.00000e+00,1.17925e-03;AF_Male=0.00000e+00,1.93424e-04;AF_Female=7.91390e-05,3.16556e-04;GC_AFR=4140,0,0,1,0,0;GC_AMR=371,1,0,0,0,0;GC_ASJ=112,0,0,1,0,0;GC_EAS=795,0,0,0,0,0;GC_FIN=1456,0,0,3,0,0;GC_NFE=6768,0,0,1,0,0;GC_OTH=423,0,0,1,0,0;GC_Male=7752,0,0,3,0,0;GC_Female=6313,1,0,4,0,0;AC_raw=1,19;AN_raw=30962;AF_raw=3.22977e-05,6.13655e-04;GC_raw=15461,1,0,19,0,0;GC=14065,1,0,7,0,0;Hom_AFR=0,0;Hom_AMR=0,0;Hom_ASJ=0,0;Hom_EAS=0,0;Hom_FIN=0,0;Hom_NFE=0,0;Hom_OTH=0,0;Hom_Male=0,0;Hom_Female=0,0;Hom_raw=0,0;Hom=0,0;POPMAX=AMR,ASJ;AC_POPMAX=1,1;AN_POPMAX=744,226;AF_POPMAX=1.34409e-03,4.42478e-03;DP_MEDIAN=19,22;DREF_MEDIAN=3.16228e-18,1.99522e-05;GQ_MEDIAN=99,47;AB_MEDIAN=4.21053e-01,1.73913e-01;AS_RF=7.14240e-01,1.47824e-02;AS_FilterStatus=PASS,RF;AS_RF_NEGATIVE_TRAIN=1;CSQ=A|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|||||||||||||||||||||||||||||||||||||||||||||,C|intergenic_variant|MODIFIER||||||||||||||||2||||SNV||||||||||||||||||||||||||||||||||||||||||||| -chr22 15927759 . T A,C 692.49 PASS AC=1,7;AF=3.55290e-05,2.48703e-04;AN=28146;BaseQRankSum=-1.66300e+00;ClippingRankSum=-6.10000e-02;DP=405801;FS=2.74010e+01;InbreedingCoeff=2.60000e-03;MQ=3.41700e+01;MQRankSum=-1.61300e+00;QD=1.10000e+00;ReadPosRankSum=-3.22000e-01;SOR=6.70000e-02;VQSLOD=-6.27400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1,1|1|1|3|1|0|0|0|2|3|1|1|0|1|1|1|0|0|0|2;DP_HIST_ALT=0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0,0|0|4|5|2|5|1|2|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0,0|0|5|7|2|3|2|0|0|0|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=75|210|161|374|461|353|797|1090|677|1437|1528|731|2217|413|1197|417|966|122|653|1602;DP_HIST_ALL=347|986|2229|3523|3708|2138|1370|889|199|50|27|9|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|5|7|2|3|2|0|1|0|0|0|0|0|0|0|0|0|0|0;AC_AFR=0,1;AC_AMR=1,0;AC_ASJ=0,1;AC_EAS=0,0;AC_FIN=0,3;AC_NFE=0,1;AC_OTH=0,1;AC_Male=0,3;AC_Female=1,4;AN_AFR=8282;AN_AMR=744;AN_ASJ=226;AN_EAS=1590;AN_FIN=2918;AN_NFE=13538;AN_OTH=848;AN_Male=15510;AN_Female=12636;AF_AFR=0.00000e+00,1.20744e-04;AF_AMR=1.34409e-03,0.00000e+00;AF_ASJ=0.00000e+00,4.42478e-03;AF_EAS=0.00000e+00,0.00000e+00;AF_FIN=0.00000e+00,1.02810e-03;AF_NFE=0.00000e+00,7.38662e-05;AF_OTH=0.00000e+00,1.17925e-03;AF_Male=0.00000e+00,1.93424e-04;AF_Female=7.91390e-05,3.16556e-04;GC_AFR=4140,0,0,1,0,0;GC_AMR=371,1,0,0,0,0;GC_ASJ=112,0,0,1,0,0;GC_EAS=795,0,0,0,0,0;GC_FIN=1456,0,0,3,0,0;GC_NFE=6768,0,0,1,0,0;GC_OTH=423,0,0,1,0,0;GC_Male=7752,0,0,3,0,0;GC_Female=6313,1,0,4,0,0;AC_raw=1,19;AN_raw=30962;AF_raw=3.22977e-05,6.13655e-04;GC_raw=15461,1,0,19,0,0;GC=14065,1,0,7,0,0;Hom_AFR=0,0;Hom_AMR=0,0;Hom_ASJ=0,0;Hom_EAS=0,0;Hom_FIN=0,0;Hom_NFE=0,0;Hom_OTH=0,0;Hom_Male=0,0;Hom_Female=0,0;Hom_raw=0,0;Hom=0,0;POPMAX=AMR,ASJ;AC_POPMAX=1,1;AN_POPMAX=744,226;AF_POPMAX=1.34409e-03,4.42478e-03;DP_MEDIAN=19,22;DREF_MEDIAN=3.16228e-18,1.99522e-05;GQ_MEDIAN=99,47;AB_MEDIAN=4.21053e-01,1.73913e-01;AS_RF=7.14240e-01,1.47824e-02;AS_FilterStatus=PASS,RF;AS_RF_NEGATIVE_TRAIN=1;CSQ=A|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|||||||||||||||||||||||||||||||||||||||||||||,C|intergenic_variant|MODIFIER||||||||||||||||2||||SNV||||||||||||||||||||||||||||||||||||||||||||| -chr1_ALT 131000000 . T A,C 692.49 PASS AC=1,7;AF=3.55290e-05,2.48703e-04;AN=28146;BaseQRankSum=-1.66300e+00;ClippingRankSum=-6.10000e-02;DP=405801;FS=2.74010e+01;InbreedingCoeff=2.60000e-03;MQ=3.41700e+01;MQRankSum=-1.61300e+00;QD=1.10000e+00;ReadPosRankSum=-3.22000e-01;SOR=6.70000e-02;VQSLOD=-6.27400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1,1|1|1|3|1|0|0|0|2|3|1|1|0|1|1|1|0|0|0|2;DP_HIST_ALT=0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0,0|0|4|5|2|5|1|2|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0,0|0|5|7|2|3|2|0|0|0|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=75|210|161|374|461|353|797|1090|677|1437|1528|731|2217|413|1197|417|966|122|653|1602;DP_HIST_ALL=347|986|2229|3523|3708|2138|1370|889|199|50|27|9|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|5|7|2|3|2|0|1|0|0|0|0|0|0|0|0|0|0|0;AC_AFR=0,1;AC_AMR=1,0;AC_ASJ=0,1;AC_EAS=0,0;AC_FIN=0,3;AC_NFE=0,1;AC_OTH=0,1;AC_Male=0,3;AC_Female=1,4;AN_AFR=8282;AN_AMR=744;AN_ASJ=226;AN_EAS=1590;AN_FIN=2918;AN_NFE=13538;AN_OTH=848;AN_Male=15510;AN_Female=12636;AF_AFR=0.00000e+00,1.20744e-04;AF_AMR=1.34409e-03,0.00000e+00;AF_ASJ=0.00000e+00,4.42478e-03;AF_EAS=0.00000e+00,0.00000e+00;AF_FIN=0.00000e+00,1.02810e-03;AF_NFE=0.00000e+00,7.38662e-05;AF_OTH=0.00000e+00,1.17925e-03;AF_Male=0.00000e+00,1.93424e-04;AF_Female=7.91390e-05,3.16556e-04;GC_AFR=4140,0,0,1,0,0;GC_AMR=371,1,0,0,0,0;GC_ASJ=112,0,0,1,0,0;GC_EAS=795,0,0,0,0,0;GC_FIN=1456,0,0,3,0,0;GC_NFE=6768,0,0,1,0,0;GC_OTH=423,0,0,1,0,0;GC_Male=7752,0,0,3,0,0;GC_Female=6313,1,0,4,0,0;AC_raw=1,19;AN_raw=30962;AF_raw=3.22977e-05,6.13655e-04;GC_raw=15461,1,0,19,0,0;GC=14065,1,0,7,0,0;Hom_AFR=0,0;Hom_AMR=0,0;Hom_ASJ=0,0;Hom_EAS=0,0;Hom_FIN=0,0;Hom_NFE=0,0;Hom_OTH=0,0;Hom_Male=0,0;Hom_Female=0,0;Hom_raw=0,0;Hom=0,0;POPMAX=AMR,ASJ;AC_POPMAX=1,1;AN_POPMAX=744,226;AF_POPMAX=1.34409e-03,4.42478e-03;DP_MEDIAN=19,22;DREF_MEDIAN=3.16228e-18,1.99522e-05;GQ_MEDIAN=99,47;AB_MEDIAN=4.21053e-01,1.73913e-01;AS_RF=7.14240e-01,1.47824e-02;AS_FilterStatus=PASS,RF;AS_RF_NEGATIVE_TRAIN=1;CSQ=A|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|||||||||||||||||||||||||||||||||||||||||||||,C|intergenic_variant|MODIFIER||||||||||||||||2||||SNV||||||||||||||||||||||||||||||||||||||||||||| -chr22 15927755 . T G 296.53 NON_PASS AC=2;AF=7.06764e-05;AN=28298;BaseQRankSum=-4.54000e-01;ClippingRankSum=-7.13000e-01;DP=415781;FS=1.91800e+00;InbreedingCoeff=-2.80000e-03;MQ=3.76000e+01;MQRankSum=9.30000e-02;QD=6.18000e+00;ReadPosRankSum=6.51000e-01;SOR=1.19500e+00;VQSLOD=-2.46100e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|2;DP_HIST_ALT=0|0|0|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=77|197|151|386|424|329|796|995|663|1392|1445|753|2211|438|1213|415|1061|122|696|1718;DP_HIST_ALL=325|927|2029|3382|3793|2212|1494|1006|220|51|28|9|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0|0;AC_AFR=2;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=0;AC_Male=1;AC_Female=1;AN_AFR=8322;AN_AMR=748;AN_ASJ=226;AN_EAS=1598;AN_FIN=2966;AN_NFE=13580;AN_OTH=858;AN_Male=15596;AN_Female=12702;AF_AFR=2.40327e-04;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=0.00000e+00;AF_Male=6.41190e-05;AF_Female=7.87278e-05;GC_AFR=4159,2,0;GC_AMR=374,0,0;GC_ASJ=113,0,0;GC_EAS=799,0,0;GC_FIN=1483,0,0;GC_NFE=6790,0,0;GC_OTH=429,0,0;GC_Male=7797,1,0;GC_Female=6350,1,0;AC_raw=2;AN_raw=30964;AF_raw=6.45911e-05;GC_raw=15480,2,0;GC=14147,2,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=AFR;AC_POPMAX=2;AN_POPMAX=8322;AF_POPMAX=2.40327e-04;DP_MEDIAN=24;DREF_MEDIAN=1.25594e-15;GQ_MEDIAN=99;AB_MEDIAN=3.57391e-01;AS_RF=5.53853e-01;AS_FilterStatus=PASS;CSQ=G|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chr22 15927745 . A C 718.20 PASS AC=2;AF=6.93049e-05;AN=28858;BaseQRankSum=-1.40100e+00;ClippingRankSum=3.61000e-01;DP=440172;FS=3.22100e+00;InbreedingCoeff=-1.50000e-03;MQ=3.87000e+01;MQRankSum=-7.30000e-02;QD=7.48000e+00;ReadPosRankSum=0.00000e+00;SOR=1.09300e+00;VQSLOD=-2.17400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|1|2;DP_HIST_ALT=0|1|0|0|0|2|1|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|1|0|0|0|0|1|0|1|1|0|0|0|0|0|0|0|0;GQ_HIST_ALL=30|139|117|300|352|256|578|857|572|1259|1323|742|2332|396|1303|408|1234|111|867|2309;DP_HIST_ALL=235|803|1714|3101|3801|2363|1856|1262|252|55|28|9|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|0|1|0|0|0|0|1|0|1|1|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=2;AC_OTH=0;AC_Male=1;AC_Female=1;AN_AFR=8454;AN_AMR=782;AN_ASJ=232;AN_EAS=1606;AN_FIN=3132;AN_NFE=13774;AN_OTH=878;AN_Male=15900;AN_Female=12958;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=1.45201e-04;AF_OTH=0.00000e+00;AF_Male=6.28931e-05;AF_Female=7.71724e-05;GC_AFR=4227,0,0;GC_AMR=391,0,0;GC_ASJ=116,0,0;GC_EAS=803,0,0;GC_FIN=1566,0,0;GC_NFE=6885,2,0;GC_OTH=439,0,0;GC_Male=7949,1,0;GC_Female=6478,1,0;AC_raw=4;AN_raw=30970;AF_raw=1.29157e-04;GC_raw=15481,4,0;GC=14427,2,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=NFE;AC_POPMAX=2;AN_POPMAX=13774;AF_POPMAX=1.45201e-04;DP_MEDIAN=27;DREF_MEDIAN=2.50594e-10;GQ_MEDIAN=96;AB_MEDIAN=4.72222e-01;AS_RF=5.74840e-01;AS_FilterStatus=PASS;CSQ=C|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| \ No newline at end of file diff --git a/perl/t/tracks/vcf/raw/gnomad.genomes.scrambled/test_split_part2.vcf.gz b/perl/t/tracks/vcf/raw/gnomad.genomes.scrambled/test_split_part2.vcf.gz deleted file mode 100644 index 060c4fe17d756b24c2d8e0df72c334ea09af30c1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7257 zcmV-f9H!$RiwFp>xMN!Y19W9`bYF9DY-w~~aA9(EGA?#wW&rJ+ZFAyCl7K&Ze}!V_ z!_C~u>UuBLdf5mX1I>oP80g;JySQ**TejK7;04?-6Z-L&ReCE4$q2^o>5K5}wk<@d zGOLu8Po^s6pMJWX&4b(J>S4NRd}?(*Gmre?r=L3KqkePJ_;c^H(Ve)1r-!Rx)xhrL z+hfoeZ&tI#oqHOruUE6j&1|`7{IMG>mK*r;kAKNu+&oP#9z_@T7MtKMSmm#7&ga2= zcJ15*kDL3o1KK#7`@mTRx6X8N<4orp_yKuVCBqb>rHS|lzBQVC|oLY z6ijcNQ!TJ~efOfy^~s2xi>K*)w)s}cm>g1(% zrZX*L1(Umc(A@{?4Tzb|H^FM{{5&3fa;~2?%iCKAdCUbdH^F>uo%*FHpxHttcQkNX z%csT0nXR2)gVoacdFBP4vsgO!v%CA_>*eBhb`vbFgLH7t&)o~>FAawv*Et>PBMe8) z_fF$4AeNYkxDPfCOwoM(%a#_s#nmbZZooKO%i#7luTkMk=g-bD@@TP(v(~71nKO0f z%P-FL^5JnFe1$s=c6nDa$>VnS^y9GP>UD7U5G*#+xpVXMIFD~pMJL1Nc)T@}>799; zuGeti^fXL2&ODf|H_mbatKrtzL9#~RyUjiPSkE>vrRyMYn&Zok>0`W^Zk{5Gidr8h zh7${M)}}=?x4l8sOt0^q=Gi2c>?Y>Bo<1ZC8f@EI z)0y~AL40SvoNh{mcUHl_pTMZTIn#&b;_d}?7}W6!lcm1T3)4{F7RzmEqlKH7DwOa3 z{d65%jHZi!jh`M0f=UMa-{2--)jGE;SigUp&99eVwMoI{&id)W*#z1!Zk;AL&QEL4 z8G$3c0s+pyjjx&&g}3Ij$46KhuOa_)EI*r*`|0Y&xz-|KI<#5^ZBE}O3ZO)RX#qDk zSL4CZ=?A*i`hNCU=%5w=nvajG<<}W-Pm=~T_Q&JT)A>{2Jgs5Ioz851AFTdk?F3(^*PCeOHcQZD6TrvyM*C>zYBnv* zQPI`yR3Ww5u z9s-C3ubuN7u#b_x%~}xoO29Z48<;oah&R(Ku$dZ0{pn(Sc3xvf&z5)2Qj1(ABCi5{ zlcvsQ1qFZVM5(id2UTB$F~ z72GRWXgVI(S|Z-lVwLI2gcy8z{cWBqlm){qg>yDa%Jid&q3ptS3g<>CF98 zUb}++-HXc)z47D{0*^ujSKhh(03vi0UJ%kC;AU`iSczu8)L168cD*PV|wlk9>Wk^pS$2B5V&I z8^riO^w^s34eVxq14_k2N)!NuSm5dPS4|vBr{_@cv^t9sXEj$xL?tvD7!&{}(3OY+W`x zqeBwc<}E}u)5W{^x6^zGqJx*qCPBCUVY=M_{{dpJqtp1W2kErF|5vb>{yKC18hn9- z=0$aCH^+zQ)YcD2nkHa+No_j4!6DjoW{bs3Pt2gxK17wla&erf(|%elAA=X3qVvgz zL-aZa0W0U{$K~VGJPM9MB@nZo-9e!K%ZsYmhfMqs9rblDYfiz#fvQATzU1>Q+gl#^ zjH}XdKL7Ab%kKYX*+s{=TW03%4wYGSd^=@Y-t0h`mSfu~&GJ?UN=rP&PEnROI#5)p zW7;X#@-_#`-Q{?8s$hGQ19dQtW%upOd5c5uXyiC{2FCG!%go~7U+VPtJpx190z%v5{g z+td?(x2fso?%i_+GMavglW~}rXRYJV+3L3TGSodR^uzL@?xWGP`}^fX@*ublj!Fd9 zd)`?p@EH)52&(rGvsBPC2-+4@^T}kHpmO+Zi>Udyu}nk>cDDPk`3$kF|6Rb@7FP4Y zV41KzP_r$!?$f?9!9~cF+@ji#@JjE}7GAa`)TBvS_qJ1*>7tM?Rwe0yehE~10d)!m zCzIyrGOIFtwr;DXJG}wFu0OAx_2YD5_U$)%vp*?nDeb;4_VQe=A??tqGDSrA(}S~}FJ7;f>$N3v?fe{ky`DdrvODm8g|d*-;bp&l+H2Nw z@%`XN)8cd()sn7OGhGo3=3rG06uKX^JLLk(1{HOGvs_Q_g2Jj@1-HQ}h_{4miSAzP zKeFyc?e3fJ_Z?T%i*r@BwY>>yO4WrvyCsgIs`^e`RsXnJUQMrNrrxhm=&{wkU87xN zRTd=W3b=R5+NxPf1bWZSqig5p$+*U} z&L>B zy7^P=Tte%@u-6|pTa$47kN*4fv#=X1g0K@zbt&MbMoaP`eBV2t{A1V-+kZbFguTIJ z1iwCXKaH==&&TlkbkMv^fDpD$$JvMB3Rc5xk)3Eh%ofYb>GkYp4Z7TbHevfK-Wa( z(|9bGp*iTb!?R^a|GSKwMszrz-mT5iY;n6ZW^T%O?~I{icOES6Hunv}6yZn^ zTxdYD1{5nrB`q|dNdv5WM4%NaG+B8xR`WP!t@Rp2pLZy$S zBy3B~bVA{d5MTL45hR^YK|xJUiXuoloy^BXZO;`+XOm-!7#GFq{(EK>oJhpC?CKwx zj$qf@9rPz@4}uF-__lk75F$u=nW97#r#fmxMAiuG7ZqnKZl$bN3Jkfl6~$RABEi^8 zTVZ6k5`MW})RG7mNmzL+Zna`y`Q;N~=|u(JvWTb^lXfe%wGQJ}wnPCdsjL-Nc3%?9 zqJhMEQM(mW6?$2Bi2C5B%3Im<<#SZ3JyyBC6t3vYw^}J#aUU78TPad*gf!Qeq7|k} zEK@}oRoEA^Tk$PN7`MvxC3yLLMAl3&3{z7+Mr5~Av_dbDZ&fj?*j|pfLN8|71f!Hy z=w*+Qa#Ep}tvwa0V!muQ!Tl|Q#jR|;xT??#S$D|Cbjvx%R%qFT0+&;r45$ab#4qzZSAZQTUM zTa=H5qFgh-+!H_`VK-x>Vqs!?2;lCvKos8vdk8utT>Wlne z`qZeQ|H8zd-o1MVne4mt?E2k@<-_uFJ$;-7i}j^`vmKJ)Z&(Qcf+b9(PuY=o)IFLUVV`<} z9qIS%kD9FpK52Fu%mZ^nJUB-Dq&aAS3r6I`ej&R-Jh)8=1^6#)idqAOb^fOZyL}CQ{;Ct}m8uds zrB_?Joq-jG;E+Oy(nCdYh&d%8Mn0U?r{F%SP)Rrsfouq5VIj~lgcONGQyDVKLdd8> z$Q+?9obus1Y7OKJ;xH6aaKRyGAP3@8Kg2%6p+2AlJ|F?ThrCQzYjkI|M;ARc(3WVP z7&M|Y%W&_nk!bwGD|)`$!Iju;L^wMg__<-9slDgvS}=BVcvx zB&yKNErO^vvUti2F@MSoHh;n0bQFNOIEl(7z%{^)A<=NUaYLA95_blQK!RaD#QXY<_^6*A8EcBStmC7|p5i^= z_$acdWXm*ZnAlXj6&W4D&uG7-0Rm6by`N|Z@o{8*o%AcqIMS^TNbgUqjSBF19LA%9 zW5Q3;U4({GC+RbPaMv6q{?ggasN69_6VL@Gy>F5j;Df zQv*DuedNHSfEbF0jL{J59}uDg3@DQ!gt`DEj8W(_zybwO_>6?G{zFM*C>6Aop~evW zlOc2j=4>NJ5D_v(dB{0Xi%6gi^adh)xJD>ISdCGY0jGRaF>ug6z`-?v1M#LvfCHg@ z3LGR-;2=d(;Lzw;OHjt+MZAG9CF2cJTv)vEg%AmDifnmd++a8bQB^{-D%?O+2KS8Y z#tos&Px%76LMD~N4Wc$~z^d}NQNqRzxYC3G5pF2X;ReEWS-=fspLHKBiK05)99_hi*nhH)EokQPEAgNZl+Ib2B!^rxR!9rhL@<9Vs74 z%17*@gpa<3M-q`w3-X!B@E9dv5qv1}EjmgTY@FcJYR03(5 zkeKT2VJ^1z-6*uk_T9K*dC~w&cw((Z(5RFcB-wW3tj4xm!iAVK#1|#oZbalTL#r?Y z+#<-%K#3>Q&>-WCMB)`WQ-d{?oO$ND9A~87cLS*SEL0p=!#SjJnCN{sNa&({Hy>#( z$Pg1uLXOPFoJIu=8PTnMU=kn+T4_8HlHYd&2^34n5gCJ|SZEXEZj5C6ZfTmQpTm^yyJ5xsbl)w?5aWF}%8C1GP_vG*eK&!)tU(-IA8Zqc5}YFr&ShJ|e|F!EE2(n1m7ob=qJ1|m>DIm*sV2@I z`)(50_q!p-PK)fM$j*oC97ykTTa3XT_nFyft7Hts$I>?wkT6DiQXwBb&zQY(n%}^f zBg{-pkI-8-^OnuLWixNt%v(0|md(6sHp9&(+eDMc6MQS1Aw*f(jPOx0n<1nsn*q=u zX@dsR8Nq9$GfDapg9en`294c!w4)|e`HmV4D{UEVsAaV_322DN;!?vO7c?ck5Sg3;({k-ghKyWYDhYi(uIDymXHuYlJ9F?=$b(i>I?;g!zPq0;E#He@hu*`RHB~IKreFNg1IiO=lFd(ivu@Gb$@1jM^vZj8Ce{ z2wxdFzDF6-4k=>EGi8Lnz^GOkAQE$b32jz_O7e@|#40EA(1rgjD^A z%!zF%g^qpuEaX=1jZ-}Is!Z!WjoTF?^H*Kk1$7X>CIbu^OoMc zr8jTs&0Bi&>gkQ-3P>|KYPzi>p&-gv8EagrX#?2+C(1-bE_|JGNpSuBVzhDr@%7_84yrRNU75&snaSyloUs~7sD_mSm^_Roe(+)3(c00 zK!t51T}S8(y`7{&{pvNyL6DhYs>v#Wry77tg&d|7n!@bhFjxOqpr1X$ikZl22Gf_e zuw65VOKS#cx{2gwBy_Wlq_{1LA8 ztj#3Oqqn;fX~d*;+jWE3BM4;LFLYrDzZufLo$jg|WF8~T)EmIT94f>%Mw)rE~H zS9<#Iw~&e~4z9*h5uV$i%73pYXqSvEsh{)|q_UBRqaGHV3)3Q|F9RBwA~~GP_au%^ zdKw6)!0qWYL>qF16V#r10|{=n=9g({%Hk2G>pUJ;cKi6q~&-m*tr4Vr5}l z5F6V{nnzeJ_-85%vzUc&5sfr`PY#qv!n{N0${($EQ2)VM5NV^{wP6o-?V3eo@jf$c z+jcGXD_8GZ=PL;z=Q8-$Hl;X;OfpwV$YidP5L_ZR?1w8P1&hmVu|Z@I<)HAb-hg9< zMirHWCuM&n4Pu(Z8}cM~8g5!LT<3oB#S zK?%4r#ZmeOL-G(-yj(h57Hpv0+XD52h(dw=bHM>%Uk3slf&xG|n0-ir;!+97Zg~j7 zW2pZomCluc%G^9;#UayV2fjmWU>>95C^*nhYbDOVNuVFJg4u)w>EM7uVoaqn-RMpg zgaQZ&^B@>hfcRr~bWmrAadbdl@P%%i#sZXBv4CG53uJ{QN(d_!@Y5%;KF`GhO5|bz z{pSzjSOClnQ6him7rF(+`B)X7#7fVXMEXgR8S$BegSN?>&yF$8(>PyP!u$e(70R<$ z6{xF<*|DJUo7gdFiH%v7U>1hM72Yb!N3rJmFUv7$KKuYLbQ8PY-dJCXOWQcCIull? zGa+DL;E{3V3*A(|uWw!q&&uf`aOFGF+6AXFpG=Q^L&Rg)l> z^Sa>KJ;XyE%96U^VLW|tP5N0yBhCJjh=?B0>h2-!AQ^aMv>#}|BW>^~lutrU>Em~L zEXz1S=^rYo1`qL7{Q8s)9to?H<-kKox{GJPqY4#`!Um6@J*73^;Uf1~R)F(JC}zMz z3*b>q6~P0Ef(;&KzZsT3mc{zbBrXd%osV^T7=ee8-w{0d=dw0_ diff --git a/perl/t/tracks/vcf/raw/gnomad.genomes/test.vcf b/perl/t/tracks/vcf/raw/gnomad.genomes/test.vcf deleted file mode 100644 index aff5d7fca..000000000 --- a/perl/t/tracks/vcf/raw/gnomad.genomes/test.vcf +++ /dev/null @@ -1,200 +0,0 @@ -##fileformat=VCFv4.2 -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FILTER= -##FILTER== 20, DP >= 10, AB => 0.2 for het calls))"> -##FILTER= -##FILTER= -##FILTER= -##FILTER= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##reference=file:///seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta -#CHROM POS ID REF ALT QUAL FILTER INFO -chr22 15927888 . C CACA,G 1164.92 PASS AC=3,0;AF=1.81378e-04,0.00000e+00;AN=16540;BaseQRankSum=-2.24000e-01;ClippingRankSum=-2.63000e-01;DP=230703;FS=8.86700e+00;InbreedingCoeff=5.80000e-03;MQ=3.46800e+01;MQRankSum=1.43000e-01;QD=1.01300e+01;ReadPosRankSum=-2.13000e-01;SOR=3.30000e-02;VQSLOD=-1.37800e+00;VQSR_culprit=SOR;VQSR_NEGATIVE_TRAIN_SITE;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|1|0|0|4,0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0;DP_HIST_ALT=0|3|2|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0,0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|2|0|1|1|1|0|0|0|0|1|0|0|0|0,0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0;GQ_HIST_ALL=104|464|608|1985|2741|1441|2526|1859|721|1069|690|260|538|62|159|47|92|8|46|68;DP_HIST_ALL=1018|6123|5177|2097|791|161|97|24|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|2|0|1|1|1|0|1|0|0|1|0|0|0|0;AC_AFR=0,0;AC_AMR=0,0;AC_ASJ=0,0;AC_EAS=0,0;AC_FIN=0,0;AC_NFE=3,0;AC_OTH=0,0;AC_Male=3,0;AC_Female=0,0;AN_AFR=3614;AN_AMR=528;AN_ASJ=180;AN_EAS=892;AN_FIN=2256;AN_NFE=8466;AN_OTH=604;AN_Male=9308;AN_Female=7232;AF_AFR=0.00000e+00,0.00000e+00;AF_AMR=0.00000e+00,0.00000e+00;AF_ASJ=0.00000e+00,0.00000e+00;AF_EAS=0.00000e+00,0.00000e+00;AF_FIN=0.00000e+00,0.00000e+00;AF_NFE=3.54359e-04,0.00000e+00;AF_OTH=0.00000e+00,0.00000e+00;AF_Male=3.22303e-04,0.00000e+00;AF_Female=0.00000e+00,0.00000e+00;GC_AFR=1807,0,0,0,0,0;GC_AMR=264,0,0,0,0,0;GC_ASJ=90,0,0,0,0,0;GC_EAS=446,0,0,0,0,0;GC_FIN=1128,0,0,0,0,0;GC_NFE=4230,3,0,0,0,0;GC_OTH=302,0,0,0,0,0;GC_Male=4651,3,0,0,0,0;GC_Female=3616,0,0,0,0,0;AC_raw=6,1;AN_raw=30976;AF_raw=1.93698e-04,3.22831e-05;GC_raw=15481,6,0,1,0,0;GC=8267,3,0,0,0,0;Hom_AFR=0,0;Hom_AMR=0,0;Hom_ASJ=0,0;Hom_EAS=0,0;Hom_FIN=0,0;Hom_NFE=0,0;Hom_OTH=0,0;Hom_Male=0,0;Hom_Female=0,0;Hom_raw=0,0;Hom=0,0;POPMAX=NFE,.;AC_POPMAX=3,.;AN_POPMAX=8466,.;AF_POPMAX=3.54359e-04,.;DP_MEDIAN=10,8;DREF_MEDIAN=3.15558e-20,2.51189e-13;GQ_MEDIAN=99,72;AB_MEDIAN=4.41558e-01,6.25000e-01;AS_RF=5.16800e-01,3.29197e-01;AS_FilterStatus=PASS,RF|AC0;CSQ=C|intergenic_variant|MODIFIER||||||||||||||||2||||insertion|1|||||||||||||||||||||||||||||||||||||||||||| -chr22 15927876 . G A 108.57 PASS AC=1;AF=5.09632e-05;AN=19622;BaseQRankSum=-7.51000e-01;ClippingRankSum=-1.43000e-01;DP=255559;FS=0.00000e+00;InbreedingCoeff=-4.70000e-03;MQ=3.40100e+01;MQRankSum=-3.32000e-01;QD=9.87000e+00;ReadPosRankSum=6.60000e-02;SOR=9.90000e-02;VQSLOD=-3.45100e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1;DP_HIST_ALT=0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=90|295|443|1443|2398|1347|2546|2110|845|1318|842|356|727|79|237|59|159|7|90|99;DP_HIST_ALL=626|4935|5582|2741|1103|289|169|45|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=1;AC_OTH=0;AC_Male=1;AC_Female=0;AN_AFR=4696;AN_AMR=588;AN_ASJ=206;AN_EAS=1092;AN_FIN=2392;AN_NFE=9986;AN_OTH=662;AN_Male=10956;AN_Female=8666;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=1.00140e-04;AF_OTH=0.00000e+00;AF_Male=9.12742e-05;AF_Female=0.00000e+00;GC_AFR=2348,0,0;GC_AMR=294,0,0;GC_ASJ=103,0,0;GC_EAS=546,0,0;GC_FIN=1196,0,0;GC_NFE=4992,1,0;GC_OTH=331,0,0;GC_Male=5477,1,0;GC_Female=4333,0,0;AC_raw=1;AN_raw=30980;AF_raw=3.22789e-05;GC_raw=15489,1,0;GC=9810,1,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=NFE;AC_POPMAX=1;AN_POPMAX=9986;AF_POPMAX=1.00140e-04;DP_MEDIAN=11;DREF_MEDIAN=7.94328e-18;GQ_MEDIAN=99;AB_MEDIAN=5.45455e-01;AS_RF=4.14273e-01;AS_FilterStatus=PASS;CSQ=A|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chr22 15927837 . A C 215.56 PASS AC=1;AF=4.23837e-05;AN=23594;BaseQRankSum=1.45000e+00;ClippingRankSum=9.70000e-02;DP=287254;FS=0.00000e+00;InbreedingCoeff=-3.60000e-03;MQ=3.28600e+01;MQRankSum=2.26000e-01;QD=9.37000e+00;ReadPosRankSum=-4.20000e-01;SOR=2.75000e-01;VQSLOD=-3.70600e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1;DP_HIST_ALT=0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=74|217|284|953|1552|1124|2262|2199|980|1697|1212|502|1089|163|422|118|256|20|147|222;DP_HIST_ALL=374|3170|5440|3653|1801|617|309|107|16|5|1|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=1;AC_Male=1;AC_Female=0;AN_AFR=6398;AN_AMR=642;AN_ASJ=220;AN_EAS=1378;AN_FIN=2270;AN_NFE=11958;AN_OTH=728;AN_Male=13148;AN_Female=10446;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=1.37363e-03;AF_Male=7.60572e-05;AF_Female=0.00000e+00;GC_AFR=3199,0,0;GC_AMR=321,0,0;GC_ASJ=110,0,0;GC_EAS=689,0,0;GC_FIN=1135,0,0;GC_NFE=5979,0,0;GC_OTH=363,1,0;GC_Male=6573,1,0;GC_Female=5223,0,0;AC_raw=1;AN_raw=30986;AF_raw=3.22726e-05;GC_raw=15492,1,0;GC=11796,1,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=.;AC_POPMAX=.;AN_POPMAX=.;AF_POPMAX=.;DP_MEDIAN=23;DREF_MEDIAN=1.58489e-28;GQ_MEDIAN=99;AB_MEDIAN=3.47826e-01;AS_RF=7.79080e-01;AS_FilterStatus=PASS;CSQ=C|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chr22 15927835 . C T 149.65 PASS AC=1;AF=4.22297e-05;AN=23680;BaseQRankSum=1.22000e+00;ClippingRankSum=-1.07000e-01;DP=287265;FS=0.00000e+00;InbreedingCoeff=-3.20000e-03;MQ=3.78300e+01;MQRankSum=-1.07000e-01;QD=1.06900e+01;ReadPosRankSum=9.67000e-01;SOR=3.50000e-01;VQSLOD=-2.44000e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1;DP_HIST_ALT=0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0;GQ_HIST_ALL=64|220|294|926|1565|1084|2265|2217|996|1671|1225|488|1101|152|440|148|259|14|143|221;DP_HIST_ALL=374|3167|5435|3621|1823|620|325|106|16|5|1|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=1;AC_OTH=0;AC_Male=1;AC_Female=0;AN_AFR=6446;AN_AMR=636;AN_ASJ=220;AN_EAS=1394;AN_FIN=2274;AN_NFE=11990;AN_OTH=720;AN_Male=13182;AN_Female=10498;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=8.34028e-05;AF_OTH=0.00000e+00;AF_Male=7.58610e-05;AF_Female=0.00000e+00;GC_AFR=3223,0,0;GC_AMR=318,0,0;GC_ASJ=110,0,0;GC_EAS=697,0,0;GC_FIN=1137,0,0;GC_NFE=5994,1,0;GC_OTH=360,0,0;GC_Male=6590,1,0;GC_Female=5249,0,0;AC_raw=1;AN_raw=30986;AF_raw=3.22726e-05;GC_raw=15492,1,0;GC=11839,1,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=NFE;AC_POPMAX=1;AN_POPMAX=11990;AF_POPMAX=8.34028e-05;DP_MEDIAN=14;DREF_MEDIAN=6.30957e-22;GQ_MEDIAN=99;AB_MEDIAN=5.71429e-01;AS_RF=6.44340e-01;AS_FilterStatus=PASS;CSQ=T|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chr22 15927834 . G T 183.64 PASS AC=1;AF=4.21905e-05;AN=23702;BaseQRankSum=2.20000e+00;ClippingRankSum=-6.08000e-01;DP=287202;FS=0.00000e+00;InbreedingCoeff=-3.00000e-03;MQ=3.51700e+01;MQRankSum=2.36000e+00;QD=9.18000e+00;ReadPosRankSum=1.14000e+00;SOR=8.60000e-02;VQSLOD=-3.66200e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1;DP_HIST_ALT=0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=54|216|270|931|1532|1084|2270|2199|971|1682|1284|459|1155|157|434|133|260|21|158|223;DP_HIST_ALL=380|3164|5416|3618|1841|618|328|106|16|5|1|0|0|0|0|0|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0;AC_AFR=1;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=0;AC_Male=1;AC_Female=0;AN_AFR=6452;AN_AMR=638;AN_ASJ=222;AN_EAS=1398;AN_FIN=2270;AN_NFE=12010;AN_OTH=712;AN_Male=13204;AN_Female=10498;AF_AFR=1.54991e-04;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=0.00000e+00;AF_Male=7.57346e-05;AF_Female=0.00000e+00;GC_AFR=3225,1,0;GC_AMR=319,0,0;GC_ASJ=111,0,0;GC_EAS=699,0,0;GC_FIN=1135,0,0;GC_NFE=6005,0,0;GC_OTH=356,0,0;GC_Male=6601,1,0;GC_Female=5249,0,0;AC_raw=1;AN_raw=30986;AF_raw=3.22726e-05;GC_raw=15492,1,0;GC=11850,1,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=AFR;AC_POPMAX=1;AN_POPMAX=6452;AF_POPMAX=1.54991e-04;DP_MEDIAN=20;DREF_MEDIAN=2.51189e-25;GQ_MEDIAN=99;AB_MEDIAN=4.50000e-01;AS_RF=8.93688e-01;AS_FilterStatus=PASS;CSQ=T|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chr22 15927834 rs199856444 G C 1458410.68 PASS AC=5232;AF=2.00721e-01;AN=26066;BaseQRankSum=9.10000e-02;ClippingRankSum=0.00000e+00;DB;DP=379518;FS=1.42950e+01;InbreedingCoeff=-1.97800e-01;MQ=3.43800e+01;MQRankSum=5.53000e-01;QD=9.27000e+00;ReadPosRankSum=5.81000e-01;SOR=1.93600e+00;VQSLOD=-3.58400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=35|58|54|77|62|73|105|121|121|134|123|107|124|140|135|130|112|141|175|4217;DP_HIST_ALT=75|462|1286|1606|1270|749|409|209|87|54|23|10|4|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|96|323|534|762|776|697|765|409|526|278|241|166|133|147|151|77|13|0;GQ_HIST_ALL=449|206|192|333|415|337|693|898|568|1088|1019|567|1363|338|675|319|555|199|455|4792;DP_HIST_ALL=342|1228|2964|3911|3340|1799|1048|584|146|60|23|10|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|96|323|534|762|776|697|765|409|526|278|241|166|133|147|151|77|13|0;AC_AFR=1195;AC_AMR=199;AC_ASJ=48;AC_EAS=462;AC_FIN=539;AC_NFE=2634;AC_OTH=155;AC_Male=2860;AC_Female=2372;AN_AFR=7838;AN_AMR=630;AN_ASJ=216;AN_EAS=1372;AN_FIN=2596;AN_NFE=12638;AN_OTH=776;AN_Male=14358;AN_Female=11708;AF_AFR=1.52462e-01;AF_AMR=3.15873e-01;AF_ASJ=2.22222e-01;AF_EAS=3.36735e-01;AF_FIN=2.07627e-01;AF_NFE=2.08419e-01;AF_OTH=1.99742e-01;AF_Male=1.99192e-01;AF_Female=2.02597e-01;GC_AFR=2733,1177,9;GC_AMR=120,191,4;GC_ASJ=60,48,0;GC_EAS=228,454,4;GC_FIN=767,523,8;GC_NFE=3709,2586,24;GC_OTH=235,151,2;GC_Male=4350,2798,31;GC_Female=3502,2332,20;AC_raw=6394;AN_raw=30922;AF_raw=2.06778e-01;GC_raw=9217,6094,150;GC=7852,5130,51;Hom_AFR=9;Hom_AMR=4;Hom_ASJ=0;Hom_EAS=4;Hom_FIN=8;Hom_NFE=24;Hom_OTH=2;Hom_Male=31;Hom_Female=20;Hom_raw=150;Hom=51;POPMAX=EAS;AC_POPMAX=462;AN_POPMAX=1372;AF_POPMAX=3.36735e-01;DP_MEDIAN=18;DREF_MEDIAN=6.30957e-17;GQ_MEDIAN=99;AB_MEDIAN=3.84615e-01;AS_RF=5.10467e-01;AS_FilterStatus=PASS;CSQ=T|intergenic_variant|MODIFIER|||||||||||||||rs199856444|1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chr22 15927759 . T A,C 692.49 PASS AC=1,7;AF=3.55290e-05,2.48703e-04;AN=28146;BaseQRankSum=-1.66300e+00;ClippingRankSum=-6.10000e-02;DP=405801;FS=2.74010e+01;InbreedingCoeff=2.60000e-03;MQ=3.41700e+01;MQRankSum=-1.61300e+00;QD=1.10000e+00;ReadPosRankSum=-3.22000e-01;SOR=6.70000e-02;VQSLOD=-6.27400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1,1|1|1|3|1|0|0|0|2|3|1|1|0|1|1|1|0|0|0|2;DP_HIST_ALT=0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0,0|0|4|5|2|5|1|2|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0,0|0|5|7|2|3|2|0|0|0|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=75|210|161|374|461|353|797|1090|677|1437|1528|731|2217|413|1197|417|966|122|653|1602;DP_HIST_ALL=347|986|2229|3523|3708|2138|1370|889|199|50|27|9|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|5|7|2|3|2|0|1|0|0|0|0|0|0|0|0|0|0|0;AC_AFR=0,1;AC_AMR=1,0;AC_ASJ=0,1;AC_EAS=0,0;AC_FIN=0,3;AC_NFE=0,1;AC_OTH=0,1;AC_Male=0,3;AC_Female=1,4;AN_AFR=8282;AN_AMR=744;AN_ASJ=226;AN_EAS=1590;AN_FIN=2918;AN_NFE=13538;AN_OTH=848;AN_Male=15510;AN_Female=12636;AF_AFR=0.00000e+00,1.20744e-04;AF_AMR=1.34409e-03,0.00000e+00;AF_ASJ=0.00000e+00,4.42478e-03;AF_EAS=0.00000e+00,0.00000e+00;AF_FIN=0.00000e+00,1.02810e-03;AF_NFE=0.00000e+00,7.38662e-05;AF_OTH=0.00000e+00,1.17925e-03;AF_Male=0.00000e+00,1.93424e-04;AF_Female=7.91390e-05,3.16556e-04;GC_AFR=4140,0,0,1,0,0;GC_AMR=371,1,0,0,0,0;GC_ASJ=112,0,0,1,0,0;GC_EAS=795,0,0,0,0,0;GC_FIN=1456,0,0,3,0,0;GC_NFE=6768,0,0,1,0,0;GC_OTH=423,0,0,1,0,0;GC_Male=7752,0,0,3,0,0;GC_Female=6313,1,0,4,0,0;AC_raw=1,19;AN_raw=30962;AF_raw=3.22977e-05,6.13655e-04;GC_raw=15461,1,0,19,0,0;GC=14065,1,0,7,0,0;Hom_AFR=0,0;Hom_AMR=0,0;Hom_ASJ=0,0;Hom_EAS=0,0;Hom_FIN=0,0;Hom_NFE=0,0;Hom_OTH=0,0;Hom_Male=0,0;Hom_Female=0,0;Hom_raw=0,0;Hom=0,0;POPMAX=AMR,ASJ;AC_POPMAX=1,1;AN_POPMAX=744,226;AF_POPMAX=1.34409e-03,4.42478e-03;DP_MEDIAN=19,22;DREF_MEDIAN=3.16228e-18,1.99522e-05;GQ_MEDIAN=99,47;AB_MEDIAN=4.21053e-01,1.73913e-01;AS_RF=7.14240e-01,1.47824e-02;AS_FilterStatus=PASS,RF;AS_RF_NEGATIVE_TRAIN=1;CSQ=A|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|||||||||||||||||||||||||||||||||||||||||||||,C|intergenic_variant|MODIFIER||||||||||||||||2||||SNV||||||||||||||||||||||||||||||||||||||||||||| -chr22 15927755 . T G 296.53 NON_PASS AC=2;AF=7.06764e-05;AN=28298;BaseQRankSum=-4.54000e-01;ClippingRankSum=-7.13000e-01;DP=415781;FS=1.91800e+00;InbreedingCoeff=-2.80000e-03;MQ=3.76000e+01;MQRankSum=9.30000e-02;QD=6.18000e+00;ReadPosRankSum=6.51000e-01;SOR=1.19500e+00;VQSLOD=-2.46100e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|2;DP_HIST_ALT=0|0|0|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0|0;GQ_HIST_ALL=77|197|151|386|424|329|796|995|663|1392|1445|753|2211|438|1213|415|1061|122|696|1718;DP_HIST_ALL=325|927|2029|3382|3793|2212|1494|1006|220|51|28|9|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0|0;AC_AFR=2;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=0;AC_OTH=0;AC_Male=1;AC_Female=1;AN_AFR=8322;AN_AMR=748;AN_ASJ=226;AN_EAS=1598;AN_FIN=2966;AN_NFE=13580;AN_OTH=858;AN_Male=15596;AN_Female=12702;AF_AFR=2.40327e-04;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=0.00000e+00;AF_OTH=0.00000e+00;AF_Male=6.41190e-05;AF_Female=7.87278e-05;GC_AFR=4159,2,0;GC_AMR=374,0,0;GC_ASJ=113,0,0;GC_EAS=799,0,0;GC_FIN=1483,0,0;GC_NFE=6790,0,0;GC_OTH=429,0,0;GC_Male=7797,1,0;GC_Female=6350,1,0;AC_raw=2;AN_raw=30964;AF_raw=6.45911e-05;GC_raw=15480,2,0;GC=14147,2,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=AFR;AC_POPMAX=2;AN_POPMAX=8322;AF_POPMAX=2.40327e-04;DP_MEDIAN=24;DREF_MEDIAN=1.25594e-15;GQ_MEDIAN=99;AB_MEDIAN=3.57391e-01;AS_RF=5.53853e-01;AS_FilterStatus=PASS;CSQ=G|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| -chr22 15927745 . A C 718.20 PASS AC=2;AF=6.93049e-05;AN=28858;BaseQRankSum=-1.40100e+00;ClippingRankSum=3.61000e-01;DP=440172;FS=3.22100e+00;InbreedingCoeff=-1.50000e-03;MQ=3.87000e+01;MQRankSum=-7.30000e-02;QD=7.48000e+00;ReadPosRankSum=0.00000e+00;SOR=1.09300e+00;VQSLOD=-2.17400e+01;VQSR_culprit=MQ;GQ_HIST_ALT=0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|1|2;DP_HIST_ALT=0|1|0|0|0|2|1|0|0|0|0|0|0|0|0|0|0|0|0|0;AB_HIST_ALT=0|0|0|1|0|0|0|0|1|0|1|1|0|0|0|0|0|0|0|0;GQ_HIST_ALL=30|139|117|300|352|256|578|857|572|1259|1323|742|2332|396|1303|408|1234|111|867|2309;DP_HIST_ALL=235|803|1714|3101|3801|2363|1856|1262|252|55|28|9|4|1|0|1|0|0|0|0;AB_HIST_ALL=0|0|0|1|0|0|0|0|1|0|1|1|0|0|0|0|0|0|0|0;AC_AFR=0;AC_AMR=0;AC_ASJ=0;AC_EAS=0;AC_FIN=0;AC_NFE=2;AC_OTH=0;AC_Male=1;AC_Female=1;AN_AFR=8454;AN_AMR=782;AN_ASJ=232;AN_EAS=1606;AN_FIN=3132;AN_NFE=13774;AN_OTH=878;AN_Male=15900;AN_Female=12958;AF_AFR=0.00000e+00;AF_AMR=0.00000e+00;AF_ASJ=0.00000e+00;AF_EAS=0.00000e+00;AF_FIN=0.00000e+00;AF_NFE=1.45201e-04;AF_OTH=0.00000e+00;AF_Male=6.28931e-05;AF_Female=7.71724e-05;GC_AFR=4227,0,0;GC_AMR=391,0,0;GC_ASJ=116,0,0;GC_EAS=803,0,0;GC_FIN=1566,0,0;GC_NFE=6885,2,0;GC_OTH=439,0,0;GC_Male=7949,1,0;GC_Female=6478,1,0;AC_raw=4;AN_raw=30970;AF_raw=1.29157e-04;GC_raw=15481,4,0;GC=14427,2,0;Hom_AFR=0;Hom_AMR=0;Hom_ASJ=0;Hom_EAS=0;Hom_FIN=0;Hom_NFE=0;Hom_OTH=0;Hom_Male=0;Hom_Female=0;Hom_raw=0;Hom=0;POPMAX=NFE;AC_POPMAX=2;AN_POPMAX=13774;AF_POPMAX=1.45201e-04;DP_MEDIAN=27;DREF_MEDIAN=2.50594e-10;GQ_MEDIAN=96;AB_MEDIAN=4.72222e-01;AS_RF=5.74840e-01;AS_FilterStatus=PASS;CSQ=C|intergenic_variant|MODIFIER||||||||||||||||1||||SNV|1|||||||||||||||||||||||||||||||||||||||||||| diff --git a/perl/t/tracks/vcf/test.hg38.chr22.yml b/perl/t/tracks/vcf/test.hg38.chr22.yml deleted file mode 100644 index 07530ebef..000000000 --- a/perl/t/tracks/vcf/test.hg38.chr22.yml +++ /dev/null @@ -1,101 +0,0 @@ ---- -assembly: hg38 -build_author: ec2-user -build_date: 2017-08-08T03:49:00 -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA - program: bystro-vcf -chromosomes: - - chr22 -database_dir: t/tracks/vcf/index/ -files_dir: t/tracks/vcf/raw/ -statistics: - dbSNPnameField: dbSNP.name - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tab - tab: .statistics.tab - programPath: bystro-stats - refTrackField: ref - siteTypeField: refSeq.siteType -temp_dir: "~" -tracks: - tracks: - - name: ref - type: reference - - features: - - alt - - id - - trTv: number - - ac: number - - af: number - - an: number - - ac_afr: number - - ac_amr: number - - ac_asj: number - - ac_eas: number - - ac_fin: number - - ac_nfe: number - - ac_oth: number - - ac_male: number - - ac_female: number - - an_afr: number - - an_amr: number - - an_asj: number - - an_eas: number - - an_fin: number - - an_nfe: number - - an_oth: number - - an_male: number - - an_female: number - - af_afr: number - - af_amr: number - - af_asj: number - - af_eas: number - - af_fin: number - - af_nfe: number - - af_oth: number - - af_male: number - - af_female: number - build_row_filters: - AS_FilterStatus: == PASS - fieldMap: - AC: ac - AF: af - AN: an - AC_AFR: ac_afr - AC_AMR: ac_amr - AC_ASJ: ac_asj - AC_EAS: ac_eas - AC_FIN: ac_fin - AC_NFE: ac_nfe - AC_OTH: ac_oth - AC_Male: ac_male - AC_Female: ac_female - AN_AFR: an_afr - AN_AMR: an_amr - AN_ASJ: an_asj - AN_EAS: an_eas - AN_FIN: an_fin - AN_NFE: an_nfe - AN_OTH: an_oth - AN_Male: an_male - AN_Female: an_female - AF_AFR: af_afr - AF_AMR: af_amr - AF_ASJ: af_asj - AF_EAS: af_eas - AF_FIN: af_fin - AF_NFE: af_nfe - AF_OTH: af_oth - AF_Male: af_male - AF_Female: af_female - local_files: - - test.vcf - name: gnomad.genomes - type: vcf diff --git a/perl/t/tracks/vcf/test.scrambled_multiple_files.yml b/perl/t/tracks/vcf/test.scrambled_multiple_files.yml deleted file mode 100644 index 7b9962a02..000000000 --- a/perl/t/tracks/vcf/test.scrambled_multiple_files.yml +++ /dev/null @@ -1,101 +0,0 @@ ---- -assembly: hg38 -build_author: ec2-user -build_date: 2017-08-08T03:49:00 -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA - program: bystro-vcf -chromosomes: - - chr22 -database_dir: t/tracks/vcf/index/ -files_dir: t/tracks/vcf/raw/ -statistics: - dbSNPnameField: dbSNP.name - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tab - tab: .statistics.tab - programPath: bystro-stats - refTrackField: ref - siteTypeField: refSeq.siteType -temp_dir: "~" -tracks: - tracks: - - name: ref - type: reference - - features: - - alt - - id - - trTv: number - - ac: number - - af: number - - an: number - - ac_afr: number - - ac_amr: number - - ac_asj: number - - ac_eas: number - - ac_fin: number - - ac_nfe: number - - ac_oth: number - - ac_male: number - - ac_female: number - - an_afr: number - - an_amr: number - - an_asj: number - - an_eas: number - - an_fin: number - - an_nfe: number - - an_oth: number - - an_male: number - - an_female: number - - af_afr: number - - af_amr: number - - af_asj: number - - af_eas: number - - af_fin: number - - af_nfe: number - - af_oth: number - - af_male: number - - af_female: number - build_row_filters: - AS_FilterStatus: == PASS - fieldMap: - AC: ac - AF: af - AN: an - AC_AFR: ac_afr - AC_AMR: ac_amr - AC_ASJ: ac_asj - AC_EAS: ac_eas - AC_FIN: ac_fin - AC_NFE: ac_nfe - AC_OTH: ac_oth - AC_Male: ac_male - AC_Female: ac_female - AN_AFR: an_afr - AN_AMR: an_amr - AN_ASJ: an_asj - AN_EAS: an_eas - AN_FIN: an_fin - AN_NFE: an_nfe - AN_OTH: an_oth - AN_Male: an_male - AN_Female: an_female - AF_AFR: af_afr - AF_AMR: af_amr - AF_ASJ: af_asj - AF_EAS: af_eas - AF_FIN: af_fin - AF_NFE: af_nfe - AF_OTH: af_oth - AF_Male: af_male - AF_Female: af_female - local_files: - - test_split_part*.vcf* - name: gnomad.genomes.scrambled - type: vcf diff --git a/perl/t/utils/dbsnp2FormatInfo.t b/perl/t/utils/dbsnp2FormatInfo.t deleted file mode 100644 index 42d85a9ee..000000000 --- a/perl/t/utils/dbsnp2FormatInfo.t +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; -use Test::More; - -use Path::Tiny; -use YAML::XS qw/DumpFile/; - -use Utils::DbSnp2FormatInfo; - -# create temp directories -my $db_dir = Path::Tiny->tempdir(); -my $raw_dir = Path::Tiny->tempdir(); - -my $vcf_path = $raw_dir->child('test.vcf')->stringify; -my $expected_output_vcf_path = - $raw_dir->child('dbSNP/test_processed.vcf')->stringify; - -my $config = { - 'assembly' => 'hg38', - 'chromosomes' => ['chr1'], - 'database_dir' => $db_dir->stringify, - 'files_dir' => $raw_dir->stringify, - 'tracks' => { - 'tracks' => [ - { - 'local_files' => [$vcf_path], - 'name' => 'dbSNP', - 'sorted' => 1, - 'type' => 'vcf', - 'utils' => [ { 'name' => 'DbSnp2FormatInfo' } ] - } - ] - } -}; - -# write temporary config file -my $config_file = $raw_dir->child('filterCadd.yml'); -DumpFile( $config_file, $config ); - -# Prepare a sample VCF for testing -my $vcf_data = "##fileformat=VCFv4.1\n"; -$vcf_data .= "##INFO=\n"; -$vcf_data .= - "##INFO=\n"; -$vcf_data .= - "##INFO=\n"; -$vcf_data .= "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; -$vcf_data .= - "NC_000001.11\t10001\trs1570391677\tT\tA,C\t.\t.\tRS=1570391677;dbSNPBuildID=154;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;FREQ=KOREAN:0.9891,0.0109,.|SGDP_PRJ:0,1,.|dbGaP_PopFreq:1,.,0\n"; -$vcf_data .= - "NC_000001.11\t10002\trs1570391692\tA\tC\t.\t.\tRS=1570391692;dbSNPBuildID=154;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;FREQ=KOREAN:0.9944,0.005597\n"; -# What happens if we get a field after the freq field? -$vcf_data .= - "NC_000001.11\t10002\trs1570391692\tA\tC\t.\t.\tRS=1570391692;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;FREQ=SOMEOTHER:0.99,0.01;dbSNPBuildID=154"; -# Write sample VCF to a temporary file -open my $fh, '>', $vcf_path or die "Could not open $vcf_path: $!"; -say $fh $vcf_data; -close $fh; - -# Initialize the utility and process the VCF -my $utility = Utils::DbSnp2FormatInfo->new( - { - config => $config_file, - name => 'dbSNP', - utilName => 'DbSnp2FormatInfo' - } -); - -$utility->go($vcf_path); - -# Check that the processed file exists and is correctly formatted -ok( -e $expected_output_vcf_path, "Processed VCF file exists" ); - -# Read the processed file to check the INFO field -$fh = path($expected_output_vcf_path)->openr; - -my @lines = <$fh>; - -ok( $lines[0] eq "##fileformat=VCFv4.1\n", 'VCF fileformat is correctly processed' ); -ok( $lines[1] eq "##INFO=\n", - 'RS population is correctly processed' ); -ok( - $lines[2] eq - "##INFO=\n", - 'dbSNPBuildID population is correctly processed' -); -ok( - $lines[3] eq - "##INFO=\n", - 'SSR population is correctly processed' -); -ok( - $lines[4] eq - "##INFO=\n", - 'KOREAN population is correctly processed' -); -ok( - $lines[5] eq - "##INFO=\n", - 'SGDP_PRJ population is correctly processed' -); -ok( - $lines[6] eq - "##INFO=\n", - 'dbGaP_PopFreq population is correctly processed' -); -ok( - $lines[7] eq - "##INFO=\n", - 'SOMEOTHER population is correctly processed' -); -ok( $lines[8] eq "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" ); - -ok( - $lines[9] eq - "NC_000001.11\t10001\trs1570391677\tT\tA,C\t.\t.\tRS=1570391677;dbSNPBuildID=154;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;KOREAN=0.0109,.;SGDP_PRJ=1,.;dbGaP_PopFreq=.,0\n", - '1st data row with KOREAN, SGDP_PRJ, dbGap freqs are correctly processed' -); -ok( - $lines[10] eq - "NC_000001.11\t10002\trs1570391692\tA\tC\t.\t.\tRS=1570391692;dbSNPBuildID=154;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;KOREAN=0.005597\n", - '2nd data row with KOREAN freq is correctly processed' -); -ok( - $lines[11] eq - "NC_000001.11\t10002\trs1570391692\tA\tC\t.\t.\tRS=1570391692;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;SOMEOTHER=0.01;dbSNPBuildID=154\n", - '2nd data row with SOMEOTHER freq is correctly processed' -); - -done_testing(); diff --git a/perl/t/utils/filterCadd.t b/perl/t/utils/filterCadd.t deleted file mode 100644 index a5858603a..000000000 --- a/perl/t/utils/filterCadd.t +++ /dev/null @@ -1,254 +0,0 @@ -use strict; -use warnings; -use 5.10.0; - -use Test::More; -use lib 't/lib'; -use TestUtils qw/ CopyAll /; - -use Path::Tiny; -use YAML::XS qw/LoadFile DumpFile/; - -use Seq::DBManager; -use Seq::Tracks::Cadd; -use Seq::Tracks::Reference; -use Seq::Tracks::Score::Build::Round; -use Utils::FilterCadd; - -# create temp directories -my $db_dir = Path::Tiny->tempdir(); -my $base_dir = Path::Tiny->tempdir(); -my $raw_dir = $base_dir->child('raw'); - -$raw_dir->mkpath; - -# Copy test contents to temporary directory -CopyAll( 't/utils/raw', $raw_dir->stringify ); - -my $config = { - 'assembly' => 'hg38', - 'chromosomes' => [ 'chr22', 'chr1', 'chr2' ], - 'database_dir' => $db_dir->stringify, - 'files_dir' => $raw_dir->stringify, - 'tracks' => { - 'tracks' => [ - { - 'assembly' => 'hg38', - 'chromosomes' => [ 'chr22', 'chr1', 'chr2' ], - 'name' => 'ref', - 'type' => 'reference' - }, - { - 'assembly' => 'hg38', - 'build_date' => '2017-04-22T05:22:00', - 'caddToBed_date' => '2017-01-19T04:37:00', - 'chromosomes' => [ 'chr22', 'chr1', 'chr2' ], - 'fetch_completed' => '2023-10-17T21:55:00', - 'filterCadd_completed' => '2023-05-26T14:05:00', - 'filterCadd_date' => '2017-09-12T19:18:00', - 'local_files' => [ - 'test.filterCadd.cadd.chr22.txt', 'test.filterCadd.cadd.chr1.txt.gz', - 'test.filterCadd.cadd.chr2.txt' - ], - 'name' => 'cadd', - 'sortCadd_date' => '2017-01-20T23:53:00', - 'sorted' => 1, - 'type' => 'cadd' - } - ] - } -}; - -# write temporary config file -my $config_file = $base_dir->child('filterCadd.yml'); -DumpFile( $config_file, $config ); - -Seq::DBManager::initialize( { databaseDir => $config->{database_dir}, } ); - -my $db = Seq::DBManager->new(); - -my $ref = Seq::Tracks::Reference->new( $config->{tracks}{tracks}[0] ); -my $cadd = Seq::Tracks::Cadd->new( $config->{tracks}{tracks}[1] ); - -my $rounder = Seq::Tracks::Score::Build::Round->new( { scalingFactor => 10 } ); - -# chr22 10584987 10584988 C A -0.003351 2.554 -# chr22 10584987 10584988 C G -0.145476 1.416 -# chr22 10584987 10584988 C T -0.050851 2.124 -$db->dbPatch( 'chr22', $cadd->dbName, 10584987, - [ $rounder->round(2.554), $rounder->round(1.416), $rounder->round(2.124) ] ); - -# chr1 10005 10006 C A 0.185685 4.528 -# chr1 10005 10006 C G -0.025782 2.345 -# chr1 10005 10006 C T 0.089343 3.494 -$db->dbPatch( 'chr1', $cadd->dbName, 10005, - [ $rounder->round(4.528), $rounder->round(2.345), $rounder->round(3.494) ] ); - -# chr2 10002 10003 T A 0.370069 6.349 -# chr2 10002 10003 T C 0.094635 3.551 -# chr2 10002 10003 T G 0.210401 4.788 -$db->dbPatch( 'chr2', $cadd->dbName, 10002, - [ $rounder->round(6.349), $rounder->round(3.551), $rounder->round(4.788) ] ); - -# $db->dbPatch('chr22', $ref->dbName, 10584987, 2); -# $db->dbPatch('chr22', $ref->dbName, 10584988, 3); -# $db->dbPatch('chr22', $ref->dbName, 10584989, 1); -# $db->dbPatch('chr22', $ref->dbName, 10584990, 3); -# $db->dbPatch('chr22', $ref->dbName, 10584991, 4); -# $db->dbPatch('chr22', $ref->dbName, 10584992, 4); -# $db->dbPatch('chr22', $ref->dbName, 10584993, 3); - -my $filter = Utils::FilterCadd->new( - { - config => $config_file, - name => 'cadd', - maxThreads => 1, - utilName => 'fetch', - compress => 0 - } -); - -my $success = $filter->go(); - -my $fh = $raw_dir->child('cadd/test.filterCadd.cadd.chr22.chr22.filtered.txt') - ->filehandle('<'); - -my $header = <$fh>; -$header .= <$fh>; - -my $count = 0; -while (<$fh>) { - chomp; - - my @fields = split '\t', $_; - - ok( $fields[0] eq 'chr22', "maintains chrom" ); - ok( $fields[1] == 10584987, "maintains chromStart" ); - ok( $fields[-4] eq "C", "maintains ref" ); - ok( $fields[-5] == 10584988, "maintains chromEnd" ); - - if ( $. == 3 ) { - ok( $fields[-1] == 2.554, "keeps 1st allele score order" ); - ok( $fields[-3] eq "A", "keeps 1st allele order" ); - } - elsif ( $. == 4 ) { - ok( $fields[-1] == 1.416, "keeps 2nd allele score order" ); - ok( $fields[-3] eq "G", "keeps 2nd allele order" ); - } - elsif ( $. == 5 ) { - ok( $fields[-1] == 2.124, "keeps 3rd allele score order" ); - ok( $fields[-3] eq "T", "keeps 3rd allele order" ); - } - - $count = $.; -} - -close($fh); - -ok( $count == 5, "found expectd number of lines" ); - -ok( $success == 1, "exited cleanly" ); - -$config = LoadFile( $config_file->stringify ); - -my $caddTrack = $config->{tracks}{tracks}[1]; - -$fh = $raw_dir->child('cadd/test.filterCadd.cadd.chr1.chr1.filtered.txt') - ->filehandle('<'); - -$header = <$fh>; -$header .= <$fh>; - -$count = 0; -while (<$fh>) { - chomp; - - my @fields = split '\t', $_; - - ok( $fields[0] eq 'chr1', "maintains chrom" ); - ok( $fields[1] == 10005, "maintains chromStart" ); - ok( $fields[-4] eq "C", "maintains ref" ); - ok( $fields[-5] == 10006, "maintains chromEnd" ); - - if ( $. == 3 ) { - ok( $fields[-1] == 4.528, "keeps 1st allele score order" ); - ok( $fields[-3] eq "A", "keeps 1st allele order" ); - } - elsif ( $. == 4 ) { - ok( $fields[-1] == 2.345, "keeps 2nd allele score order" ); - ok( $fields[-3] eq "G", "keeps 2nd allele order" ); - } - elsif ( $. == 5 ) { - ok( $fields[-1] == 3.494, "keeps 3rd allele score order" ); - ok( $fields[-3] eq "T", "keeps 3rd allele order" ); - } - - $count = $.; -} - -close($fh); - -ok( $count == 5, "found expected number of lines" ); - -ok( $success == 1, "exited cleanly" ); - -$config = LoadFile( $config_file->stringify ); - -$caddTrack = $config->{tracks}{tracks}[1]; - -$fh = $raw_dir->child('cadd/test.filterCadd.cadd.chr2.chr2.filtered.txt') - ->filehandle('<'); - -$header = <$fh>; -$header .= <$fh>; - -$count = 0; -while (<$fh>) { - chomp; - - my @fields = split '\t', $_; - - ok( $fields[0] eq 'chr2', "maintains chrom" ); - ok( $fields[1] == 10002, "maintains chromStart" ); - ok( $fields[-4] eq "T", "maintains ref" ); - ok( $fields[-5] == 10003, "maintains chromEnd" ); - - if ( $. == 3 ) { - ok( $fields[-1] == 6.349, "keeps 1st allele score order" ); - ok( $fields[-3] eq "A", "keeps 1st allele order" ); - } - elsif ( $. == 4 ) { - ok( $fields[-1] == 3.551, "keeps 2nd allele score order" ); - ok( $fields[-3] eq "C", "keeps 2nd allele order" ); - } - elsif ( $. == 5 ) { - ok( $fields[-1] == 4.788, "keeps 3rd allele score order" ); - ok( $fields[-3] eq "G", "keeps 3rd allele order" ); - } - - $count = $.; -} - -close($fh); - -ok( $count == 5, "found expected number of lines" ); - -ok( $success == 1, "exited cleanly" ); - -$config = LoadFile( $config_file->stringify ); - -$caddTrack = $config->{tracks}{tracks}[1]; - -ok( $caddTrack->{filterCadd_date}, "has non-null filterCadd_date property" ); - -is_deeply( - $caddTrack->{local_files}, - [ - "test.filterCadd.cadd.chr22.chr22.filtered.txt", - "test.filterCadd.cadd.chr1.chr1.filtered.txt", - "test.filterCadd.cadd.chr2.chr2.filtered.txt" - ], - "expected filtered CADD files." -); - -done_testing(); diff --git a/perl/t/utils/raw/cadd/test.filterCadd.cadd.chr1.txt.gz b/perl/t/utils/raw/cadd/test.filterCadd.cadd.chr1.txt.gz deleted file mode 100644 index ad016d9515288f9e8a73eaca7943ac1a8d2c1b53..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 466 zcmV;@0WJO?iwFp?P^4J^19W9`bS`FTY;WG zn(bCzjRN`Q@KydChif-4L;8@@N&ae=ei+Yb8f7~kdfbNXC9k4LfV zPNKOav1|Xn9j3VxKVEmw+x&wOM)OzHh`nfpRsddu6HaM#HRg#KgJ1~-D(#`hyf9<6 z#b}uEnpWmhEjyKSq9yN`aoW_Fe2(z}9592VYs@}pMvuJZa` z4a#PDi72e15NpgnA7luabsolute->parent; - -# Calculate the path back to the 'bystro/' directory -my $bystro_dir = $test_script_dir->parent->parent->parent; - -# Get the script path using Path::Tiny, which is in lib/Utils/scripts -my $script_path = - $bystro_dir->child( 'lib', 'Utils', 'scripts', 'split_vcf_by_chr.pl' ); - -my $vcf_file_to_split = $test_script_dir->child('vcf_example.vcf'); - -my $split_command = "perl " . $script_path->stringify . " " . $vcf_file_to_split; -system($split_command) == 0 or die "Failed to execute $split_command: $!"; - -my @expected_chromosomes = ( '1', '2', '3' ); -my $counts_of_entries = { - '1' => 3, - '2' => 2, - '3' => 1 -}; - -# Verify each output file -my @expected_header_without_contig = ( - '##fileformat=VCFv4.2', - "##INFO=" -); -foreach my $chrom (@expected_chromosomes) { - my $output_file = $vcf_file_to_split->parent->child("vcf_example.vcf.$chrom.vcf"); - my @expected_header_with_contig = @expected_header_without_contig; - push( @expected_header_with_contig, "##contig=" ); - push( @expected_header_with_contig, "#CHROM POS ID REF ALT QUAL FILTER INFO" ); - - ok( -e $output_file, "$output_file exists" ); - - # Open the file and verify its contents - # This is a basic check; you'll need to adjust it based on your expected data - open( my $fh, '<', $output_file ) - or die "Could not open file '$output_file' $!"; - my $header_tested; - my @headers; - - my $entry_count = 0; - while ( my $line = <$fh> ) { - chomp $line; - - # Accumulate headers - if ( $line =~ /^#/ ) { - push( @headers, $line ); - next; - } - - if ( !$header_tested ) { - is_deeply( - \@headers, - \@expected_header_with_contig, - "Header for $chrom matches expected" - ); - $header_tested = 1; - } - - my ($chrom_from_file) = split( /\t/, $line ); - is( $chrom_from_file, $chrom, "Line chromosome matches expected: $chrom" ); - - $entry_count += 1; - } - - is( - $entry_count, - $counts_of_entries->{$chrom}, - "Entry count for $chrom matches expected" - ); - close $fh; - - # Clean up file - unlink($output_file); -} - -done_testing(); diff --git a/perl/t/utils/scripts/vcf_example.vcf b/perl/t/utils/scripts/vcf_example.vcf deleted file mode 100644 index 6cacd7aa4..000000000 --- a/perl/t/utils/scripts/vcf_example.vcf +++ /dev/null @@ -1,9 +0,0 @@ -##fileformat=VCFv4.2 -##INFO= -#CHROM POS ID REF ALT QUAL FILTER INFO -2 200 . A T . . AF=0.01 -1 100 . G C . . AF=0.02 -3 300 . T G . . AF=0.03 -2 5000 . C G . . AF=0.02 -1 1000 . G C . . AF=0.02 -1 1001 . G C . . AF=0.02 \ No newline at end of file diff --git a/perl/t/utils/sqlWriter.t b/perl/t/utils/sqlWriter.t deleted file mode 100644 index bf398cd8e..000000000 --- a/perl/t/utils/sqlWriter.t +++ /dev/null @@ -1,147 +0,0 @@ -use 5.10.0; -use strict; -use warnings; - -use Test::More; - -use Path::Tiny; - -use Utils::SqlWriter; - -my $out_dir = Path::Tiny->tempdir(); -my $db = 'hg19'; - -my %config = ( - sql => "SELECT r.*, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.kgID, '')) SEPARATOR - ';') FROM kgXref x WHERE x.refseq=r.name) AS kgID, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.description, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS description, - (SELECT GROUP_CONCAT(DISTINCT(NULLIF(e.value, '')) SEPARATOR ';') FROM knownToEnsembl - e JOIN kgXref x ON x.kgID = e.name WHERE x.refseq = r.name) AS ensemblID, - (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.tRnaName, '')) SEPARATOR ';') FROM - kgXref x WHERE x.refseq=r.name) AS tRnaName, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.spID, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS spID, (SELECT - GROUP_CONCAT(DISTINCT(NULLIF(x.spDisplayID, '')) SEPARATOR ';') FROM kgXref - x WHERE x.refseq=r.name) AS spDisplayID, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.protAcc, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS protAcc, (SELECT - GROUP_CONCAT(DISTINCT(NULLIF(x.mRNA, '')) SEPARATOR ';') FROM kgXref x WHERE - x.refseq=r.name) AS mRNA, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.rfamAcc, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS rfamAcc FROM - refGene r WHERE r.name='NM_019046' OR r.name='NM_001009943' OR r.name='NM_001009941';", - connection => { - database => $db, - host => 'genome-mysql.soe.ucsc.edu', - user => 'genome', - port => '3306' - }, - outputDir => $out_dir->stringify, - compress => 0, -); - -my $sqlWriter = Utils::SqlWriter->new( \%config ); - -$sqlWriter->go(); - -my $exp = $out_dir->child("$db.kgXref.fetch.txt")->stringify; - -open( my $fh, '<', $exp ); - -my @stuff = <$fh>; - -ok( @stuff == 4, "Ok, got the expected number of rows" ); - -chomp @stuff; - -my @rows; - -for my $r (@stuff) { - push @rows, [ split '\t', $r ]; -} - -my @head = @{ $rows[0] }; - -# We expect -# [ -# [0] "bin", -# [1] "name", -# [2] "chrom", -# [3] "strand", -# [4] "txStart", -# [5] "txEnd", -# [6] "cdsStart", -# [7] "cdsEnd", -# [8] "exonCount", -# [9] "exonStarts", -# [10] "exonEnds", -# [11] "score", -# [12] "name2", -# [13] "cdsStartStat", -# [14] "cdsEndStat", -# [15] "exonFrames", -# [16] "kgID", -# [17] "description", -# [18] "ensemblID", -# [19] "tRnaName", -# [20] "spID", -# [21] "spDisplayID", -# [22] "protAcc", -# [23] "mRNA", -# [24] "rfamAcc" -# ] - -ok( @head == 25, "The first line is a header" ); - -my $idx = 0; -for my $f (@head) { - if ( $f eq 'name' ) { - last; - } - - $idx++; -} - -my @tx = sort { $a cmp $b } ( $rows[1][$idx], $rows[2][$idx], $rows[3][$idx] ); -my @exp = sort { $a cmp $b } ( 'NM_019046', 'NM_001009943', 'NM_001009941' ); - -ok( join( "\t", @tx ) eq join( "\t", @exp ), "Find expected tx" ); - -$out_dir->remove_tree(); - -close $fh; - -%config = ( - sql => "SELECT r.*, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.kgID, '')) SEPARATOR - ';') FROM kgXref x WHERE x.refseq=r.name) AS kgID, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.description, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS description, - (SELECT GROUP_CONCAT(DISTINCT(NULLIF(e.value, '')) SEPARATOR ';') FROM knownToEnsembl - e JOIN kgXref x ON x.kgID = e.name WHERE x.refseq = r.name) AS ensemblID, - (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.tRnaName, '')) SEPARATOR ';') FROM - kgXref x WHERE x.refseq=r.name) AS tRnaName, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.spID, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS spID, (SELECT - GROUP_CONCAT(DISTINCT(NULLIF(x.spDisplayID, '')) SEPARATOR ';') FROM kgXref - x WHERE x.refseq=r.name) AS spDisplayID, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.protAcc, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS protAcc, (SELECT - GROUP_CONCAT(DISTINCT(NULLIF(x.mRNA, '')) SEPARATOR ';') FROM kgXref x WHERE - x.refseq=r.name) AS mRNA, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.rfamAcc, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS rfamAcc FROM - refGene r WHERE r.name='Ndjfalkjsdlkajf';", - connection => { - database => $db, - host => 'genome-mysql.soe.ucsc.edu', - user => 'genome', - port => '3306' - }, - outputDir => $out_dir->stringify, - compress => 0, -); - -$sqlWriter = Utils::SqlWriter->new( \%config ); - -$sqlWriter->go(); - -$exp = $out_dir->child("$db.kgXref.fetch.txt")->stringify; - -ok( !-e $exp, "No file generated when empty query" ); - -done_testing(); - -1; diff --git a/python/python/bystro/ancestry/__init__.py b/python/python/bystro/ancestry/__init__.py deleted file mode 100644 index 580f272f5..000000000 --- a/python/python/bystro/ancestry/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -"""Provide ancestry module for classifying ancestry from genotype. - -Module's functionality to cover: - -- Training models and storing model artifacts -- Loading model artifacts and performing inference -- Orchestrating inference jobs within bystro -""" - -__version__ = "0.0.1" diff --git a/python/python/bystro/ancestry/adversarial_autoencoder.py b/python/python/bystro/ancestry/adversarial_autoencoder.py deleted file mode 100644 index 40b9f9aed..000000000 --- a/python/python/bystro/ancestry/adversarial_autoencoder.py +++ /dev/null @@ -1,419 +0,0 @@ -""" -Background ----------- -Adversarial autoencoders (AAEs) represent a novel approach to -unsupervised learning by combining the principles of autoencoders -with adversarial networks. Traditional autoencoders aim to compress -input data into a lower-dimensional latent space and then reconstruct -the original input from this compressed representation. AAEs introduce -an adversarial training component by incorporating a generative -adversarial network (GAN) into the autoencoder architecture. The -adversarial network's role is to discriminate between the encoded -latent representations and samples drawn from a predefined distribution. -This adversarial process encourages the autoencoder to produce latent -representations that closely resemble samples from the specified -distribution, thereby promoting the learning of a more structured and -meaningful latent space. - -The adversarial component in AAEs helps overcome some limitations of -standard autoencoders, such as mode collapse and lack of diversity in -the generated samples. By introducing adversarial training, AAEs can -learn a more robust and continuous latent space that captures the -underlying structure of the input data. This combination of autoencoder -and GAN principles makes adversarial autoencoders a powerful tool for -tasks like data generation, anomaly detection, and representation -learning, where learning a meaningful and compact latent representation -is crucial for effective performance. - -This implements an adversarial encoder and the objects - -Objects -------- -Encoder(nn.Module) - This provides a deterministic function - latent variables = encoder(data) - -Decoder(nn.Module) - This provides a deterministic function - reconstructed data = decoder(latent_variables) - -Discriminator(nn.Module) - This defines a neural network that distinguishes - latent variables computed from our encoder on observed - data and data from a synthetic distribution - - -AdversarialAutoencoder - This fits an adversarial autoencoder given data - - -Methods -------- -None -""" -from typing import Any -import numpy as np -from numpy.typing import NDArray - -import torch -from torch import tensor -import torch.nn as nn -from torch.autograd import Variable -from itertools import chain - -from tqdm import trange - -from sklearn.mixture import GaussianMixture - -Tensor = torch.FloatTensor - - -class Encoder(nn.Module): - """ - This provides a deterministic function - - latent variables = encoder(data) - - Unlike a VAE, this is a deterministic rather than stochastic - mapping. However, it is forced to approximate a distribution - due to the adversarial loss with training - """ - - def __init__(self, observation_dimension, n_components, encoder_options): - super().__init__() - eo = encoder_options - - self.layers = nn.Sequential( - nn.Linear(observation_dimension, eo["n_nodes"]), - nn.LeakyReLU(0.2), - nn.Linear(eo["n_nodes"], eo["n_nodes"]), - nn.BatchNorm1d(eo["n_nodes"]), - nn.LeakyReLU(0.2), - nn.Linear(eo["n_nodes"], n_components), - ) - - def forward(self, x): - z = self.layers(x) - return z - - -class Decoder(nn.Module): - """ - This provides a deterministic function - - reconstructed data = decoder(latent_variables) - """ - - def __init__(self, observation_dimension, n_components, decoder_options): - super().__init__() - do = decoder_options - - self.model = nn.Sequential( - nn.Linear(n_components, do["n_nodes"]), - nn.LeakyReLU(0.2, inplace=True), - nn.Linear(do["n_nodes"], do["n_nodes"]), - nn.BatchNorm1d(do["n_nodes"]), - nn.LeakyReLU(0.2, inplace=True), - nn.Linear(do["n_nodes"], observation_dimension), - nn.Tanh(), - ) - - def forward(self, z): - x = self.model(z) - return x - - -class Discriminator(nn.Module): - """ - This defines a neural network that distinguishes - latent variables computed from our encoder on observed - data and data from a synthetic distribution - """ - - def __init__(self, n_components, discriminator_options): - super().__init__() - do = discriminator_options - - self.model = nn.Sequential( - nn.Linear(n_components, do["n_nodes"]), - nn.LeakyReLU(0.2), - nn.Linear(do["n_nodes"], do["n_nodes2"]), - nn.LeakyReLU(0.2), - nn.Linear(do["n_nodes2"], 1), - nn.Sigmoid(), - ) - - def forward(self, z): - predictions = self.model(z) - return predictions - - -class AdversarialAutoencoder: - """ - This implements an adversarial autoencoder - """ - - def __init__( - self, - n_components, - training_options: dict[str, Any] | None = None, - latent_distribution_options: dict[str, Any] | None = None, - encoder_options: dict[str, Any] | None = None, - decoder_options: dict[str, Any] | None = None, - discriminator_options: dict[str, Any] | None = None, - ): - self.n_components = int(n_components) - - self.encoder: Encoder | None = None - self.decoder: Decoder | None = None - - if training_options is None: - training_options = {} - if encoder_options is None: - encoder_options = {} - if decoder_options is None: - decoder_options = {} - if discriminator_options is None: - discriminator_options = {} - if latent_distribution_options is None: - latent_distribution_options = {} - - self._fill_training_options(training_options) - self._fill_model_options( - latent_distribution_options, - encoder_options, - decoder_options, - discriminator_options, - ) - - def fit(self, X, seed=2021): - N, self.p = X.shape - rng = np.random.default_rng(int(seed)) - X_ = tensor(X, dtype=torch.float) - lamb = self.training_options["lambda"] - - n_iterations = int(self.training_options["n_iterations"]) - batch_size = int(self.training_options["batch_size"]) - - encoder = Encoder(self.p, self.n_components, self.encoder_options) - decoder = Decoder(self.p, self.n_components, self.decoder_options) - discriminator = Discriminator( - self.n_components, self.discriminator_options - ) - - adversarial_loss = nn.BCELoss() - generative_loss = nn.MSELoss() - - # Using chain to combine parameters from both models - trainable_variables_g = chain( - encoder.parameters(), decoder.parameters() - ) - trainable_variables_d = discriminator.parameters() - - optimizer_G = torch.optim.Adam( - trainable_variables_g, - lr=self.training_options["learning_rate"], - betas=(self.training_options["b1"], self.training_options["b2"]), - ) - optimizer_D = torch.optim.Adam( - trainable_variables_d, - lr=self.training_options["learning_rate"], - betas=(self.training_options["b1"], self.training_options["b2"]), - ) - - ones = Variable( - Tensor(batch_size, 1).fill_(1.0), - requires_grad=False, - ) - zeros = Variable( - Tensor(batch_size, 1).fill_(0.0), - requires_grad=False, - ) - - self.losses_generative = np.zeros(n_iterations) - self.losses_discriminative = np.zeros(n_iterations) - - XX = rng.normal(scale=.3,size=(10000,self.n_components)) - XX[:3300,0] += 5 - XX[3300:6600,0] += -5 - gmm = GaussianMixture(3) - gmm.fit(XX) - - for i in trange(n_iterations): - idx = rng.choice( - N, size=batch_size, replace=False - ) - X_batch = X_[tensor(idx)] - Z = encoder(X_batch) - X_recon = decoder(Z) - - optimizer_G.zero_grad() - optimizer_D.zero_grad() - - prediction_real_data = discriminator(Z) - gloss = generative_loss(X_recon, X_batch) - dloss = adversarial_loss(prediction_real_data, ones) - G_loss = lamb * gloss + (1 - lamb) * dloss - G_loss.backward() - optimizer_G.step() - - samples,_ = gmm.sample(n_samples=batch_size) - real_z = Variable(tensor(samples.astype(np.float32))) - - real_loss = adversarial_loss(discriminator(real_z), ones) - fake_loss = adversarial_loss(discriminator(Z.detach()), zeros) - - D_loss = 0.5 * (real_loss + fake_loss) - - D_loss.backward() - optimizer_D.step() - - self.losses_generative[i] = gloss.item() - self.losses_discriminative[i] = dloss.item() - - self.encoder = encoder - self.decoder = decoder - self.discriminator = discriminator - - return self - - def transform(self, X: NDArray) -> NDArray[np.float_]: - """ - This returns the latent variable estimates given X - - Parameters - ---------- - X : NDArray,(N_samples,p) - The data to transform. - - Returns - ------- - S : NDArray,(N_samples,n_components) - The factor estimates - """ - if self.encoder is None: - raise ValueError("The model has not been fit yet") - - X_ = tensor(X) - S_ = self.encoder(X_) - S = S_.detach().numpy() - return S - - def inverse_transform(self, S: NDArray) -> NDArray[np.float_]: - """ - This returns the reconstruction given latent variables - - Parameters - ---------- - S : NDArray,(N_samples,n_components) - The factor estimates - - Returns - ------- - X_recon : np array-like,(N_samples,p) - The reconstruction - """ - if self.decoder is None: - raise ValueError("The model has not been fit yet") - - S_ = tensor(S) - X_ = self.decoder(S_) - X_recon = X_.detach().numpy() - - return X_recon - - def _fill_training_options(self, training_options: dict[str, Any]) -> None: - """ - This sets the default parameters for stochastic gradient descent, - our inference strategy for the model. - - Parameters - ---------- - training_options : dict - The original options set by the user passed as a dictionary - - Options - ------- - n_iterations : int, default=3000 - Number of iterations to train using stochastic gradient descent - - learning_rate : float, default=1e-4 - Learning rate of gradient descent - - batch_size : int, default=None - The number of observations to use at each iteration. If none - corresponds to batch learning - """ - default_options = { - "n_iterations": 3000, - "learning_rate": 1e-2, - "batch_size": 100, - "b1": 0.5, - "b2": 0.999, - "lambda": 0.1, - } - tops = {**default_options, **training_options} - - default_keys = set(default_options.keys()) - final_keys = set(tops.keys()) - - expected_but_missing_keys = default_keys - final_keys - unexpected_but_present_keys = final_keys - default_keys - if expected_but_missing_keys: - raise ValueError( - "the following training options were expected but not found..." - ) - if unexpected_but_present_keys: - raise ValueError( - "the following training options were unrecognized but provided..." - ) - - self.training_options = tops - - def _fill_model_options( - self, - latent_distribution_options: dict[str, Any], - encoder_options: dict[str, Any], - decoder_options: dict[str, Any], - discriminator_options: dict[str, Any], - ) -> None: - """ - This sets the default parameters for our encoder, decoder and discriminator - - Parameters - ---------- - latent_distribution_options: dict[str, Any] - Latent distribution options - - encoder_options: dict[str, Any] - Encoder parameters - - decoder_options: dict[str, Any] - Decoder parameters - - discriminator_options: dict[str, Any] - Discriminator parameters - """ - default_latent_distribution_options = { - "n_iterations": 3000, - } - default_encoder_options = { - "n_nodes": 128, - } - default_decoder_options = { - "n_nodes": 128, - } - default_discriminator_options = { - "n_nodes": 64, - "n_nodes2": 16, - } - self.latent_distribution_options = { - **default_latent_distribution_options, - **latent_distribution_options, - } - self.encoder_options = {**default_encoder_options, **encoder_options} - self.decoder_options = {**default_decoder_options, **decoder_options} - self.discriminator_options = { - **default_discriminator_options, - **discriminator_options, - } diff --git a/python/python/bystro/ancestry/ancestry_model_products/.gitignore b/python/python/bystro/ancestry/ancestry_model_products/.gitignore deleted file mode 100644 index eb09848fa..000000000 --- a/python/python/bystro/ancestry/ancestry_model_products/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.txt -*.skop -*.csv \ No newline at end of file diff --git a/python/python/bystro/ancestry/ancestry_types.py b/python/python/bystro/ancestry/ancestry_types.py deleted file mode 100644 index 2814878f3..000000000 --- a/python/python/bystro/ancestry/ancestry_types.py +++ /dev/null @@ -1,139 +0,0 @@ -"""Classes for common shapes of data in ancestry.""" -from msgspec import Struct - -LOWER_UNIT_BOUND = 0.0 -UPPER_UNIT_BOUND = 1.0 - -class ProbabilityInterval(Struct, frozen=True, rename="camel"): - """Represent an interval of probabilities.""" - - lower_bound: float - upper_bound: float - - # Currently msgspec constraint bounds don't seem to integrate with - # mypy or pyright, so we'll have to do this manually. - # See tracking issue https://github.com/jcrist/msgspec/issues/177 - def __post_init__(self): - # Due to PEP 484, types checkers do not distinguish between float and int - if not isinstance(self.lower_bound, float): - raise TypeError(f"lower_bound must be a float, not {type(self.lower_bound)}") - if not isinstance(self.upper_bound, float): - raise TypeError(f"upper_bound must be a float, not {type(self.upper_bound)}") - - if self.lower_bound < LOWER_UNIT_BOUND: - raise TypeError(f"lower_bound must be >= {LOWER_UNIT_BOUND}") - - if self.upper_bound > UPPER_UNIT_BOUND: - raise TypeError(f"upper_bound must be <= {UPPER_UNIT_BOUND}") - - -# NB: We might consider that a vector of ProbabilityIntervals should -# have additional validation properties, like that the sums of the -# lower bounds, upper bounds, or midpoints should be close to one. -# But constraints on the bounds don't hold in general (consider the -# vector of intervals [(0.4, 0.6), (0.4, 0.6)]), and we can't know how -# well the midpoints of the intervals reflect the point estimate in -# general, so we'll punt on this and assume it's the ML model's -# responsibility to give us scientifically sensible results. - - -class PopulationVector(Struct, frozen=True, kw_only=True): - """A vector of probability intervals over populations. - - Represents model estimates of an individual's similarity to - reference HapMap populations, with upper and lower bounds for each - population. - """ - - ACB: ProbabilityInterval - ASW: ProbabilityInterval - BEB: ProbabilityInterval - CDX: ProbabilityInterval - CEU: ProbabilityInterval - CHB: ProbabilityInterval - CHS: ProbabilityInterval - CLM: ProbabilityInterval - ESN: ProbabilityInterval - FIN: ProbabilityInterval - GBR: ProbabilityInterval - GIH: ProbabilityInterval - GWD: ProbabilityInterval - IBS: ProbabilityInterval - ITU: ProbabilityInterval - JPT: ProbabilityInterval - KHV: ProbabilityInterval - LWK: ProbabilityInterval - MSL: ProbabilityInterval - MXL: ProbabilityInterval - PEL: ProbabilityInterval - PJL: ProbabilityInterval - PUR: ProbabilityInterval - STU: ProbabilityInterval - TSI: ProbabilityInterval - YRI: ProbabilityInterval - - -class SuperpopVector(Struct, frozen=True, kw_only=True): - """A vector of probability intervals for superpopulations. - - Represents model estimates of an individual's similarity to - reference HapMap superpopulations, with upper and lower bounds for - each population. - - """ - - AFR: ProbabilityInterval - AMR: ProbabilityInterval - EAS: ProbabilityInterval - EUR: ProbabilityInterval - SAS: ProbabilityInterval - - -class AncestryTopHit(Struct, frozen=True, rename="camel"): - """ - The top hit for a sample, with the max value (a probability) and the population(s) corresponding - """ - - probability: float - populations: list[str] - - def __post_init__(self): - if not isinstance(self.probability, float): - raise TypeError(f"probability must be a float, not {type(self.probability)}") - - if self.probability < LOWER_UNIT_BOUND or self.probability > UPPER_UNIT_BOUND: - raise TypeError(f"probability must be between {LOWER_UNIT_BOUND} and {UPPER_UNIT_BOUND}") - - -class AncestryScoresOneSample(Struct, frozen=True, rename="camel"): - """An ancestry result for a sample. - - Represents ancestry model output for an individual study - participant (identified by sample_id) with estimates for - populations and superpopulations, and the overall number of snps - retained for calculating ancestry - """ - - sample_id: str - top_hit: AncestryTopHit - populations: PopulationVector - superpops: SuperpopVector - n_snps: int - - def __post_init__(self): - if not isinstance(self.n_snps, int): - raise TypeError(f"n_snps must be an int, not {type(self.n_snps)}") - - if self.n_snps < 0: - raise TypeError("n_snps must be non-negative") - -class AncestryResults(Struct, frozen=True, rename="camel"): - """An outgoing response from the ancestry worker. - - Represents ancestry model output for an entire study as a list of - individual AncestryResults. - - """ - - results: list[AncestryScoresOneSample] - pcs: dict[str, list[float]] \ No newline at end of file diff --git a/python/python/bystro/ancestry/asserts.py b/python/python/bystro/ancestry/asserts.py deleted file mode 100644 index 0451ba9c7..000000000 --- a/python/python/bystro/ancestry/asserts.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Convenience methods for safe asserts.""" - -from typing import Any - - -def assert_true( - description: str, - condition: Any, # noqa: ANN401 Any is actually appropriate here - comment: str = "", -) -> None: - """Check that condition holds, raising AssertionError if not.""" - if not condition: - msg = f"Expected {description}." - if comment: - msg += " " + comment - raise AssertionError(msg) - - -def assert_equals( - expected_description: str, - expected_value: Any, # noqa: ANN401 - actual_description: str, - actual_value: Any, # noqa: ANN401 - comment: str = "", -) -> None: - """Check that expected_value equals actual_value, raising AssertionError if not.""" - comparison = expected_value == actual_value - success = all(comparison) if hasattr(comparison, "__len__") else comparison - if success: - return - msg = ( - f"Expected {expected_description} ({expected_value}) " - f"to equal {actual_description} ({actual_value})." - ) - if comment: - msg += " " + comment - raise AssertionError(msg) diff --git a/python/python/bystro/ancestry/data/.gitignore b/python/python/bystro/ancestry/data/.gitignore deleted file mode 100644 index 562038556..000000000 --- a/python/python/bystro/ancestry/data/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -20130606_sample_info.txt -gnomad.v3.1.pca_loadings.tsv.gz -Human660W-Quad_v1_H.csv -Axiom_PMRA.na35.annot.csv \ No newline at end of file diff --git a/python/python/bystro/ancestry/data/kgp_vcfs/.gitignore b/python/python/bystro/ancestry/data/kgp_vcfs/.gitignore deleted file mode 100644 index 210d59cdb..000000000 --- a/python/python/bystro/ancestry/data/kgp_vcfs/.gitignore +++ /dev/null @@ -1,22 +0,0 @@ -ALL.chr1.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr10.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr11.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr12.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr13.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr14.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr15.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr16.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr17.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr18.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr19.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr2.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr20.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr21.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr22.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr3.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr4.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr5.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr6.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr7.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr8.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz -ALL.chr9.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz \ No newline at end of file diff --git a/python/python/bystro/ancestry/define_callset.py b/python/python/bystro/ancestry/define_callset.py deleted file mode 100644 index c48902177..000000000 --- a/python/python/bystro/ancestry/define_callset.py +++ /dev/null @@ -1,154 +0,0 @@ -"""Calculate intersection of Illumina and Affymetrix chips.""" - -import logging -import re - -import pandas as pd -import tqdm -from liftover import get_lifter - -from bystro.ancestry.asserts import assert_equals, assert_true -from bystro.ancestry.train import DATA_DIR, INTERMEDIATE_DATA_DIR -from bystro.ancestry.train_utils import is_autosomal_variant - -logger = logging.getLogger(__name__) - -pd.options.future.infer_string = True # type: ignore - -ILLUMINA_FILEPATH = DATA_DIR / "Human660W-Quad_v1_H.csv" -AFFYMETRIX_FILEPATH = DATA_DIR / "Axiom_PMRA.na35.annot.csv" - -# TODO: harmonize variant chromosomal coordinates with rsIDs. -pd.options.future.infer_string = True # type: ignore - - -def get_watson_crick_complement(base: str) -> str: - """Calculate Watson-Crick Complement.""" - wc_dict = {"A": "T", "T": "A", "G": "C", "C": "G"} - return wc_dict[base] - - -def liftover_38_from_37(variant: str) -> str | None: - """Liftover a variant to genome build 38 from 37.""" - chrom, pos, ref, alt = variant.split(":") - # liftover doesn't deal gracefully with MT variants, but we don't need them anyway - converter = get_lifter("hg19", "hg38") - locations = converter[chrom][int(pos)] - if locations is None or len(locations) != 1: - logger.debug("Variant %s had a non-unique location, couldn't lift over", variant) - return None - chrom38, pos38, _strand38 = locations[0] - variant38 = ":".join([chrom38, str(pos38), ref, alt]) - assert_true("lifted-over variant starts with 'chr'", variant38.startswith("chr")) - return variant38 - - -def _load_illumina_df() -> pd.DataFrame: - comment_rows = 7 - columns_to_keep = ["Chr", "MapInfo", "SNP", "RefStrand"] - illumina_df = pd.read_csv(ILLUMINA_FILEPATH, skiprows=comment_rows) - assert_equals( - "Set of genome builds", - {37.1}, - "actual set of genome builds", - set(illumina_df.GenomeBuild.dropna()), - ) - return illumina_df[columns_to_keep].dropna() - - -def load_illumina_variants() -> pd.Series: - """Load list of variants for illumina Human660W-Quad_v1 chip.""" - illumina_df = _load_illumina_df() - illumina_variants = _get_variants_from_illumina_df(illumina_df) - assert_equals( - "number of autosomal illumina variants after liftover", - 578822, - "recovered number of variants", - len(illumina_variants), - ) - return illumina_variants - - -def _get_variants_from_illumina_df(illumina_df: pd.DataFrame) -> pd.Series: - """Extract illumina variants and lift over.""" - variants38 = [] - for _i, row in tqdm.tqdm(illumina_df.iterrows(), total=len(illumina_df)): - chromosome = str(row.Chr) - position = str(int(row.MapInfo)) - if match := re.match(r"\[([ACGT])/([ACGT])\]", str(row.SNP)): - allele1, allele2 = match.groups() - else: - continue - if row.RefStrand == "-": - allele1 = get_watson_crick_complement(allele1) - allele2 = get_watson_crick_complement(allele2) - variant37 = ":".join(["chr" + chromosome, position, allele1, allele2]) - if not is_autosomal_variant(variant37): - continue - variants38.append(liftover_38_from_37(variant37)) - illumina_variants = pd.Series(variants38) - liftover_failure_rate = illumina_variants.isna().mean() - logger.info("liftover failure rate: %1.2f%%", liftover_failure_rate * 100) - return illumina_variants.dropna() - - -def _load_affymetrix_df() -> pd.DataFrame: - affymetrix_df = pd.read_csv(AFFYMETRIX_FILEPATH, comment="#", index_col=0, dtype={"Chromosome": str}) - columns_to_keep = ["Chromosome", "Physical Position", "Ref Allele", "Alt Allele"] - assert_equals("positive strand", {"+"}, "actual set of strands", set(affymetrix_df.Strand)) - assert_equals( - "differences between Physical Position and Position End", - {0}, - "actual differences", - set(affymetrix_df["Physical Position"] - affymetrix_df["Position End"]), - ) - - return affymetrix_df[columns_to_keep] - - -def _get_variants_from_affymetrix_df(affymetrix_df: pd.DataFrame) -> pd.Series: - """Extract affymetrix variants and lift over.""" - variants38 = [] - for _i, row in tqdm.tqdm(affymetrix_df.iterrows(), total=len(affymetrix_df)): - variant = ":".join( - [ - "chr" + (row.Chromosome), - str(row["Physical Position"]), - row["Ref Allele"], - row["Alt Allele"], - ] - ) - if not is_autosomal_variant(variant): - continue - variant38 = liftover_38_from_37(variant) - variants38.append(variant38) - affymetrix_variants = pd.Series(variants38) - liftover_failure_rate = affymetrix_variants.isna().mean() - logger.info("liftover failure rate: %1.2f%%", liftover_failure_rate * 100) - return affymetrix_variants.dropna() - - -def load_affymetrix_variants() -> pd.Series: - """Load list of variants for Affymetrix Axiom PMRA chip.""" - affymetrix_variants = _get_variants_from_affymetrix_df(_load_affymetrix_df()) - assert_equals( - "number of autosomal affymetrix variants after liftover", - 820710, - "recovered number of variants", - len(affymetrix_variants), - ) - return affymetrix_variants - - -def calculate_shared_illumina_affymetrix_variants() -> pd.DataFrame: - """Calculate intersection of illumina, affymetrix variants and write result to disk.""" - illumina_variants = load_illumina_variants() - affymetrix_variants = load_affymetrix_variants() - shared_variants = pd.DataFrame(sorted(set(illumina_variants).intersection(affymetrix_variants))) - assert_equals( - "number of shared variants", 34319, "number of shared variants obtained", len(shared_variants) - ) - shared_variants.to_csv( - INTERMEDIATE_DATA_DIR / "shared_illumina_affy_variants.csv", index=False, header=False - ) - return shared_variants diff --git a/python/python/bystro/ancestry/gmm_ancestry.py b/python/python/bystro/ancestry/gmm_ancestry.py deleted file mode 100644 index c6fc96246..000000000 --- a/python/python/bystro/ancestry/gmm_ancestry.py +++ /dev/null @@ -1,436 +0,0 @@ -""" -This implements the following model: - - p(z) ~ GMM({mu}_k,I) - p(x|z) ~ N(Wz,sigma^2I) - -This is different for a standard mixture model and from -probabilistic PCA in that it ties the covariance and means -together in a very specific format. This corresponds to -fitting a high dimensional GMM such that the latent -distribution is interpretable. No guarantees on functionality, -just on correctness. - -Objects -------- -GaussianMixturePPCA - The model implementation of a Gaussian mixture model - closely corresponding to probabilistic PCA. - -Methods -------- -None -""" -import numpy as np -from numpy.typing import NDArray -import numpy.linalg as la - -from sklearn.decomposition import PCA # type: ignore -from sklearn.mixture import GaussianMixture # type: ignore - -import torch -from torch import nn -from torch.distributions.multivariate_normal import MultivariateNormal - -from tqdm import trange - -from bystro._template_sgd_np import BaseSGDModel # type: ignore - - -class GaussianMixturePPCA(BaseSGDModel): - """ - This fits the following generative model - - p(z) ~ GMM({mu}_k,I) - p(x|z) ~ N(Wz,sigma^2I) - - using stochastic gradient descent on the - marginal likelihood. - - """ - - def __init__( - self, - n_clusters, - n_components, - training_options=None, - prior_options=None, - ): - """ - This is a Gaussian mixture model with a shared low - rank covariance structure. - - Paramters - --------- - n_clusters : int - Number of groups in the latent space - - n_components : int - PPCA dimensionality - """ - self.n_clusters = n_clusters - self.n_components = n_components - if training_options is None: - training_options = {} - if prior_options is None: - prior_options = {} - - self.training_options = self._fill_training_options(training_options) - self.prior_options = self._fill_prior_options(prior_options) - - def _fill_training_options(self, training_options): - """ - This sets the default parameters for stochastic gradient descent, - our inference strategy for the model. - - Parameters - ---------- - training_options : dict - The original options set by the user passed as a dictionary - - Options - ------- - n_iterations : int, default=3000 - Number of iterations to train using stochastic gradient descent - - learning_rate : float, default=1e-4 - Learning rate of gradient descent - - batch_size : int, default=None - The number of observations to use at each iteration. If none - corresponds to batch learning - """ - default_options = { - "n_iterations": 3000, - "learning_rate": 1e-3, - "batch_size": 100, - "momentum": 0.9, - } - tops = {**default_options, **training_options} - - default_keys = set(default_options.keys()) - final_keys = set(tops.keys()) - - expected_but_missing_keys = default_keys - final_keys - unexpected_but_present_keys = final_keys - default_keys - if expected_but_missing_keys: - raise ValueError("Missing keys") - if unexpected_but_present_keys: - raise ValueError("Extra keys") - return tops - - def _fill_prior_options(self, prior_options): - """ - Fills in options for prior parameters on latent space - - Paramters - --------- - new_dict : dictionary - The prior parameters used to specify the prior - """ - default_dict = {"mu_l2": 1.0} - new_dict = {**default_dict, **prior_options} - return new_dict - - def fit(self, X, progress_bar=True, seed=2021): - """ - Fits a model given covariates X - - Parameters - ---------- - X : np.array-like,(n_samples,n_covariates) - The data - - progress_bar : bool,default=True - Whether to print the progress bar to monitor time - - Returns - ------- - self : object - The model - """ - X = X.astype(np.float32) - self._test_inputs(X) - training_options = self.training_options - N, p = X.shape - self.p = p - rng = np.random.default_rng(int(seed)) - K = self.n_clusters - - W_, sigmal_, pi_logits, mu_list = self._initialize_variables(X) - - X = self._transform_training_data(X)[0] - - trainable_variables = [W_, sigmal_, pi_logits] + mu_list - - optimizer = torch.optim.SGD( - trainable_variables, - lr=training_options["learning_rate"], - momentum=training_options["momentum"], - ) - - softplus = nn.Softplus() - mse = nn.MSELoss() - smax = nn.Softmax() - - eye = torch.tensor(np.eye(p).astype(np.float32)) - - for i in trange( - training_options["n_iterations"], disable=not progress_bar - ): - idx = rng.choice( - X.shape[0], size=training_options["batch_size"], replace=False - ) - - X_batch = X[idx] - - sigma2 = softplus(sigmal_) - - X_each = [X_batch - torch.matmul(mu_list[k], W_) for k in range(K)] - - Sigma = torch.matmul(torch.transpose(W_, 0, 1), W_) + sigma2 * eye - - m = MultivariateNormal(torch.zeros(p), Sigma) - - pi_ = smax(pi_logits) - loss_logits = 0.001 * mse(pi_logits, torch.zeros(K)) - - log_likelihood_each = [ - m.log_prob(X_each[k]) for k in range(K) - ] # List of batchsize x 1 - log_likelihood_stack = torch.stack( - log_likelihood_each - ) # matrix of batchsize x K - log_likelihood_components = torch.transpose( - log_likelihood_stack, 0, 1 - ) + torch.log( - pi_ - ) # Log component posterior - log_likelihood_marg = torch.logsumexp( - log_likelihood_components, dim=1 - ) # Log likelihood per component - loss_likelihood = -1 * torch.mean( - log_likelihood_marg - ) # Loss function of likelihood - - loss = loss_logits + loss_likelihood - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - self._store_instance_variables(trainable_variables) - return self - - def get_covariance(self): - """ - Gets the covariance matrix defined by the model parameters - - Parameters - ---------- - None - - Returns - ------- - covariance : np.array-like(p,p) - The covariance matrix - """ - covariance = np.dot(self.W_.T, self.W_) + np.diag(self.sigma2_) - return covariance - - def get_precision(self): - """ - Gets the precision matrix defined as the inverse of the covariance - - Parameters - ---------- - None - - Returns - ------- - precision : np.array-like(p,p) - The inverse of the covariance matrix - """ - covariance = self.get_covariance() - precision = la.inv(covariance) - return precision - - def transform(self, X): - """ - This returns the latent variable estimates given X - - Parameters - ---------- - X : np array-like,(N_samples,p - The data to transform. - - Returns - ------- - S : np.array-like,(N_samples,n_components) - The factor estimates - """ - prec = self.get_precision() - coefs = np.dot(self.W_, prec) - S = np.dot(X, coefs.T) - return S - - def _initialize_save_losses(self): - """ - This method initializes the arrays to track relevant variables - during training at each iteration - - Sets - ---- - losses_likelihood : np.array(n_iterations) - The log likelihood - - losses_prior : np.array(n_iterations) - The log prior - - losses_posterior : np.array(n_iterations) - The log posterior - """ - n_iterations = self.training_options["n_iterations"] - self.losses_likelihood = np.empty(n_iterations) - self.losses_prior = np.empty(n_iterations) - self.losses_posterior = np.empty(n_iterations) - - def _initialize_variables(self, X): - """ - Initializes the variables of the model. Right now fits a PCA model - in sklearn, uses the loadings and sets sigma^2 to be unexplained - variance for each group. - - Parameters - ---------- - X : np.array-like,(n_samples,p) - The data - - Returns - ------- - W_ : torch.tensor-like,(n_components,p) - The loadings of our latent factor model - - sigmal_ : list - A list of the isotropic noises for each group - """ - model_pca = PCA(self.n_components) - S_ = model_pca.fit_transform(X) - model_gmm = GaussianMixture(self.n_clusters, covariance_type="tied") - model_gmm.fit(S_) - W_init = model_pca.components_ - W_ = torch.tensor(W_init.astype(np.float32), requires_grad=True) - X_recon = np.dot(S_, W_init) - diff = np.mean((X - X_recon) ** 2) - sinv = softplus_inverse_np(diff * np.ones(1).astype(np.float32)) - sigmal_ = torch.tensor(sinv, requires_grad=True) - - pi_logits = torch.tensor( - np.log(model_gmm.weights_ + 1e-3).astype(np.float32), - requires_grad=True, - ) - mean_list = [ - torch.tensor( - model_gmm.means_[k].astype(np.float32), requires_grad=True - ) - for k in range(self.n_clusters) - ] - return W_, sigmal_, pi_logits, mean_list - - def _save_losses(self, i, log_likelihood, log_prior, log_posterior): - """ - Saves the values of the losses at each iteration - - Parameters - ----------- - i : int - Current training iteration - - losses_likelihood : torch.tensor - The log likelihood - - losses_prior : torch.tensor - The log prior - - losses_posterior : torch.tensor - The log posterior - """ - self.losses_likelihood[i] = log_likelihood.detach().numpy() - if isinstance(log_prior, np.ndarray): - self.losses_prior[i] = log_prior - else: - self.losses_prior[i] = log_prior.detach().numpy() - self.losses_posterior[i] = log_posterior.detach().numpy() - - def _store_instance_variables(self, trainable_variables): - """ - Saves the learned variables - - Parameters - ---------- - trainable_variables : list - List of tensorflow variables saved - - Sets - ---- - W_ : np.array-like,(n_components,p) - The loadings - - sigma2_ : float - The isotropic variance - """ - self.W_ = trainable_variables[0].detach().numpy() - self.sigma2_ = nn.Softplus()(trainable_variables[1]).detach().numpy() - self.pi_ = nn.Softmax()(trainable_variables[2]).detach().numpy() - self.mu_ = np.zeros((self.n_clusters, self.n_components)) - for i in range(self.n_clusters): - self.mu_[i] = trainable_variables[3 + i].detach().numpy() - - def _test_inputs(self, X: NDArray[np.float_]) -> None: - """ - Just tests to make sure data is numpy array - """ - if not isinstance(X, np.ndarray): - raise ValueError("Data is numpy array") - if self.training_options["batch_size"] > X.shape[0]: - raise ValueError("Batch size exceeds number of samples") - - def _transform_training_data(self, *args): - """ - Convert a list of numpy arrays to tensors - """ - out = [] - for arg in args: - out.append(torch.tensor(arg)) - return out - - -def softplus_inverse_np(y): - """ - Computes the inverse of the softplus activation of x in a - numerically stable way - - Softplus: y = log(exp(x) + 1) - Softplus^{-1}: y = np.log(np.exp(x) - 1) - - Parameters - ---------- - x : np.array - Original array - - Returns - ------- - x : np.array - Transformed array - """ - min_threshold = 10 ** -15 - max_threshold = 500 - safe_y = np.clip( - y, min_threshold, max_threshold - ) # we can safely pass this to the reference inverse_softplus below - safe_x = np.log(np.exp(safe_y) - 1) - - # if y_i was below (respectively: above) the min (max) threshold, replace with log(y_i) (y_i) - x = np.where(y < min_threshold, np.log(y), safe_x) - x = np.where(y > max_threshold, y, x) - return x diff --git a/python/python/bystro/ancestry/inference.py b/python/python/bystro/ancestry/inference.py deleted file mode 100644 index 44a099c09..000000000 --- a/python/python/bystro/ancestry/inference.py +++ /dev/null @@ -1,382 +0,0 @@ -"""Classify genotypes at inference time.""" - -import logging -import gc -import os -import psutil -import warnings - -from msgspec import Struct -import numpy as np -import pandas as pd - -import pyarrow as pa # type: ignore -import pyarrow.compute as pc # type: ignore -from pyarrow.dataset import Dataset # type: ignore - -from sklearn.ensemble import RandomForestClassifier # type: ignore - -from bystro.ancestry.ancestry_types import ( - AncestryResults, - AncestryScoresOneSample, - AncestryTopHit, - PopulationVector, - ProbabilityInterval, - SuperpopVector, -) -from bystro.ancestry.asserts import assert_equals -from bystro.ancestry.train import POPS, SUPERPOP_FROM_POP, SUPERPOPS -from bystro.utils.timer import Timer - -logger = logging.getLogger(__name__) -warnings.simplefilter(action="ignore", category=FutureWarning) - -pd.options.future.infer_string = True # type: ignore - -ANCESTRY_SCORE_SAMPLE_CHUNK_SIZE = int(os.getenv("ANCESTRY_SCORE_SAMPLE_CHUNK_SIZE", 200)) - -DEBUG = False -DEBUG_MISSINGNESS_IMPOSED = 0.0 - - -class AncestryModel(Struct, frozen=True, forbid_unknown_fields=True, rename="camel"): - """Bundle together PCA and RFC models for bookkeeping purposes.""" - - pca_loadings_df: pd.DataFrame - rfc: RandomForestClassifier - - def __post_init__(self) -> None: - """Ensure that PCA and RFC features line up correctly.""" - pca_cols = self.pca_loadings_df.columns - rfc_features = self.rfc.feature_names_in_ - if not (len(pca_cols) == len(rfc_features) and (pca_cols == rfc_features).all()): - err_msg = ( - f"PC loadings columns:{self.pca_loadings_df.columns} must equal " - f"rfc.feature_names_in: {self.rfc.feature_names_in_}" - ) - raise ValueError(err_msg) - - def predict_proba(self, genotypes: pd.DataFrame) -> tuple[dict[str, list[float]], pd.DataFrame]: - """ - Predict population probabilities from dosage matrix. - - Args: - genotypes: pd.DataFrame, shape (n_variants, m_samples) - A dosage matrix with samples as rows and variants as columns. - """ - logger.debug("computing PCA transformation") - genotypes = genotypes.fillna(0) - - with Timer() as timer: - Xpc = genotypes.T @ self.pca_loadings_df.loc[genotypes.index, :] - - # get memory usage - logger.info( - "Memory usage after PCA transformation: %s (MB)", - psutil.Process(os.getpid()).memory_info().rss / 1024**2, - ) - - logger.debug("finished computing PCA transformation in %f seconds", timer.elapsed_time) - logger.debug("computing RFC classification") - - with Timer() as timer: - probs = self.rfc.predict_proba(Xpc) - - logger.debug("finished computing RFC classification in %f seconds", timer.elapsed_time) - - Xpc_dict = Xpc.T.to_dict(orient="list") - - return Xpc_dict, pd.DataFrame(probs, index=genotypes.columns, columns=POPS) - - -def _package_ancestry_response_from_pop_probs( - pcs_for_plotting: dict[str, list[float]], - pop_probs_df: pd.DataFrame, - n_snps: int, -) -> AncestryResults: - """Fill out AncestryResults using filepath, numerical model output and sample-wise missingnesses.""" - superpop_probs_df = _superpop_probs_from_pop_probs(pop_probs_df) - ancestry_results = [] - - for (sample_id, sample_pop_probs), (_sample_id2, sample_superpop_probs) in zip( - pop_probs_df.iterrows(), superpop_probs_df.iterrows(), strict=True - ): - if not isinstance(sample_id, str): - # just spoonfeeding mypy here-- this should never raise - err_msg = ( - f"Expected sample_id of type str, got {sample_id} of type({type(sample_id)}) instead" - ) - raise TypeError(err_msg) - - pop_probs_dict = dict(sample_pop_probs) - max_value = float(max(pop_probs_dict.values())) - top_pops = [pop for pop, value in pop_probs_dict.items() if value == max_value] - - pop_vector = PopulationVector( - **{ - pop: _make_trivial_probability_interval(value) - for (pop, value) in dict(sample_pop_probs).items() - } - ) - superpop_vector = SuperpopVector( - **{ - superpop: _make_trivial_probability_interval(value) - for (superpop, value) in dict(sample_superpop_probs).items() - } - ) - ancestry_results.append( - AncestryScoresOneSample( - sample_id=sample_id, - top_hit=AncestryTopHit(probability=max_value, populations=top_pops), - populations=pop_vector, - superpops=superpop_vector, - n_snps=n_snps, - ) - ) - - return AncestryResults(results=ancestry_results, pcs=pcs_for_plotting) - - -class AncestryModels(Struct, frozen=True, forbid_unknown_fields=True, rename="camel"): - """ - A Struct of trained models for predicting ancestry, - using either gnomAD PC's or genotyping array PC's, depending on which has lower missingness. - """ - - gnomad_model: AncestryModel - array_model: AncestryModel - - -def infer_ancestry( - ancestry_models: AncestryModels, genotypes: Dataset, model: str | None = None -) -> AncestryResults: - """ - Infer ancestry from genotypes using a trained model. - - Parameters - ---------- - ancestry_models: AncestryModels - A Struct of trained models for predicting ancestry, - - genotypes: Arrow Dataset, shape (n_variants, m_samples) - A dataset containing genotypes to be classified. - - model: str, optional - The model to use for ancestry inference. - The model name must be one of "array" or "gnomad" - If None, the model with the lowest missingness will be used. - - Returns - ------- - AncestryResults - A Struct of ancestry results for each sample in the dataset. - AncestryResults.results is a list of AncestryScoresOneSample objects and - AncestryResults.pcs is a dictionary of principal components for each sample. - - AncestryScoresOneSample is a Struct with the following fields: - - sample_id: str - The ID of the sample. This is the same as the sample ID in the input dataset. - - top_hit: AncestryTopHit - The top hit for a sample, with the max value (a probability) and the list of population(s) - corresponding to that probability, typically a single population. - - populations: PopulationVector - A Struct of population probabilities for the sample. For instance, if the sample is - 80% Finnish and 20% Nigerian, the PopulationVector - would be {"FIN": 0.8, "YRI": 0.2}. - - superpops: SuperpopVector - A Struct of super population probabilities for the sample. For instance, if the sample is - 80% European and 20% African, the PopulationVector would be {"AFR": 0.2, "EUR": 0.8}. - - num_snps_selected: int - The number of SNPs used to infer ancestry for the sample. - """ - - logger.debug("Beginning ancestry inference") - - logger.info( - "Memory usage before dosage matrix filtered row counting: %s (MB)", - psutil.Process(os.getpid()).memory_info().rss / 1024**2, - ) - - pool = pa.default_memory_pool() - - with Timer() as timer: - gnomad_model = ancestry_models.gnomad_model - array_model = ancestry_models.array_model - - mask_gnomad = pc.field("locus").isin(gnomad_model.pca_loadings_df.index) - mask_array = pc.field("locus").isin(array_model.pca_loadings_df.index) - - scanner_gnomad = genotypes.filter(mask_gnomad) - scanner_array = genotypes.filter(mask_array) - - gnomad_matching_row_count = scanner_gnomad.count_rows(memory_pool=pool) - array_matching_row_count = scanner_array.count_rows(memory_pool=pool) - - logger.debug( - "Found %d rows in genotypes matching gnomAD PCA loadings", - gnomad_matching_row_count, - ) - logger.debug( - "Found %d rows in genotypes matching array PCA loadings", - array_matching_row_count, - ) - - logger.info( - "Memory usage after dosage matrix filtered row counting: %s (MB)", - psutil.Process(os.getpid()).memory_info().rss / 1024**2, - ) - - gnomad_completeness = gnomad_matching_row_count / len(gnomad_model.pca_loadings_df.index) - array_completeness = array_matching_row_count / len(array_model.pca_loadings_df.index) - - if model is None: - logger.debug("No model specified, selecting model with lowest missing") - - # give slight preference to array model if missingness is very simliar - # because array model performs better - if array_completeness >= (gnomad_completeness * 0.95): - scanner = scanner_array - ancestry_model = array_model - - logger.debug("Using array PCA loadings for ancestry inference due to lower missingness") - num_snps_selected = array_matching_row_count - del scanner_gnomad - del gnomad_model - else: - scanner = scanner_gnomad - ancestry_model = gnomad_model - - logger.debug("Using gnomAD PCA loadings for ancestry inference due to lower missingness") - num_snps_selected = gnomad_matching_row_count - del scanner_array - del array_model - else: - if model == "gnomad": - scanner = scanner_gnomad - ancestry_model = gnomad_model - - logger.debug("Using gnomAD PCA loadings for ancestry inference") - num_snps_selected = gnomad_matching_row_count - del scanner_array - del array_model - elif model == "array": - scanner = scanner_array - ancestry_model = array_model - - logger.debug("Using array PCA loadings for ancestry inference") - num_snps_selected = array_matching_row_count - del scanner_gnomad - del gnomad_model - else: - raise ValueError(f"Invalid model specified: {model}") - - logger.info("Completed ancestry model selection in %f seconds", timer.elapsed_time) - - samples = [name for name in genotypes.schema.names if name != "locus"] - - # Take chunks of up to 500 samples - chunk_size = ANCESTRY_SCORE_SAMPLE_CHUNK_SIZE - num_samples = len(samples) - start = 0 - - all_pcs_for_plotting = {} - all_pop_probs_df = [] - while start < num_samples: - with Timer() as timer: - end = min(start + chunk_size, num_samples) - chunk_samples = samples[start:end] - - genotypes_chunk_table = scanner.to_table(["locus", *chunk_samples], memory_pool=pool) - - logger.info( - "Memory usage after dosage matrix filtering for samples %d to %d: %s (MB)", - start, - end, - psutil.Process(os.getpid()).memory_info().rss / 1024**2, - ) - - genotypes_df = genotypes_chunk_table.to_pandas().set_index("locus") - - # If there are duplicates, log them and remove them - if genotypes_df.index.duplicated().any(): - logger.warning("Found duplicate loci in genotypes, removing duplicates") - genotypes_df = genotypes_df[~genotypes_df.index.duplicated(keep="first")] - - logger.info( - "Memory usage after converting table to Pandas dataframe, for samples %d to %d: %s (MB)", - start, - end, - psutil.Process(os.getpid()).memory_info().rss / 1024**2, - ) - - if DEBUG and DEBUG_MISSINGNESS_IMPOSED > 0: - # drop a random DEBUG_MISSINGNESS_IMPOSED number of rows - genotypes_df = genotypes_df.sample( - frac=1 - DEBUG_MISSINGNESS_IMPOSED, random_state=42, axis=0 - ) - - logger.info( - "DEBUG: imposed missingness, genotypes_df shape after: %s", genotypes_df.shape - ) - - pcs_for_plotting, pop_probs_df = ancestry_model.predict_proba(genotypes_df) - - all_pcs_for_plotting.update(pcs_for_plotting) - all_pop_probs_df.append(pop_probs_df) - - # We must manually free memory, because - # otherwise it appears Python has a tough time coping - # with the size of allocations, which can reach into gigabytes per loop iteration - del genotypes_df - del genotypes_chunk_table - pool.release_unused() - gc.collect() - - logger.info( - "Completed ancestry inference for samples %d to %d in %f seconds. RSS: %s (MB)", - start, - end, - timer.elapsed_time, - psutil.Process(os.getpid()).memory_info().rss / 1024**2, - ) - - start = end - - return _package_ancestry_response_from_pop_probs( - all_pcs_for_plotting, pd.concat(all_pop_probs_df), num_snps_selected - ) - - -def _superpop_probs_from_pop_probs(pop_probs: pd.DataFrame) -> pd.DataFrame: - """Given a matrix of population probabilities, convert to matrix of superpop probabilities.""" - N = len(pop_probs) - pops = sorted(SUPERPOP_FROM_POP.keys()) - superpops = sorted(set(SUPERPOP_FROM_POP.values())) - superpop_projection_matrix = pd.DataFrame( - np.array([[int(superpop == SUPERPOP_FROM_POP[pop]) for superpop in superpops] for pop in pops]), - index=POPS, - columns=SUPERPOPS, - ) - superpop_probs = pop_probs @ superpop_projection_matrix - assert_equals( - "Expected superpop_probs shape (N x |superpops|):", - superpop_probs.shape, - "Actual shape", - (N, len(superpops)), - ) - return superpop_probs - - -def _make_trivial_probability_interval(x: float) -> ProbabilityInterval: - """Promote a value to a trivial ProbabilityInterval with equal lower, upper bounds.""" - # The ancestry calculations come out as np.float64, which is not JSON serializable in msgspec. - - # Allow for floating point precision error - if x > 1.005: - raise ValueError(f"Expected value between 0 and 1, got {x}") - if x < -0.005: - raise ValueError(f"Expected value between 0 and 1, got {x}") - - x = min(1, max(0, x)) - - return ProbabilityInterval(float(x), float(x)) diff --git a/python/python/bystro/ancestry/intermediate_data/.gitignore b/python/python/bystro/ancestry/intermediate_data/.gitignore deleted file mode 100644 index e2b939795..000000000 --- a/python/python/bystro/ancestry/intermediate_data/.gitignore +++ /dev/null @@ -1 +0,0 @@ -shared_illumina_affy_variants.csv \ No newline at end of file diff --git a/python/python/bystro/ancestry/model.py b/python/python/bystro/ancestry/model.py deleted file mode 100644 index dfac9642d..000000000 --- a/python/python/bystro/ancestry/model.py +++ /dev/null @@ -1,265 +0,0 @@ -"""Provide a worker for the ancestry model.""" - -import logging -import os -from pathlib import Path -import requests - -import pandas as pd -from skops.io import load as skops_load # type: ignore - -from bystro.ancestry.inference import AncestryModel, AncestryModels - -from bystro.utils.timer import Timer - -logging.basicConfig( - filename="ancestry_model.log", - level=logging.DEBUG, - format="%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", -) -logger = logging.getLogger() - -pd.options.future.infer_string = True # type: ignore - -ARRAYSET_MODEL_KEY = "array" -GNOMADSET_MODEL_KEY = "gnomad" - -ANCESTRY_BUCKET = os.getenv("ANCESTRY_BUCKET", "bystro-ancestry-public") -ANCESTRY_MODEL_DIR = os.getenv("ANCESTRY_MODEL_DIR", str(Path(__file__).parent / "data")) -GNOMAD_PCA_FILE = "gnomadset_pca_sklearn_151.csv" -GNOMAD_RFC_FILE = "gnomadset_rfc_sklearn_151.skop" -ARRAY_PCA_FILE = "arrayset_pca_sklearn_151.csv" -ARRAY_RFC_FILE = "arrayset_rfc_sklearn_151.skop" - -models_cache: dict[str, AncestryModels] = {} - - -def download_file(bucket: str, key: str, filename: str): - """ - Download a file from the given URL to the local path. - - Args: - url (str): The URL to download the file from. - local_path (str): The local path to save the file. - """ - url = f"https://{bucket}.s3.amazonaws.com/{key}" - logger.info("Downloading file from %s to %s", url, filename) - response = requests.get(url) - response.raise_for_status() # Raise an error for bad status codes - - with open(filename, "wb") as f: - f.write(response.content) - - -def get_one_model( - pca_local_path: str, rfc_local_path: str, pca_file_key: str, rfc_file_key: str -) -> AncestryModel: - """ - Load an ancestry model from S3. - - Args: - pca_local_path (str): The local path to save the PCA file. - rfc_local_path (str): The local path to save the RFC file. - pca_file_key (str): The remove path to the PCA file. - rfc_file_key (str): The remove path to the RFC file. - - Raises: - ValueError: If the PCA or RFC file is not found. - - Returns: - AncestryModel: The loaded ancestry model. - """ - logger.info( - "Downloading PCA file %s and RFC file %s to %s and %s", - pca_file_key, - rfc_file_key, - pca_local_path, - rfc_local_path, - ) - - with Timer() as timer: - try: - download_file(bucket=ANCESTRY_BUCKET, key=pca_file_key, filename=pca_local_path) - except requests.HTTPError as e: - raise ValueError( - f"{pca_file_key} not found in bucket {ANCESTRY_BUCKET}. " - "This assembly is not supported." - ) from e - - try: - download_file(bucket=ANCESTRY_BUCKET, key=rfc_file_key, filename=rfc_local_path) - except requests.HTTPError as e: - raise ValueError( - f"{rfc_file_key} not found in bucket {ANCESTRY_BUCKET}. " - "This assembly is not supported." - ) from e - - logger.debug("Downloaded PCA file and RFC file in %f seconds", timer.elapsed_time) - - return get_one_model_from_file_system(pca_local_path, rfc_local_path) - - -def get_models(assembly: str) -> AncestryModels: - """ - Load the ancestry models for the given assembly from S3. - - Args: - assembly (str): The genome assembly to load the models for. - Example: "hg38" or "hg19" - - Returns: - AncestryModels: The loaded models for the given assembly. - """ - if assembly in models_cache: - logger.info("Model for assembly %s found in cache.", assembly) - return models_cache[assembly] - - paths = _get_local_paths(assembly) - - if ( - Path(paths[GNOMADSET_MODEL_KEY]["pca_local_path"]).exists() - and Path(paths[GNOMADSET_MODEL_KEY]["rfc_local_path"]).exists() - and Path(paths[ARRAYSET_MODEL_KEY]["pca_local_path"]).exists() - and Path(paths[ARRAYSET_MODEL_KEY]["rfc_local_path"]).exists() - ): - logger.info("Loading models from disk.") - gnomad_model = get_one_model_from_file_system( - paths[GNOMADSET_MODEL_KEY]["pca_local_path"], paths[GNOMADSET_MODEL_KEY]["rfc_local_path"] - ) - array_model = get_one_model_from_file_system( - paths[ARRAYSET_MODEL_KEY]["pca_local_path"], paths[ARRAYSET_MODEL_KEY]["rfc_local_path"] - ) - models = AncestryModels(gnomad_model, array_model) - else: - gnomad_model = get_one_model( - paths[GNOMADSET_MODEL_KEY]["pca_local_path"], - paths[GNOMADSET_MODEL_KEY]["rfc_local_path"], - paths[GNOMADSET_MODEL_KEY]["pca_remote_path"], - paths[GNOMADSET_MODEL_KEY]["rfc_remote_path"], - ) - array_model = get_one_model( - paths[ARRAYSET_MODEL_KEY]["pca_local_path"], - paths[ARRAYSET_MODEL_KEY]["rfc_local_path"], - paths[ARRAYSET_MODEL_KEY]["pca_remote_path"], - paths[ARRAYSET_MODEL_KEY]["rfc_remote_path"], - ) - models = AncestryModels(gnomad_model, array_model) - - # Update the cache with the new model - if len(models_cache) >= 1: - # Remove the oldest loaded model to maintain cache size - oldest_assembly = next(iter(models_cache)) - del models_cache[oldest_assembly] - models_cache[assembly] = models - - return models - - -def get_one_model_from_file_system(pca_path: str, rfc_path: str) -> AncestryModel: - """ - Load an ancestry model from the local file system. - - Args: - pca_path (str): The path to the PCA file. - rfc_path (str): The path to the RFC file. - - Returns: - AncestryModel: The loaded ancestry model. - """ - with Timer() as timer: - logger.info("Loading PCA file %s", pca_path) - pca_loadings_df = pd.read_csv(pca_path, index_col=0) - - logger.info("Loading RFC file %s", rfc_path) - rfc = skops_load(rfc_path) - - logger.debug("Loaded PCA and RFC files in %f seconds", timer.elapsed_time) - - return AncestryModel(pca_loadings_df, rfc) - - -def get_models_from_file_system(assembly: str) -> AncestryModels: - """ - Load the ancestry models for the given assembly from the local file system. - - Args: - model_dir (str): The local directory where the models are stored. - We expect the models to be stored in the following format: - ``` - model_dir/ - assembly/ - gnomadset_pca.csv - gnomadset_rfc.skop - arrayset_pca.csv - arrayset_rfc.skop - ``` - assembly (str): The genome assembly to load the models for. - Example: "hg38" or "hg19" - - Returns: - AncestryModels: The loaded models for the given assembly. - """ - if assembly in models_cache: - logger.info("Model for assembly %s found in cache.", assembly) - return models_cache[assembly] - - paths = _get_local_paths(assembly) - - gnomad_model = get_one_model_from_file_system( - paths[GNOMADSET_MODEL_KEY]["pca_local_path"], paths[GNOMADSET_MODEL_KEY]["rfc_local_path"] - ) - array_model = get_one_model_from_file_system( - paths[ARRAYSET_MODEL_KEY]["pca_local_path"], paths[ARRAYSET_MODEL_KEY]["rfc_local_path"] - ) - - models = AncestryModels(gnomad_model, array_model) - - # Update the cache with the new model - if len(models_cache) >= 1: - # Remove the oldest loaded model to maintain cache size - oldest_assembly = next(iter(models_cache)) - del models_cache[oldest_assembly] - models_cache[assembly] = models - - return models - - -def _get_local_paths(assembly: str) -> dict[str, dict[str, str]]: - local_dir = Path(ANCESTRY_MODEL_DIR) / assembly - local_dir.mkdir(exist_ok=True, parents=True) - - gnomad_pca_basename = f"{assembly}_{GNOMAD_PCA_FILE}" - gnomad_rfc_basename = f"{assembly}_{GNOMAD_RFC_FILE}" - - array_pca_basename = f"{assembly}_{ARRAY_PCA_FILE}" - array_rfc_basename = f"{assembly}_{ARRAY_RFC_FILE}" - - pca_local_path_gnomad = local_dir / gnomad_pca_basename - rfc_local_path_gnomad = local_dir / gnomad_rfc_basename - - pca_local_path_array = local_dir / array_pca_basename - rfc_local_path_array = local_dir / array_rfc_basename - pca_remote_path_gnomad = f"{assembly}/{gnomad_pca_basename}" - rfc_remote_path_gnomad = f"{assembly}/{gnomad_rfc_basename}" - pca_remote_path_array = f"{assembly}/{array_pca_basename}" - rfc_remote_path_array = f"{assembly}/{array_rfc_basename}" - - return { - GNOMADSET_MODEL_KEY: { - "pca_local_path": str(pca_local_path_gnomad), - "rfc_local_path": str(rfc_local_path_gnomad), - "pca_remote_path": pca_remote_path_gnomad, - "rfc_remote_path": rfc_remote_path_gnomad, - "pca_basename": gnomad_pca_basename, - "rfc_basename": gnomad_rfc_basename, - }, - ARRAYSET_MODEL_KEY: { - "pca_local_path": str(pca_local_path_array), - "rfc_local_path": str(rfc_local_path_array), - "pca_remote_path": pca_remote_path_array, - "rfc_remote_path": rfc_remote_path_array, - "pca_basename": array_pca_basename, - "rfc_basename": array_rfc_basename, - }, - } diff --git a/python/python/bystro/ancestry/preprocess_1kgp_using_gnomad_loadings.sh b/python/python/bystro/ancestry/preprocess_1kgp_using_gnomad_loadings.sh deleted file mode 100644 index 0fc47ca94..000000000 --- a/python/python/bystro/ancestry/preprocess_1kgp_using_gnomad_loadings.sh +++ /dev/null @@ -1,75 +0,0 @@ -#Extract gnomad loadings variant list from 1kgp genomes using plink - -#TODO convert this process to one using our own vcf parser - -# Plink2 is needed for this pre-process -# If installed in current directory, add to path -export PATH=$PATH:./ -# Check if plink2 is installed or not in path -program_name="plink2" -if ! command -v "$program_name" &> /dev/null; then - echo "Error: '$program_name' is not installed or not present on system PATH." - echo "Please install '$program_name' or make sure it is added to the PATH. You can install from: https://www.cog-genomics.org/plink/2.0/" - exit 1 -fi -echo "'$program_name' is installed and present on the system's PATH." - -# Download 1kgp manifest that has list of vcf files with checksums - make sure this is the most recent version of 1kgp -wget 'ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/20220804_manifest.txt' -# Download 1kgp genomes -wget 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr'* - -# Check if downloaded files were properly downloaded using md5 checksums -MANIFEST_FILE="20220804_manifest.txt" -# Variable to keep track of the checksum verification status -verification_status="PASSED" -# Verify checksums for each file in the manifest and print pass/fail -while read -r file_name expected_checksum; do - # Calculate md5sum checksum of the downloaded file - actual_checksum=$(md5sum "$file_name" | awk '{print $1}') - - # Compare the calculated checksum with the expected checksum and mark if failure - if [ "$actual_checksum" != "$expected_checksum" ]; then - verification_status="FAILED" - echo "Checksum verification FAILED for $file_name" - fi -done < "$MANIFEST_FILE" - -if [ "$verification_status" = "PASSED" ]; then - echo "All checksum verifications PASSED" -else - echo "One or more checksum verifications FAILED" -fi - -# Gnomad loadings have been preprocessed to extract the variant list only as gnomadvariantlist.txt - -# Extract from each autosomal chromosome for ancestry -for ((chr=1; chr<=22; chr++)) -do - # Input and output paths for each chromosome - input_vcf="1kGP_high_coverage_Illumina.chr${chr}.filtered.SNV_INDEL_SV_phased_panel.vcf.gz" - output_base="tempchr${chr}" - - # Run plink2 with current chromosome - ./plink2 --extract gnomadvariantlist.txt \ - --make-pgen \ - --out "$output_base" \ - --vcf "$input_vcf" - # Append output file name to the merge list, excluding 'chr1' - if [ "$chr" -ne 1 ]; then - echo "$output_base" >> chr_merge_list.txt - fi -done - -# Merge the files together using list of output file names -./plink2 --export vcf \ - --out 1kgpGnomadList \ - --pfile tempchr1 \ - --pmerge-list chr_merge_list.txt - -#Delete temp files that are no longer needed -echo "Clean up temp files" -rm "1kGP_high_coverage_Illumina.chr"* -rm "tempchr"* -rm "chr_merge_list.txt" -rm "$MANIFEST_FILE" \ No newline at end of file diff --git a/python/python/bystro/ancestry/preprocess_vcfs.sh b/python/python/bystro/ancestry/preprocess_vcfs.sh deleted file mode 100644 index 69242b395..000000000 --- a/python/python/bystro/ancestry/preprocess_vcfs.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -set -e - -FTP_PREFIX="ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/" - -if [[ $(ls chr*_filtered.vcf.gz) ]] -then - echo "Found temp files of form: chr*_filtered.vcf.gz, please remove before continuing" - exit 1 -fi - -for chr in {1..22};do - echo "Processing chromosome $chr at" `date` - vcf_filename=ALL.chr$chr.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz - #vcf_filename=chr$chr.vcf.gz - if [ ! -f $vcf_filename ] - then - echo "didn't find $vcf_filename in directory; downloading" - wget $FTP_PREFIX$vcf_filename - else - echo "Found $vcf_filename in directory" - fi - - - output_filename="chr${chr}_filtered.vcf.gz" - echo "writing to temp file:" $output_filename - bcftools view -I $vcf_filename -Ou | # exclude indels - bcftools view -i 'MAF > 0.01' -Ou | # exclude MAF < 0.01 - bcftools view -c 1 -Ou | # exclude monomorphic sites - bcftools norm -d all -Ou | # include only bi-allelic sites - bcftools +prune -l 0.2 -w 1000 | # last output must be human-readable - gzip -c > $output_filename -done - -final_outfile=1KGP_final_variants_1percent.vcf.gz - -echo "Writing final variants to: " $final_outfile -vcf-concat chr*_filtered.vcf.gz | gzip -c > $final_outfile - -echo "Cleaning up" -rm chr*_filtered.vcf.gz diff --git a/python/python/bystro/ancestry/tests/__init__.py b/python/python/bystro/ancestry/tests/__init__.py deleted file mode 100644 index 2da476115..000000000 --- a/python/python/bystro/ancestry/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for ancestry module.""" diff --git a/python/python/bystro/ancestry/tests/test_adversarial_autoencoder.py b/python/python/bystro/ancestry/tests/test_adversarial_autoencoder.py deleted file mode 100644 index 2387e6bbd..000000000 --- a/python/python/bystro/ancestry/tests/test_adversarial_autoencoder.py +++ /dev/null @@ -1,32 +0,0 @@ -import numpy as np -import scipy.stats as st # type: ignore -from bystro.ancestry.adversarial_autoencoder import AdversarialAutoencoder - - -def generate_data(N = 10000, p = 100, L = 3, sigma = 1.0): - rng = np.random.default_rng(2021) - - W_base = st.ortho_group.rvs(p) - W = W_base[:L] - lamb = rng.gamma(1, 1, size=L) + 1 - for ll in range(L): - W[ll] = W[ll] * lamb[ll] - S_train = rng.normal(size=(N, L)) - X_hat = np.dot(S_train, W) - X_noise = sigma * rng.normal(size=(N, p)) - X_train = X_hat + X_noise - return W, sigma, X_train, S_train - - -def test_adversarial_autoencoder(): - W, sigma, X, S_train = generate_data() - model = AdversarialAutoencoder(2, training_options={"n_iterations": 100}) - model.fit(X) - - S_est = model.transform(X.astype(np.float32)) - X_recon = model.inverse_transform(S_est.astype(np.float32)) - assert X_recon is not None - - -if __name__ == "__main__": - test_adversarial_autoencoder() diff --git a/python/python/bystro/ancestry/tests/test_ancestry_gmm.py b/python/python/bystro/ancestry/tests/test_ancestry_gmm.py deleted file mode 100644 index 16a8a4547..000000000 --- a/python/python/bystro/ancestry/tests/test_ancestry_gmm.py +++ /dev/null @@ -1,47 +0,0 @@ -import pytest -import numpy as np -from bystro.ancestry.gmm_ancestry import GaussianMixturePPCA - - -@pytest.fixture -def example_data(): - # Provide an example dataset for testing - rng = np.random.default_rng(2021) - - return rng.normal( - size=(100, 10) - ) # Example data with 100 samples and 10 features - - -def test_gaussian_mixture_ppca_fit(example_data): - # Test the fit method of GaussianMixturePPCA - - # Create an instance of the model with minimal configuration for testing - model = GaussianMixturePPCA(n_clusters=2, n_components=5) - - # Perform fit on the example data - model.fit(example_data, progress_bar=False) - - # Assert that the model has been trained (you can add more specific assertions) - assert hasattr(model, "W_") - assert hasattr(model, "sigma2_") - assert hasattr(model, "pi_") - assert hasattr(model, "mu_") - - -""" -def test_gaussian_mixture_ppca_transform(example_data): - # Test the transform method of GaussianMixturePPCA - - # Create an instance of the model with minimal configuration for testing - model = GaussianMixturePPCA(n_clusters=2, n_components=5) - - # Perform fit on the example data - model.fit(example_data, progress_bar=False) - - # Perform transformation on the example data - transformed_data = model.transform(example_data) - - # Assert that the transformed_data has the correct shape - assert transformed_data.shape == (len(example_data), model.n_components) -""" diff --git a/python/python/bystro/ancestry/tests/test_ancestry_types.py b/python/python/bystro/ancestry/tests/test_ancestry_types.py deleted file mode 100644 index e257047b3..000000000 --- a/python/python/bystro/ancestry/tests/test_ancestry_types.py +++ /dev/null @@ -1,213 +0,0 @@ -"""Test ancestry_types.py.""" - -import re - -import msgspec -import pytest - -from bystro.ancestry.ancestry_types import ( - AncestryResults, - AncestryScoresOneSample, - AncestryTopHit, - PopulationVector, - ProbabilityInterval, - SuperpopVector, -) -from bystro.ancestry.train import POPS, SUPERPOPS - - -# ruff: noqa: E721 - -# In several tests we explicitly check that a value `is float` rather -# than use the more pythonic `isinstance(value, float)`. We make -# these explicit checks in order to ensure that such values are raw -# floats and not np.float64's, which can't easily be deserialized. -# But the former method of checking raises E71 errors, which are -# exempted from the linter on a file-wide basis above. - -prob_int = ProbabilityInterval(lower_bound=0.0, upper_bound=1.0) - - -pop_kwargs = {pop: prob_int for pop in POPS} -superpop_kwargs = {pop: prob_int for pop in SUPERPOPS} - - -def test_expected_population_vector(): - """Ensure that the expected populations are found.""" - expected = { - "ACB": prob_int, - "ASW": prob_int, - "BEB": prob_int, - "CDX": prob_int, - "CEU": prob_int, - "CHB": prob_int, - "CHS": prob_int, - "CLM": prob_int, - "ESN": prob_int, - "FIN": prob_int, - "GBR": prob_int, - "GIH": prob_int, - "GWD": prob_int, - "IBS": prob_int, - "ITU": prob_int, - "JPT": prob_int, - "KHV": prob_int, - "LWK": prob_int, - "MSL": prob_int, - "MXL": prob_int, - "PEL": prob_int, - "PJL": prob_int, - "PUR": prob_int, - "STU": prob_int, - "TSI": prob_int, - "YRI": prob_int, - } - assert msgspec.structs.asdict(PopulationVector(**pop_kwargs)) == expected - - -def test_ProbabilityInterval_accepts_valid_bounds() -> None: - """Ensure we can instantiate, validate ProbabilityInterval correctly.""" - prob_int = ProbabilityInterval(lower_bound=0.1, upper_bound=0.9) - assert type(prob_int.lower_bound) is float - assert type(prob_int.upper_bound) is float - - -def test_ProbabilityInterval_rejects_invalid_lower_bound() -> None: - with pytest.raises(TypeError, match="lower_bound must be >= 0.0"): - ProbabilityInterval(lower_bound=-0.1, upper_bound=0.9) - - -def test_ProbabilityInterval_rejects_invalid_upper_bound() -> None: - with pytest.raises(TypeError, match="upper_bound must be <= 1.0"): - ProbabilityInterval(lower_bound=0.1, upper_bound=1.1) - - -def test_ProbabilityInterval_rejects_ints() -> None: - with pytest.raises(TypeError, match="lower_bound must be a float, not "): - ProbabilityInterval(lower_bound=int(0), upper_bound=1.0) - - with pytest.raises(TypeError, match="upper_bound must be a float, not "): - ProbabilityInterval(lower_bound=0.0, upper_bound=int(1)) - - -def test_PopulationVector_accepts_valid_args() -> None: - """Ensure we can instantiate, validate PopulationVector correctly.""" - PopulationVector(**pop_kwargs) - - -def test_PopulationVector_rejects_missing_key() -> None: - pop_kwargs_with_missing_key = pop_kwargs.copy() - del pop_kwargs_with_missing_key["ACB"] - with pytest.raises(TypeError, match="Missing required argument 'ACB'"): - PopulationVector(**pop_kwargs_with_missing_key) - - -def test_PopulationVector_rejects_extra_key() -> None: - pop_kwargs_with_extra_key = pop_kwargs.copy() - pop_kwargs_with_extra_key["FOO"] = prob_int - with pytest.raises(TypeError, match="Unexpected keyword argument 'FOO'"): - PopulationVector(**pop_kwargs_with_extra_key) - - -def test_SuperpopVector_rejects_missing_key() -> None: - with pytest.raises(TypeError): - SuperpopVector( # type: ignore - AFR=prob_int, - AMR=prob_int, - EAS=prob_int, - EUR=prob_int, - ) - - -def test_SuperpopVector_extra_key() -> None: - with pytest.raises(TypeError): - SuperpopVector( # type: ignore - AFR=prob_int, - AMR=prob_int, - EAS=prob_int, - EUR=prob_int, - SAS=prob_int, - FOO=prob_int, - ) - - -def test_AncestryScoresOneSample_accepts_valid_args() -> None: - ancestry_result = AncestryScoresOneSample( - sample_id="my_sample_id", - top_hit=AncestryTopHit(probability=0.6, populations=["SAS"]), - populations=PopulationVector(**pop_kwargs), - superpops=SuperpopVector(**superpop_kwargs), - n_snps=10, - ) - assert type(ancestry_result.n_snps) is int - - -def test_AncestryScoresOneSample_rejects_invalid_n_snps() -> None: - with pytest.raises(TypeError, match="n_snps must be non-negative"): - AncestryScoresOneSample( - sample_id="my_sample_id", - top_hit=AncestryTopHit(probability=0.6, populations=["SAS"]), - populations=PopulationVector(**pop_kwargs), - superpops=SuperpopVector(**superpop_kwargs), - n_snps=-10, - ) - - with pytest.raises(TypeError, match="n_snps must be an int, not "): - AncestryScoresOneSample( - sample_id="my_sample_id", - top_hit=AncestryTopHit(probability=0.6, populations=["SAS"]), - populations=PopulationVector(**pop_kwargs), - superpops=SuperpopVector(**superpop_kwargs), - n_snps=float(10), # type: ignore - ) - - -def test_AncestryResults_accepts_valid_args() -> None: - ancestry_response = AncestryResults( - results=[ - AncestryScoresOneSample( - sample_id="foo", - top_hit=AncestryTopHit(probability=0.6, populations=["EAS"]), - populations=PopulationVector(**pop_kwargs), - superpops=SuperpopVector(**superpop_kwargs), - n_snps=5 - ), - AncestryScoresOneSample( - sample_id="bar", - top_hit=AncestryTopHit(probability=0.7, populations=["EUR"]), - populations=PopulationVector(**pop_kwargs), - superpops=SuperpopVector(**superpop_kwargs), - n_snps=5 - ), - AncestryScoresOneSample( - sample_id="baz", - top_hit=AncestryTopHit(probability=0.5, populations=["AFR", "AMR"]), - populations=PopulationVector(**pop_kwargs), - superpops=SuperpopVector(**superpop_kwargs), - n_snps=5 - ), - ], - pcs={"SampleID1": [0.1, 0.2, 0.3], "SampleID2": [0.4, 0.5, 0.6], "SampleID3": [0.7, 0.8, 0.9]}, - ) - ancestry_response_json = msgspec.json.encode(ancestry_response) - msgspec.json.decode(ancestry_response_json, type=AncestryResults) - - -def test_AncestryResults_rejects_invalid_pcs() -> None: - with pytest.raises( - msgspec.ValidationError, match=re.escape("Expected `object`, got `array` - at `$.pcs`") - ): - ancestry_response = AncestryResults( - results=[ - AncestryScoresOneSample( - sample_id="foo", - top_hit=AncestryTopHit(probability=0.6, populations=["EAS"]), - populations=PopulationVector(**pop_kwargs), - superpops=SuperpopVector(**superpop_kwargs), - n_snps=0, - ), - ], - pcs=({"SampleID1": [0.1]}, {"SampleID2": [0.4]}, {"SampleID3": [0.7]}), # type: ignore - ) - ancestry_response_json = msgspec.json.encode(ancestry_response) - msgspec.json.decode(ancestry_response_json, type=AncestryResults) diff --git a/python/python/bystro/ancestry/tests/test_define_callset.py b/python/python/bystro/ancestry/tests/test_define_callset.py deleted file mode 100644 index bd534c710..000000000 --- a/python/python/bystro/ancestry/tests/test_define_callset.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Test define_callset.py.""" - -import pandas as pd -import pytest - -from bystro.ancestry.define_callset import ( - _get_variants_from_affymetrix_df, - _get_variants_from_illumina_df, - liftover_38_from_37, -) - -pd.options.future.infer_string = True # type: ignore - -@pytest.mark.parametrize( - ("test_input", "expected"), - [ - ("chr22:51156732:G:A", "chr22:50718304:G:A"), - ("chr12:126890980:G:A", "chr12:126406434:G:A"), - ("chrX:81328:A:G", "chrX:31328:A:G"), - ("chr1:900000000:G:A", None), - ], -) -@pytest.mark.skip(reason="UCSC liftover service may be down.") -def test_liftover_38_from_37(test_input: str, expected: str | None): - assert expected == liftover_38_from_37(test_input) - - -@pytest.mark.skip(reason="UCSC liftover service may be down.") -def test__process_affymetrix_df(): - affymetrix_df = pd.DataFrame( - { - "Chromosome": {"AFFX-SP-000001": "10", "AFFX-SP-000002": "12", "AFFX-SP-000003": "10"}, - "Physical Position": { - "AFFX-SP-000001": 123096468, - "AFFX-SP-000002": 23201352, - "AFFX-SP-000003": 33545464, - }, - "Ref Allele": {"AFFX-SP-000001": "C", "AFFX-SP-000002": "C", "AFFX-SP-000003": "G"}, - "Alt Allele": {"AFFX-SP-000001": "G", "AFFX-SP-000002": "G", "AFFX-SP-000003": "C"}, - } - ) - expected_output = pd.Series( - { - 0: "chr10:121336954:C:G", - 1: "chr12:23048418:C:G", - 2: "chr10:33256536:G:C", - } - ) - - assert (expected_output == _get_variants_from_affymetrix_df(affymetrix_df)).all() - - -@pytest.mark.skip(reason="UCSC liftover service may be down.") -def test__process_illumina_df(): - illumina_df = pd.DataFrame( - { - "Chr": {1: "9", 3: "2", 5: "2"}, - "MapInfo": {1: 139926402.0, 3: 220089685.0, 5: 220075045.0}, - "SNP": {1: "[A/G]", 3: "[C/G]", 5: "[T/C]"}, - "RefStrand": {1: "-", 3: "-", 5: "+"}, - } - ) - expected_output = pd.Series( - {0: "chr9:137031950:T:C", 1: "chr2:219224963:G:C", 2: "chr2:219210323:T:C"} - ) - assert (expected_output == _get_variants_from_illumina_df(illumina_df)).all() diff --git a/python/python/bystro/ancestry/tests/test_inference.py b/python/python/bystro/ancestry/tests/test_inference.py deleted file mode 100644 index 7188a66cf..000000000 --- a/python/python/bystro/ancestry/tests/test_inference.py +++ /dev/null @@ -1,128 +0,0 @@ -"""Test Ancestry Model inference code.""" - -import numpy as np -import pandas as pd -import pytest -from sklearn.ensemble import RandomForestClassifier # type: ignore - -import pyarrow as pa # type: ignore -import pyarrow.dataset as ds # type: ignore - -from bystro.ancestry.inference import AncestryModel, AncestryModels, infer_ancestry -from bystro.ancestry.train import POPS -from bystro.ancestry.model import get_models - -pd.options.future.infer_string = True # type: ignore - -np.random.seed(0) # noqa: NPY002 - -SAMPLES = [f"sample{i}" for i in range(len(POPS))] -VARIANTS = ["variant1", "variant2", "variant3"] -PC_COLUMNS = ["pc1", "pc2", "pc3", "pc4"] -FAKE_GENOTYPES = pd.DataFrame( - np.random.random((len(VARIANTS), len(SAMPLES))), index=VARIANTS, columns=SAMPLES -) -FAKE_GENOTYPES_DOSAGE_MATRIX = ds.dataset( - pa.Table.from_pandas( - FAKE_GENOTYPES.reset_index().rename(columns={"index": "locus"}), preserve_index=False - ).to_batches() -) - - -def _infer_ancestry(): - samples = [f"sample{i}" for i in range(len(POPS))] - variants = ["variant1", "variant2", "variant3"] - pc_columns = ["pc1", "pc2", "pc3", "pc4"] - pca_loadings_df = pd.DataFrame( - np.random.random((len(variants), len(pc_columns))), index=variants, columns=pc_columns - ) - genotypes = pd.DataFrame( - np.random.random((len(variants), len(samples))), index=variants, columns=samples - ) - - train_Xpc = genotypes.T @ pca_loadings_df - train_y = POPS - rfc = RandomForestClassifier(n_estimators=1, max_depth=1).fit(train_Xpc, train_y) - ancestry_model = AncestryModel(pca_loadings_df, rfc) - - genotypes = genotypes.reset_index() - genotypes = genotypes.rename(columns={"index": "locus"}) - genotypes = ds.dataset(pa.Table.from_pandas(genotypes, preserve_index=False).to_batches()) - - return infer_ancestry(AncestryModels(ancestry_model, ancestry_model), genotypes), samples - - -def _make_ancestry_model() -> AncestryModel: - # one population per sample so that we can include all populations in train_y. - pca_loadings_df = pd.DataFrame( - np.random.random((len(VARIANTS), len(PC_COLUMNS))), index=VARIANTS, columns=PC_COLUMNS - ) - train_Xpc = FAKE_GENOTYPES.T @ pca_loadings_df - train_y = POPS - rfc = RandomForestClassifier(n_estimators=1, max_depth=1).fit(train_Xpc, train_y) - return AncestryModel(pca_loadings_df, rfc) - - -ANCESTRY_MODEL = _make_ancestry_model() - - -def test_Ancestry_Model(): - pcs_for_plotting, pop_probs = ANCESTRY_MODEL.predict_proba(FAKE_GENOTYPES) - assert (pop_probs.index == SAMPLES).all() - assert (pop_probs.columns == POPS).all() - - -def test_Ancestry_Model_missing_pca_col(): - pca_loadings_df = ANCESTRY_MODEL.pca_loadings_df - bad_pca_loadings_df = pca_loadings_df[pca_loadings_df.columns[:-1]] - - with pytest.raises(ValueError, match="must equal"): - AncestryModel(bad_pca_loadings_df, ANCESTRY_MODEL.rfc) - - -def test_infer_ancestry(): - ancestry_response, samples = _infer_ancestry() - - assert len(samples) == len(ancestry_response.results) - - -def test_infer_ancestry_from_model(): - ancestry_models = get_models("hg38") - - # Generate an arrow table that contains genotype dosages for 1000 samples - variants = list(ancestry_models.gnomad_model.pca_loadings_df.index) - samples = [f"sample{i}" for i in range(1000)] - genotypes = pd.DataFrame( - np.random.randint(0, 2, (len(variants), len(samples))), # noqa: NPY002 - index=variants, - columns=samples, # noqa: NPY002 - ) - # randomly set 10% of the genotypes to missing to ensure we test missing data handling - drop_snps_n = int(0.1 * len(genotypes)) - retained_snps_n = len(genotypes) - drop_snps_n - drop_indices = np.random.choice(genotypes.index, size=drop_snps_n, replace=False) # noqa: NPY002 - genotypes = genotypes.drop(list(drop_indices)) - - genotypes = genotypes.reset_index() - genotypes = genotypes.rename(columns={"index": "locus"}) - - genotypes = ds.dataset(pa.Table.from_pandas(genotypes, preserve_index=False).to_batches()) - - ancestry_response = infer_ancestry(ancestry_models, genotypes) - assert len(samples) == len(ancestry_response.results) - - top_hits = set() - top_probs = set() - samples_seen = set() - sample_set = set(samples) - for result in ancestry_response.results: - top_hits.add(result.top_hit.populations[0]) - top_probs.add(result.top_hit.probability) - - samples_seen.add(result.sample_id) - - assert result.n_snps == retained_snps_n - - assert samples_seen == sample_set - assert len(top_hits) > 1 - assert len(top_probs) > 1 diff --git a/python/python/bystro/ancestry/tests/test_train.py b/python/python/bystro/ancestry/tests/test_train.py deleted file mode 100644 index bfffe8811..000000000 --- a/python/python/bystro/ancestry/tests/test_train.py +++ /dev/null @@ -1,288 +0,0 @@ -"""Tests for ancestry model training code.""" - -import re - -import numpy as np -import pandas as pd -import pytest -from pandas.testing import assert_frame_equal - -from bystro.ancestry.train import ( - POPS, - SUPERPOPS, - _parse_vcf_from_file_stream, - superpop_predictions_from_pop_probs, - superpop_probs_from_pop_probs, -) - -pd.options.future.infer_string = True # type: ignore - - -def test__parse_vcf_from_file_stream(): - file_stream = [ - "##Some comment", - "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 sample3", - "chr1 1 . T G . PASS i;n;f;o GT 0|1 1|0 0|0", - "chr1 123 . T G . PASS i;n;f;o GT 0|0 1|1 1|1", - "chr1 123456 . T G . PASS i;n;f;o GT 0|1 1|0 0|0", - ] - expected_df = pd.DataFrame( - [[1.0, 0.0, 1.0], [1.0, 2.0, 1.0], [0.0, 2.0, 0.0]], - index=["sample1", "sample2", "sample3"], - columns=["chr1:1:T:G", "chr1:123:T:G", "chr1:123456:T:G"], - ) - actual_df = _parse_vcf_from_file_stream( - file_stream, - [ - "chr1:1:T:G", - "chr1:123:T:G", - "chr1:123456:T:G", - ], - return_exact_variants=True, - ) - assert_frame_equal(expected_df, actual_df) - - -def test__parse_vcf_from_file_stream_missing_data(): - file_stream = [ - "##Some comment", - "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 sample3", - "chr1 1 . T G . PASS i;n;f;o GT .|. 1|0 0|0", - "chr1 123 . T G . PASS i;n;f;o GT 0|0 .|1 1|1", - "chr1 123456 . T G . PASS i;n;f;o GT 0|1 1|0 0|.", - ] - expected_df = pd.DataFrame( - [[np.nan, 0.0, 1.0], [1.0, np.nan, 1.0], [0.0, 2.0, np.nan]], - index=["sample1", "sample2", "sample3"], - columns=["chr1:1:T:G", "chr1:123:T:G", "chr1:123456:T:G"], - ) - actual_df = _parse_vcf_from_file_stream( - file_stream, - [ - "chr1:1:T:G", - "chr1:123:T:G", - "chr1:123456:T:G", - ], - return_exact_variants=True, - ) - assert_frame_equal(expected_df, actual_df) - - -def test__parse_vcf_from_file_stream_no_chr_prefix(): - file_stream = [ - "##Some comment", - "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 sample3", - "1 1 . T G . PASS i;n;f;o GT 0|1 1|0 0|0", - ] - expected_df = pd.DataFrame( - [[1.0], [1.0], [0.0]], - index=["sample1", "sample2", "sample3"], - columns=["chr1:1:T:G"], - ) - actual_df = _parse_vcf_from_file_stream( - file_stream, - [ - "chr1:1:T:G", - ], - return_exact_variants=True, - ) - assert_frame_equal(expected_df, actual_df) - - -def test__parse_vcf_from_file_stream_bad_metadata_fields(): - file_stream = [ - "##Some comment", - "#CHROM POS ID REF ALT FILTER INFO sample1 sample2 sample3", - "chr1 1 . T G PASS i;n;f;o 0|1 1|0 0|0", - "chr1 123 . T G PASS i;n;f;o 0|0 1|1 1|1", - "chr1 123456 . T G PASS i;n;f;o 0|1 1|0 0|0", - ] - - expected_err_msg = re.escape( - "vcf does not contain expected metadata columns. " - "Expected: ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'], " - "got: ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'INFO', 'sample1', 'sample2'] instead." - ) - with pytest.raises(ValueError, match=expected_err_msg): - _parse_vcf_from_file_stream( - file_stream, - [ - "chr1:1:T:G", - "chr1:123:T:G", - "chr1:123456:T:G", - ], - return_exact_variants=True, - ) - - -def test__parse_vcf_from_file_stream_wrong_chromosome(): - file_stream = [ - "##Some comment", - "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 sample3", - "chr2 1 . T G . PASS i;n;f;o GT 0|1 1|0 0|0", - "chr2 123 . T G . PASS i;n;f;o GT 0|0 1|1 1|1", - "chr2 123456 . T G . PASS i;n;f;o GT 0|1 1|0 0|0", - ] - expected_df = pd.DataFrame( - [], - index=["sample1", "sample2", "sample3"], - columns=[], - ) - - actual_df = _parse_vcf_from_file_stream( - file_stream, - [ - "chr1:1:T:G", - "chr1:123:T:G", - "chr1:123456:T:G", - ], - return_exact_variants=False, - ) - assert_frame_equal(expected_df, actual_df) - - expected_df_missing_data = pd.DataFrame( - np.zeros((3, 3)) * np.nan, - index=["sample1", "sample2", "sample3"], - columns=[ - "chr1:1:T:G", - "chr1:123:T:G", - "chr1:123456:T:G", - ], - ) - - actual_df_missing_data = _parse_vcf_from_file_stream( - file_stream, - [ - "chr1:1:T:G", - "chr1:123:T:G", - "chr1:123456:T:G", - ], - return_exact_variants=True, - ) - # check frame equality up to column ordering, which may differ if some variants were missing. - assert_frame_equal(expected_df_missing_data, actual_df_missing_data, check_like=True) - - -def test__parse_vcf_from_file_stream_ragged_rows(): - file_stream = [ - "##Some comment", - "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 sample3", - "chr1 1 . T G . PASS i;n;f;o GT 0|1 1|0 0|0", - "chr1 123 . T G . PASS i;n;f;o GT 0|0 1|1", - "chr1 123456 . T G . PASS i;n;f;o GT 0|1 1|0 0|0", - ] - - with pytest.raises(ValueError, match="do all genotype rows have the same number of fields?"): - _parse_vcf_from_file_stream( - file_stream, - [ - "chr1:1:T:G", - "chr1:123:T:G", - "chr1:123456:T:G", - ], - return_exact_variants=False, - ) - - -def test__parse_vcf_from_file_stream_bad_filter_values(): - file_stream = [ - "##Some comment", - "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 sample3", - "chr1 1 . T G . PASS i;n;f;o GT 0|1 1|0 0|0", - "chr1 123 . T G . . i;n;f;o GT 0|0 1|1 1|1", - "chr1 123456 . T G . foo i;n;f;o GT 0|1 1|0 0|0", - ] - - expected_df = pd.DataFrame( - [[1.0, 0.0], [1.0, 2.0], [0.0, 2.0]], - index=["sample1", "sample2", "sample3"], - columns=["chr1:1:T:G", "chr1:123:T:G"], - ) - - actual_df = _parse_vcf_from_file_stream( - file_stream, - [ - "chr1:1:T:G", - "chr1:123:T:G", - "chr1:123456:T:G", - ], - return_exact_variants=False, - ) - assert_frame_equal(expected_df, actual_df) - - -def test_superpop_probs_from_pop_probs(): - samples = [f"sample{i}" for i in range(len(POPS))] - # input array is identity matrix, i.e. one 100% prediction per population - pop_probs = pd.DataFrame(np.eye(len(POPS)), index=samples, columns=POPS) - superpop_probs = superpop_probs_from_pop_probs(pop_probs) - # expected output is matrix mapping each population to its superpop - expected_superpop_probs = pd.DataFrame( - [ - [1.0, 0.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 0.0, 1.0], - [0.0, 0.0, 1.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 1.0, 0.0], - [0.0, 0.0, 1.0, 0.0, 0.0], - [0.0, 0.0, 1.0, 0.0, 0.0], - [0.0, 1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 1.0, 0.0], - [0.0, 0.0, 0.0, 1.0, 0.0], - [0.0, 0.0, 0.0, 0.0, 1.0], - [1.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 1.0, 0.0], - [0.0, 0.0, 0.0, 0.0, 1.0], - [0.0, 0.0, 1.0, 0.0, 0.0], - [0.0, 0.0, 1.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 1.0, 0.0, 0.0, 0.0], - [0.0, 1.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 0.0, 1.0], - [0.0, 1.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 0.0, 1.0], - [0.0, 0.0, 0.0, 1.0, 0.0], - [1.0, 0.0, 0.0, 0.0, 0.0], - ], - index=samples, - columns=SUPERPOPS, - ) - assert_frame_equal(expected_superpop_probs, superpop_probs) - - -def test_superpop_predictions_from_pop_probs(): - samples = [f"sample{i}" for i in range(len(POPS))] - # input array is identity matrix, i.e. one 100% prediction per population - pop_probs = pd.DataFrame(np.eye(len(POPS)), index=samples, columns=POPS) - superpop_predictions = superpop_predictions_from_pop_probs(pop_probs) - expected_superpop_predictions = [ - "AFR", - "AFR", - "SAS", - "EAS", - "EUR", - "EAS", - "EAS", - "AMR", - "AFR", - "EUR", - "EUR", - "SAS", - "AFR", - "EUR", - "SAS", - "EAS", - "EAS", - "AFR", - "AFR", - "AMR", - "AMR", - "SAS", - "AMR", - "SAS", - "EUR", - "AFR", - ] - assert expected_superpop_predictions == superpop_predictions diff --git a/python/python/bystro/ancestry/tests/test_train_utils.py b/python/python/bystro/ancestry/tests/test_train_utils.py deleted file mode 100644 index dad813d04..000000000 --- a/python/python/bystro/ancestry/tests/test_train_utils.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Test train utilities.""" -import pytest - -from bystro.ancestry.train_utils import is_autosomal_variant - - -def test_is_autosomal_variant(): - assert is_autosomal_variant("chr1:123456:A:T") - assert is_autosomal_variant("chr22:1:T:A") - assert not is_autosomal_variant("22:1:A:G") - assert not is_autosomal_variant("chrX:1:A:G") - assert not is_autosomal_variant("chr23:1:G:C") - assert not is_autosomal_variant("chr22:1:A:") - assert not is_autosomal_variant("chr22:1:A:AT") - assert not is_autosomal_variant("chr22:1:GC:AT") - assert not is_autosomal_variant("chr22:1:X:Y") - with pytest.raises(ValueError, match="cannot have identical ref and alt alleles"): - is_autosomal_variant("chr22:1:A:A") diff --git a/python/python/bystro/ancestry/train.py b/python/python/bystro/ancestry/train.py deleted file mode 100644 index a022b1a01..000000000 --- a/python/python/bystro/ancestry/train.py +++ /dev/null @@ -1,615 +0,0 @@ -"""Training code for Global Ancestry Model. - -This script takes a vcf of preprocessed variants (see preprocess_vcfs.sh) and generates: - -1. A list of variants to be used for inference. - -2. PCA loadings mapping the list of variants in (1) to PC space. - -3. Classifiers mapping PC space to the 26 HapMap populations as well as 5 continent-level -superpopulations. - -Training for current models occurs in train_chip_model.py and train_gnomad_model.py -""" - -import dataclasses -import gzip -import logging -import random -import sys -from collections.abc import Collection, Container, Iterable -from pathlib import Path -from typing import Any, Literal, TypeVar, get_args - -import allel -import numpy as np -import pandas as pd -import sklearn -import tqdm -from sklearn.decomposition import PCA -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import accuracy_score -from sklearn.model_selection import train_test_split -from skops.io import dump as skops_dump - -from bystro.ancestry.asserts import assert_equals, assert_true -from bystro.ancestry.train_utils import get_variant_ids_from_callset, head - -logger = logging.getLogger(__name__) - -pd.options.future.infer_string = True # type: ignore - -ANCESTRY_DIR = Path(__file__).parent -DATA_DIR = ANCESTRY_DIR / "data" -KGP_VCF_DIR = DATA_DIR / "kgp_vcfs" -INTERMEDIATE_DATA_DIR = ANCESTRY_DIR / "intermediate_data" -VCF_PATH = DATA_DIR / "1kgp_gnomadset_unrelated.vcf.gz" -ANCESTRY_MODEL_PRODUCTS_DIR = ANCESTRY_DIR / "ancestry_model_products" -PCA_FPATH = ANCESTRY_MODEL_PRODUCTS_DIR / "hg38_gnomadset_pca.csv" -RFC_FPATH = ANCESTRY_MODEL_PRODUCTS_DIR / "hg38_gnomadset_rfc.skop" -# TODO Set up download of gnomad loadings in preprocess step -GNOMAD_LOADINGS_PATH = "gnomadloadings.tsv" -# TODO Set up preprocess of this file that doesn't include dependency like plink or bcftools -KGP_VCF_FILTERED_TO_GNOMAD_LOADINGS_FILEPATH = "1kgpGnomadList.vcf" - - -ANCESTRY_INFO_PATH = DATA_DIR / "KGP_ancestry.csv" -ROWS, COLS = 0, 1 -QUALITY_CUTOFF = 100 # for variant quality filtering -PLOIDY = 2 -AUTOSOMAL_CHROMOSOMES = set(range(1, 22 + 1)) -EXPECTED_NUM_POPULATIONS = 26 -FST_THRESHOLD = 0.3 -MI_THRESHOLD = 0 -PCA_DIMS = 50 -EXPLAINED_VARIANCE_THRESHOLD = 0.1 -RFC_TRAIN_ACCURACY_THRESHOLD = 0.9 -RFC_TEST_ACCURACY_THRESHOLD = 0.75 -RFC_TRAIN_SUPERPOP_ACCURACY_THRESHOLD = 0.99 -RFC_TEST_SUPERPOP_ACCURACY_THRESHOLD = 0.99 - -VCF_METADATA_COLUMNS = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] -FILTER_FIELD_IDX = VCF_METADATA_COLUMNS.index("FILTER") -NUM_VCF_METADATA_COLUMNS = len(VCF_METADATA_COLUMNS) - -# superpop definitions taken from ensembl -SUPERPOP_FROM_POP = { - "CHB": "EAS", - "JPT": "EAS", - "CHS": "EAS", - "CDX": "EAS", - "KHV": "EAS", - "CEU": "EUR", - "TSI": "EUR", - "FIN": "EUR", - "GBR": "EUR", - "IBS": "EUR", - "YRI": "AFR", - "LWK": "AFR", - "MSL": "AFR", - "ESN": "AFR", - "ASW": "AFR", - "ACB": "AFR", - "GWD": "AFR", - "MXL": "AMR", - "PUR": "AMR", - "CLM": "AMR", - "PEL": "AMR", - "GIH": "SAS", - "PJL": "SAS", - "BEB": "SAS", - "STU": "SAS", - "ITU": "SAS", -} -POPS = np.array(sorted(SUPERPOP_FROM_POP.keys())) -SUPERPOPS = np.array(sorted(set(SUPERPOP_FROM_POP.values()))) - -rng = np.random.RandomState(1337) - -Variant = str - - -def _load_callset() -> dict[str, Any]: - """Load callset and perform as many checks as can be done without processing it.""" - callset = allel.read_vcf(str(VCF_PATH), log=sys.stdout) - genotypes = callset["calldata/GT"] - num_variants, num_samples = genotypes.shape[0], genotypes.shape[1] - assert_equals( - f"chromosomes in {VCF_PATH}", - {int(chrom) for chrom in callset["variants/CHROM"]}, - "autosomal chromosomes", - AUTOSOMAL_CHROMOSOMES, - ) - assert_true( - "variant and sample dimensions", - num_variants > 33000 and num_samples >= 2504, - comment=( - f"VCF file {VCF_PATH} had unexpected dimensions. " - f"Expected more than 33,000 variants and 2504 or more samples, " - f"but got {num_variants} variants and {num_samples} samples." - ), - ) - return callset - - -def load_callset_for_variants(variants: set[str]) -> pd.DataFrame: - """Load merged 1000 genomes data filtered to specified variants.""" - logger.info("Starting to load callset") - genotype_df = parse_vcf(VCF_PATH, variants) - logger.info("Got genotype_df of shape: %s", genotype_df.shape) - return genotype_df - - -def _parse_vcf_line_for_dosages( - line: str, variants_to_keep: Container[Variant] -) -> tuple[Variant, list[float]] | None: - # We want to determine if we care about the variant on this line - # before we parse it in full. So we'll parse just enough of it to - # read the variant and filter info: if we want the variant and it - # passes the filter checks, then we'll parse the rest of the line - # for the sample genotypes, because `line.split` is the bottleneck - # here. Under this approach, file IO is 75% of the walltime of - # parse_vcf and parsing the other 25%. - i = 0 - tab_count = 0 - while tab_count <= FILTER_FIELD_IDX: - if line[i] == "\t": - tab_count += 1 - i += 1 - fixed_fields = line[:i].split() - if fixed_fields[FILTER_FIELD_IDX] not in ["PASS", "."]: - return None - variant = ":".join([fixed_fields[0], fixed_fields[1], fixed_fields[3], fixed_fields[4]]) - variant = variant if variant.startswith("chr") else "chr" + variant - if variant in variants_to_keep: - fields = line.split() # now we can parse the full line - variant_dosages = [ - _parse_genotype_field(field) for field in fields[NUM_VCF_METADATA_COLUMNS:] - ] # genotype fields take the form e.g. '0|1', '0/1', './1 or './.' - return variant, variant_dosages - return None - - -def _parse_genotype_field(psa: str) -> float: - """Parse a field of the form '0|1', '0/1' or './.' as a dosage.""" - try: - return float(psa[0]) + float(psa[2]) - except ValueError: - return np.nan - - -def _get_chromosome_from_variant(variant: Variant) -> str: - return variant.split(":")[0] - - -T = TypeVar("T") - - -def _calculate_recovery_rate( - found_variants: Collection[Variant], variants_to_keep: Collection[Variant] -) -> float: - if len(found_variants) == 0: - return 0.0 - found_chromosomes = {_get_chromosome_from_variant(v) for v in found_variants} - # We might have parsed a vcf containing variants for many - # chromosomes, or for one only. if for one chromosome only, we - # calculate the recovery rate only for variants belonging to that - # chromosome. Otherwise, we calculate the recovery rate over all - # chromosomes. - if len(found_chromosomes) == 1: - relevant_chromosome = head(found_chromosomes) - relevant_variants_to_keep = { - v for v in variants_to_keep if _get_chromosome_from_variant(v) == relevant_chromosome - } - return len(found_variants) / len(relevant_variants_to_keep) - return len(found_variants) / len(variants_to_keep) - - -def parse_vcf( - vcf_fpath: str | Path, variants_to_keep: Collection[str], *, return_exact_variants: bool = False -) -> pd.DataFrame: - """Parse vcf_fpath for selected variants, returning dosage matrix as DataFrame.""" - with gzip.open(vcf_fpath, "rt") as f: - return _parse_vcf_from_file_stream( - f, variants_to_keep, return_exact_variants=return_exact_variants - ) - - -def _check_fields_for_metadata_columns(fields: list[str]) -> None: - """Assert that VCF contains expected metadata columns.""" - metadata_fields = fields[:NUM_VCF_METADATA_COLUMNS] - if metadata_fields != VCF_METADATA_COLUMNS: - err_msg = ( - "vcf does not contain expected metadata columns. " - f"Expected: {VCF_METADATA_COLUMNS}, " - f"got: {metadata_fields} instead." - ) - raise ValueError(err_msg) - - -def _parse_vcf_from_file_stream( - file_stream: Iterable[str], variants_to_keep: Collection[str], *, return_exact_variants: bool -) -> pd.DataFrame: - found_variants = [] - dosage_data = [] - sample_ids = None - total_lines = 0 - for line in tqdm.tqdm(file_stream): - total_lines += 1 - if line.startswith("##"): - continue - if line.startswith("#CHROM"): - fields = line.lstrip("#").split() - _check_fields_for_metadata_columns(fields) - sample_ids = fields[NUM_VCF_METADATA_COLUMNS:] - elif variant_dosages := _parse_vcf_line_for_dosages(line, variants_to_keep): - variant, dosages = variant_dosages - found_variants.append(variant) - dosage_data.append(dosages) - else: - continue - if sample_ids is None: - msg = "Sample ids not set during VCF processing: does your VCF contain a valid header?" - raise AssertionError(msg) - found_chromosomes = {_get_chromosome_from_variant(v) for v in found_variants} - logger.info( - "processed %s lines, retaining %s variants from %s chromosomes", - total_lines, - len(found_variants), - len(found_chromosomes), - ) - try: - df_values: np.ndarray | list[list[float]] = np.array(dosage_data).T if dosage_data else [] - logger.info(df_values) - except ValueError as val_err: - err_msg = ( - "Couldn't convert dosage data to np.array, " - "do all genotype rows have the same number of fields?" - ) - raise ValueError(err_msg) from val_err - dosage_df = pd.DataFrame(df_values, index=sample_ids, columns=found_variants) - # we assume each vcf file contains variants for a single chromosome - recovery_rate = _calculate_recovery_rate(found_variants, variants_to_keep) - logger.info("recovery rate: %s", recovery_rate) - - if return_exact_variants: - missing_variants = set(variants_to_keep) - set(found_variants) - logger.info("adding NaNs for %s variants not found in VCF", len(missing_variants)) - missing_dosages = pd.DataFrame( - np.nan * np.ones((len(dosage_df), len(missing_variants))), - index=dosage_df.index, - columns=list(missing_variants), - ) - dosage_df = pd.concat([dosage_df, missing_dosages], axis="columns") - - return dosage_df - - -def _load_genotypes() -> pd.DataFrame: - """Read variants from disk, return as count matrix with dimensions (samples X variants).""" - logger.info("loading callset") - callset = _load_callset() - logger.info("finished loading callset") - samples = callset["samples"] - genotypes = allel.GenotypeArray(callset["calldata/GT"]) - variant_ids = get_variant_ids_from_callset(callset) - logger.info("starting with GenotypeArray of shape: %s", genotypes.shape) - ref_is_snp = np.array([len(ref) == 1 for ref in callset["variants/REF"]]) - logger.info("found %s variants where ref is mononucleotide", sum(ref_is_snp)) - # first alt is SNP and rest are empty... - alt_is_snp = np.array( - [len(row[0]) == 1 and set(row[1:]) == {""} for row in callset["variants/ALT"]], - ) - logger.info("found %s variants where alt is mononucleotide", sum(alt_is_snp)) - allele_counts = genotypes.count_alleles()[:] - is_single_allele_snp = allele_counts.max_allele() == 1 - logger.info("found %s single allele snps", sum(is_single_allele_snp)) - is_non_singleton = ( - allele_counts[:, :2].min(axis=COLS) > 1 - ) # min count over reference, first allele is greater than zero - logger.info("found %s non-singleton alleles", sum(is_non_singleton)) - is_high_quality = callset["variants/QUAL"] >= QUALITY_CUTOFF - logger.info("found %s high quality alleles", sum(is_high_quality)) - - mask = ref_is_snp & alt_is_snp & is_single_allele_snp & is_non_singleton & is_high_quality - logger.info("keeping %s alleles", sum(mask)) - filtered_genotypes = genotypes.compress(mask, axis=ROWS).to_n_alt().T - variant_ids = variant_ids[mask] - assert_equals( - "filtered genotype matrix shape", - filtered_genotypes.shape, - "sample and variant id sizes", - (len(samples), len(variant_ids)), - ) - return pd.DataFrame(filtered_genotypes, index=samples, columns=variant_ids) - - -def _load_ancestry_df() -> pd.DataFrame: - ancestry_df = pd.read_csv(ANCESTRY_INFO_PATH, sep=",") - assert_equals("number of rows", 3195, "actual number of rows", len(ancestry_df)) - expected_samples = ["NA12865", "HG03930", "NA19171"] - assert_true( - "Sample name column passes spot checks", - all(sample in ancestry_df["Sample name"].to_numpy() for sample in expected_samples), - ) - ancestry_df = ancestry_df.set_index("Sample name") - return ancestry_df - - -def load_label_data(samples: pd.Index) -> pd.DataFrame: - """Load dataframe of population, superpop labels for samples.""" - ancestry_df = _load_ancestry_df() - missing_samples = set(samples) - set(ancestry_df.index) - if missing_samples: - msg = f"Ancestry dataframe is missing samples: {missing_samples}" - raise AssertionError(msg) - populations = sorted(ancestry_df["Population elastic ID"].unique()) - if EXPECTED_NUM_POPULATIONS != len(populations): - msg = ( - f"Found wrong number of populations ({len(populations)}) in ancestry df, " - f"expected {EXPECTED_NUM_POPULATIONS}" - ) - raise ValueError(msg) - get_pop_from_sample = ancestry_df["Population elastic ID"].to_dict() - labels = pd.DataFrame( - [get_pop_from_sample[s] for s in samples], - index=samples, - columns=["Population elastic ID"], - ) - labels = labels.rename(columns={"Population elastic ID": "population"}) - labels["superpop"] = labels["population"].apply(SUPERPOP_FROM_POP.get) - - assert_true("no missing data in labels", labels.notna().all().all()) - assert_equals( - "number of populations", - EXPECTED_NUM_POPULATIONS, - "number of populations found in labels", - len((labels["population"]).unique()), - ) - assert_equals( - "number of superpopulations", - len(set(SUPERPOP_FROM_POP.values())), - "superpopulations found in labels", - len((labels["superpop"]).unique()), - ) - assert_equals("samples", samples, "labels", labels.index) - return labels - - -def assert_genotypes_and_label_agree(genotypes: pd.DataFrame, labels: pd.DataFrame) -> None: - """Check that genotypes, labels agree on indices.""" - assert_equals("genotypes index", genotypes.index, "labels index", labels.index) - - -def _perform_pca(train_X: pd.DataFrame, test_X: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, PCA]: - """Perform PCA, checking for good compression.""" - logger.info("Beginning PCA") - minimum_variant_std = train_X.std(axis="index").min() - assert_true( - "minimum variant standard deviation greater than zero", - minimum_variant_std > 0, - comment="Have you excluded all monomorphic alleles?", - ) - train_Xpc, pca = allel.pca( - train_X.T, - n_components=PCA_DIMS, - scaler="patterson", - ) # must be transposed for allel pca - logger.info( - "Cumulative explained variance ratio for %s dimensions: %s", - len(pca.explained_variance_ratio_), - np.cumsum(pca.explained_variance_ratio_), - ) - test_Xpc = pca.transform(test_X.T) - - assert_true( - f"Explained variance ratio > {EXPLAINED_VARIANCE_THRESHOLD}%", - np.sum(pca.explained_variance_ratio_) > EXPLAINED_VARIANCE_THRESHOLD, - ) - return train_Xpc, test_Xpc, pca - - -def make_train_test_split( - genotypes: pd.DataFrame, - labels: pd.DataFrame, -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """Make train / test splits, stratifying on population.""" - train_X, test_X, train_y, test_y = train_test_split( - genotypes, labels, stratify=labels.population, random_state=1337 - ) - assert_equals( - "train features", - train_X.shape[1], - "test features", - test_X.shape[1], - comment="Did you mix up the return values?", - ) - return train_X, test_X, train_y, test_y - - -@dataclasses.dataclass(frozen=True) -class RFCParamChoices: - """Param Choices for RFC Randomized Hyperparameter Tuning.""" - - n_estimators: Literal[1000] - max_depth: Literal[5, 10, 20, None] - min_samples_leaf: Literal[1, 2, 5, 10] - criterion: Literal["gini", "entropy", "log_loss"] - max_features: Literal["sqrt", "log2", None] - pca_dims: Literal[30] - - @classmethod - def sample(cls) -> "RFCParamChoices": - """Construct a randomized ParamChoice from values of Literal type annotations.""" - kwargs = {} - for param_name, typ in cls.__annotations__.items(): - values = get_args(typ) - kwargs[param_name] = random.choice(values) - return cls(**kwargs) - - -@dataclasses.dataclass(frozen=True) -class AccuracyReport: - """Represent model accuracy scores at population and superpopulation levels.""" - - train_pop_accuracy: float - test_pop_accuracy: float - train_superpop_accuracy: float - test_superpop_accuracy: float - - -def _compute_accuracy_report( - clf: sklearn.base.ClassifierMixin, - train_Xpc: pd.DataFrame, - test_Xpc: pd.DataFrame, - train_y: pd.DataFrame, - test_y: pd.DataFrame, -) -> AccuracyReport: - """Compute accuracy scores for population and superpop classification on train and test.""" - train_yhat_pop_probs = clf.predict_proba(train_Xpc) - test_yhat_pop_probs = clf.predict_proba(test_Xpc) - - train_yhat_pops = POPS[np.argmax(train_yhat_pop_probs, axis=1)] - test_yhat_pops = POPS[np.argmax(test_yhat_pop_probs, axis=1)] - train_yhat_superpops = superpop_predictions_from_pop_probs(train_yhat_pop_probs) - test_yhat_superpops = superpop_predictions_from_pop_probs(test_yhat_pop_probs) - - return AccuracyReport( - accuracy_score(train_y.population, train_yhat_pops), - accuracy_score(test_y.population, test_yhat_pops), - accuracy_score(train_y.superpop, train_yhat_superpops), - accuracy_score(test_y.superpop, test_yhat_superpops), - ) - - -def make_rfc( - train_Xpc: pd.DataFrame, - test_Xpc: pd.DataFrame, - train_y: pd.DataFrame, - test_y: pd.DataFrame, - trials: int = 10, -) -> RandomForestClassifier: - """Build population-level RFC using randomized hyperparameter search.""" - tuning_results: dict[RFCParamChoices, AccuracyReport] = {} - - for _trial in tqdm.trange(trials): - param_choices = RFCParamChoices.sample() - rfc_params = {k: v for (k, v) in dataclasses.asdict(param_choices).items() if k != "pca_dims"} - cols_to_use = train_Xpc.columns[: param_choices.pca_dims] - rfc = RandomForestClassifier(**rfc_params, random_state=1337) - rfc.fit(train_Xpc[cols_to_use], train_y.population) - - accuracy_report = _compute_accuracy_report( - rfc, train_Xpc[cols_to_use], test_Xpc[cols_to_use], train_y, test_y - ) - tuning_results[param_choices] = accuracy_report - logger.info( - "RFC param choices: %s, accuracies: %s", param_choices, tuning_results[param_choices] - ) - - best_params = max(tuning_results, key=lambda params: tuning_results[params].test_pop_accuracy) - rfc_params = {k: v for (k, v) in dataclasses.asdict(best_params).items() if k != "pca_dims"} - cols_to_use = train_Xpc.columns[: param_choices.pca_dims] - - rfc = RandomForestClassifier(**(rfc_params)) - rfc.fit(train_Xpc[cols_to_use], train_y.population) - # recompute accuracy report to ensure we didn't just get lucky... - accuracy_report = _compute_accuracy_report( - rfc, train_Xpc[cols_to_use], test_Xpc[cols_to_use], train_y, test_y - ) - logger.info("best_params: %s", best_params) - logger.info("accuracies %s", accuracy_report) - - threshold_checks = [ - ("train population", RFC_TRAIN_ACCURACY_THRESHOLD, accuracy_report.train_pop_accuracy), - ("test population", RFC_TEST_ACCURACY_THRESHOLD, accuracy_report.test_pop_accuracy), - ( - "train superpop", - RFC_TRAIN_SUPERPOP_ACCURACY_THRESHOLD, - accuracy_report.test_superpop_accuracy, - ), - ("test superpop", RFC_TEST_SUPERPOP_ACCURACY_THRESHOLD, accuracy_report.test_superpop_accuracy), - ] - - for description, expected, actual in threshold_checks: - if not actual >= expected: - logger.warning("Expected %s accuracy >= %s, got: %s instead", description, expected, actual) - # if not accuracy_report.train_pop_accuracy > RFC_TRAIN_ACCURACY_THRESHOLD: - # assert_true( - # assert_true( - # assert_true( - # accuracy_report.train_superpops_accuracy > RFC_TRAIN_SUPERPOP_ACCURACY_THRESHOLD, - # assert_true( - # accuracy_report.test_superpops_accuracy > RFC_TEST_SUPERPOP_ACCURACY_THRESHOLD, - - return rfc - - -def superpop_probs_from_pop_probs(pop_probs: pd.DataFrame) -> pd.DataFrame: - """Given a matrix of population probabilities, convert to matrix of superpop probabilities.""" - N = len(pop_probs) - pops = sorted(SUPERPOP_FROM_POP.keys()) - superpops = sorted(set(SUPERPOP_FROM_POP.values())) - superpop_projection_matrix = pd.DataFrame( - np.array([[int(superpop == SUPERPOP_FROM_POP[pop]) for superpop in superpops] for pop in pops]), - index=POPS, - columns=SUPERPOPS, - ) - superpop_probs = pop_probs @ superpop_projection_matrix - assert_equals( - "Expected superpop_probs shape (N x |superpops|):", - superpop_probs.shape, - "Actual shape", - (N, len(superpops)), - ) - return superpop_probs - - -def superpop_predictions_from_pop_probs(pop_probs: pd.DataFrame) -> list[str]: - """Given a matrix of population probabilities, convert to superpop predictions.""" - superpops = sorted(set(SUPERPOP_FROM_POP.values())) - superpop_probs = superpop_probs_from_pop_probs(pop_probs) - return [superpops[np.argmax(ps)] for i, ps in superpop_probs.iterrows()] - - -def serialize_model_products(pca_df: pd.DataFrame, rfc: RandomForestClassifier) -> None: - """Serialize variant list, pca and rfc to disk as .txt, .skops files.""" - pca_df.to_csv(PCA_FPATH) - skops_dump(rfc, RFC_FPATH) - -def filter_samples_for_relatedness( - genotypes: pd.DataFrame, - labels: pd.DataFrame, -) -> tuple[pd.DataFrame, pd.DataFrame]: - """Filter samples for relatedness, returning subset of unrelated individuals.""" - logger.info("Filtering samples for relatedness") - ancestry_df = _load_ancestry_df() - assert_equals("genotype samples", genotypes.index, "label samples", labels.index) - samples = genotypes.index - ancestry_df = ancestry_df[ancestry_df.index.isin(samples)] - family_ids = ancestry_df["Family ID"].unique() - logger.info("Found %s unique families", len(family_ids)) - unrelated_samples = [] - random.seed(1337) - for family_id in family_ids: - family_members = ancestry_df[ancestry_df["Family ID"] == family_id].index - # grab value out of array... - family_member = random.choice(family_members) - unrelated_samples.append(family_member) - unrelated_sample_idx = np.array(unrelated_samples) - genotypes, labels = ( - genotypes.loc[unrelated_sample_idx], - labels.loc[unrelated_sample_idx], - ) - # we did this earlier but removing samples could make more variants monomorphic - genotypes = _filter_variants_for_monomorphism(genotypes) - return genotypes, labels - -def _filter_variants_for_monomorphism(genotypes: pd.DataFrame) -> pd.DataFrame: - """Exclude monomorphic variants, i.e. those with no variation in dataset.""" - monomorphic_mask = genotypes.std(axis="index") > 0 - num_excluded_monomorphic_variants = np.sum(~monomorphic_mask) - logger.info("Removing %s monomorphic variants", num_excluded_monomorphic_variants) - monomorphic_fraction = num_excluded_monomorphic_variants / len(monomorphic_mask) - assert_true("fraction of excluded monomorphic variants less than 1%", monomorphic_fraction < 1 / 100) - return genotypes[genotypes.columns[monomorphic_mask]] diff --git a/python/python/bystro/ancestry/train_chip_model.py b/python/python/bystro/ancestry/train_chip_model.py deleted file mode 100644 index 83289d80a..000000000 --- a/python/python/bystro/ancestry/train_chip_model.py +++ /dev/null @@ -1,56 +0,0 @@ -"""Implement PCA + RF for shared Illumina + Affymetrix variants.""" - -import pandas as pd -from sklearn.decomposition import PCA - -from bystro.ancestry.train import ( - INTERMEDIATE_DATA_DIR, - filter_samples_for_relatedness, - load_callset_for_variants, - load_label_data, - make_rfc, - make_train_test_split, - serialize_model_products, -) - - -pd.options.future.infer_string = True # type: ignore - -def load_illumina_affy_variants() -> set[str]: - """Get previously computed intersection of illumina and affymetrix variants.""" - return set(pd.read_csv(INTERMEDIATE_DATA_DIR / "shared_illumina_affy_variants.csv").variant) - - -def load_kgp_genotypes_for_shared_variants() -> pd.DataFrame: - """Get KGP genotypes filtered to shared variants.""" - variants = load_illumina_affy_variants() - return load_callset_for_variants(variants) - - -def pca_transform_df(pca: PCA, X: pd.DataFrame) -> pd.DataFrame: - """PCA transform dataframe, retaining index and labeling columns appropriately.""" - pc_columns = ["pc" + str(i) for i in range(1, pca.num_components_ + 1)] - return pd.DataFrame(pca.transform(X), index=X.index, columns=pc_columns) - - -def main() -> None: - """Train PCA, RF for Illumina and Affymetrix variants, save model products to disk.""" - kgp_genotypes = load_kgp_genotypes_for_shared_variants() - labels = load_label_data(kgp_genotypes.index) - kgp_genotypes, labels = filter_samples_for_relatedness(kgp_genotypes, labels) - train_X, test_X, train_y, test_y = make_train_test_split( - kgp_genotypes, - labels, - ) - PCA_DIMS = 30 - pca = PCA(n_components=PCA_DIMS).fit(train_X) - pc_columns = [f"pc{i}" for i in range(1, PCA_DIMS + 1)] - loadings_df = pd.DataFrame(pca.components_.T, index=train_X.columns, columns=pc_columns) - train_Xpc = train_X @ loadings_df - test_Xpc = test_X @ loadings_df - rfc = make_rfc(train_Xpc, test_Xpc, train_y, test_y) - serialize_model_products(loadings_df, rfc) - - -if __name__ == "__main__": - main() diff --git a/python/python/bystro/ancestry/train_gnomad_model.py b/python/python/bystro/ancestry/train_gnomad_model.py deleted file mode 100644 index c737c4c02..000000000 --- a/python/python/bystro/ancestry/train_gnomad_model.py +++ /dev/null @@ -1,55 +0,0 @@ -"""Implement PCA + RF for variants gnomad uses for ancestry.""" - -import pandas as pd -from sklearn.decomposition import PCA - -from bystro.ancestry.train import ( - load_callset_for_variants, - load_label_data, - make_rfc, - make_train_test_split, - serialize_model_products, -) - -VARIANT_PATH = "hg38_gnomad_snpset.csv" - -pd.options.future.infer_string = True # type: ignore - - -def load_model_variants() -> set[str]: - """Get set of variants to train ancestry model.""" - try: - variants_df = pd.read_csv(VARIANT_PATH) - except FileNotFoundError: - raise FileNotFoundError(f"Variant file '{VARIANT_PATH}' not found.") - except pd.errors.ParserError as e: - raise Exception(f"Error parsing variant file '{VARIANT_PATH}': {e}") - return set(variants_df.variants) - - -def load_kgp_genotypes_for_shared_variants() -> pd.DataFrame: - """Get KGP genotypes filtered to shared variants.""" - variants = load_model_variants() - return load_callset_for_variants(variants) - - -def main() -> None: - """Train PCA, RF for gnomad variants, save model products to disk.""" - kgp_genotypes = load_kgp_genotypes_for_shared_variants() - labels = load_label_data(kgp_genotypes.index) - train_X, test_X, train_y, test_y = make_train_test_split( - kgp_genotypes, - labels, - ) - PCA_DIMS = 30 - pca = PCA(n_components=PCA_DIMS).fit(train_X) - pc_columns = [f"pc{i}" for i in range(1, PCA_DIMS + 1)] - loadings_df = pd.DataFrame(pca.components_.T, index=train_X.columns, columns=pc_columns) - train_Xpc = train_X @ loadings_df - test_Xpc = test_X @ loadings_df - rfc = make_rfc(train_Xpc, test_Xpc, train_y, test_y) - serialize_model_products(loadings_df, rfc) - - -if __name__ == "__main__": - main() diff --git a/python/python/bystro/ancestry/train_utils.py b/python/python/bystro/ancestry/train_utils.py deleted file mode 100644 index 3219460c1..000000000 --- a/python/python/bystro/ancestry/train_utils.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Utilities for ancestry model training.""" -import re -from collections.abc import Iterable -from typing import Any, TypeVar - -import numpy as np - -T = TypeVar("T") - - -def get_variant_ids_from_callset(callset: dict[str, Any]) -> np.ndarray: - """Given a callset generated from scikit.allel, return variant ids in Broad notation.""" - # see https://illumina.github.io/NirvanaDocumentation/core-functionality/variant-ids/ - return np.array( - [ - "-".join([chrom, str(pos), ref, alt]) - for chrom, pos, ref, alt in zip( - callset["variants/CHROM"], - callset["variants/POS"], - callset["variants/REF"], - callset["variants/ALT"][:, 0], - # matrix contains all alts for position-- we only want the first because we're - # excluding all multi-allelic variants - strict=True, - ) - ], - ) - - -def head(xs: Iterable[T]) -> T: - """Get first element of xs.""" - return next(iter(xs)) - - -VARIANT_REGEX = re.compile( - r""" - ^ - chr(1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22) # (autosomal) chromosome - : - [0-9]+ # position - : - [ACGT] # ref allele - : - [ACGT] # alt allele - $ - """, - re.VERBOSE, -) - - -def is_autosomal_variant(potential_variant: str) -> bool: - """Determine whether string is a syntactically valid autochromosomal variant.""" - if not VARIANT_REGEX.match(potential_variant): - return False - chromosome, position, ref, alt = potential_variant.split(":") - if ref == alt: - err_msg = f"Variant {potential_variant} cannot have identical ref and alt alleles" - raise ValueError(err_msg) - return True diff --git a/python/python/bystro/ancestry/upload_model.py b/python/python/bystro/ancestry/upload_model.py deleted file mode 100644 index 85404a8f3..000000000 --- a/python/python/bystro/ancestry/upload_model.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Upload PCA and RFC models to ancestry s3 bucket.""" - -import logging -import os -import sys -from pathlib import Path - -import boto3 - -logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) -logger = logging.getLogger() -logger.setLevel(logging.INFO) - -ANCESTRY_BUCKET = "bystro-ancestry" -ANCESTRY_MODEL_PRODUCTS_DIR = Path("ancestry_model_products") -PCA_FILE = "pca.csv" -RFC_FILE = "rfc.skop" - -try: - AWS_DEFAULT_PROFILE = os.environ["AWS_DEFAULT_PROFILE"] - logger.info("found AWS_DEFAULT_PROFILE: %s", AWS_DEFAULT_PROFILE) -except KeyError as key_err: - err_msg = ( - "AWS_DEFAULT_PROFILE not found in environment variables, " - "check to see that it's defined and explicitly exported." - "\n\n" - "If you are running this script in an ipython process, os.environ may not see " - "environment variables defined in another shell. Try running from the command line instead." - ) - raise RuntimeError(err_msg) from key_err - - -def upload_to_ancestry_bucket(filename: str) -> None: - """Upload filename from ancestry model products dir to ancestry bucket.""" - logger.info("uploading %s...", filename) - try: - s3_client.upload_file(ANCESTRY_MODEL_PRODUCTS_DIR / filename, ANCESTRY_BUCKET, filename) - logger.info("%s uploaded successfully", PCA_FILE) - except Exception: - logger.exception("Couldn't upload %s", filename) - - -if __name__ == "__main__": - session = boto3.Session(profile_name=AWS_DEFAULT_PROFILE) - s3_client = session.client("s3") - logger.info("instantiated s3 client") - - upload_to_ancestry_bucket(PCA_FILE) - upload_to_ancestry_bucket(RFC_FILE) diff --git a/python/python/bystro/api/ancestry.py b/python/python/bystro/api/ancestry.py deleted file mode 100644 index 48b1ad542..000000000 --- a/python/python/bystro/api/ancestry.py +++ /dev/null @@ -1,180 +0,0 @@ -import csv -import json -from os import system -from pathlib import Path -import tempfile - -import msgspec -from pyarrow import dataset as ds # type: ignore - -from bystro.utils.compress import get_decompress_to_pipe_cmd -from bystro.ancestry.model import get_models -from bystro.ancestry.inference import AncestryResults, infer_ancestry - - -def calculate_ancestry_scores( - vcf: str, assembly: str, dosage: bool = False, out_dir: str | None = None -) -> AncestryResults: - """Calculate ancestry scores from a VCF file and write the results to a file. - - Args: - vcf (str): The input VCF file path - assembly (str): The assembly version (hg19 or hg38) - out_dir (str, optional): If not provided, the results are not written to a file. - Defaults to None. - dosage (bool, optional): - Whether or not to write the dosage matrix output to the output directory. - If no `out_dir` is not provided, this option cannot be True. Defaults to False. - - Raises: - RuntimeError: If the bystro-vcf command fails - ValueError: If `dosage` is True and `out_dir` is not provided - - Returns: - AncestryResults: The ancestry results - """ - path_out_dir = None - if out_dir is not None: - path_out_dir = Path(out_dir) - path_out_dir.mkdir(parents=True, exist_ok=True) - - if not dosage: - dosage_matrix_path = tempfile.mktemp(suffix=".feather") - else: - if path_out_dir is None: - raise ValueError("If `dosage` is True, `out_dir` must be provided") - dosage_matrix_path = str(path_out_dir / "dosage_matrix.feather") - - # Input command - bystro_vcf_command = f"bystro-vcf --noOut --dosageOutput {dosage_matrix_path}" - cmd = get_decompress_to_pipe_cmd(vcf, bystro_vcf_command) - res = system(cmd) - if res != 0: - raise RuntimeError(f"Failed to run bystro-vcf command: {cmd}") - - return calculate_ancestry_scores_from_dosage(dosage_matrix_path, assembly, out_dir) - - -def calculate_ancestry_scores_from_dosage( - dosage_matrix_path: str, assembly: str, out_dir: str | None = None -) -> AncestryResults: - """Calculate ancestry scores from a Bystro dosage Arrow feather file and write the results. - - Args: - dosage_matrix_path (str): The input VCF file path - assembly (str): The assembly version (hg19 or hg38) - out_dir (str, optional): If not provided, the results are not written to a file. - Defaults to None. - - Raises: - RuntimeError: If the bystro-vcf command fails - ValueError: If `dosage` is True and `out_dir` is not provided - - Returns: - AncestryResults: The ancestry results - """ - path_out_dir = None - if out_dir is not None: - path_out_dir = Path(out_dir) - path_out_dir.mkdir(parents=True, exist_ok=True) - - # Ancestry command - dataset = ds.dataset(dosage_matrix_path, format="arrow") - - ancestry_models = get_models(assembly) - - results = infer_ancestry(ancestry_models, dataset) - - # Write results to output file - json_data = msgspec.json.encode(results) - - if path_out_dir is not None: - with open(str(path_out_dir / "ancestry_results.json"), "w") as f: - f.write(str(json_data, "utf-8")) - - return results - - -def ancestry_json_to_format(input_json_path, output_path, output_format="tsv"): - """ - Parse the JSON output from the Ancestry Inference tool and write it to a TSV or CSV file. - - Arguments - --------- - input_json_path (str): Path to the JSON file to parse. - output_tsv_path (str): Path to the output TSV or CSV file. - - Returns - ------- - None - """ - if output_format != "tsv" and output_format != "csv": - raise ValueError("output_format must be either 'tsv' or 'csv'") - - with open(input_json_path, "r") as json_file: - data = json.load(json_file) - - results = data["results"] - - delimiter = "\t" if output_format == "tsv" else "," - # Make output directory if needed - output_path = Path(output_path) - output_path.parent.mkdir(parents=True, exist_ok=True) - - # Open TSV file for writing - with open(output_path, "w", newline="") as file: - writer = csv.writer(file, delimiter=delimiter) - - # Write header - header = [ - "sample_id", - "top_population", - "top_population_probability", - "top_superpopulation", - "top_superpopulation_probability", - ] - populations = list(results[0]["populations"].keys()) - superpops = list(results[0]["superpops"].keys()) - - header += populations + superpops - writer.writerow(header) - - # Write data rows - for result in results: - row = [ - result["sampleId"], - ";".join(result["topHit"]["populations"]), - result["topHit"]["probability"], - ] - - top_hit_superpop = "" - top_hit_superpop_probability = 0 - for superpop, vals in result["superpops"].items(): - mean = (vals["lowerBound"] + vals["upperBound"]) / 2 - - if mean > top_hit_superpop_probability: - top_hit_superpop_probability = mean - top_hit_superpop = superpop - - row.append(top_hit_superpop) - row.append(top_hit_superpop_probability) - - for population in populations: - row.append( - ( - result["populations"][population]["lowerBound"] - + result["populations"][population]["upperBound"] - ) - / 2 - ) - - for superpop in superpops: - row.append( - ( - result["superpops"][superpop]["lowerBound"] - + result["superpops"][superpop]["upperBound"] - ) - / 2 - ) - - writer.writerow(row) diff --git a/python/python/bystro/api/tests/ancestry_expected_output.tsv b/python/python/bystro/api/tests/ancestry_expected_output.tsv deleted file mode 100644 index ce75a25ed..000000000 --- a/python/python/bystro/api/tests/ancestry_expected_output.tsv +++ /dev/null @@ -1,4 +0,0 @@ -sample_id top_population top_population_probability top_superpopulation top_superpopulation_probability ACB ASW BEB CDX CEU CHB CHS CLM ESN FIN GBR GIH GWD IBS ITU JPT KHV LWK MSL MXL PEL PJL PUR STU TSI YRI AFR AMR EAS EUR SAS -1805 ACB 0.1842833333333333 AFR 0.4796166666666667 0.1842833333333333 0.07058333333333335 0.001 0.005833333333333333 0.10033333333333334 0.0375 0.04518333333333333 0.04148333333333334 0.023266666666666668 0.06673333333333334 0.05596666666666667 0.001 0.026033333333333332 0.030133333333333335 0.001 0.0032666666666666664 0.011166666666666667 0.002333333333333333 0.08 0.014983333333333333 0.04161666666666667 0.00075 0.054366666666666674 0.0005833333333333333 0.007483333333333332 0.0931166666666667 0.4796166666666667 0.15245000000000003 0.10295 0.26065000000000005 0.004333333333333333 -1847 ACB 0.16088333333333332 AFR 0.4528333333333333 0.16088333333333332 0.07253333333333332 0.002 0.005833333333333333 0.08646666666666665 0.04221666666666666 0.05151666666666667 0.033900000000000007 0.020516666666666666 0.07631666666666666 0.07106666666666668 0.0 0.023699999999999995 0.031966666666666664 0.00025 0.0036 0.013250000000000001 0.002 0.09240000000000001 0.011450000000000002 0.048816666666666675 0.0002 0.062150000000000004 0.0 0.006166666666666667 0.08080000000000001 0.4528333333333333 0.1563166666666667 0.11641666666666667 0.2719833333333333 0.0024500000000000004 -4805 ACB 0.1915690476190476 AFR 0.49455238095238097 0.1915690476190476 0.06688333333333335 0.0017142857142857144 0.008333333333333333 0.09278333333333333 0.03266666666666666 0.04435 0.04215 0.03260000000000001 0.06356666666666667 0.06285 0.0 0.02228333333333334 0.03113333333333333 0.001 0.004266666666666667 0.010333333333333332 0.0031666666666666666 0.08020000000000001 0.015283333333333335 0.03621666666666667 0.0014166666666666666 0.05148333333333333 0.0005833333333333333 0.005316666666666667 0.09784999999999999 0.49455238095238097 0.14513333333333334 0.09995 0.25565000000000004 0.004714285714285714 diff --git a/python/python/bystro/api/tests/ancestry_input.json b/python/python/bystro/api/tests/ancestry_input.json deleted file mode 100644 index 81e973af8..000000000 --- a/python/python/bystro/api/tests/ancestry_input.json +++ /dev/null @@ -1 +0,0 @@ -{"results":[{"sampleId":"1805","topHit":{"probability":0.1842833333333333,"populations":["ACB"]},"populations":{"ACB":{"lowerBound":0.1842833333333333,"upperBound":0.1842833333333333},"ASW":{"lowerBound":0.07058333333333335,"upperBound":0.07058333333333335},"BEB":{"lowerBound":0.001,"upperBound":0.001},"CDX":{"lowerBound":0.005833333333333333,"upperBound":0.005833333333333333},"CEU":{"lowerBound":0.10033333333333334,"upperBound":0.10033333333333334},"CHB":{"lowerBound":0.0375,"upperBound":0.0375},"CHS":{"lowerBound":0.04518333333333333,"upperBound":0.04518333333333333},"CLM":{"lowerBound":0.04148333333333334,"upperBound":0.04148333333333334},"ESN":{"lowerBound":0.023266666666666668,"upperBound":0.023266666666666668},"FIN":{"lowerBound":0.06673333333333334,"upperBound":0.06673333333333334},"GBR":{"lowerBound":0.05596666666666667,"upperBound":0.05596666666666667},"GIH":{"lowerBound":0.001,"upperBound":0.001},"GWD":{"lowerBound":0.026033333333333332,"upperBound":0.026033333333333332},"IBS":{"lowerBound":0.030133333333333335,"upperBound":0.030133333333333335},"ITU":{"lowerBound":0.001,"upperBound":0.001},"JPT":{"lowerBound":0.0032666666666666664,"upperBound":0.0032666666666666664},"KHV":{"lowerBound":0.011166666666666667,"upperBound":0.011166666666666667},"LWK":{"lowerBound":0.002333333333333333,"upperBound":0.002333333333333333},"MSL":{"lowerBound":0.08,"upperBound":0.08},"MXL":{"lowerBound":0.014983333333333333,"upperBound":0.014983333333333333},"PEL":{"lowerBound":0.04161666666666667,"upperBound":0.04161666666666667},"PJL":{"lowerBound":0.00075,"upperBound":0.00075},"PUR":{"lowerBound":0.054366666666666674,"upperBound":0.054366666666666674},"STU":{"lowerBound":0.0005833333333333333,"upperBound":0.0005833333333333333},"TSI":{"lowerBound":0.007483333333333332,"upperBound":0.007483333333333332},"YRI":{"lowerBound":0.0931166666666667,"upperBound":0.0931166666666667}},"superpops":{"AFR":{"lowerBound":0.4796166666666667,"upperBound":0.4796166666666667},"AMR":{"lowerBound":0.15245000000000003,"upperBound":0.15245000000000003},"EAS":{"lowerBound":0.10295,"upperBound":0.10295},"EUR":{"lowerBound":0.26065000000000005,"upperBound":0.26065000000000005},"SAS":{"lowerBound":0.004333333333333333,"upperBound":0.004333333333333333}},"nSnps":1238},{"sampleId":"1847","topHit":{"probability":0.16088333333333332,"populations":["ACB"]},"populations":{"ACB":{"lowerBound":0.16088333333333332,"upperBound":0.16088333333333332},"ASW":{"lowerBound":0.07253333333333332,"upperBound":0.07253333333333332},"BEB":{"lowerBound":0.002,"upperBound":0.002},"CDX":{"lowerBound":0.005833333333333333,"upperBound":0.005833333333333333},"CEU":{"lowerBound":0.08646666666666665,"upperBound":0.08646666666666665},"CHB":{"lowerBound":0.04221666666666666,"upperBound":0.04221666666666666},"CHS":{"lowerBound":0.05151666666666667,"upperBound":0.05151666666666667},"CLM":{"lowerBound":0.033900000000000007,"upperBound":0.033900000000000007},"ESN":{"lowerBound":0.020516666666666666,"upperBound":0.020516666666666666},"FIN":{"lowerBound":0.07631666666666666,"upperBound":0.07631666666666666},"GBR":{"lowerBound":0.07106666666666668,"upperBound":0.07106666666666668},"GIH":{"lowerBound":0.0,"upperBound":0.0},"GWD":{"lowerBound":0.023699999999999995,"upperBound":0.023699999999999995},"IBS":{"lowerBound":0.031966666666666664,"upperBound":0.031966666666666664},"ITU":{"lowerBound":0.00025,"upperBound":0.00025},"JPT":{"lowerBound":0.0036,"upperBound":0.0036},"KHV":{"lowerBound":0.013250000000000001,"upperBound":0.013250000000000001},"LWK":{"lowerBound":0.002,"upperBound":0.002},"MSL":{"lowerBound":0.09240000000000001,"upperBound":0.09240000000000001},"MXL":{"lowerBound":0.011450000000000002,"upperBound":0.011450000000000002},"PEL":{"lowerBound":0.048816666666666675,"upperBound":0.048816666666666675},"PJL":{"lowerBound":0.0002,"upperBound":0.0002},"PUR":{"lowerBound":0.062150000000000004,"upperBound":0.062150000000000004},"STU":{"lowerBound":0.0,"upperBound":0.0},"TSI":{"lowerBound":0.006166666666666667,"upperBound":0.006166666666666667},"YRI":{"lowerBound":0.08080000000000001,"upperBound":0.08080000000000001}},"superpops":{"AFR":{"lowerBound":0.4528333333333333,"upperBound":0.4528333333333333},"AMR":{"lowerBound":0.1563166666666667,"upperBound":0.1563166666666667},"EAS":{"lowerBound":0.11641666666666667,"upperBound":0.11641666666666667},"EUR":{"lowerBound":0.2719833333333333,"upperBound":0.2719833333333333},"SAS":{"lowerBound":0.0024500000000000004,"upperBound":0.0024500000000000004}},"nSnps":1238},{"sampleId":"4805","topHit":{"probability":0.1915690476190476,"populations":["ACB"]},"populations":{"ACB":{"lowerBound":0.1915690476190476,"upperBound":0.1915690476190476},"ASW":{"lowerBound":0.06688333333333335,"upperBound":0.06688333333333335},"BEB":{"lowerBound":0.0017142857142857144,"upperBound":0.0017142857142857144},"CDX":{"lowerBound":0.008333333333333333,"upperBound":0.008333333333333333},"CEU":{"lowerBound":0.09278333333333333,"upperBound":0.09278333333333333},"CHB":{"lowerBound":0.03266666666666666,"upperBound":0.03266666666666666},"CHS":{"lowerBound":0.04435,"upperBound":0.04435},"CLM":{"lowerBound":0.04215,"upperBound":0.04215},"ESN":{"lowerBound":0.03260000000000001,"upperBound":0.03260000000000001},"FIN":{"lowerBound":0.06356666666666667,"upperBound":0.06356666666666667},"GBR":{"lowerBound":0.06285,"upperBound":0.06285},"GIH":{"lowerBound":0.0,"upperBound":0.0},"GWD":{"lowerBound":0.02228333333333334,"upperBound":0.02228333333333334},"IBS":{"lowerBound":0.03113333333333333,"upperBound":0.03113333333333333},"ITU":{"lowerBound":0.001,"upperBound":0.001},"JPT":{"lowerBound":0.004266666666666667,"upperBound":0.004266666666666667},"KHV":{"lowerBound":0.010333333333333332,"upperBound":0.010333333333333332},"LWK":{"lowerBound":0.0031666666666666666,"upperBound":0.0031666666666666666},"MSL":{"lowerBound":0.08020000000000001,"upperBound":0.08020000000000001},"MXL":{"lowerBound":0.015283333333333335,"upperBound":0.015283333333333335},"PEL":{"lowerBound":0.03621666666666667,"upperBound":0.03621666666666667},"PJL":{"lowerBound":0.0014166666666666666,"upperBound":0.0014166666666666666},"PUR":{"lowerBound":0.05148333333333333,"upperBound":0.05148333333333333},"STU":{"lowerBound":0.0005833333333333333,"upperBound":0.0005833333333333333},"TSI":{"lowerBound":0.005316666666666667,"upperBound":0.005316666666666667},"YRI":{"lowerBound":0.09784999999999999,"upperBound":0.09784999999999999}},"superpops":{"AFR":{"lowerBound":0.49455238095238097,"upperBound":0.49455238095238097},"AMR":{"lowerBound":0.14513333333333334,"upperBound":0.14513333333333334},"EAS":{"lowerBound":0.09995,"upperBound":0.09995},"EUR":{"lowerBound":0.25565000000000004,"upperBound":0.25565000000000004},"SAS":{"lowerBound":0.004714285714285714,"upperBound":0.004714285714285714}},"nSnps":1238}],"pcs":{"1805":[36.212301500214,8.880032713523068,6.4685387614609855,-6.986385401265155,-1.2161613876301929,0.18122864180355114,4.879164992147747,-1.3825319483843006,1.6565815085801594,-0.2675033562678496,1.0175223661530621,0.21156874538204273,-0.6495907158974807,-0.2821412378140518,0.7163513449873625,-0.5081614053538704,0.5534507488802398,0.7250995630790414,1.8478559588452843,-0.41131091128929353,1.3430161327952719,0.7620345619116838,0.8952018177255077,-0.9965641407374166,-1.3199735059355553,1.207583395188399,1.1471490913707127,-0.47110379079023673,0.13513724898869475,0.5731936570971211],"1847":[36.500668853503285,9.21175844402723,6.489695579364607,-7.0987434050667275,-0.9170605817472359,-0.18288530585392385,4.598064470504398,-1.1524714768307327,1.824371775247836,-0.7814874802943742,1.4082146489336538,0.38113983051284483,-0.847996044464877,-0.031326150163679484,0.025513420963202617,-0.2208609689549743,0.6158780987169225,0.5725914764665309,2.1346566622768015,-0.6881400644796526,1.561344573906901,0.7638499726751277,1.1429226260963772,-1.1833817808213616,-1.7262934237362275,1.0951965629327574,0.9762991188035195,-0.5167494466476656,0.4546841024660949,0.5413555563537119],"4805":[36.38356953633935,9.049844922483388,6.606472163818057,-7.121242841719361,-1.3311904755475088,-0.21900502784821532,5.094634966680147,-1.1802023866269407,1.840744991156444,-0.4652734710264729,1.0421481054171768,0.6203247400812968,-0.6751251146908167,0.005451107621077693,0.8429898311101504,-0.4080694653016031,0.426548206491673,0.39051646850281563,1.882225273026346,-0.4742827051359152,1.236524912410367,0.7139498156842751,1.0381049460009655,-1.0272491852331194,-1.62808045617317,0.862198891418773,1.1098383415758883,-0.49625681787895415,0.32663479605029433,0.5764836860331063]}} \ No newline at end of file diff --git a/python/python/bystro/api/tests/test_ancestry_api.py b/python/python/bystro/api/tests/test_ancestry_api.py deleted file mode 100644 index d8344d747..000000000 --- a/python/python/bystro/api/tests/test_ancestry_api.py +++ /dev/null @@ -1,63 +0,0 @@ -from pathlib import Path -import os - -import pandas as pd - -import pytest - -from bystro.ancestry.ancestry_types import AncestryResults -from bystro.ancestry.inference import AncestryModels -from bystro.ancestry.tests.test_inference import ( - ANCESTRY_MODEL, -) - -from bystro.api.ancestry import calculate_ancestry_scores, ancestry_json_to_format - - -pd.options.future.infer_string = True # type: ignore - - -@pytest.mark.integration("Requires bystro-vcf to be installed as well as AWS credentials.") -def test_calculate_ancestry_scores_happy_path(mocker, tmpdir): - mocker.patch( - "bystro.ancestry.model.get_models", - return_value=AncestryModels(ANCESTRY_MODEL, ANCESTRY_MODEL), - ) - - VCF_PATH = Path(__file__).parent / "trio.trim.vep.vcf.gz" - ancestry_response = calculate_ancestry_scores( - str(VCF_PATH), "hg19", dosage=False, out_dir=str(tmpdir) - ) - - assert isinstance(ancestry_response, AncestryResults) - - # Demonstrate that all expected sample_ids are accounted for - samples_seen = set() - expected_samples = set(["1805", "4805", "1847"]) - for result in ancestry_response.results: - samples_seen.add(result.sample_id) - - assert samples_seen == expected_samples - -def test_ancestry_tsv(tmp_path): - pwd = os.path.dirname(os.path.abspath(__file__)) - ancestry_file_path = Path(pwd) / "ancestry_input.json" - expected_results_path = Path(pwd) / "ancestry_expected_output.tsv" - - expected = pd.read_csv(expected_results_path, sep="\t") - print("expected", expected) - - # create tmp file - output_tsv_path = tmp_path / "output.tsv" - output_csv_path = tmp_path / "output.csv" - - # Run the conversion for TSV - ancestry_json_to_format(ancestry_file_path, output_tsv_path, "tsv") - df1 = pd.read_csv(output_tsv_path, sep="\t") - - # Run the conversion for Excel - ancestry_json_to_format(ancestry_file_path, output_csv_path, "csv") - df2 = pd.read_csv(output_csv_path) - - assert expected.equals(df1), "TSV files do not match" - assert expected.equals(df2), "CSV files do not match"