From fde77707c2b8f7a1dabf832981db3498bf1cc4ae Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 27 Sep 2024 19:06:44 +0200 Subject: [PATCH] Update docs and tutorials for hictk [no ci] --- docs/balancing_matrices.rst | 28 ++-- docs/creating_cool_and_hic_files.rst | 104 +++++------- docs/creating_multires_files.rst | 70 +++----- docs/downloading_test_datasets.rst | 2 +- docs/file_validation.rst | 133 +++++++++------ docs/format_conversion.rst | 101 ++++++------ docs/installation.rst | 29 ++-- docs/installation_src.rst | 132 ++++++--------- docs/quickstart_api.rst | 8 +- docs/quickstart_cli.rst | 11 +- docs/reading_interactions.rst | 3 + .../dump_interactions_to_cool_hic_file.rst | 117 ++++++------- docs/tutorials/reordering_chromosomes.rst | 156 +++++++++--------- src/hictk/validate/validate.cpp | 10 +- 14 files changed, 435 insertions(+), 469 deletions(-) diff --git a/docs/balancing_matrices.rst b/docs/balancing_matrices.rst index 3c7b7611e..b90d1d720 100644 --- a/docs/balancing_matrices.rst +++ b/docs/balancing_matrices.rst @@ -27,20 +27,22 @@ The following is an example showing how to balance a .cool file using ICE. user@dev:/tmp$ hictk balance ice 4DNFIZ1ZVXC8.mcool::/resolutions/1000 - [2023-10-01 13:18:02.119] [info]: Running hictk v0.0.2-f83f93e - [2023-10-01 13:18:02.130] [info]: Writing interactions to temporary file /tmp/4DNFIZ1ZVXC8.tmp0... - [2023-10-01 13:18:05.098] [info]: Initializing bias vector... - [2023-10-01 13:18:05.099] [info]: Masking rows with fewer than 10 nnz entries... - [2023-10-01 13:18:06.298] [info]: Masking rows using mad_max=5... - [2023-10-01 13:18:06.971] [info]: Iteration 1: 36874560.192587376 - [2023-10-01 13:18:07.634] [info]: Iteration 2: 21347543.04950776 - [2023-10-01 13:18:08.307] [info]: Iteration 3: 7819314.542541969 + [2024-09-26 16:02:19.731] [info]: Running hictk v1.0.0-fbdcb591 + [2024-09-26 16:02:19.731] [info]: balancing using ICE (GW_ICE) + [2024-09-26 16:02:19.734] [info]: Writing interactions to temporary file /tmp/hictk-tmp-XXXX1ZC9FF/4DNFIZ1ZVXC8.mcool.tmp... + [2024-09-26 16:02:22.480] [info]: Initializing bias vector... + [2024-09-26 16:02:22.482] [info]: Masking rows with fewer than 10 nnz entries... + [2024-09-26 16:02:23.392] [info]: Masking rows using mad_max=5... + [2024-09-26 16:02:23.860] [info]: Iteration 1: 36452362.243888594 + [2024-09-26 16:02:24.327] [info]: Iteration 2: 21649057.88060747 + [2024-09-26 16:02:24.792] [info]: Iteration 3: 7890065.688497526 ... - [2023-10-01 13:19:20.365] [info]: Iteration 105: 2.1397932757529552e-05 - [2023-10-01 13:19:21.146] [info]: Iteration 106: 1.6604770462001875e-05 - [2023-10-01 13:19:21.870] [info]: Iteration 107: 1.2885285040054778e-05 - [2023-10-01 13:19:22.608] [info]: Iteration 108: 9.99900768769869e-06 - [2023-10-01 13:19:22.619] [info]: Writing weights to 4DNFIZ1ZVXC8.mcool::/resolutions/1000/bins/weight... + [2024-09-26 16:03:12.285] [info]: Iteration 107: 2.0533518142916073e-05 + [2024-09-26 16:03:12.752] [info]: Iteration 108: 1.601698258037195e-05 + [2024-09-26 16:03:13.216] [info]: Iteration 109: 1.2493901433163442e-05 + [2024-09-26 16:03:13.681] [info]: Iteration 110: 9.745791018854495e-06 + [2024-09-26 16:03:13.707] [info]: Writing weights to 4DNFIZ1ZVXC8.mcool::/resolutions/1000/bins/GW_ICE... + [2024-09-26 16:03:13.708] [info]: Linking weights to 4DNFIZ1ZVXC8.mcool::/resolutions/1000/bins/weight... When balancing files in .mcool or .hic formats, all resolutions are balanced. diff --git a/docs/creating_cool_and_hic_files.rst b/docs/creating_cool_and_hic_files.rst index 711b08e5b..fe194a2e8 100644 --- a/docs/creating_cool_and_hic_files.rst +++ b/docs/creating_cool_and_hic_files.rst @@ -17,44 +17,42 @@ File requirements: * ``dm6.chrom.sizes`` - `download `__ * ``4DNFIKNWM36K.pairs.gz`` - `download `__ + +Ingesting pairwise interactions into a 10kbp .cool file +------------------------------------------------------- + +Loading interactions in pairs (4DN-DCIC) format into a .cool/hic file is straightforward: + .. code-block:: console - # Create a 10kbp .cool file using dm6 as reference - user@dev:/tmp$ zcat 4DNFIKNWM36K.pairs.gz | hictk load --format 4dn --assembly dm6 --bin-size 10000 dm6.chrom.sizes 4DNFIKNWM36K.10000.cool - - [2024-01-23 15:15:00.520] [info]: Running hictk v0.0.6-45c36af-dirty - [2024-01-23 15:15:00.531] [info]: writing chunk #1 to intermediate file "/tmp/4DNFIKNWM36K.10000.cool.tmp/4DNFIKNWM36K.10000.cool.tmp"... - [2024-01-23 15:15:23.762] [info]: done writing chunk #1 to tmp file "/tmp/4DNFIKNWM36K.10000.cool.tmp/4DNFIKNWM36K.10000.cool.tmp". - [2024-01-23 15:15:23.762] [info]: writing chunk #2 to intermediate file "/tmp/4DNFIKNWM36K.10000.cool.tmp/4DNFIKNWM36K.10000.cool.tmp"... - [2024-01-23 15:15:49.042] [info]: done writing chunk #2 to tmp file "/tmp/4DNFIKNWM36K.10000.cool.tmp/4DNFIKNWM36K.10000.cool.tmp". - [2024-01-23 15:15:49.042] [info]: writing chunk #3 to intermediate file "/tmp/4DNFIKNWM36K.10000.cool.tmp/4DNFIKNWM36K.10000.cool.tmp"... - [2024-01-23 15:15:49.834] [info]: done writing chunk #3 to tmp file "/tmp/4DNFIKNWM36K.10000.cool.tmp/4DNFIKNWM36K.10000.cool.tmp". - [2024-01-23 15:15:49.836] [info]: merging 3 chunks into "4DNFIKNWM36K.10000.cool"... - [2024-01-23 15:15:55.118] [info]: processing chr3L:15100000-15110000 chr3L:16230000-16240000 at 4789272 pixels/s... - [2024-01-23 15:15:59.718] [info]: ingested 119208613 interactions (18122865 nnz) in 59.197723453s! - - # Create a 10kbp .hic file using dm6 as reference - user@dev:/tmp$ zcat 4DNFIKNWM36K.pairs.gz | hictk load --format 4dn --assembly dm6 --bin-size 10000 dm6.chrom.sizes 4DNFIKNWM36K.10000.hic - - [2024-01-23 15:45:19.969] [info]: Running hictk v0.0.6-570037c-dirty - [2024-01-23 15:45:42.439] [info]: preprocessing chunk #1 at 452919 pixels/s... - [2024-01-23 15:46:09.182] [info]: preprocessing chunk #2 at 303750 pixels/s... - [2024-01-23 15:46:11.184] [info]: writing header at offset 0 - [2024-01-23 15:46:11.184] [info]: begin writing interaction blocks to file "4DNFIKNWM36K.10000.hic"... - [2024-01-23 15:46:11.184] [info]: [10000 bp] writing pixels for chr3R:chr3R matrix at offset 50632... - [2024-01-23 15:46:13.295] [info]: [10000 bp] written 2264963 pixels for chr3R:chr3R matrix - [2024-01-23 15:46:13.295] [info]: [10000 bp] writing pixels for chr3R:chr3L matrix at offset 4235718... - [2024-01-23 15:46:14.611] [info]: [10000 bp] written 1610264 pixels for chr3R:chr3L matrix - ... - [2024-01-23 15:46:44.065] [info]: [10000 bp] initializing expected value vector - [2024-01-23 15:46:50.531] [info]: [10000 bp] computing expected vector density - [2024-01-23 15:46:51.157] [info]: writing 1 expected value vectors at offset 32065110... - [2024-01-23 15:46:51.158] [info]: writing 0 normalized expected value vectors at offset 32078017... - [2024-01-23 15:46:51.194] [info]: ingested 119208613 interactions (18122865 nnz) in 91.225341628s! + user@dev:/tmp$ hictk load --format 4dn --bin-size 10000 4DNFIKNWM36K.pairs.gz 4DNFIKNWM36K.10000.cool + + [2024-09-26 16:51:28.059] [info]: Running hictk v1.0.0-fbdcb591 + [2024-09-26 16:51:28.068] [info]: begin loading pairwise interactions into a .cool file... + [2024-09-26 16:51:28.137] [info]: writing chunk #1 to intermediate file "/tmp/hictk-tmp-XXXXQPdOSn/4DNFIKNWM36K.10000.cool.tmp"... + [2024-09-26 16:51:45.281] [info]: done writing chunk #1 to tmp file "/tmp/hictk-tmp-XXXXQPdOSn/4DNFIKNWM36K.10000.cool.tmp". + [2024-09-26 16:51:45.281] [info]: writing chunk #2 to intermediate file "/tmp/hictk-tmp-XXXXQPdOSn/4DNFIKNWM36K.10000.cool.tmp"... + [2024-09-26 16:52:04.969] [info]: done writing chunk #2 to tmp file "/tmp/hictk-tmp-XXXXQPdOSn/4DNFIKNWM36K.10000.cool.tmp". + [2024-09-26 16:52:04.970] [info]: merging 2 chunks into "4DNFIKNWM36K.10000.cool"... + [2024-09-26 16:52:06.430] [info]: processing chr3L:1030000-1040000 chr3R:30240000-30250000 at 6882312 pixels/s... + [2024-09-26 16:52:08.478] [info]: ingested 119208613 interactions (18122865 nnz) in 40.418916003s! + +To ingest interactions in a .hic file, simply change the extension of the output file (or use the ``--output-fmt`` option). + +By default, the list of chromosomes is read from the file header. +The reference genome used to build the .cool or .hic file can be provided explicitly using the ``--chrom-sizes`` option. +Note that ``--chrom-sizes`` is a mandatory option when ingesting interactions in formats other than ``--format=4dn``. +In case the input file contains interactions mapping on chromosomes missing from the reference genome provided through ``--chrom-sizes``, the ``--drop-unknown-chroms`` flag can be used to instruct hictk to ignored said interactions. + +When loading interactions using ``--format=pairs`` or ``--format=validPairs`` into a .cool file, tables of variable bins are supported. +To load interactions in to a .cool with a variable bin size provide the table of bins using the ``--bin-table`` option. **Tips:** -* When creating large .hic files, ``hictk`` needs to create potentially large temporary files. When this is the case, use option ``--tmpdir`` to set the temporary folder to a path with sufficient space. +* When creating large .cool/hic files, ``hictk`` needs to create potentially large temporary files. When this is the case, use option ``--tmpdir`` to set the temporary folder to a path with sufficient space. +* When loading interactions into .hic files, some of the steps can be run in parallel by increasing the number of processing threads using the ``--threads`` option. +* When loading pre-binned interactions into .cool file, if the interactions are already sorted by genomic coordinates, the ``--assume-sorted`` option can be used to load interactions at once, without using temporary files. +* Interaction loading performance can be improved by processing interactions in larger chunks. This can be controlled using the ``--chunk-size`` option. In fact, when ``--chunk-size`` is greater than the number of interactions to be loaded, .hic and .cool files can be created without the use of temporary files. Merging multiple files @@ -66,35 +64,17 @@ Multiple .cool and .hic files using the same reference genome and resolution can # Merge multiple cooler files - user@dev:/tmp$ hictk merge data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 -o 4DNFIZ1ZVXC8.merged.cool - - [2023-09-29 19:24:49.479] [info]: Running hictk v0.0.2 - [2023-09-29 19:24:49.479] [info]: begin merging 2 coolers... - [2023-09-29 19:24:52.032] [info]: processing chr2R:11267000-11268000 chr4:1052000-1053000 at 3976143 pixels/s... - [2023-09-29 19:24:55.157] [info]: processing chr3R:5812000-5813000 chr3R:23422000-23423000 at 3201024 pixels/s... - [2023-09-29 19:24:57.992] [info]: DONE! Merging 2 coolers took 8.51s! - [2023-09-29 19:24:57.992] [info]: 4DNFIZ1ZVXC8.merged.cool size: 36.23 MB - - # Merge multiple .hic files - - user@dev:/tmp$ hictk merge data/4DNFIZ1ZVXC8.hic9 data/4DNFIZ1ZVXC8.hic9 -o 4DNFIZ1ZVXC8.10000.merged.hic --resolution 10000 - - [2024-01-23 15:49:23.248] [info]: Running hictk v0.0.6-570037c-dirty - [2024-01-23 15:49:23.248] [info]: begin merging 2 .hic files... - [2024-01-23 15:49:31.101] [info]: ingesting pixels at 1352814 pixels/s... - [2024-01-23 15:49:37.777] [info]: writing header at offset 0 - [2024-01-23 15:49:37.777] [info]: begin writing interaction blocks to file "4DNFIZ1ZVXC8.10000.merged.hic"... - [2024-01-23 15:49:37.777] [info]: [10000 bp] writing pixels for chr2L:chr2L matrix at offset 212... - [2024-01-23 15:49:39.060] [info]: [10000 bp] written 1433133 pixels for chr2L:chr2L matrix - [2024-01-23 15:49:39.060] [info]: [10000 bp] writing pixels for chr2L:chr2R matrix at offset 2619165... - ... - [2024-01-23 15:49:58.624] [info]: [10000 bp] initializing expected value vector - [2024-01-23 15:50:05.276] [info]: [10000 bp] computing expected vector density - [2024-01-23 15:50:05.276] [info]: writing 1 expected value vectors at offset 31936601... - [2024-01-23 15:50:05.276] [info]: writing 0 normalized expected value vectors at offset 31949508... - [2024-01-23 15:50:05.299] [info]: DONE! Merging 2 files took 42.05s! - [2024-01-23 15:50:05.299] [info]: 4DNFIZ1ZVXC8.10000.merged.hic size: 31.95 MB + user@dev:/tmp$ hictk merge 4DNFIZ1ZVXC8.mcool::/resolutions/10000 4DNFIZ1ZVXC8.mcool::/resolutions/10000 -o 4DNFIZ1ZVXC8.merged.10000.cool + + [2024-09-26 17:07:57.101] [info]: Running hictk v1.0.0-fbdcb591 + [2024-09-26 17:07:57.101] [info]: begin merging 2 files into one .cool file... + [2024-09-26 17:07:58.978] [info]: processing chr3L:1030000-1040000 chr3R:29720000-29730000 at 5571031 pixels/s... + [2024-09-26 17:08:01.224] [info]: DONE! Merging 2 files took 4.12s! + [2024-09-26 17:08:01.224] [info]: 4DNFIZ1ZVXC8.merged.10000.cool size: 19.64 MB + +Merging .hic files as well as a mix of .hic and .cool files is also supported (as long as all files have the same resolution and reference genome). +When one or more of the input files are in .hic format, the ``--resolution`` option is mandatory. **Tips:** -* When merging many, large .hic files, ``hictk`` needs to create potentially large temporary files. When this is the case, use option ``--tmpdir`` to set the temporary folder to a path with sufficient space. +See the list of Tips for hictk load. diff --git a/docs/creating_multires_files.rst b/docs/creating_multires_files.rst index 3122f24b1..dcd6d8b3b 100644 --- a/docs/creating_multires_files.rst +++ b/docs/creating_multires_files.rst @@ -14,28 +14,30 @@ Interactions from a single-resolution Cooler file (.cool) can be used to generat user@dev:/tmp$ hictk zoomify data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 out.mcool - [2023-09-29 19:28:39.926] [info]: Running hictk v0.0.2 - [2023-09-29 19:28:39.929] [info]: coarsening cooler at data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 13 times (1000 -> 1000 -> 2000 -> 5000 -> 10000 -> 20000 -> 50000 -> 100000 -> 200000 -> 500000 -> 1000000 -> 2000000 -> 5000000 -> 10000000) - [2023-09-29 19:28:39.929] [info]: copying 1000 resolution from data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 - [2023-09-29 19:28:40.119] [info]: generating 2000 resolution from 1000 (2x) - [2023-09-29 19:28:40.343] [info]: [1000 -> 2000] processing chr2L:1996000-1998000 at 4484305 pixels/s... - [2023-09-29 19:28:40.663] [info]: [1000 -> 2000] processing chr2L:4932000-4934000 at 3125000 pixels/s... - [2023-09-29 19:28:40.973] [info]: [1000 -> 2000] processing chr2L:7986000-7988000 at 3236246 pixels/s... - ... - [2023-09-29 19:29:12.513] [info]: generating 10000000 resolution from 5000000 (2x) - [2023-09-29 19:29:12.519] [info]: DONE! Processed 13 resolution(s) in 32.59s! + [2024-09-26 17:21:21.792] [info]: Running hictk v1.0.0-fbdcb591 + [2024-09-26 17:21:21.795] [info]: coarsening cooler at 4DNFIZ1ZVXC8.mcool::/resolutions/1000 13 times (1000 -> 1000 -> 2000 -> 5000 -> 10000 -> 20000 -> 50000 -> 100000 -> 200000 -> 500000 -> 1000000 -> 2000000 -> 5000000 -> 10000000) + [2024-09-26 17:21:21.795] [info]: copying 1000 resolution from 4DNFIZ1ZVXC8.mcool::/resolutions/1000 + [2024-09-26 17:21:21.959] [info]: generating 2000 resolution from 1000 (2x) + [2024-09-26 17:21:22.134] [info]: [1000 -> 2000] processing chr2L:1996000-1998000 at 5747126 pixels/s... + [2024-09-26 17:21:22.355] [info]: [1000 -> 2000] processing chr2L:4932000-4934000 at 4545455 pixels/s... + [2024-09-26 17:21:22.563] [info]: [1000 -> 2000] processing chr2L:7986000-7988000 at 4830918 pixels/s... + ... + [2024-09-26 17:21:42.886] [info]: generating 2000000 resolution from 1000000 (2x) + [2024-09-26 17:21:42.892] [info]: generating 5000000 resolution from 1000000 (5x) + [2024-09-26 17:21:42.898] [info]: generating 10000000 resolution from 5000000 (2x) + [2024-09-26 17:21:42.902] [info]: DONE! Processed 13 resolution(s) in 21.11s! # Coarsen a single resolution user@dev:/tmp$ hictk zoomify data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 out.cool --resolutions 50000 - [2023-09-29 19:30:52.476] [info]: Running hictk v0.0.2 - [2023-09-29 19:30:52.482] [info]: coarsening cooler at data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 2 times (1000 -> 1000 -> 50000) - [2023-09-29 19:30:52.482] [info]: copying 1000 resolution from data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 - [2023-09-29 19:30:52.668] [info]: generating 50000 resolution from 1000 (50x) - [2023-09-29 19:30:53.789] [info]: [1000 -> 50000] processing chr2L:23000000-23050000 at 896057 pixels/s... - [2023-09-29 19:30:55.005] [info]: [1000 -> 50000] processing chr3L:4600000-4650000 at 822368 pixels/s... - [2023-09-29 19:30:56.440] [info]: [1000 -> 50000] processing chr3R:32050000-32079331 at 696864 pixels/s... - [2023-09-29 19:30:56.863] [info]: DONE! Processed 2 resolution(s) in 4.39s! + [2024-09-26 17:22:22.203] [info]: Running hictk v1.0.0-fbdcb591 + [2024-09-26 17:22:22.206] [info]: coarsening cooler at 4DNFIZ1ZVXC8.mcool::/resolutions/1000 2 times (1000 -> 1000 -> 50000) + [2024-09-26 17:22:22.206] [info]: copying 1000 resolution from 4DNFIZ1ZVXC8.mcool::/resolutions/1000 + [2024-09-26 17:22:22.364] [info]: generating 50000 resolution from 1000 (50x) + [2024-09-26 17:22:23.165] [info]: [1000 -> 50000] processing chr2L:23000000-23050000 at 1253133 pixels/s... + [2024-09-26 17:22:23.939] [info]: [1000 -> 50000] processing chr3L:4600000-4650000 at 1293661 pixels/s... + [2024-09-26 17:22:24.878] [info]: [1000 -> 50000] processing chr3R:32050000-32079331 at 1064963 pixels/s... + [2024-09-26 17:22:25.151] [info]: DONE! Processed 2 resolution(s) in 2.95s! Converting a single-resolution .hic to a multi-resolution .hic ______________________________________________________________ @@ -44,36 +46,6 @@ Interactions from a .hic file (like the one generated by ``hictk load``) can be hictk will copy interactions for resolutions that are available in the input file. Interactions at resolutions missing from the input file will be generated by iterative coarsening. -.. code-block:: console - - user@dev:/tmp$ hictk zoomify 4DNFIZ1ZVXC8.hic9 4DNFIZ1ZVXC8.zoomified.hic --threads 8 - - [2024-01-23 16:59:57.369] [info]: Running hictk v0.0.6-570037c-dirty - [2024-01-23 16:59:57.369] [info]: copying resolution 1000 from "4DNFIZ1ZVXC8.hic9" - [2024-01-23 16:59:57.369] [info]: generating 2000 resolution from 1000 (2x) - [2024-01-23 16:59:57.369] [info]: copying resolution 5000 from "4DNFIZ1ZVXC8.hic9" - [2024-01-23 16:59:57.369] [info]: copying resolution 10000 from "4DNFIZ1ZVXC8.hic9" - [2024-01-23 16:59:57.369] [info]: generating 20000 resolution from 10000 (2x) - [2024-01-23 16:59:57.369] [info]: copying resolution 50000 from "4DNFIZ1ZVXC8.hic9" - [2024-01-23 16:59:57.369] [info]: copying resolution 100000 from "4DNFIZ1ZVXC8.hic9" - [2024-01-23 16:59:57.369] [info]: generating 200000 resolution from 100000 (2x) - [2024-01-23 16:59:57.369] [info]: copying resolution 500000 from "4DNFIZ1ZVXC8.hic9" - [2024-01-23 16:59:57.369] [info]: copying resolution 1000000 from "4DNFIZ1ZVXC8.hic9" - [2024-01-23 16:59:57.369] [info]: generating 2000000 resolution from 1000000 (2x) - [2024-01-23 16:59:57.369] [info]: generating 5000000 resolution from 1000000 (5x) - [2024-01-23 16:59:57.369] [info]: generating 10000000 resolution from 5000000 (2x) - [2024-01-23 16:59:57.379] [info]: [1000 bp] ingesting interactions... - [2024-01-23 17:00:02.183] [info]: ingesting pixels at 2157032 pixels/s... - [2024-01-23 17:00:07.271] [info]: ingesting pixels at 1965795 pixels/s... - ... - [2024-01-23 17:02:04.842] [info]: [1000 bp] computing expected vector density - [2024-01-23 17:02:05.325] [info]: [2000 bp] computing expected vector density - [2024-01-23 17:02:06.291] [info]: [5000 bp] computing expected vector density - [2024-01-23 17:02:06.292] [info]: writing 13 expected value vectors at offset 193918320... - [2024-01-23 17:02:06.293] [info]: writing 0 normalized expected value vectors at offset 194161639... - [2024-01-23 17:02:06.318] [info]: DONE! Processed 13 resolution(s) in 128.95s! - - **Tips:** -* When zoomifying large .hic files, ``hictk`` may need to create large temporary files. When this is the case, use option ``--tmpdir`` to set the temporary folder to a path with sufficient space. +See the list of Tips for hictk load. diff --git a/docs/downloading_test_datasets.rst b/docs/downloading_test_datasets.rst index e289a9c8d..53b31ab34 100644 --- a/docs/downloading_test_datasets.rst +++ b/docs/downloading_test_datasets.rst @@ -13,7 +13,7 @@ After downloading the data, move to a folder with at least ~1 GB of free space a :class: no-copybutton user@dev:/tmp$ mkdir data/ - user@dev:/tmp$ tar -xf hictk_test_data.tar.xz \ + user@dev:/tmp$ tar -xf hictk_test_data.tar.zst \ -C data --strip-components=3 \ test/data/hic/4DNFIZ1ZVXC8.hic9 \ test/data/integration_tests/4DNFIZ1ZVXC8.mcool \ diff --git a/docs/file_validation.rst b/docs/file_validation.rst index 6912d8366..88156b1b8 100644 --- a/docs/file_validation.rst +++ b/docs/file_validation.rst @@ -8,7 +8,7 @@ File validation Why is this needed? ------------------- -``hictk validate`` can detect several types of data corruption in .hic and .cool files, from simple file truncation due to e.g. failed downloads to subtle index corruption in .cool files. +``hictk validate`` can detect several types of data corruption in .hic and .[ms]cool files, from simple file truncation due to e.g. failed downloads to subtle index corruption in .mcool files. .. _cooler-index-corruption-label: @@ -38,59 +38,94 @@ Perform a quick check to detect truncated or otherwise invalid files: .. code-block:: console # Validate a .hic file - user@dev:/tmp$ hictk validate test/data/hic/4DNFIZ1ZVXC8.hic8 - ### SUCCESS: "test/data/hic/4DNFIZ1ZVXC8.hic8" is a valid .hic file. - - # Validate a .cool file - user@dev:/tmp$ hictk validate test/data/integration_tests/4DNFIZ1ZVXC8.mcool - uri="test/data/integration_tests/4DNFIZ1ZVXC8.mcool::/resolutions/2500000" - is_hdf5=true - unable_to_open_file=false - file_was_properly_closed=true - missing_or_invalid_format_attr=false - missing_or_invalid_bin_type_attr=false - missing_groups=[] - is_valid_cooler=true - index_is_valid=not_checked - ### SUCCESS: "test/data/integration_tests/4DNFIZ1ZVXC8.mcool::/resolutions/2500000" is a valid Cooler. - uri="test/data/integration_tests/4DNFIZ1ZVXC8.mcool::/resolutions/1000000" - is_hdf5=true - unable_to_open_file=false - file_was_properly_closed=true - missing_or_invalid_format_attr=false - missing_or_invalid_bin_type_attr=false - missing_groups=[] - is_valid_cooler=true - index_is_valid=not_checked - ### SUCCESS: "test/data/integration_tests/4DNFIZ1ZVXC8.mcool::/resolutions/1000000" is a valid Cooler. - ... - uri="test/data/integration_tests/4DNFIZ1ZVXC8.mcool::/resolutions/1000" - is_hdf5=true - unable_to_open_file=false - file_was_properly_closed=true - missing_or_invalid_format_attr=false - missing_or_invalid_bin_type_attr=false - missing_groups=[] - is_valid_cooler=true - index_is_valid=not_checked - ### SUCCESS: "test/data/integration_tests/4DNFIZ1ZVXC8.mcool::/resolutions/1000" is a valid Cooler. - + user@dev:/tmp$ hictk validate 4DNFIZ1ZVXC8.hic8 + [2024-09-26 16:20:55.552] [info]: Running hictk v1.0.0-fbdcb591 + { + "format": "hic", + "is_valid_hic": true, + "uri": "4DNFIZ1ZVXC8.hic8" + } + ### SUCCESS: "4DNFIZ1ZVXC8.hic8" is a valid .hic file. + + # Validate a .mcool file + user@dev:/tmp$ hictk validate 4DNFIZ1ZVXC8.mcool + [2024-09-26 16:22:47.348] [info]: Running hictk v1.0.0-fbdcb591 + { + "1000": { + "bin_table_dtypes_ok": true, + "bin_table_num_invalid_bins": 0, + "bin_table_shape_ok": true, + "file_was_properly_closed": true, + "index_is_valid": "not_checked", + "is_hdf5": true, + "is_valid_cooler": true, + "missing_groups": [], + "missing_or_invalid_bin_type_attr": false, + "missing_or_invalid_format_attr": false, + "unable_to_open_file": false + }, + "100000": { + ... + }, + "1000000": { + ... + }, + "25000": { + ... + }, + "250000": { + ... + }, + "2500000": { + ... + }, + "5000": { + ... + }, + "50000": { + ... + }, + "500000": { + ... + }, + "file_was_properly_closed": true, + "format": "mcool", + "is_hdf5": true, + "is_valid_mcool": true, + "missing_groups": [], + "missing_or_invalid_bin_type_attr": false, + "missing_or_invalid_format_attr": false, + "unable_to_open_file": false, + "uri": "4DNFIZ1ZVXC8.mcool" + } + ### SUCCESS: "4DNFIZ1ZVXC8.mcool" is a valid .mcool file. The quick check will not detect Cooler files with corrupted index, as this requires the ``--validate-index`` option: .. code-block:: console user@dev:/tmp$ hictk validate --validate-index 4DNFI9GMP2J8.mcool::/resolutions/1000000 - uri="4DNFI9GMP2J8.mcool::/resolutions/1000000" - is_hdf5=true - unable_to_open_file=false - file_was_properly_closed=true - missing_or_invalid_format_attr=false - missing_or_invalid_bin_type_attr=false - missing_groups=[] - is_valid_cooler=true - index_is_valid=false - ### FAILURE: "4DNFI9GMP2J8.mcool::/resolutions/1000000" is not a valid Cooler. + [2024-09-26 16:26:32.671] [info]: Running hictk v1.0.0-fbdcb591 + { + "bin_table_dtypes_ok": true, + "bin_table_num_invalid_bins": 0, + "bin_table_shape_ok": true, + "file_was_properly_closed": true, + "format": "cool", + "index_is_valid": "pixels between 0-2850 are not sorted in ascending order (and very likely contain duplicate entries)", + "is_hdf5": true, + "is_valid_cooler": false, + "missing_groups": [], + "missing_or_invalid_bin_type_attr": false, + "missing_or_invalid_format_attr": false, + "unable_to_open_file": false, + "uri": "4DNFI9GMP2J8.mcool::/resolutions/100000" + } + ### FAILURE: "4DNFI9GMP2J8.mcool::/resolutions/100000" does not point to valid Cooler. + +When launched with default settings, hictk validate outputs its report in .json format. The output format can be changed using the ``--output-format`` option. +Output to stdout can be completely suppressed by providing the ``--quiet`` option (the outcome of file validation can still be determined based on hictk's exit code). +When processing multi-resolution or single-cell files, hictk validate returns as soon as the first validation failure is encountered. This behavior can be changed by specifying the ``--exhaustive`` flag. Restoring corrupted .mcool files -------------------------------- @@ -106,3 +141,5 @@ File restoration is automated with ``hictk fix-mcool``: ``hictk fix-mcool`` is basically a wrapper around ``hictk zoomify`` and ``hictk balance``. When balancing, ``hictk fix-mcool`` will try to use the same parameters used to balance the original .mcool file. When this is not possible, ``hictk fix-mcool`` will fall back to the default parameters used by ``hictk balance``. + +To improve performance, consider using the ``--in-memory`` and/or ``--threads`` CLI options when appropriate (see :doc:`/balancing_matrices` for more details). diff --git a/docs/format_conversion.rst b/docs/format_conversion.rst index 730b88bea..2e2c87063 100644 --- a/docs/format_conversion.rst +++ b/docs/format_conversion.rst @@ -16,7 +16,7 @@ Converting from .hic to .cool or .mcool formats consists of the following operat #. For each resolution to be converted: a. Copy all raw interactions present in the .hic file - b. Copy all known normalization vectors (currently these are VC, VC_SQRT, KR, and SCALE) + b. Copy all normalization vectors Interactions are copied using streams of data, so memory requirements remain quite modest even when converting very high resolutions. @@ -24,19 +24,19 @@ Interactions are copied using streams of data, so memory requirements remain qui user@dev:/tmp$ hictk convert data/4DNFIZ1ZVXC8.hic9 4DNFIZ1ZVXC8.mcool - [2023-09-29 17:12:08.983] [info]: Running hictk v0.0.2-f83f93e - [2023-09-29 17:12:08.983] [info]: Converting data/4DNFIZ1ZVXC8.hic9 to 4DNFIZ1ZVXC8.mcool (hic -> mcool)... - [2023-09-29 17:12:09.052] [info]: [1000] begin processing 1000bp matrix... - [2023-09-29 17:12:12.212] [info]: [1000] processing chr2R:11267000-11268000 at 3167564 pixels/s (cache hit rate 0.00%)... - [2023-09-29 17:12:15.346] [info]: [1000] processing chr3R:5672000-5673000 at 3190810 pixels/s (cache hit rate 0.00%)... - [2023-09-29 17:12:18.204] [info]: [1000] processing SCALE normalization vector... - [2023-09-29 17:12:18.241] [info]: [1000] processing VC normalization vector... - [2023-09-29 17:12:18.285] [info]: [1000] processing VC_SQRT normalization vector... - [2023-09-29 17:12:19.123] [info]: [1000] DONE! Processed 26658348 pixels across 8 chromosomes in 10.07s + [2024-09-26 16:06:41.713] [info]: Running hictk v1.0.0-fbdcb591 + [2024-09-26 16:06:41.713] [info]: Converting data/4DNFIZ1ZVXC8.hic9 to 4DNFIZ1ZVXC8.mcool (hic -> mcool)... + [2024-09-26 16:06:41.943] [info]: [1000] begin processing 1000bp matrix... + [2024-09-26 16:06:44.117] [info]: [1000] processing chr2R:11267000-11268000 at 4604052 pixels/s (cache hit rate 0.00%)... + [2024-09-26 16:06:46.026] [info]: [1000] processing chr3R:5812000-5813000 at 5238345 pixels/s (cache hit rate 0.10%)... + [2024-09-26 16:06:47.842] [info]: [1000] processing SCALE normalization vector... + [2024-09-26 16:06:47.873] [info]: [1000] processing VC normalization vector... + [2024-09-26 16:06:47.907] [info]: [1000] processing VC_SQRT normalization vector... + [2024-09-26 16:06:48.411] [info]: [1000] DONE! Processed 26682908 pixels across 8 chromosomes in 6.47s ... - [2023-09-29 17:12:37.412] [info]: DONE! Processed 10 resolution(s) in 28.43s! - [2023-09-29 17:12:37.412] [info]: data/4DNFIZ1ZVXC8.hic9 size: 133.68 MB - [2023-09-29 17:12:37.412] [info]: 4DNFIZ1ZVXC8.mcool size: 100.00 MB + [2024-09-26 16:06:58.265] [info]: DONE! Processed 10 resolution(s) in 16.55s! + [2024-09-26 16:06:58.265] [info]: data/4DNFIZ1ZVXC8.hic9 size: 133.68 MB + [2024-09-26 16:06:58.265] [info]: 4DNFIZ1ZVXC8.mcool size: 99.86 MB It is also possible to convert only a subset of available resolutions by specifying resolutions to be converted with the ``--resolutions`` option. @@ -47,19 +47,18 @@ When specifying a single resolution, the resulting file will be in .cool format. user@dev:/tmp$ hictk convert data/4DNFIZ1ZVXC8.hic9 4DNFIZ1ZVXC8.1000.cool --resolutions 1000 - [2023-09-29 17:42:47.917] [info]: Running hictk v0.0.2-f83f93e - [2023-09-29 17:42:47.917] [info]: Converting data/4DNFIZ1ZVXC8.hic9 to 4DNFIZ1ZVXC8.cool (hic -> cool)... - [2023-09-29 17:42:47.982] [info]: [1000] begin processing 1000bp matrix... - [2023-09-29 17:42:49.982] [info]: [1000] processing chr2R:11267000-11268000 at 5005005 pixels/s (cache hit rate 93.05%)... - [2023-09-29 17:42:52.339] [info]: [1000] processing chr3R:5672000-5673000 at 4242681 pixels/s (cache hit rate 92.66%)... - [2023-09-29 17:42:54.071] [info]: [1000] processing SCALE normalization vector... - [2023-09-29 17:42:54.109] [info]: [1000] processing VC normalization vector... - [2023-09-29 17:42:54.150] [info]: [1000] processing VC_SQRT normalization vector... - [2023-09-29 17:42:54.931] [info]: [1000] DONE! Processed 26658348 pixels across 8 chromosomes in 6.95s - [2023-09-29 17:42:54.931] [info]: DONE! Processed 1 resolution(s) in 7.01s! - [2023-09-29 17:42:54.931] [info]: data/4DNFIZ1ZVXC8.hic9 size: 133.68 MB - [2023-09-29 17:42:54.931] [info]: 4DNFIZ1ZVXC8.cool size: 36.74 MB - + [2024-09-26 16:08:09.827] [info]: Running hictk v1.0.0-fbdcb591 + [2024-09-26 16:08:09.827] [info]: Converting data/4DNFIZ1ZVXC8.hic9 to 4DNFIZ1ZVXC8.cool (hic -> cool)... + [2024-09-26 16:08:10.043] [info]: [1000] begin processing 1000bp matrix... + [2024-09-26 16:08:11.216] [info]: [1000] processing chr2R:11267000-11268000 at 8539710 pixels/s (cache hit rate 93.05%)... + [2024-09-26 16:08:12.462] [info]: [1000] processing chr3R:5812000-5813000 at 8032129 pixels/s (cache hit rate 93.11%)... + [2024-09-26 16:08:13.423] [info]: [1000] processing SCALE normalization vector... + [2024-09-26 16:08:13.453] [info]: [1000] processing VC normalization vector... + [2024-09-26 16:08:13.485] [info]: [1000] processing VC_SQRT normalization vector... + [2024-09-26 16:08:13.968] [info]: [1000] DONE! Processed 26682908 pixels across 8 chromosomes in 3.92s + [2024-09-26 16:08:13.968] [info]: DONE! Processed 1 resolution(s) in 4.14s! + [2024-09-26 16:08:13.968] [info]: data/4DNFIZ1ZVXC8.hic9 size: 133.68 MB + [2024-09-26 16:08:13.968] [info]: 4DNFIZ1ZVXC8.cool size: 36.69 MB Converting from .[m]cool to .hic @@ -67,37 +66,37 @@ Converting from .[m]cool to .hic ``hictk convert`` can also be used to convert .[m]cool files to .hic format. -The conversion steps are similar to those carried out to convert .hic to .[m]cool +The conversion steps are similar to those carried out to convert .hic to .[m]cool. +The main difference is that in this case hictk computes the raw and normalized expected values for each resolution. .. code-block:: console user@dev:/tmp$ hictk convert data/4DNFIZ1ZVXC8.mcool 4DNFIZ1ZVXC8.hic - [2024-01-23 17:19:34.045] [info]: Running hictk v0.0.6-570037c-dirty - [2024-01-23 17:19:34.045] [info]: Converting 4DNFIZ1ZVXC8.mcool to 4DNFIZ1ZVXC8.hic (mcool -> hic)... - [2024-01-23 17:19:37.808] [info]: ingesting pixels at 2700513 pixels/s... - [2024-01-23 17:19:41.916] [info]: ingesting pixels at 2434275 pixels/s... - [2024-01-23 17:19:48.685] [info]: ingesting pixels at 2500000 pixels/s... - [2024-01-23 17:19:52.753] [info]: ingesting pixels at 2458815 pixels/s... - [2024-01-23 17:19:59.034] [info]: ingesting pixels at 2805049 pixels/s... - [2024-01-23 17:20:07.190] [info]: writing header at offset 0 - [2024-01-23 17:20:07.190] [info]: begin writing interaction blocks to file "4DNFIZ1ZVXC8.hic"... - [2024-01-23 17:20:07.190] [info]: [1000 bp] writing pixels for chr2L:chr2L matrix at offset 248... - [2024-01-23 17:20:07.595] [info]: [1000 bp] written 2676654 pixels for chr2L:chr2L matrix - [2024-01-23 17:20:07.651] [info]: [5000 bp] writing pixels for chr2L:chr2L matrix at offset 4303035... - [2024-01-23 17:20:08.257] [info]: [5000 bp] written 2676654 pixels for chr2L:chr2L matrix - [2024-01-23 17:20:08.366] [info]: [10000 bp] writing pixels for chr2L:chr2L matrix at offset 9144982... - [2024-01-23 17:20:08.821] [info]: [10000 bp] written 1433133 pixels for chr2L:chr2L matrix + [2024-09-26 16:10:58.066] [info]: Running hictk v1.0.0-fbdcb591 + [2024-09-26 16:10:58.066] [info]: Converting data/4DNFIZ1ZVXC8.mcool to 4DNFIZ1ZVXC8.hic (mcool -> hic)... + [2024-09-26 16:11:02.124] [info]: ingesting pixels at 2472799 pixels/s... + [2024-09-26 16:11:06.328] [info]: ingesting pixels at 2379253 pixels/s... + [2024-09-26 16:11:13.161] [info]: ingesting pixels at 2479544 pixels/s... + [2024-09-26 16:11:17.436] [info]: ingesting pixels at 2339729 pixels/s... + [2024-09-26 16:11:24.176] [info]: ingesting pixels at 2472188 pixels/s... + [2024-09-26 16:11:32.941] [info]: writing header at offset 0 + [2024-09-26 16:11:32.941] [info]: begin writing interaction blocks to file "4DNFIZ1ZVXC8.hic"... + [2024-09-26 16:11:32.941] [info]: [1000 bp] writing pixels for chr2L:chr2L matrix at offset 249... + [2024-09-26 16:11:35.129] [info]: [1000 bp] written 2676654 pixels for chr2L:chr2L matrix + [2024-09-26 16:11:35.159] [info]: [5000 bp] writing pixels for chr2L:chr2L matrix at offset 4075891... + [2024-09-26 16:11:37.035] [info]: [5000 bp] written 2676654 pixels for chr2L:chr2L matrix + [2024-09-26 16:11:37.096] [info]: [10000 bp] writing pixels for chr2L:chr2L matrix at offset 8697885... + [2024-09-26 16:11:38.094] [info]: [10000 bp] written 1433133 pixels for chr2L:chr2L matrix ... - [2024-01-23 17:21:30.092] [info]: [5000 bp] computing expected vector density - [2024-01-23 17:21:30.240] [info]: [5000 bp] computing expected vector density - [2024-01-23 17:21:30.297] [info]: [1000 bp] computing expected vector density - [2024-01-23 17:21:30.784] [info]: [5000 bp] computing expected vector density - [2024-01-23 17:21:30.784] [info]: writing 50 normalized expected value vectors at offset 142822186... - [2024-01-23 17:21:30.785] [info]: writing 400 normalization vectors at offset 143709792... - [2024-01-23 17:21:30.839] [info]: DONE! Processed 10 resolution(s) in 116.79s! - [2024-01-23 17:21:30.839] [info]: 4DNFIZ1ZVXC8.mcool size: 139.38 MB - [2024-01-23 17:21:30.839] [info]: 4DNFIZ1ZVXC8.hic size: 147.52 MB + [2024-09-26 16:13:20.981] [info]: [2500000 bp] initializing expected value vector + [2024-09-26 16:13:20.981] [info]: [2500000 bp] computing expected vector density + [2024-09-26 16:13:20.982] [info]: [500000 bp] computing expected vector density + [2024-09-26 16:13:20.982] [info]: writing 50 normalized expected value vectors at offset 135622984... + [2024-09-26 16:13:20.983] [info]: writing 400 normalization vectors at offset 136510590... + [2024-09-26 16:13:21.027] [info]: DONE! Processed 10 resolution(s) in 142.96s! + [2024-09-26 16:13:21.027] [info]: data/4DNFIZ1ZVXC8.mcool size: 139.37 MB + [2024-09-26 16:13:21.027] [info]: 4DNFIZ1ZVXC8.hic size: 140.32 MB **Tips:** diff --git a/docs/installation.rst b/docs/installation.rst index 7c8e5831f..c172102c2 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -20,12 +20,8 @@ hictk package for Linux and MacOS is available on bioconda and can be installed hictk: /home/user/.miniconda3/envs/hictk/bin/hictk (hictk) user@dev:/tmp$ hictk --version - hictk-v1.0.0-bioconda + hictk-v2.0.0-bioconda -If you are trying to install hictk on a Mac with an M chip, the above command may fail due to conda not being able to find a package for hictk. -You can workaround the above issue by prefixing conda commands with :code:`CONDA_SUBDIR=osx-64`. -Note that this will make hictk quite a bit slower, as the installed binary will be executed through Rosetta. -If performance is important, please consider :doc:`compiling hictk from source <./installation_src>` or using containers (see below). Containers (Docker or Singularity/Apptainer) ============================================ @@ -70,25 +66,28 @@ Downloading and running the latest stable release can be done as follows: .. code-block:: console # Using Docker, may require sudo - user@dev:/tmp$ docker run ghcr.io/paulsengroup/hictk:1.0.0 --help + user@dev:/tmp$ docker run ghcr.io/paulsengroup/hictk:2.0.0 --help # Using Singularity/Apptainer - user@dev:/tmp$ singularity run ghcr.io/paulsengroup/hictk:1.0.0 --help + user@dev:/tmp$ singularity run ghcr.io/paulsengroup/hictk:2.0.0 --help Blazing fast tools to work with .hic and .cool files. - Usage: /usr/local/bin/hictk [OPTIONS] SUBCOMMAND - + Usage: hictk [OPTIONS] SUBCOMMAND Options: -h,--help Print this help message and exit -V,--version Display program version information and exit - Subcommands: - convert Convert Hi-C matrices to a different format. - dump Dump data from .hic and Cooler files to stdout. - load Build .cool files from interactions in various text formats. - merge Merge coolers. + balance Balance Hi-C files using ICE, SCALE, or VC. + convert Convert Hi-C files between different formats. + dump Read interactions and other kinds of data from .hic and Cooler files and write them to stdout. + fix-mcool Fix corrupted .mcool files. + load Build .cool and .hic files from interactions in various text formats. + merge Merge multiple Cooler or .hic files into a single file. + metadata Print file metadata to stdout. + rename-chromosomes, rename-chroms + Rename chromosomes found in Cooler files. validate Validate .hic and Cooler files. - zoomify Convert single-resolution Cooler file to multi-resolution by coarsening. + zoomify Convert single-resolution Cooler and .hic files to multi-resolution by coarsening. The above will print hictk's help message, and is equivalent to running :code:`hictk --help` on the command line (assuming hictk is available on your machine). diff --git a/docs/installation_src.rst b/docs/installation_src.rst index 5e0f24bb0..eee5a0d15 100644 --- a/docs/installation_src.rst +++ b/docs/installation_src.rst @@ -61,7 +61,7 @@ Download from the `Release `_ pa .. code-block:: bash mkdir /tmp/hictk - curl -L 'https://github.com/paulsengroup/hictk/archive/refs/tags/v1.0.0.tar.gz' | tar --strip-components=1 -C /tmp/hictk -xzf - + curl -L 'https://github.com/paulsengroup/hictk/archive/refs/tags/v2.0.0.tar.gz' | tar --strip-components=1 -C /tmp/hictk -xzf - Using git. @@ -71,7 +71,7 @@ Using git. git clone https://github.com/paulsengroup/hictk.git /tmp/hictk cd /tmp/hictk - git checkout v1.0.0 # Skip this step if you want to build the latest commit from main + git checkout v2.0.0 # Skip this step if you want to build the latest commit from main Compiling hictk --------------- @@ -95,12 +95,14 @@ Compiling hictk --output-folder=./build/ \ . - # This may take a while, as CMake will run Conan to build hictk dependencies. # Do not pass -G Ninja if you want CMake to use make instead of ninja cmake -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_PREFIX_PATH="$PWD/build" \ -DHICTK_ENABLE_TESTING=ON \ + -DHICTK_ENABLE_FUZZY_TESTING=ON \ -DHICTK_BUILD_TOOLS=ON \ + -DHICTK_BUILD_BENCHMARKS=OFF \ + -DHICTK_BUILD_EXAMPLES=OFF \ -G Ninja \ -S /tmp/hictk \ -B /tmp/hictk/build @@ -138,18 +140,20 @@ A successful run of the test suite will produce an output like the following: user@dev:/tmp/hictk$ ctest --test-dir build/ ... ... - 63/70 Test #21: Cooler: init files - SHORT ....................................... Passed 0.02 sec - 64/70 Test #57: HiC: pixel selector fetch (observed NONE BP 10000) - LONG ........ Passed 1.53 sec - 65/70 Test #5: Cooler: index validation - SHORT ................................. Passed 3.83 sec - 66/70 Test #17: Cooler: index validation - SHORT ................................. Passed 3.62 sec - 67/70 Test #37: Cooler: utils merge - LONG ....................................... Passed 4.35 sec - 68/70 Test #67: Transformers (cooler) - SHORT .................................... Passed 4.11 sec - 69/70 Test #36: Cooler: dataset random iteration - MEDIUM ........................ Passed 5.50 sec - 70/70 Test #40: Cooler: dataset large read/write - LONG .......................... Passed 11.47 sec - - 100% tests passed, 0 tests failed out of 70 - - Total Test time (real) = 12.03 sec + 96/106 Test #62: Cooler: dataset linear iteration - LONG ........................... Passed 2.26 sec + 97/106 Test #104: Transformers (hic) - SHORT ........................................ Passed 2.81 sec + 98/106 Test #7: Balancing: SCALE (gw) - SHORT ..................................... Passed 2.49 sec + 99/106 Test #17: Balancing: SCALE (edge cases) - MEDIUM ............................ Passed 2.78 sec + 100/106 Test #15: Balancing: ICE (inter) - MEDIUM ................................... Passed 3.17 sec + 101/106 Test #6: Balancing: SCALE (inter) - SHORT .................................. Passed 3.06 sec + 102/106 Test #8: Balancing: AtomicBitSet - SHORT ................................... Passed 3.52 sec + 103/106 Test #66: Cooler: utils merge - LONG ........................................ Passed 3.88 sec + 104/106 Test #61: Cooler: dataset random iteration - MEDIUM ......................... Passed 10.41 sec + 105/106 Test #63: Cooler: dataset large read/write - LONG ........................... Passed 12.10 sec + 106/106 Test #92: HiC: HiCFileWriter - LONG ......................................... Passed 13.03 sec + 100% tests passed, 0 tests failed out of 106 + + Total Test time (real) = 101.97 sec **All tests are expected to pass. Do not ignore test failures!** @@ -166,7 +170,7 @@ If one or more tests fail, try the following troubleshooting steps before reachi #. Before running :code:`ctest`, create a temporary folder where your user has read-write permissions and where there are at least 100-200MB of space available. Then set variable :code:`TMPDIR` to that folder and re-run `ctest`. #. Checksum the test dataset located under :code:`test/data/` by running :code:`sha256sum -c checksums.sha256`. - If the checksumming fails or the folder doesn't exist, download and extract the :code:`.tar.xz` file listed in file :code:`cmake/FetchTestDataset.cmake`. Make sure you run :code:`tar -xf` from the root of the repository (:code:`/tmp/hictk` if you are following the instructions). + If the checksumming fails or the folder doesn't exist, download and extract the :code:`.tar.zst` file listed in file :code:`cmake/FetchTestDataset.cmake`. Make sure you run :code:`tar -xf` from the root of the repository (:code:`/tmp/hictk` if you are following the instructions). Example: @@ -178,7 +182,7 @@ Example: cd /tmp/hictk # Make sure this is the URL listed in file cmake/FetchTestDataset.cmake - curl -L 'https://zenodo.org/records/10522583/files/hictk_test_data.tar.xz?download=1' | tar -xJf - + curl -L 'https://zenodo.org/records/13849053/files/hictk_test_data.tar.zst?download=1' | zstdcat | tar -xf - # This should print "OK" if the check is successful (cd test/data && sha256sum --quiet -c checksums.sha256 && 2>&1 echo OK) @@ -205,80 +209,36 @@ If after trying the above steps the tests are still failing, feel free to start Integration tests ----------------- -The integration test scripts depend on the following tools: - -* cooler>=0.9 -* java -* `juicer_tools `_ or `hic_tools `_ -* xz -* common UNIX shell commands - -cooler can be installed using pip: - -.. code-block:: bash - - /tmp/venv/bin/pip3 install 'cooler>=0.10.0' 'pyyaml' +The integration test suite is implemented in Python, requires 3.11 or newer, and can be installed using pip: -If not already installed, :code:`xz` can usually be installed with your system package manager (on some Linux distributions the relevant package is called :code:`xz-utils`). .. code-block:: bash # Activate venv . /tmp/venv/bin/activate - cd /tmp/hictk - - # hictk balance - test/scripts/hictk_balance_ice.sh build/src/hictk/hictk - test/scripts/hictk_balance_scale.sh build/src/hictk/hictk - test/scripts/hictk_balance_vc.sh build/src/hictk/hictk - - # hictk convert - test/scripts/hictk_convert_cool2hic.sh build/src/hictk/hictk - test/scripts/hictk_convert_hic2cool.sh build/src/hictk/hictk + pip install test/integration - # hictk dump tables - test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk - test/scripts/hictk_dump_bins.sh build/src/hictk/hictk - test/scripts/hictk_dump_resolutions.sh build/src/hictk/hictk - test/scripts/hictk_dump_normalizations.sh build/src/hictk/hictk - test/scripts/hictk_dump_cells.sh build/src/hictk/hictk + hictk_integration_suite --help - # hictk dump pixels - test/scripts/hictk_dump_balanced.sh build/src/hictk/hictk - test/scripts/hictk_dump_bins.sh build/src/hictk/hictk - test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk - test/scripts/hictk_dump_cis.sh build/src/hictk/hictk - test/scripts/hictk_dump_gw.sh build/src/hictk/hictk - test/scripts/hictk_dump_trans.sh build/src/hictk/hictk +Once installed, the full integration suite can be run as follows: - # hictk fix-mcool - test/scripts/hictk_fix_mcool.sh build/src/hictk/hictk - - # hictk load (sorted) - test/scripts/hictk_load_4dn.sh build/src/hictk/hictk sorted - test/scripts/hictk_load_bg2.sh build/src/hictk/hictk sorted - test/scripts/hictk_load_coo.sh build/src/hictk/hictk sorted - - # hictk load (unsorted) - test/scripts/hictk_load_4dn.sh build/src/hictk/hictk unsorted - test/scripts/hictk_load_bg2.sh build/src/hictk/hictk unsorted - test/scripts/hictk_load_coo.sh build/src/hictk/hictk unsorted +.. code-block:: bash - # hictk merge - test/scripts/hictk_merge.sh build/src/hictk/hictk + # Activate venv + . /tmp/venv/bin/activate - # hictk metadata - test/scripts/hictk_metadata.sh build/src/hictk/hictk + cd /tmp/hictk - # hictk rename-chromosomes - test/scripts/hictk_rename_chromosomes.sh build/src/hictk/hictk + hictk_integration_suite \ + build/src/hictk/hictk \ + test/integration/config.toml \ + --data-dir test/data \ + --threads 8 \ + --result-file results.json - # hictk validate - test/scripts/hictk_validate.sh build/src/hictk/hictk + # To run specific parts of the integration suite, pass e.g. --suites=metadata,validate - # hictk zoomify - test/scripts/hictk_zoomify.sh build/src/hictk/hictk Installation ============ @@ -294,18 +254,28 @@ Once all tests have passed, :code:`hictk` can be installed as follows: user@dev:/tmp$ cmake --install /tmp/hictk/build -- Install configuration: "Release" -- Installing: /usr/local/bin/hictk - -- Set runtime path of "/usr/local/bin/hictk" to "" - -- Up-to-date: /usr/local/share/licenses/hictk/LICENSE + -- Set non-toolchain portion of runtime path of "/usr/local/bin/hictk" to "" + -- Installing: /usr/local/share/licenses/hictk/LICENSE + -- Installing: /usr/local/include/hictk ... # Alternatively, install to custom path user@dev:/tmp$ cmake --install /tmp/hictk/build --prefix "$HOME/.local/" -- Install configuration: "Release" -- Installing: /home/user/.local/bin/hictk - -- Set runtime path of "/home/user/.local/bin/hictk" to "" - -- Up-to-date: /home/user/.local/share/licenses/hictk/LICENSE + -- Set non-toolchain portion of runtime path of "/home/user/.local/bin/hictk" to "" + -- Installing: /home/user/.local/share/licenses/hictk/LICENSE + -- Installing: /home/user/.local/include/hictk ... + # Install the hictk binary only (i.e. without the header files required for development) + user@dev:/tmp$ cmake --install /tmp/hictk/build --component Runtime + -- Install configuration: "Release" + -- Installing: /usr/local/bin/hictk + -- Set non-toolchain portion of runtime path of "/usr/local/bin/hictk" to "" + -- Installing: /usr/local/share/licenses/hictk/LICENSE + + Cleaning build artifacts ======================== @@ -314,4 +284,4 @@ After successfully compiling hictk the following folders safely be removed: * Python virtualenv: :code:`/tmp/venv` * hictk source tree: :code:`/tmp/hictk` -If you are not using Conan in any other project feel free to also delete Conan's folder :code:`~/.conan2/` +If you are not using Conan in any other project feel free to also delete Conan's folder :code:`~/.conan2/`. diff --git a/docs/quickstart_api.rst b/docs/quickstart_api.rst index c1d4db2e3..e8764b97f 100644 --- a/docs/quickstart_api.rst +++ b/docs/quickstart_api.rst @@ -18,7 +18,7 @@ To install libhictk using Conan, first create a conanfile.txt like the following .. code-block:: [requires] - hictk/1.0.0 + hictk/2.0.0 [generators] CMakeDeps @@ -47,14 +47,14 @@ Installing using CMake FetchContent ----------------------------------- Before beginning, make sure all of hictk dependencies have been installed. -Refer to `conanfile.txt `_ for an up-to-date list of hictk dependencies. +Refer to `conanfile.txt `_ for an up-to-date list of hictk dependencies. To install and configure hictk using `FetchContent `_, first write a ``CMakeLists.txt`` file like the following: .. code-block:: cmake cmake_minimum_required(VERSION 3.25) - cmake_policy(VERSION 3.25...3.27) + cmake_policy(VERSION 3.25...3.30) project(myproject LANGUAGES C CXX) @@ -62,7 +62,7 @@ To install and configure hictk using `FetchContent ` -* :doc:`Creating .cool and .hic files <./creating_cool_and_hic_files>` -* :doc:`Converting single-resolution files to multi-resolution <./creating_multires_files>` * :doc:`Balancing Hi-C matrices <./balancing_matrices>` +* :doc:`Converting single-resolution files to multi-resolution <./creating_multires_files>` +* :doc:`Creating .cool and .hic files <./creating_cool_and_hic_files>` +* :doc:`Dumping tabular information to stdout <./reading_interactions>` +* :doc:`File validation <./file_validation>` +* :doc:`Format conversion <./format_conversion>` +* :doc:`Reading file metadata <./file_metadata>` API diff --git a/docs/reading_interactions.rst b/docs/reading_interactions.rst index 39bc828d5..a423386cf 100644 --- a/docs/reading_interactions.rst +++ b/docs/reading_interactions.rst @@ -108,6 +108,9 @@ Dump tables other than pixels: ... +See hictk dump help message for the complete list of supported tables. + + Dump cis or trans interactions only: .. code-block:: console diff --git a/docs/tutorials/dump_interactions_to_cool_hic_file.rst b/docs/tutorials/dump_interactions_to_cool_hic_file.rst index bbfe4c2ad..e7f24412e 100644 --- a/docs/tutorials/dump_interactions_to_cool_hic_file.rst +++ b/docs/tutorials/dump_interactions_to_cool_hic_file.rst @@ -11,13 +11,14 @@ TLDR .. code-block:: console # Important! --bin-size should be the same resolution as matrix.cool - user@dev:/tmp hictk load --format=bg2 \ - --bin-size=1000 \ - <(hictk dump --table=chroms matrix.cool) + user@dev:/tmp hictk load - \ output.cool \ + --chrom-sizes=<(hictk dump --table=chroms matrix.cool) \ + --format=bg2 \ + --bin-size=1000 \ < <(hictk dump --join - --range=chr2L:0-10,000,000 - --range2=chr3R:0-10,000,000 + --range=2L:0-10,000,000 + --range2=3R:0-10,000,000 matrix.cool) Why is this needed? @@ -32,77 +33,75 @@ This tutorial shows how this can be accomplished using ``hictk dump`` and ``hict Walkthrough ----------- -For this tutorial, we will use file ``4DNFIZ1ZVXC8.mcool`` as an example, which can be downloaded from `here `__. +For this tutorial, we will use file ``4DNFIOTPSS3L.hic`` as an example, which can be downloaded from `here `__. First, we extract the list of chromosomes from the input file: .. code-block:: console - user@dev:/tmp hictk dump 4DNFIZ1ZVXC8.mcool --table=chroms | tee chrom.sizes + user@dev:/tmp hictk dump 4DNFIOTPSS3L.hic --table=chroms | tee chrom.sizes - chr2L 23513712 - chr2R 25286936 - chr3L 28110227 - chr3R 32079331 - chr4 1348131 - chrX 23542271 - chrY 3667352 + 2L 23513712 + 2R 25286936 + 3L 28110227 + 3R 32079331 + 4 1348131 + X 23542271 + Y 3667352 Second, we dump pixels in bedGraph2 format (see below for how to make this step more efficient): .. code-block:: console - user@dev:/tmp hictk dump 4DNFIZ1ZVXC8.mcool \ + user@dev:/tmp hictk dump 4DNFIOTPSS3L.hic \ --join \ --resolution=1000 \ - --range=chr2L:5,000,000-10,000,000 \ - --range2=chr3R:7,500,000-10,000,000 > pixels.bg2 + --range=2L:5,000,000-10,000,000 \ + --range2=3R:7,500,000-10,000,000 > pixels.bg2 user@dev:/tmp head pixels.bg2 - chr2L 5000000 5001000 chr3R 7506000 7507000 1 - chr2L 5000000 5001000 chr3R 7624000 7625000 1 - chr2L 5000000 5001000 chr3R 7943000 7944000 1 - chr2L 5000000 5001000 chr3R 8014000 8015000 1 - chr2L 5000000 5001000 chr3R 8130000 8131000 1 - chr2L 5000000 5001000 chr3R 8245000 8246000 1 - chr2L 5000000 5001000 chr3R 8855000 8856000 1 - chr2L 5000000 5001000 chr3R 9032000 9033000 1 - chr2L 5000000 5001000 chr3R 9171000 9172000 1 - chr2L 5000000 5001000 chr3R 9380000 9381000 1 - + 2L 5000000 5001000 3R 7506000 7507000 1 + 2L 5000000 5001000 3R 7624000 7625000 1 + 2L 5000000 5001000 3R 7943000 7944000 1 + 2L 5000000 5001000 3R 8014000 8015000 1 + 2L 5000000 5001000 3R 8130000 8131000 1 + 2L 5000000 5001000 3R 8245000 8246000 1 + 2L 5000000 5001000 3R 8855000 8856000 1 + 2L 5000000 5001000 3R 9032000 9033000 1 + 2L 5000000 5001000 3R 9171000 9172000 1 + 2L 5000000 5001000 3R 9380000 9381000 1 Finally, we load pixels into a new .cool file .. code-block:: console - user@dev:/tmp hictk load --format=bg2 \ - --bin-size=1000 \ - chrom.sizes \ - output.cool < pixels.bg2 - - [2024-03-21 13:22:57.542] [info]: Running hictk v0.0.10-1c2bafd - [2024-03-21 13:22:57.542] [info]: begin loading unsorted pixels into a .cool file... - [2024-03-21 13:22:57.613] [info]: writing chunk #1 to intermediate file "/tmp/output.cool.tmp/output.cool.tmp"... - [2024-03-21 13:22:57.630] [info]: done writing chunk #1 to tmp file "/tmp/output.cool.tmp/output.cool.tmp". - [2024-03-21 13:22:57.630] [info]: writing chunk #2 to intermediate file "/tmp/output.cool.tmp/output.cool.tmp"... - [2024-03-21 13:22:57.634] [info]: done writing chunk #2 to tmp file "/tmp/output.cool.tmp/output.cool.tmp". - [2024-03-21 13:22:57.634] [info]: merging 2 chunks into "output.cool"... - [2024-03-21 13:22:57.676] [info]: ingested 26214 interactions (25085 nnz) in 0.133955616s! + user@dev:/tmp hictk load pixels.bg2 \ + output.cool \ + --chrom-sizes=chrom.sizes \ + --format=bg2 \ + --bin-size=1000 + + [2024-09-27 18:54:58.532] [info]: Running hictk v1.0.0-fbdcb591 + [2024-09-27 18:54:58.540] [info]: begin loading unsorted pixels into a .cool file... + [2024-09-27 18:54:58.629] [info]: writing chunk #1 to intermediate file "/tmp/hictk-tmp-XXXXatmfuM/output.cool.tmp"... + [2024-09-27 18:54:58.641] [info]: done writing chunk #1 to tmp file "/tmp/hictk-tmp-XXXXatmfuM/output.cool.tmp". + [2024-09-27 18:54:58.642] [info]: merging 1 chunks into "output.cool"... + [2024-09-27 18:54:58.672] [info]: ingested 26214 interactions (25085 nnz) in 0.139864314s! Removing empty chromosomes from the reference genome ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -This can be easily achieved by grepping ``chr2L`` and ``chr3R`` when generating the ``chrom.sizes`` file. +This can be easily achieved by grepping ``2L`` and ``3R`` when generating the ``chrom.sizes`` file. .. code-block:: console - user@dev:/tmp hictk dump 4DNFIZ1ZVXC8.mcool --table=chroms | - grep -e 'chr2L' -e 'chr3R' | + user@dev:/tmp hictk dump 4DNFIOTPSS3L.hic --table=chroms | + grep -e '2L' -e '3R' | tee chrom.sizes - chr2L 23513712 - chr3R 32079331 + 2L 23513712 + 3R 32079331 Tips and tricks @@ -113,30 +112,32 @@ Luckily, we can completely avoid generating this file by using output redirectio .. code-block:: console - user@dev:/tmp hictk load --format=bg2 \ - --bin-size=1000 \ - chrom.sizes \ + user@dev:/tmp hictk load - \ output.cool \ - < <(hictk dump 4DNFIZ1ZVXC8.mcool \ + --chrom-sizes=chrom.sizes \ + --format=bg2 \ + --bin-size=1000 \ + < <(hictk dump 4DNFIOTPSS3L.hic \ --join \ --resolution=1000 \ - --range=chr2L:0-10,000,000 \ - --range2=chr3R:0-10,000,000) + --range=2L:0-10,000,000 \ + --range2=3R:0-10,000,000) Note that hictk still needs to generate some temporary file to load interactions into a new .cool or .hic file. When processing large files, it is a good idea to specify custom folder where to create temporary files through the ``--tmpdir`` flag: .. code-block:: console - user@dev:/tmp hictk load --format=bg2 \ + user@dev:/tmp hictk load - \ + output.cool \ + --chrom-sizes=chrom.sizes \ + --format=bg2 \ --bin-size=1000 \ --tmpdir=/var/tmp/ \ - chrom.sizes.sorted \ - output.cool \ - < <(hictk dump 4DNFIZ1ZVXC8.mcool \ + < <(hictk dump 4DNFIOTPSS3L.hic \ --join \ --resolution=1000 \ - --range=chr2L:0-10,000,000 \ - --range2=chr3R:0-10,000,000) + --range=2L:0-10,000,000 \ + --range2=3R:0-10,000,000) Another option you may want to consider when working with .hic files, is the ``--threads`` option, which can significantly reduce the time required to load interactions into .hic files. diff --git a/docs/tutorials/reordering_chromosomes.rst b/docs/tutorials/reordering_chromosomes.rst index 6c6fd363c..6ffe356b4 100644 --- a/docs/tutorials/reordering_chromosomes.rst +++ b/docs/tutorials/reordering_chromosomes.rst @@ -11,12 +11,12 @@ TLDR .. code-block:: console # Important! --bin-size should be the same resolution as matrix.cool - user@dev:/tmp hictk load --format=bg2 \ - --bin-size=1000 \ - <(hictk dump --table=chroms matrix.cool | - sort -k2,2nr) \ + user@dev:/tmp hictk load <(hictk dump --join matrix.cool) \ output.cool \ - < <(hictk dump --join matrix.cool) + --chrom-sizes=<(hictk dump --table=chroms matrix.cool | sort -k2,2nr) \ + --format=bg2 \ + --bin-size=1000 \ + --transpose-lower-triangular-pixels Why is this needed? @@ -31,21 +31,21 @@ The same procedure can be applied to .hic files. Walkthrough ----------- -For this tutorial, we will use file ``4DNFIZ1ZVXC8.mcool`` as an example, which can be downloaded from `here `__. +For this tutorial, we will use file ``4DNFIOTPSS3L.hic`` as an example, which can be downloaded from `here `__. First, we extract the list of chromosomes from the input file: .. code-block:: console - user@dev:/tmp hictk dump 4DNFIZ1ZVXC8.mcool --table=chroms | tee chrom.sizes + user@dev:/tmp hictk dump 4DNFIOTPSS3L.hic --table=chroms | tee chrom.sizes - chr2L 23513712 - chr2R 25286936 - chr3L 28110227 - chr3R 32079331 - chr4 1348131 - chrX 23542271 - chrY 3667352 + 2L 23513712 + 2R 25286936 + 3L 28110227 + 3R 32079331 + 4 1348131 + X 23542271 + Y 3667352 Second, we re-order chromosomes: @@ -53,78 +53,75 @@ Second, we re-order chromosomes: user@dev:/tmp sort -k2,2nr chrom.sizes | tee chrom.sizes.sorted - chr3R 32079331 - chr3L 28110227 - chr2R 25286936 - chrX 23542271 - chr2L 23513712 - chrY 3667352 - chr4 1348131 + 3R 32079331 + 3L 28110227 + 2R 25286936 + X 23542271 + 2L 23513712 + Y 3667352 + 4 1348131 Next, we dump pixels in bedGraph2 format (see below for how to make this step more efficient): .. code-block:: console - user@dev:/tmp hictk dump 4DNFIZ1ZVXC8.mcool --join --resolution=1000 > pixels.bg2 + user@dev:/tmp hictk dump 4DNFIOTPSS3L.hic --join --resolution=1000 > pixels.bg2 user@dev:/tmp head pixels.bg2 - chr2L 5000 6000 chr2L 5000 6000 127 - chr2L 5000 6000 chr2L 6000 7000 129 - chr2L 5000 6000 chr2L 7000 8000 60 - chr2L 5000 6000 chr2L 8000 9000 77 - chr2L 5000 6000 chr2L 9000 10000 97 - chr2L 5000 6000 chr2L 10000 11000 3 - chr2L 5000 6000 chr2L 11000 12000 1 - chr2L 5000 6000 chr2L 12000 13000 66 - chr2L 5000 6000 chr2L 13000 14000 116 - chr2L 5000 6000 chr2L 14000 15000 64 + 2L 5000 6000 2L 5000 6000 41 + 2L 5000 6000 2L 6000 7000 126 + 2L 5000 6000 2L 7000 8000 60 + 2L 5000 6000 2L 8000 9000 77 + 2L 5000 6000 2L 9000 10000 97 + 2L 5000 6000 2L 10000 11000 3 + 2L 5000 6000 2L 11000 12000 1 + 2L 5000 6000 2L 12000 13000 66 + 2L 5000 6000 2L 13000 14000 116 + 2L 5000 6000 2L 14000 15000 64 -Finally, we load pixels into a new .cool file +Finally, we load pixels into a new .hic file .. code-block:: console - user@dev:/tmp hictk load --format=bg2 \ - --bin-size=1000 \ - chrom.sizes.sorted \ - output.cool < pixels.bg2 - - [2024-03-21 12:27:16.998] [info]: Running hictk v0.0.10-1c2bafd - [2024-03-21 12:27:16.998] [info]: begin loading unsorted pixels into a .cool file... - [2024-03-21 12:27:17.077] [info]: writing chunk #1 to intermediate file "/tmp/output.cool.tmp/output.cool.tmp"... - [2024-03-21 12:27:20.945] [info]: done writing chunk #1 to tmp file "/tmp/output.cool.tmp/output.cool.tmp". - [2024-03-21 12:27:20.945] [info]: writing chunk #2 to intermediate file "/tmp/output.cool.tmp/output.cool.tmp"... - [2024-03-21 12:27:24.890] [info]: done writing chunk #2 to tmp file "/tmp/output.cool.tmp/output.cool.tmp". - [2024-03-21 12:27:24.890] [info]: writing chunk #3 to intermediate file "/tmp/output.cool.tmp/output.cool.tmp"... - [2024-03-21 12:27:28.823] [info]: done writing chunk #3 to tmp file "/tmp/output.cool.tmp/output.cool.tmp". - [2024-03-21 12:27:28.823] [info]: writing chunk #4 to intermediate file "/tmp/output.cool.tmp/output.cool.tmp"... - [2024-03-21 12:27:32.668] [info]: done writing chunk #4 to tmp file "/tmp/output.cool.tmp/output.cool.tmp". - [2024-03-21 12:27:32.668] [info]: writing chunk #5 to intermediate file "/tmp/output.cool.tmp/output.cool.tmp"... - [2024-03-21 12:27:36.070] [info]: done writing chunk #5 to tmp file "/tmp/output.cool.tmp/output.cool.tmp". - [2024-03-21 12:27:36.070] [info]: writing chunk #6 to intermediate file "/tmp/output.cool.tmp/output.cool.tmp"... - [2024-03-21 12:27:36.079] [info]: done writing chunk #6 to tmp file "/tmp/output.cool.tmp/output.cool.tmp". - [2024-03-21 12:27:36.080] [info]: merging 6 chunks into "output.cool"... - [2024-03-21 12:27:38.572] [info]: processing chr3R:20786000-20787000 chr3R:20808000-20809000 at 4091653 pixels/s... - [2024-03-21 12:27:41.443] [info]: processing chr3L:7391000-7392000 chr3L:7417000-7418000 at 3484321 pixels/s... - [2024-03-21 12:27:44.292] [info]: processing chr2R:9278000-9279000 chrX:5993000-5994000 at 3510004 pixels/s... - [2024-03-21 12:27:47.062] [info]: processing chrX:14217000-14218000 chrX:17476000-17477000 at 3611412 pixels/s... - [2024-03-21 12:27:49.901] [info]: ingested 119208613 interactions (48469783 nnz) in 32.902465965s! - + user@dev:/tmp hictk load pixels.bg2 \ + output.hic \ + --chrom-sizes=chrom.sizes.sorted \ + --transpose-lower-triangular-pixels \ + --format=bg2 \ + --bin-size=1000 + + [2024-09-27 19:00:40.344] [info]: Running hictk v1.0.0-fbdcb591 + [2024-09-27 19:00:40.353] [info]: begin loading pixels into a .hic file... + [2024-09-27 19:00:42.504] [info]: preprocessing chunk #1 at 4847310 pixels/s... + [2024-09-27 19:00:45.244] [info]: preprocessing chunk #2 at 3649635 pixels/s... + [2024-09-27 19:00:48.180] [info]: preprocessing chunk #3 at 3407155 pixels/s... + [2024-09-27 19:00:50.616] [info]: preprocessing chunk #4 at 4105090 pixels/s... + [2024-09-27 19:00:53.251] [info]: preprocessing chunk #5 at 3203434 pixels/s... + [2024-09-27 19:00:54.358] [info]: writing header at offset 0 + [2024-09-27 19:00:54.358] [info]: begin writing interaction blocks to file "output.hic"... + [2024-09-27 19:00:54.358] [info]: [1000 bp] writing pixels for 3R:3R matrix at offset 171... + [2024-09-27 19:01:01.039] [info]: [1000 bp] written 9571521 pixels for 3R:3R matrix + ... + [2024-09-27 19:01:26.831] [info]: [1000 bp] initializing expected value vector + [2024-09-27 19:01:32.649] [info]: [1000 bp] computing expected vector density + [2024-09-27 19:01:32.649] [info]: writing 1 expected value vectors at offset 93720080... + [2024-09-27 19:01:32.649] [info]: writing 0 normalized expected value vectors at offset 93848475... + [2024-09-27 19:01:32.682] [info]: ingested 114355295 interactions (48437845 nnz) in 52.337885908s! Lastly, we check that chromosomes are properly sorted: .. code-block:: console - user@dev:/tmp hictk dump 4DNFIZ1ZVXC8.mcool --table=chroms - - chr3R 32079331 - chr3L 28110227 - chr2R 25286936 - chrX 23542271 - chr2L 23513712 - chrY 3667352 - chr4 1348131 + user@dev:/tmp hictk dump output.hic --table=chroms + 3R 32079331 + 3L 28110227 + 2R 25286936 + X 23542271 + 2L 23513712 + Y 3667352 + 4 1348131 Tips and tricks --------------- @@ -134,22 +131,25 @@ Luckily, we can completely avoid generating this file by using output redirectio .. code-block:: console - user@dev:/tmp hictk load --format=bg2 \ - --bin-size=1000 \ - chrom.sizes.sorted \ - output.cool \ - < <(hictk dump 4DNFIZ1ZVXC8.mcool --join --resolution=1000) + user@dev:/tmp hictk load <(hictk dump 4DNFIOTPSS3L.hic --join --resolution=1000) \ + output.hic \ + --chrom-sizes=chrom.sizes.sorted \ + --transpose-lower-triangular-pixels \ + --format=bg2 \ + --bin-size=1000 + Note that hictk still needs to generate some temporary file to load interactions into a new .cool or .hic file. When processing large files, it is a good idea to specify custom folder where to create temporary files through the ``--tmpdir`` flag: .. code-block:: console - user@dev:/tmp hictk load --format=bg2 \ + user@dev:/tmp hictk load <(hictk dump 4DNFIOTPSS3L.hic --join --resolution=1000) \ + output.hic \ + --chrom-sizes=chrom.sizes.sorted \ + --transpose-lower-triangular-pixels \ + --format=bg2 \ --bin-size=1000 \ - --tmpdir=/var/tmp/ \ - chrom.sizes.sorted \ - output.cool \ - < <(hictk dump 4DNFIZ1ZVXC8.mcool --join --resolution=1000) + --tmpdir=/var/tmp/ Another option you may want to consider when working with .hic files, is the ``--threads`` option, which can significantly reduce the time required to load interactions into .hic files. diff --git a/src/hictk/validate/validate.cpp b/src/hictk/validate/validate.cpp index fc98a8a4b..9019a6a59 100644 --- a/src/hictk/validate/validate.cpp +++ b/src/hictk/validate/validate.cpp @@ -493,19 +493,19 @@ int validate_subcmd(const ValidateConfig& c) { if (!c.quiet) { print_report(status, c.output_format); if (is_hic) { - fmt::print(stderr, FMT_STRING("### {}: \"{}\" is {}a valid .hic file."), + fmt::print(stderr, FMT_STRING("### {}: \"{}\" is {}a valid .hic file.\n"), return_code == 0 ? "SUCCESS" : "FAILURE", c.uri, return_code == 0 ? "" : "not "); } else if (is_mcool) { - fmt::print(stderr, FMT_STRING("### {}: \"{}\" is {}a valid .mcool file."), + fmt::print(stderr, FMT_STRING("### {}: \"{}\" is {}a valid .mcool file.\n"), return_code == 0 ? "SUCCESS" : "FAILURE", c.uri, return_code == 0 ? "" : "not "); } else if (is_scool) { - fmt::print(stderr, FMT_STRING("### {}: \"{}\" is {}a valid .scool file."), + fmt::print(stderr, FMT_STRING("### {}: \"{}\" is {}a valid .scool file.\n"), return_code == 0 ? "SUCCESS" : "FAILURE", c.uri, return_code == 0 ? "" : "not "); } else if (std::filesystem::exists(c.uri)) { - fmt::print(stderr, FMT_STRING("### {}: \"{}\" is {}a valid .cool file."), + fmt::print(stderr, FMT_STRING("### {}: \"{}\" is {}a valid .cool file.\n"), return_code == 0 ? "SUCCESS" : "FAILURE", c.uri, return_code == 0 ? "" : "not "); } else { - fmt::print(stderr, FMT_STRING("### {}: \"{}\" {} to valid Cooler."), + fmt::print(stderr, FMT_STRING("### {}: \"{}\" {} to valid Cooler.\n"), return_code == 0 ? "SUCCESS" : "FAILURE", c.uri, return_code == 0 ? "points" : "does not point"); }