Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor cljam.io.util.bin #201

Merged
merged 1 commit into from
Jun 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 62 additions & 49 deletions src/cljam/io/util/bin.clj
Original file line number Diff line number Diff line change
@@ -1,83 +1,96 @@
(ns cljam.io.util.bin
(:require [cljam.io.util.chunk :as util-chunk]))

(defn max-pos ^long [^long min-shift ^long depth]
(defn max-pos
"Returns a maximum position of a binning index. The value is identical to the
width of bin 0."
^long [^long min-shift ^long depth]
(bit-shift-left 1 (+ min-shift (* 3 depth))))

(defn- reg->bins*
"Returns candidate bins for the specified region as a vector."
[^long beg ^long end ^long min-shift ^long depth]
(let [max-pos (max-pos min-shift depth)
beg (if (<= beg 0) 0 (min (dec beg) max-pos))
end (if (<= end 0) max-pos (min end max-pos))]
(into [0]
(mapcat
(fn [^long d]
(let [t
(apply + (map (fn [^long x] (bit-shift-left 1 (* x 3)))
(range (inc d))))
s (+ min-shift (* 3 (- depth d 1)))]
(range (+ t (bit-shift-right beg s))
(+ t 1 (bit-shift-right (dec end) s))))))
(range depth))))

(def ^:private reg->bins (memoize reg->bins*))

(defn first-bin-of-level ^long [^long level]
(defn first-bin-of-level
"Returns a left-most bin number of the given `level`."
^long [^long level]
(quot (bit-shift-left 1 (* 3 level)) 7))

(defn bin-width-of-level ^long [^long level ^long min-shift ^long depth]
(defn bin-width-of-level
"Returns a width shared by bins of the same given `level`."
^long [^long level ^long min-shift ^long depth]
(bit-shift-left 1 (+ min-shift (* 3 (- depth level)))))

(defn bin-level ^long [^long bin]
(defn bin-level
"Returns a level that the given `bin` belongs to."
^long [^long bin]
(let [x (inc (quot (- 64 (Long/numberOfLeadingZeros bin)) 3))]
(cond-> x (< bin (first-bin-of-level x)) dec)))

(defn bin-beg ^long [^long bin ^long min-shift ^long depth]
(defn bin-beg
"Returns a beginning position of the given `bin`. 1-based."
^long [^long bin ^long min-shift ^long depth]
(let [level (bin-level bin)]
(inc (* (- bin (first-bin-of-level level))
(bin-width-of-level level min-shift depth)))))

(defn max-bin ^long [^long depth]
(defn max-bin
"Returns a maximum bin number of a binning index with the given `depth`."
^long [^long depth]
(dec (first-bin-of-level (inc depth))))

(defprotocol IBinningIndex
(get-chunks [this ref-idx bins])
(get-min-offset [this ref-idx beg])
(get-min-shift [this])
(get-depth [this])
(get-chr-names [this]))
(defn leading-bins-at-level
"Returns the distance between the bin corresponding to `pos` and the first one
at the same level."
^long [^long pos ^long level ^long min-shift ^long depth]
(unsigned-bit-shift-right pos (+ min-shift (* (- depth level) 3))))

(defn pos->lidx-offset
"Returns an offset of a linear index that the given `pos` belongs to."
[^long pos ^long linear-index-shift]
(bit-shift-right (if (<= pos 0) 0 (dec pos)) linear-index-shift))

(defn get-spans
"Calculates span information for random access from index data such as tabix."
[index-data ^long ref-idx ^long beg ^long end]
(let [bins
(reg->bins beg end (get-min-shift index-data) (get-depth index-data))
chunks (get-chunks index-data ref-idx bins)
min-offset (get-min-offset index-data ref-idx beg)]
(->> (util-chunk/optimize-chunks chunks min-offset)
(map vals))))

(defn leading-bins-at-level
"Returns the distance between the binning corresponding to pos and
the first one at the passed depth"
^long [^long pos ^long level ^long min-shift ^long depth]
(unsigned-bit-shift-right pos (+ min-shift (* (- depth level) 3))))
(defn reg->bins
"Returns all overlapping bins for the specified region [`beg`, `end`] as a
vector."
[^long beg ^long end ^long min-shift ^long depth]
(let [max-pos (max-pos min-shift depth)
beg (dec (Math/min max-pos (Math/max 1 beg)))
end (dec (Math/min max-pos (Math/max 1 end)))]
(into [0]
(mapcat
(fn [^long d]
(let [t (long (transduce
(map (fn [^long x] (bit-shift-left 1 (* x 3))))
+ 0 (range (inc d))))
s (+ min-shift (* 3 (- depth d 1)))]
(range (+ t (bit-shift-right beg s))
(+ t 1 (bit-shift-right end s))))))
(range depth))))

(defn reg->bin
"Calculates bin given an alignment covering [beg, end]"
"Calculates the smallest bin containing the given region [`beg`, `end`]."
^long [^long beg ^long end ^long min-shift ^long depth]
(let [max-pos (max-pos min-shift depth)
beg (dec (Math/min max-pos (Math/max 1 beg)))
end (dec (Math/min max-pos (Math/max 1 end)))]
beg (dec (Math/min max-pos (Math/max 1 beg)))
end (dec (Math/min max-pos (Math/max 1 end)))]
(loop [level depth]
(if-not (neg? level)
(let [beg-bins (leading-bins-at-level beg level min-shift depth)]
(if (= beg-bins (leading-bins-at-level end level min-shift depth))
(+ (first-bin-of-level level) beg-bins)
(recur (dec level))))
0))))

(defprotocol IBinningIndex
(get-chunks [this ref-idx bins])
(get-min-offset [this ref-idx beg])
(get-min-shift [this])
(get-depth [this])
(get-chr-names [this]))

(defn get-spans
"Calculates span information for random access from index data such as tabix."
[index-data ^long ref-idx ^long beg ^long end]
(let [bins (reg->bins
beg end (get-min-shift index-data) (get-depth index-data))
chunks (get-chunks index-data ref-idx bins)
min-offset (get-min-offset index-data ref-idx beg)]
(->> (util-chunk/optimize-chunks chunks min-offset)
(map vals))))
152 changes: 117 additions & 35 deletions test/cljam/io/util/bin_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -8,41 +8,67 @@
[cljam.io.csi :as csi]
[cljam.io.util.bin :as util-bin]))

(deftest get-spans-returns-a-sequence-including-regions
(let [tabix* (tabix/read-index test-tabix-file)]
(is (= [[0 50872]] (util-bin/get-spans tabix* 0 1 100)))
(is (every? #(and (= 2 (count %))
(number? (first %))
(number? (second %)))
(util-bin/get-spans tabix* 0 1 100))))
(let [csi* (csi/read-index test-csi-file)]
(is (= [[3904 3973]] (util-bin/get-spans csi* 0 1 100000)))
(is (every? #(and (= 2 (count %))
(number? (first %))
(number? (second %)))
(util-bin/get-spans csi* 0 1 100000)))))
(deftest max-pos-test
(are [?min-shift ?depth ?max-pos]
(= ?max-pos (util-bin/max-pos ?min-shift ?depth))
14 5 536870912
15 5 1073741824
0 0 1
0 1 8
0 2 64
1 1 16
1 2 128))

(deftest reg->bins-test
(are [x] (= [0 1 9 73 585 4681]
(#'util-bin/reg->bins 1 (bit-shift-left 1 x) x 5))
13 14 15 16)
(are [x] (= [0 1 9 73 585 4681 4682]
(#'util-bin/reg->bins 1 (inc (bit-shift-left 1 x)) x 5))
13 14 15 16)
(is (= [0 2 17]
(#'util-bin/reg->bins 9 9 0 2)))
(is (= [0 1 9 73 585]
(#'util-bin/reg->bins 1 1 14 4)))
(is (= [0 1 9 73 585 4681]
(#'util-bin/reg->bins 1 1 14 5)))
(is (= [0 1 9 73 585 4681 37449]
(#'util-bin/reg->bins 1 1 14 6)))
(is (= [0 1 9 73 585 4681 37449 299593]
(#'util-bin/reg->bins 1 1 14 7)))
(is (= [0 1 9 73 585 4681 37449 37450]
(#'util-bin/reg->bins 1 32769 15 6))))
(deftest first-bin-of-level-test
(are [?level ?first-bin]
(= ?first-bin (util-bin/first-bin-of-level ?level))
0 0
1 1
2 (+ 1 8)
3 (+ 1 8 64)
4 (+ 1 8 64 512)
5 (+ 1 8 64 512 4096)
6 (+ 1 8 64 512 4096 32768)))

(deftest bin-width-of-level-test
(are [?level ?min-shift ?depth ?bin-width]
(= ?bin-width (util-bin/bin-width-of-level ?level ?min-shift ?depth))
0 14 5 536870912
1 14 5 67108864
2 14 5 8388608
3 14 5 1048576
4 14 5 131072
5 14 5 16384
0 15 5 1073741824
0 0 0 1
0 0 1 8
1 0 1 1
1 0 2 8))

(deftest bin-level-test
(are [?bin ?level]
(= ?level (util-bin/bin-level ?bin))
0 0
1 1
2 1
8 1
9 2
72 2
73 3
584 3
585 4
4680 4
4681 5
37448 5
37449 6))

(deftest bin-beg-test
(are [?bin ?min-shift ?depth ?bin-beg]
(= ?bin-beg (util-bin/bin-beg ?bin ?min-shift ?depth))
4680 14 5 536739841
4681 14 5 1
4682 14 5 16385
4683 14 5 32769)
(doseq [index-shift [12 14 16]]
(let [min-size (bit-shift-left 1 index-shift)]
(is (= 1 (util-bin/bin-beg 4681 index-shift 5)))
Expand All @@ -54,16 +80,56 @@
(is (= (+ 1 (* min-size 24)) (util-bin/bin-beg 588 index-shift 5))))))

(deftest max-bin-test
(are [depth ans] (= (util-bin/max-bin depth) ans)
(are [?depth ?ans] (= (util-bin/max-bin ?depth) ?ans)
1 8
2 72
3 584
5 37448))

(deftest leading-bins-at-level-test
(are [?pos ?level ?min-shift ?depth ?bins]
(= ?bins (util-bin/leading-bins-at-level ?pos ?level ?min-shift ?depth))
1 0 14 5 0
1 5 14 5 0
16385 5 14 5 1
536870912 5 14 5 32768))

(deftest pos->lidx-offset-test
(are [?pos ?linear-index-shift ?offset]
(= ?offset (util-bin/pos->lidx-offset ?pos ?linear-index-shift))
0 14 0
1 14 0
16384 14 0
16385 14 1
32768 14 1
32769 14 2))

(deftest reg->bins-test
(are [x] (= [0 1 9 73 585 4681]
(util-bin/reg->bins 1 (bit-shift-left 1 x) x 5))
13 14 15 16)
(are [x] (= [0 1 9 73 585 4681 4682]
(util-bin/reg->bins 1 (inc (bit-shift-left 1 x)) x 5))
13 14 15 16)
(is (= [0 2 17]
(util-bin/reg->bins 9 9 0 2)))
(is (= [0 1 9 73 585]
(util-bin/reg->bins 1 1 14 4)))
(is (= [0 1 9 73 585 4681]
(util-bin/reg->bins 1 1 14 5)))
(is (= [0 1 9 73 585 4681 37449]
(util-bin/reg->bins 1 1 14 6)))
(is (= [0 1 9 73 585 4681 37449 299593]
(util-bin/reg->bins 1 1 14 7)))
(is (= [0 1 9 73 585 4681 37449 37450]
(util-bin/reg->bins 1 32769 15 6))))

(deftest reg->bin-test
(testing "BAI-compatible"
(are [?start ?end ?bin]
(= ?bin (util-bin/reg->bin ?start ?end 14 5))
(= ?bin
(util-bin/reg->bin ?start ?end 14 5)
((set (util-bin/reg->bins ?start ?end 14 5)) ?bin))
0 0 4681
0 1 4681
1 1 4681
Expand All @@ -79,7 +145,9 @@
536870912 536870913 37448))
(testing "1-by-1"
(are [?start ?end ?bin]
(= ?bin (util-bin/reg->bin ?start ?end 0 2))
(= ?bin
(util-bin/reg->bin ?start ?end 0 2)
((set (util-bin/reg->bins ?start ?end 0 2)) ?bin))
0 1 9
1 1 9
2 2 10
Expand All @@ -96,3 +164,17 @@
(is (= (util-bin/reg->bin 1 1 14 7) 299593))
(is (= (util-bin/reg->bin 1 32769 15 6) 4681))
(is (= (util-bin/reg->bin 240877561 240877568 14 6) 52150))))

(deftest get-spans-returns-a-sequence-including-regions
(let [tabix* (tabix/read-index test-tabix-file)]
(is (= [[0 50872]] (util-bin/get-spans tabix* 0 1 100)))
(is (every? #(and (= 2 (count %))
(number? (first %))
(number? (second %)))
(util-bin/get-spans tabix* 0 1 100))))
(let [csi* (csi/read-index test-csi-file)]
(is (= [[3904 3973]] (util-bin/get-spans csi* 0 1 100000)))
(is (every? #(and (= 2 (count %))
(number? (first %))
(number? (second %)))
(util-bin/get-spans csi* 0 1 100000)))))