Skip to content

Commit

Permalink
Refactor cljam.io.util.bin
Browse files Browse the repository at this point in the history
  • Loading branch information
alumi committed Jun 15, 2020
1 parent 84a535d commit aa62440
Show file tree
Hide file tree
Showing 2 changed files with 179 additions and 84 deletions.
111 changes: 62 additions & 49 deletions src/cljam/io/util/bin.clj
Original file line number Diff line number Diff line change
@@ -1,83 +1,96 @@
(ns cljam.io.util.bin
(:require [cljam.io.util.chunk :as util-chunk]))

(defn max-pos ^long [^long min-shift ^long depth]
(defn max-pos
"Returns a maximum position of a binning index. The value is identical to the
width of bin 0."
^long [^long min-shift ^long depth]
(bit-shift-left 1 (+ min-shift (* 3 depth))))

(defn- reg->bins*
"Returns candidate bins for the specified region as a vector."
[^long beg ^long end ^long min-shift ^long depth]
(let [max-pos (max-pos min-shift depth)
beg (if (<= beg 0) 0 (min (dec beg) max-pos))
end (if (<= end 0) max-pos (min end max-pos))]
(into [0]
(mapcat
(fn [^long d]
(let [t
(apply + (map (fn [^long x] (bit-shift-left 1 (* x 3)))
(range (inc d))))
s (+ min-shift (* 3 (- depth d 1)))]
(range (+ t (bit-shift-right beg s))
(+ t 1 (bit-shift-right (dec end) s))))))
(range depth))))

(def ^:private reg->bins (memoize reg->bins*))

(defn first-bin-of-level ^long [^long level]
(defn first-bin-of-level
"Returns a left-most bin number of the given `level`."
^long [^long level]
(quot (bit-shift-left 1 (* 3 level)) 7))

(defn bin-width-of-level ^long [^long level ^long min-shift ^long depth]
(defn bin-width-of-level
"Returns a width shared by bins of the same given `level`."
^long [^long level ^long min-shift ^long depth]
(bit-shift-left 1 (+ min-shift (* 3 (- depth level)))))

(defn bin-level ^long [^long bin]
(defn bin-level
"Returns a level that the given `bin` belongs to."
^long [^long bin]
(let [x (inc (quot (- 64 (Long/numberOfLeadingZeros bin)) 3))]
(cond-> x (< bin (first-bin-of-level x)) dec)))

(defn bin-beg ^long [^long bin ^long min-shift ^long depth]
(defn bin-beg
"Returns a beginning position of the given `bin`. 1-based."
^long [^long bin ^long min-shift ^long depth]
(let [level (bin-level bin)]
(inc (* (- bin (first-bin-of-level level))
(bin-width-of-level level min-shift depth)))))

(defn max-bin ^long [^long depth]
(defn max-bin
"Returns a maximum bin number of a binning index with the given `depth`."
^long [^long depth]
(dec (first-bin-of-level (inc depth))))

(defprotocol IBinningIndex
(get-chunks [this ref-idx bins])
(get-min-offset [this ref-idx beg])
(get-min-shift [this])
(get-depth [this])
(get-chr-names [this]))
(defn leading-bins-at-level
"Returns the distance between the bin corresponding to `pos` and the first one
at the same level."
^long [^long pos ^long level ^long min-shift ^long depth]
(unsigned-bit-shift-right pos (+ min-shift (* (- depth level) 3))))

(defn pos->lidx-offset
"Returns an offset of a linear index that the given `pos` belongs to."
[^long pos ^long linear-index-shift]
(bit-shift-right (if (<= pos 0) 0 (dec pos)) linear-index-shift))

(defn get-spans
"Calculates span information for random access from index data such as tabix."
[index-data ^long ref-idx ^long beg ^long end]
(let [bins
(reg->bins beg end (get-min-shift index-data) (get-depth index-data))
chunks (get-chunks index-data ref-idx bins)
min-offset (get-min-offset index-data ref-idx beg)]
(->> (util-chunk/optimize-chunks chunks min-offset)
(map vals))))

(defn leading-bins-at-level
"Returns the distance between the binning corresponding to pos and
the first one at the passed depth"
^long [^long pos ^long level ^long min-shift ^long depth]
(unsigned-bit-shift-right pos (+ min-shift (* (- depth level) 3))))
(defn reg->bins
"Returns all overlapping bins for the specified region [`beg`, `end`] as a
vector."
[^long beg ^long end ^long min-shift ^long depth]
(let [max-pos (max-pos min-shift depth)
beg (dec (Math/min max-pos (Math/max 1 beg)))
end (dec (Math/min max-pos (Math/max 1 end)))]
(into [0]
(mapcat
(fn [^long d]
(let [t (long (transduce
(map (fn [^long x] (bit-shift-left 1 (* x 3))))
+ 0 (range (inc d))))
s (+ min-shift (* 3 (- depth d 1)))]
(range (+ t (bit-shift-right beg s))
(+ t 1 (bit-shift-right end s))))))
(range depth))))

(defn reg->bin
"Calculates bin given an alignment covering [beg, end]"
"Calculates the smallest bin containing the given region [`beg`, `end`]."
^long [^long beg ^long end ^long min-shift ^long depth]
(let [max-pos (max-pos min-shift depth)
beg (dec (Math/min max-pos (Math/max 1 beg)))
end (dec (Math/min max-pos (Math/max 1 end)))]
beg (dec (Math/min max-pos (Math/max 1 beg)))
end (dec (Math/min max-pos (Math/max 1 end)))]
(loop [level depth]
(if-not (neg? level)
(let [beg-bins (leading-bins-at-level beg level min-shift depth)]
(if (= beg-bins (leading-bins-at-level end level min-shift depth))
(+ (first-bin-of-level level) beg-bins)
(recur (dec level))))
0))))

(defprotocol IBinningIndex
(get-chunks [this ref-idx bins])
(get-min-offset [this ref-idx beg])
(get-min-shift [this])
(get-depth [this])
(get-chr-names [this]))

(defn get-spans
"Calculates span information for random access from index data such as tabix."
[index-data ^long ref-idx ^long beg ^long end]
(let [bins (reg->bins
beg end (get-min-shift index-data) (get-depth index-data))
chunks (get-chunks index-data ref-idx bins)
min-offset (get-min-offset index-data ref-idx beg)]
(->> (util-chunk/optimize-chunks chunks min-offset)
(map vals))))
152 changes: 117 additions & 35 deletions test/cljam/io/util/bin_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -8,41 +8,67 @@
[cljam.io.csi :as csi]
[cljam.io.util.bin :as util-bin]))

(deftest get-spans-returns-a-sequence-including-regions
(let [tabix* (tabix/read-index test-tabix-file)]
(is (= [[0 50872]] (util-bin/get-spans tabix* 0 1 100)))
(is (every? #(and (= 2 (count %))
(number? (first %))
(number? (second %)))
(util-bin/get-spans tabix* 0 1 100))))
(let [csi* (csi/read-index test-csi-file)]
(is (= [[3904 3973]] (util-bin/get-spans csi* 0 1 100000)))
(is (every? #(and (= 2 (count %))
(number? (first %))
(number? (second %)))
(util-bin/get-spans csi* 0 1 100000)))))
(deftest max-pos-test
(are [?min-shift ?depth ?max-pos]
(= ?max-pos (util-bin/max-pos ?min-shift ?depth))
14 5 536870912
15 5 1073741824
0 0 1
0 1 8
0 2 64
1 1 16
1 2 128))

(deftest reg->bins-test
(are [x] (= [0 1 9 73 585 4681]
(#'util-bin/reg->bins 1 (bit-shift-left 1 x) x 5))
13 14 15 16)
(are [x] (= [0 1 9 73 585 4681 4682]
(#'util-bin/reg->bins 1 (inc (bit-shift-left 1 x)) x 5))
13 14 15 16)
(is (= [0 2 17]
(#'util-bin/reg->bins 9 9 0 2)))
(is (= [0 1 9 73 585]
(#'util-bin/reg->bins 1 1 14 4)))
(is (= [0 1 9 73 585 4681]
(#'util-bin/reg->bins 1 1 14 5)))
(is (= [0 1 9 73 585 4681 37449]
(#'util-bin/reg->bins 1 1 14 6)))
(is (= [0 1 9 73 585 4681 37449 299593]
(#'util-bin/reg->bins 1 1 14 7)))
(is (= [0 1 9 73 585 4681 37449 37450]
(#'util-bin/reg->bins 1 32769 15 6))))
(deftest first-bin-of-level-test
(are [?level ?first-bin]
(= ?first-bin (util-bin/first-bin-of-level ?level))
0 0
1 1
2 (+ 1 8)
3 (+ 1 8 64)
4 (+ 1 8 64 512)
5 (+ 1 8 64 512 4096)
6 (+ 1 8 64 512 4096 32768)))

(deftest bin-width-of-level-test
(are [?level ?min-shift ?depth ?bin-width]
(= ?bin-width (util-bin/bin-width-of-level ?level ?min-shift ?depth))
0 14 5 536870912
1 14 5 67108864
2 14 5 8388608
3 14 5 1048576
4 14 5 131072
5 14 5 16384
0 15 5 1073741824
0 0 0 1
0 0 1 8
1 0 1 1
1 0 2 8))

(deftest bin-level-test
(are [?bin ?level]
(= ?level (util-bin/bin-level ?bin))
0 0
1 1
2 1
8 1
9 2
72 2
73 3
584 3
585 4
4680 4
4681 5
37448 5
37449 6))

(deftest bin-beg-test
(are [?bin ?min-shift ?depth ?bin-beg]
(= ?bin-beg (util-bin/bin-beg ?bin ?min-shift ?depth))
4680 14 5 536739841
4681 14 5 1
4682 14 5 16385
4683 14 5 32769)
(doseq [index-shift [12 14 16]]
(let [min-size (bit-shift-left 1 index-shift)]
(is (= 1 (util-bin/bin-beg 4681 index-shift 5)))
Expand All @@ -54,16 +80,56 @@
(is (= (+ 1 (* min-size 24)) (util-bin/bin-beg 588 index-shift 5))))))

(deftest max-bin-test
(are [depth ans] (= (util-bin/max-bin depth) ans)
(are [?depth ?ans] (= (util-bin/max-bin ?depth) ?ans)
1 8
2 72
3 584
5 37448))

(deftest leading-bins-at-level-test
(are [?pos ?level ?min-shift ?depth ?bins]
(= ?bins (util-bin/leading-bins-at-level ?pos ?level ?min-shift ?depth))
1 0 14 5 0
1 5 14 5 0
16385 5 14 5 1
536870912 5 14 5 32768))

(deftest pos->lidx-offset-test
(are [?pos ?linear-index-shift ?offset]
(= ?offset (util-bin/pos->lidx-offset ?pos ?linear-index-shift))
0 14 0
1 14 0
16384 14 0
16385 14 1
32768 14 1
32769 14 2))

(deftest reg->bins-test
(are [x] (= [0 1 9 73 585 4681]
(util-bin/reg->bins 1 (bit-shift-left 1 x) x 5))
13 14 15 16)
(are [x] (= [0 1 9 73 585 4681 4682]
(util-bin/reg->bins 1 (inc (bit-shift-left 1 x)) x 5))
13 14 15 16)
(is (= [0 2 17]
(util-bin/reg->bins 9 9 0 2)))
(is (= [0 1 9 73 585]
(util-bin/reg->bins 1 1 14 4)))
(is (= [0 1 9 73 585 4681]
(util-bin/reg->bins 1 1 14 5)))
(is (= [0 1 9 73 585 4681 37449]
(util-bin/reg->bins 1 1 14 6)))
(is (= [0 1 9 73 585 4681 37449 299593]
(util-bin/reg->bins 1 1 14 7)))
(is (= [0 1 9 73 585 4681 37449 37450]
(util-bin/reg->bins 1 32769 15 6))))

(deftest reg->bin-test
(testing "BAI-compatible"
(are [?start ?end ?bin]
(= ?bin (util-bin/reg->bin ?start ?end 14 5))
(= ?bin
(util-bin/reg->bin ?start ?end 14 5)
((set (util-bin/reg->bins ?start ?end 14 5)) ?bin))
0 0 4681
0 1 4681
1 1 4681
Expand All @@ -79,7 +145,9 @@
536870912 536870913 37448))
(testing "1-by-1"
(are [?start ?end ?bin]
(= ?bin (util-bin/reg->bin ?start ?end 0 2))
(= ?bin
(util-bin/reg->bin ?start ?end 0 2)
((set (util-bin/reg->bins ?start ?end 0 2)) ?bin))
0 1 9
1 1 9
2 2 10
Expand All @@ -96,3 +164,17 @@
(is (= (util-bin/reg->bin 1 1 14 7) 299593))
(is (= (util-bin/reg->bin 1 32769 15 6) 4681))
(is (= (util-bin/reg->bin 240877561 240877568 14 6) 52150))))

(deftest get-spans-returns-a-sequence-including-regions
(let [tabix* (tabix/read-index test-tabix-file)]
(is (= [[0 50872]] (util-bin/get-spans tabix* 0 1 100)))
(is (every? #(and (= 2 (count %))
(number? (first %))
(number? (second %)))
(util-bin/get-spans tabix* 0 1 100))))
(let [csi* (csi/read-index test-csi-file)]
(is (= [[3904 3973]] (util-bin/get-spans csi* 0 1 100000)))
(is (every? #(and (= 2 (count %))
(number? (first %))
(number? (second %)))
(util-bin/get-spans csi* 0 1 100000)))))

0 comments on commit aa62440

Please sign in to comment.