diff --git a/src/cljam/algo/vcf_indexer.clj b/src/cljam/algo/vcf_indexer.clj index f24fd017..708c4fe1 100644 --- a/src/cljam/algo/vcf_indexer.clj +++ b/src/cljam/algo/vcf_indexer.clj @@ -5,8 +5,8 @@ (defn create-index "Creates a CSI index file from the VCF file." - [in-bgziped-vcf out-csi {:keys [shift depth] :or {shift 14 depth 5}}] - (with-open [r (vcf/reader in-bgziped-vcf)] + [in-bgzipped-vcf out-csi {:keys [shift depth] :or {shift 14 depth 5}}] + (with-open [r (vcf/reader in-bgzipped-vcf)] (let [offsets (vcf/read-file-offsets r) csi (csi/offsets->index offsets shift depth)] (csi/write-index out-csi csi)))) diff --git a/src/cljam/io/csi.clj b/src/cljam/io/csi.clj index 5f32039c..8c44a7e8 100644 --- a/src/cljam/io/csi.clj +++ b/src/cljam/io/csi.clj @@ -7,6 +7,8 @@ (:import java.util.Arrays [java.io DataInputStream DataOutputStream IOException])) +(def SMALL-CHUNK-SIZE 0x10000) + (deftype CSI [n-ref min-shift depth bidx loffset] util-bin/IBinningIndex (get-chunks [_ ref-idx bins] @@ -52,27 +54,29 @@ l-aux (lsb/read-int rdr) _ (lsb/read-bytes rdr l-aux) n-ref (lsb/read-int rdr) - refs (range n-ref) bins (vec (repeatedly n-ref #(read-bin-index rdr))) max-bin (util-bin/max-bin depth) - bidx (zipmap refs - (map (fn [bin] - (into {} (comp (map (juxt :bin :chunks)) - (filter #(<= (first %) max-bin))) - bin)) - bins)) - loffset (zipmap refs - (map (fn [bin] - (into (sorted-map) - (comp (map (juxt :bin :loffset)) - (filter #(<= (first %) max-bin)) - (map (fn [[bin loffset]] - [(util-bin/bin-beg bin - min-shift - depth) - loffset]))) - bin)) - bins))] + bidx (->> bins + (map-indexed (fn [index bin] + [index + (into {} (comp (map (juxt :bin :chunks)) + (filter #(<= (first %) + max-bin))) + bin)])) + (into (sorted-map))) + loffset (->> bins + (map-indexed + (fn [index bin] + [index + (into (sorted-map) + (comp (map (juxt :bin :loffset)) + (filter #(<= (first %) max-bin)) + (map (fn [[bin loffset]] + [(util-bin/bin-beg bin + min-shift depth) + loffset]))) + bin)])) + (into (sorted-map)))] (->CSI n-ref min-shift depth bidx loffset))) (defn read-index @@ -80,70 +84,65 @@ (with-open [r (DataInputStream. (bgzf/bgzf-input-stream f))] (read-index* r))) +(defn- concatenate-offsets [offsets] + (reduce (fn [res chunk] + (if (= (:file-end (first res)) (:file-beg chunk)) + (cons (assoc (first res) :file-beg (:file-beg (first res)) + :file-end (:file-end chunk)) + (next res)) + (cons chunk res))) + nil + offsets)) + +(defn- compress-bidx [depth bidx] + (let [target-bins (filter #(= (util-bin/bin-level %) depth) (keys bidx))] + (->> target-bins + (map (fn [bin] + (let [offsets (get bidx bin) + parent-bin (bit-shift-right (dec bin) 3)] + (if (and (< (- (bit-shift-right (:file-end (last offsets)) + 16) + (bit-shift-right (:file-beg (first offsets)) + 16)) + SMALL-CHUNK-SIZE) + (get bidx parent-bin)) + [parent-bin offsets] + [bin offsets])))) + (concat (apply dissoc bidx target-bins)) + (group-by first) + (map (fn [[bin offset-array]] + [bin (->> (map second offset-array) + (apply concat) + (sort-by :file-beg))])) + (into {})))) + (defn- calc-bidx [file-offsets shift depth] (->> file-offsets (map #(assoc % :bin (util-bin/reg->bin (:beg %) (:end %) shift depth))) - (reduce (fn [res offset] - (if (and (= (:bin (first res)) (:bin offset)) - (= (:file-end (first res)) (:file-beg offset))) - (cons (assoc (first res) :file-end (:file-end offset)) - (next res)) - (cons offset res))) - nil) (group-by :bin) + ((apply comp (for [i (range depth)] + #(compress-bidx (inc i) %)))) (map (fn [[bin offsets]] - [bin (->> offsets + [bin (->> (concatenate-offsets offsets) (map #(chunk/->Chunk (:file-beg %) (:file-end %))) reverse)])) - sort (into (sorted-map)))) -(defn- fill-in-file-offsets [file-offsets shift] - (let [indexed-offsets - (->> file-offsets - (map-indexed #(assoc %2 - :index %1 - :beg (-> (:beg %2) - (bit-shift-right shift) - (bit-shift-left shift)))) - (into []))] - (letfn [(step [index last-end] - (when-let [offset (and (< index (count indexed-offsets)) - (nth indexed-offsets index))] - (if-let [next-index (->> (drop (+ index 1) indexed-offsets) - (drop-while #(< (:end %) - (:end offset))) - first - :index)] - (let [next-offset (nth indexed-offsets next-index) - end (if (< (:beg next-offset) (:end offset)) - (:end offset) - (:beg next-offset))] - (lazy-seq (cons (assoc offset - :end end - :beg last-end) - (step next-index end)))) - (list offset))))] - (step 0 0)))) - -(defn- calc-loffsets [begs file-offsets shift] - (let [offsets (fill-in-file-offsets file-offsets shift)] - (->> begs - (map (fn [beg] - [beg - (:file-beg - (first (drop-while #(< (:end %) beg) offsets)))])) - (into (sorted-map))))) +(defn- calc-loffsets [begs file-offsets] + (->> begs + (map (fn [beg] + [beg (->> (drop-while #(< (:end %) beg) file-offsets) + (map :file-beg) + first)])) + (into (sorted-map)))) (defn offsets->index "Calculates loffsets and bidx from offsets {:file-beg :file-end :beg :end :chr }" [offsets shift depth] - (let [chr-offsets (->> (partition-by :chr offsets) - (map #(vector (:chr (first %)) %)) - sort + (let [chr-offsets (->> (group-by :chr offsets) (into (sorted-map))) bidx (->> chr-offsets (map (fn [[chr offsets]] @@ -152,10 +151,9 @@ loffsets (->> chr-offsets (map (fn [[chr offsets]] [chr (calc-loffsets - (map #(util-bin/bin-beg % shift depth) - (keys (get bidx chr))) - offsets - shift)])) + (set (map #(util-bin/bin-beg % shift depth) + (keys (get bidx chr)))) + offsets)])) (into (sorted-map)))] (->CSI (count bidx) shift depth bidx loffsets))) diff --git a/src/cljam/io/util/bin.clj b/src/cljam/io/util/bin.clj index a48caa82..b30104bb 100644 --- a/src/cljam/io/util/bin.clj +++ b/src/cljam/io/util/bin.clj @@ -70,7 +70,7 @@ ^long [^long beg ^long end ^long min-shift ^long depth] (let [max-pos (max-pos min-shift depth) beg (Math/min max-pos (Math/max 0 (dec beg))) - end (Math/min max-pos (Math/max 0 end))] + end (Math/min max-pos (Math/max 0 (dec end)))] (loop [level depth] (if-not (neg? level) (let [beg-bins (leading-bins-at-level beg level min-shift depth)] diff --git a/test/cljam/algo/vcf_indexer_test.clj b/test/cljam/algo/vcf_indexer_test.clj index 01814fec..b1cf8739 100644 --- a/test/cljam/algo/vcf_indexer_test.clj +++ b/test/cljam/algo/vcf_indexer_test.clj @@ -1,7 +1,6 @@ (ns cljam.algo.vcf-indexer-test "Tests for cljam.algo.bam-indexer." (:require [clojure.test :refer :all] - [clojure.java.io :as cio] [cljam.test-common :refer :all] [cljam.io.csi :as csi] [cljam.io.vcf :as vcf] @@ -111,52 +110,63 @@ 1 {1 2046}, 2 {1 2155}})))))) -(deftest about-vcf-indexer-input-result - (with-before-after {:before (prepare-cache!) - :after (clean-cache!)} - (cio/copy (cio/file test-vcf-various-bins-gz-file) - (cio/file tmp-vcf-file)) - (vcf-indexer/create-index tmp-vcf-file tmp-vcf-csi-file - {:shift 14 :depth 6}) - (are [chr start end] - (let [vcf1* (vcf/vcf-reader tmp-vcf-file) - vcf2* (vcf/vcf-reader test-vcf-various-bins-gz-file)] - (= (vcf/read-variants-randomly vcf1* - {:chr chr :start start :end end} {}) - (vcf/read-variants-randomly vcf2* - {:chr chr :start start - :end end} {}))) - "chr1" 1 16384 - "chr1" 1 49153 - "chr1" 1 30000 - "chr1" 49153 147457 - "chr1" 32769 147457 - "chr1" 49153 1064952 - "chr1" 1048577 414826496 - "chr1" 1048577 414826497 - "chr1" 32769 414859265 - "chr1" 414859265 536608769))) +(defn- check-loffset [file-offsets file-offset-index beg file-offset] + ;;If "beg" which is the key of the loffset does not exist + ;;between the beg-end of the variant data, + ;;the selected data must be closest to "beg" + ;;among data with pos greater than "beg". + (let [index (get file-offset-index file-offset) + current-offset (nth file-offsets index) + previous-offset (nth file-offsets (dec index))] + (and (= (:chr current-offset) (:chr previous-offset)) + (< (:end previous-offset) beg (:beg current-offset))))) (deftest csi-comparison-test - (let [csi ^CSI (csi/read-index test-vcf-various-bins-csi-file) - computed (with-open [r (vcf/reader test-vcf-various-bins-gz-file)] - (csi/offsets->index (vcf/read-file-offsets r) 14 6))] - (is (= (.n-ref csi) (.n-ref ^CSI computed))) - (is (= (.min-shift csi) (.min-shift ^CSI computed))) - (is (= (.depth csi) (.depth ^CSI computed))) + (with-open [r (vcf/reader test-vcf-various-bins-gz-file)] + (let [file-offsets (vcf/read-file-offsets r) + computed (csi/offsets->index file-offsets 14 6) + csi ^CSI (csi/read-index test-vcf-various-bins-csi-file) + file-offset-index (->> file-offsets + (map-indexed #(vector (:file-beg %2) + %1)) + (into {}))] + (is (= (.n-ref csi) (.n-ref ^CSI computed))) + (is (= (.min-shift csi) (.min-shift ^CSI computed))) + (is (= (.depth csi) (.depth ^CSI computed))) ;;Exclude tail data ;;due to differences in implementation of HTSlib and bgzf4j. - (is (= (dissoc (get (.bidx csi) 0) 75) - (dissoc (get (.bidx ^CSI computed) 0) 75))) - (is (= (.loffset csi) (.loffset ^CSI computed))))) + (is (= (update (.bidx csi) 0 dissoc 75) + (update (.bidx ^CSI computed) 0 dissoc 75))) + + (doseq [chr (keys (.loffset csi))] + (doseq [beg (keys (get (.loffset csi) chr))] + (is (or (= (get-in (.loffset csi) [chr beg]) + (get-in (.loffset ^CSI computed) [chr beg])) + (check-loffset file-offsets + file-offset-index + beg (get-in (.loffset ^CSI computed) + [chr beg]))))))))) (deftest-remote large-csi-comparison-test (with-before-after {:before (prepare-cavia!)} - (let [csi ^CSI (csi/read-index test-large-vcf-csi-file) - computed (with-open [r (vcf/reader test-large-vcf-file)] - (csi/offsets->index (vcf/read-file-offsets r) 14 6))] - (is (= (.n-ref csi) (.n-ref ^CSI computed))) - (is (= (.min-shift csi) (.min-shift ^CSI computed))) - (is (= (.depth csi) (.depth ^CSI computed))) - (is (= (.bidx csi) (.bidx ^CSI computed))) - (is (= (.loffset csi) (.loffset ^CSI computed)))))) + (with-open [r (vcf/reader test-large-vcf-file)] + (let [file-offsets (vcf/read-file-offsets r) + csi ^CSI (csi/read-index test-large-vcf-csi-file) + computed (csi/offsets->index file-offsets 14 6) + file-offset-index (->> file-offsets + (map-indexed #(vector (:file-beg %2) + %1)) + (into {}))] + (is (= (.n-ref csi) (.n-ref ^CSI computed))) + (is (= (.min-shift csi) (.min-shift ^CSI computed))) + (is (= (.depth csi) (.depth ^CSI computed))) + (is (= (update (.bidx csi) 24 dissoc 37450) + (update (.bidx ^CSI computed) 24 dissoc 37450))) + (doseq [chr (keys (.loffset csi))] + (doseq [beg (keys (get (.loffset csi) chr))] + (is (or (= (get-in (.loffset csi) [chr beg]) + (get-in (.loffset ^CSI computed) [chr beg])) + (check-loffset file-offsets + file-offset-index + beg (get-in (.loffset ^CSI computed) + [chr beg])))))))))) diff --git a/test/cljam/io/csi_test.clj b/test/cljam/io/csi_test.clj index b8cc67e4..07455777 100644 --- a/test/cljam/io/csi_test.clj +++ b/test/cljam/io/csi_test.clj @@ -28,19 +28,6 @@ (cio/as-url (cio/file test-csi-file)) (cio/as-url (str (:uri server) "/csi/test.csi"))))) -(deftest calc-loffsets-test - (is (= (#'csi/calc-loffsets '(3) '({:beg 1 :end 2 :file-beg 1} - {:beg 5 :end 6 :file-beg 2}) 0) - {3 1})) - - (is (= (#'csi/calc-loffsets '(3) '({:beg 2 :end 3 :file-beg 1} - {:beg 5 :end 6 :file-beg 2}) 1) - {3 1})) - - (is (= (#'csi/calc-loffsets '(4) '({:beg 2 :end 3 :file-beg 1} - {:beg 5 :end 6 :file-beg 2}) 1) - {4 1}))) - (deftest-remote large-read-write-test (with-before-after {:before (do (prepare-cavia!) (prepare-cache!)), diff --git a/test/cljam/io/util/bin_test.clj b/test/cljam/io/util/bin_test.clj index a17b71f4..5f1c2077 100644 --- a/test/cljam/io/util/bin_test.clj +++ b/test/cljam/io/util/bin_test.clj @@ -62,4 +62,5 @@ (is (= (util-bin/reg->bin 1 1 14 6) 37449)) (is (= (util-bin/reg->bin 1 1 14 7) 299593)) (is (= (util-bin/reg->bin 1 32769 15 6) 4681)) - (is (= (util-bin/reg->bin 1 2 0 2) 1))) + (is (= (util-bin/reg->bin 1 2 0 2) 1)) + (is (= (util-bin/reg->bin 240877561 240877568 14 6) 52150))) diff --git a/test/cljam/test_common.clj b/test/cljam/test_common.clj index a2bda033..4564fdac 100644 --- a/test/cljam/test_common.clj +++ b/test/cljam/test_common.clj @@ -52,9 +52,6 @@ {:id "large.vcf.gz.csi" :url "https://test.chrov.is/data/cljam/example-500-10000.vcf.gz.csi" :sha1 "568a47f463de8df846e021640d38b8cf8f257e66"} - {:id "large.vcf.gz.csi" - :url "https://test.chrov.is/data/cljam/example-500-10000.vcf.gz.csi" - :sha1 "568a47f463de8df846e021640d38b8cf8f257e66"} {:id "large.bcf" :url "https://test.chrov.is/data/cljam/example-500-10000.bcf" :sha1 "f7f57ed9d21874c92331ef6d86d85b36959f4d16"}