Skip to content

Commit

Permalink
Fix bad expressions.
Browse files Browse the repository at this point in the history
  • Loading branch information
niyarin committed Mar 24, 2020
1 parent cddf8c2 commit 3bf6899
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 134 deletions.
4 changes: 2 additions & 2 deletions src/cljam/algo/vcf_indexer.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

(defn create-index
"Creates a CSI index file from the VCF file."
[in-bgziped-vcf out-csi {:keys [shift depth] :or {shift 14 depth 5}}]
(with-open [r (vcf/reader in-bgziped-vcf)]
[in-bgzipped-vcf out-csi {:keys [shift depth] :or {shift 14 depth 5}}]
(with-open [r (vcf/reader in-bgzipped-vcf)]
(let [offsets (vcf/read-file-offsets r)
csi (csi/offsets->index offsets shift depth)]
(csi/write-index out-csi csi))))
140 changes: 69 additions & 71 deletions src/cljam/io/csi.clj
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
(:import java.util.Arrays
[java.io DataInputStream DataOutputStream IOException]))

(def SMALL-CHUNK-SIZE 0x10000)

(deftype CSI [n-ref min-shift depth bidx loffset]
util-bin/IBinningIndex
(get-chunks [_ ref-idx bins]
Expand Down Expand Up @@ -52,98 +54,95 @@
l-aux (lsb/read-int rdr)
_ (lsb/read-bytes rdr l-aux)
n-ref (lsb/read-int rdr)
refs (range n-ref)
bins (vec (repeatedly n-ref #(read-bin-index rdr)))
max-bin (util-bin/max-bin depth)
bidx (zipmap refs
(map (fn [bin]
(into {} (comp (map (juxt :bin :chunks))
(filter #(<= (first %) max-bin)))
bin))
bins))
loffset (zipmap refs
(map (fn [bin]
(into (sorted-map)
(comp (map (juxt :bin :loffset))
(filter #(<= (first %) max-bin))
(map (fn [[bin loffset]]
[(util-bin/bin-beg bin
min-shift
depth)
loffset])))
bin))
bins))]
bidx (->> bins
(map-indexed (fn [index bin]
[index
(into {} (comp (map (juxt :bin :chunks))
(filter #(<= (first %)
max-bin)))
bin)]))
(into (sorted-map)))
loffset (->> bins
(map-indexed
(fn [index bin]
[index
(into (sorted-map)
(comp (map (juxt :bin :loffset))
(filter #(<= (first %) max-bin))
(map (fn [[bin loffset]]
[(util-bin/bin-beg bin
min-shift depth)
loffset])))
bin)]))
(into (sorted-map)))]
(->CSI n-ref min-shift depth bidx loffset)))

(defn read-index
[f]
(with-open [r (DataInputStream. (bgzf/bgzf-input-stream f))]
(read-index* r)))

(defn- concatenate-offsets [offsets]
(reduce (fn [res chunk]
(if (= (:file-end (first res)) (:file-beg chunk))
(cons (assoc (first res) :file-beg (:file-beg (first res))
:file-end (:file-end chunk))
(next res))
(cons chunk res)))
nil
offsets))

(defn- compress-bidx [depth bidx]
(let [target-bins (filter #(= (util-bin/bin-level %) depth) (keys bidx))]
(->> target-bins
(map (fn [bin]
(let [offsets (get bidx bin)
parent-bin (bit-shift-right (dec bin) 3)]
(if (and (< (- (bit-shift-right (:file-end (last offsets))
16)
(bit-shift-right (:file-beg (first offsets))
16))
SMALL-CHUNK-SIZE)
(get bidx parent-bin))
[parent-bin offsets]
[bin offsets]))))
(concat (apply dissoc bidx target-bins))
(group-by first)
(map (fn [[bin offset-array]]
[bin (->> (map second offset-array)
(apply concat)
(sort-by :file-beg))]))
(into {}))))

(defn- calc-bidx [file-offsets shift depth]
(->> file-offsets
(map #(assoc %
:bin (util-bin/reg->bin (:beg %)
(:end %) shift depth)))
(reduce (fn [res offset]
(if (and (= (:bin (first res)) (:bin offset))
(= (:file-end (first res)) (:file-beg offset)))
(cons (assoc (first res) :file-end (:file-end offset))
(next res))
(cons offset res)))
nil)
(group-by :bin)
((apply comp (for [i (range depth)]
#(compress-bidx (inc i) %))))
(map (fn [[bin offsets]]
[bin (->> offsets
[bin (->> (concatenate-offsets offsets)
(map #(chunk/->Chunk (:file-beg %) (:file-end %)))
reverse)]))
sort
(into (sorted-map))))

(defn- fill-in-file-offsets [file-offsets shift]
(let [indexed-offsets
(->> file-offsets
(map-indexed #(assoc %2
:index %1
:beg (-> (:beg %2)
(bit-shift-right shift)
(bit-shift-left shift))))
(into []))]
(letfn [(step [index last-end]
(when-let [offset (and (< index (count indexed-offsets))
(nth indexed-offsets index))]
(if-let [next-index (->> (drop (+ index 1) indexed-offsets)
(drop-while #(< (:end %)
(:end offset)))
first
:index)]
(let [next-offset (nth indexed-offsets next-index)
end (if (< (:beg next-offset) (:end offset))
(:end offset)
(:beg next-offset))]
(lazy-seq (cons (assoc offset
:end end
:beg last-end)
(step next-index end))))
(list offset))))]
(step 0 0))))

(defn- calc-loffsets [begs file-offsets shift]
(let [offsets (fill-in-file-offsets file-offsets shift)]
(->> begs
(map (fn [beg]
[beg
(:file-beg
(first (drop-while #(< (:end %) beg) offsets)))]))
(into (sorted-map)))))
(defn- calc-loffsets [begs file-offsets]
(->> begs
(map (fn [beg]
[beg (->> (drop-while #(< (:end %) beg) file-offsets)
(map :file-beg)
first)]))
(into (sorted-map))))

(defn offsets->index
"Calculates loffsets and bidx
from offsets {:file-beg :file-end :beg :end :chr }"
[offsets shift depth]
(let [chr-offsets (->> (partition-by :chr offsets)
(map #(vector (:chr (first %)) %))
sort
(let [chr-offsets (->> (group-by :chr offsets)
(into (sorted-map)))
bidx (->> chr-offsets
(map (fn [[chr offsets]]
Expand All @@ -152,10 +151,9 @@
loffsets (->> chr-offsets
(map (fn [[chr offsets]]
[chr (calc-loffsets
(map #(util-bin/bin-beg % shift depth)
(keys (get bidx chr)))
offsets
shift)]))
(set (map #(util-bin/bin-beg % shift depth)
(keys (get bidx chr))))
offsets)]))
(into (sorted-map)))]
(->CSI (count bidx) shift depth bidx loffsets)))

Expand Down
2 changes: 1 addition & 1 deletion src/cljam/io/util/bin.clj
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
^long [^long beg ^long end ^long min-shift ^long depth]
(let [max-pos (max-pos min-shift depth)
beg (Math/min max-pos (Math/max 0 (dec beg)))
end (Math/min max-pos (Math/max 0 end))]
end (Math/min max-pos (Math/max 0 (dec end)))]
(loop [level depth]
(if-not (neg? level)
(let [beg-bins (leading-bins-at-level beg level min-shift depth)]
Expand Down
96 changes: 53 additions & 43 deletions test/cljam/algo/vcf_indexer_test.clj
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
(ns cljam.algo.vcf-indexer-test
"Tests for cljam.algo.bam-indexer."
(:require [clojure.test :refer :all]
[clojure.java.io :as cio]
[cljam.test-common :refer :all]
[cljam.io.csi :as csi]
[cljam.io.vcf :as vcf]
Expand Down Expand Up @@ -111,52 +110,63 @@
1 {1 2046},
2 {1 2155}}))))))

(deftest about-vcf-indexer-input-result
(with-before-after {:before (prepare-cache!)
:after (clean-cache!)}
(cio/copy (cio/file test-vcf-various-bins-gz-file)
(cio/file tmp-vcf-file))
(vcf-indexer/create-index tmp-vcf-file tmp-vcf-csi-file
{:shift 14 :depth 6})
(are [chr start end]
(let [vcf1* (vcf/vcf-reader tmp-vcf-file)
vcf2* (vcf/vcf-reader test-vcf-various-bins-gz-file)]
(= (vcf/read-variants-randomly vcf1*
{:chr chr :start start :end end} {})
(vcf/read-variants-randomly vcf2*
{:chr chr :start start
:end end} {})))
"chr1" 1 16384
"chr1" 1 49153
"chr1" 1 30000
"chr1" 49153 147457
"chr1" 32769 147457
"chr1" 49153 1064952
"chr1" 1048577 414826496
"chr1" 1048577 414826497
"chr1" 32769 414859265
"chr1" 414859265 536608769)))
(defn- check-loffset [file-offsets file-offset-index beg file-offset]
;;If "beg" which is the key of the loffset does not exist
;;between the beg-end of the variant data,
;;the selected data must be closest to "beg"
;;among data with pos greater than "beg".
(let [index (get file-offset-index file-offset)
current-offset (nth file-offsets index)
previous-offset (nth file-offsets (dec index))]
(and (= (:chr current-offset) (:chr previous-offset))
(< (:end previous-offset) beg (:beg current-offset)))))

(deftest csi-comparison-test
(let [csi ^CSI (csi/read-index test-vcf-various-bins-csi-file)
computed (with-open [r (vcf/reader test-vcf-various-bins-gz-file)]
(csi/offsets->index (vcf/read-file-offsets r) 14 6))]
(is (= (.n-ref csi) (.n-ref ^CSI computed)))
(is (= (.min-shift csi) (.min-shift ^CSI computed)))
(is (= (.depth csi) (.depth ^CSI computed)))
(with-open [r (vcf/reader test-vcf-various-bins-gz-file)]
(let [file-offsets (vcf/read-file-offsets r)
computed (csi/offsets->index file-offsets 14 6)
csi ^CSI (csi/read-index test-vcf-various-bins-csi-file)
file-offset-index (->> file-offsets
(map-indexed #(vector (:file-beg %2)
%1))
(into {}))]
(is (= (.n-ref csi) (.n-ref ^CSI computed)))
(is (= (.min-shift csi) (.min-shift ^CSI computed)))
(is (= (.depth csi) (.depth ^CSI computed)))
;;Exclude tail data
;;due to differences in implementation of HTSlib and bgzf4j.
(is (= (dissoc (get (.bidx csi) 0) 75)
(dissoc (get (.bidx ^CSI computed) 0) 75)))
(is (= (.loffset csi) (.loffset ^CSI computed)))))
(is (= (update (.bidx csi) 0 dissoc 75)
(update (.bidx ^CSI computed) 0 dissoc 75)))

(doseq [chr (keys (.loffset csi))]
(doseq [beg (keys (get (.loffset csi) chr))]
(is (or (= (get-in (.loffset csi) [chr beg])
(get-in (.loffset ^CSI computed) [chr beg]))
(check-loffset file-offsets
file-offset-index
beg (get-in (.loffset ^CSI computed)
[chr beg])))))))))

(deftest-remote large-csi-comparison-test
(with-before-after {:before (prepare-cavia!)}
(let [csi ^CSI (csi/read-index test-large-vcf-csi-file)
computed (with-open [r (vcf/reader test-large-vcf-file)]
(csi/offsets->index (vcf/read-file-offsets r) 14 6))]
(is (= (.n-ref csi) (.n-ref ^CSI computed)))
(is (= (.min-shift csi) (.min-shift ^CSI computed)))
(is (= (.depth csi) (.depth ^CSI computed)))
(is (= (.bidx csi) (.bidx ^CSI computed)))
(is (= (.loffset csi) (.loffset ^CSI computed))))))
(with-open [r (vcf/reader test-large-vcf-file)]
(let [file-offsets (vcf/read-file-offsets r)
csi ^CSI (csi/read-index test-large-vcf-csi-file)
computed (csi/offsets->index file-offsets 14 6)
file-offset-index (->> file-offsets
(map-indexed #(vector (:file-beg %2)
%1))
(into {}))]
(is (= (.n-ref csi) (.n-ref ^CSI computed)))
(is (= (.min-shift csi) (.min-shift ^CSI computed)))
(is (= (.depth csi) (.depth ^CSI computed)))
(is (= (update (.bidx csi) 24 dissoc 37450)
(update (.bidx ^CSI computed) 24 dissoc 37450)))
(doseq [chr (keys (.loffset csi))]
(doseq [beg (keys (get (.loffset csi) chr))]
(is (or (= (get-in (.loffset csi) [chr beg])
(get-in (.loffset ^CSI computed) [chr beg]))
(check-loffset file-offsets
file-offset-index
beg (get-in (.loffset ^CSI computed)
[chr beg]))))))))))
13 changes: 0 additions & 13 deletions test/cljam/io/csi_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,6 @@
(cio/as-url (cio/file test-csi-file))
(cio/as-url (str (:uri server) "/csi/test.csi")))))

(deftest calc-loffsets-test
(is (= (#'csi/calc-loffsets '(3) '({:beg 1 :end 2 :file-beg 1}
{:beg 5 :end 6 :file-beg 2}) 0)
{3 1}))

(is (= (#'csi/calc-loffsets '(3) '({:beg 2 :end 3 :file-beg 1}
{:beg 5 :end 6 :file-beg 2}) 1)
{3 1}))

(is (= (#'csi/calc-loffsets '(4) '({:beg 2 :end 3 :file-beg 1}
{:beg 5 :end 6 :file-beg 2}) 1)
{4 1})))

(deftest-remote large-read-write-test
(with-before-after {:before (do (prepare-cavia!)
(prepare-cache!)),
Expand Down
3 changes: 2 additions & 1 deletion test/cljam/io/util/bin_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,5 @@
(is (= (util-bin/reg->bin 1 1 14 6) 37449))
(is (= (util-bin/reg->bin 1 1 14 7) 299593))
(is (= (util-bin/reg->bin 1 32769 15 6) 4681))
(is (= (util-bin/reg->bin 1 2 0 2) 1)))
(is (= (util-bin/reg->bin 1 2 0 2) 1))
(is (= (util-bin/reg->bin 240877561 240877568 14 6) 52150)))
3 changes: 0 additions & 3 deletions test/cljam/test_common.clj
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,6 @@
{:id "large.vcf.gz.csi"
:url "https://test.chrov.is/data/cljam/example-500-10000.vcf.gz.csi"
:sha1 "568a47f463de8df846e021640d38b8cf8f257e66"}
{:id "large.vcf.gz.csi"
:url "https://test.chrov.is/data/cljam/example-500-10000.vcf.gz.csi"
:sha1 "568a47f463de8df846e021640d38b8cf8f257e66"}
{:id "large.bcf"
:url "https://test.chrov.is/data/cljam/example-500-10000.bcf"
:sha1 "f7f57ed9d21874c92331ef6d86d85b36959f4d16"}
Expand Down

0 comments on commit 3bf6899

Please sign in to comment.