Skip to content

Commit

Permalink
Add support for generating CSI that can be read by htslib
Browse files Browse the repository at this point in the history
  • Loading branch information
niyarin committed Mar 25, 2020
1 parent 3bf6899 commit e655e42
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 26 deletions.
13 changes: 11 additions & 2 deletions src/cljam/algo/vcf_indexer.clj
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
(ns cljam.algo.vcf-indexer
"Indexer of VCF."
(:require [cljam.io.csi :as csi]
[cljam.io.vcf :as vcf]))
[cljam.io.vcf :as vcf])
(:import cljam.io.vcf.reader.VCFReader))

(defn create-index
"Creates a CSI index file from the VCF file."
[in-bgzipped-vcf out-csi {:keys [shift depth] :or {shift 14 depth 5}}]
(with-open [r (vcf/reader in-bgzipped-vcf)]
(let [offsets (vcf/read-file-offsets r)
csi (csi/offsets->index offsets shift depth)]

names (->> (map :chr-name offsets)
(concat (->> (.meta-info ^VCFReader r)
:contig
(mapv :id)))
distinct)
csi (csi/offsets->index offsets shift depth
{:variant-file-type :VCF
:names names})]
(csi/write-index out-csi csi))))
58 changes: 46 additions & 12 deletions src/cljam/io/csi.clj
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
(ns cljam.io.csi
"Reader of a CSI format file."
(:require [cljam.io.util.bgzf :as bgzf]
(:require [clojure.string :as cstr]
[cljam.io.util.bgzf :as bgzf]
[cljam.io.util.lsb :as lsb]
[cljam.io.util.chunk :as chunk]
[cljam.io.util.bin :as util-bin])
(:import java.util.Arrays
[java.io DataInputStream DataOutputStream IOException]))

(def SMALL-CHUNK-SIZE 0x10000)
(def ^:private default-vcf-csi-aux
{:format 2 :col-seq 1 :col-beg 2 :col-end 0 :meta-char \# :skip 0})

(deftype CSI [n-ref min-shift depth bidx loffset]
(def small-chunk-size 0x10000)

(deftype CSI [n-ref min-shift depth bidx loffset aux]
util-bin/IBinningIndex
(get-chunks [_ ref-idx bins]
(vec (mapcat (get bidx ref-idx) bins)))
Expand Down Expand Up @@ -52,7 +56,7 @@
(let [min-shift (lsb/read-int rdr)
depth (lsb/read-int rdr)
l-aux (lsb/read-int rdr)
_ (lsb/read-bytes rdr l-aux)
aux (lsb/read-bytes rdr l-aux)
n-ref (lsb/read-int rdr)
bins (vec (repeatedly n-ref #(read-bin-index rdr)))
max-bin (util-bin/max-bin depth)
Expand All @@ -77,7 +81,7 @@
loffset])))
bin)]))
(into (sorted-map)))]
(->CSI n-ref min-shift depth bidx loffset)))
(->CSI n-ref min-shift depth bidx loffset aux)))

(defn read-index
[f]
Expand All @@ -104,7 +108,7 @@
16)
(bit-shift-right (:file-beg (first offsets))
16))
SMALL-CHUNK-SIZE)
small-chunk-size)
(get bidx parent-bin))
[parent-bin offsets]
[bin offsets]))))
Expand Down Expand Up @@ -138,12 +142,38 @@
first)]))
(into (sorted-map))))

(defn- create-vcf-csi-aux-data [contigs]
(let [contig-bytes (.getBytes (str (cstr/join (char 0) contigs)
(char 0)))]
(byte-array
(concat (->> (assoc default-vcf-csi-aux
:l-nm (count contig-bytes))
((juxt :format :col-seq :col-beg
:col-end :meta-char :skip :l-nm))
(map (fn [n]
(map #(bit-shift-right (int n) %)
(range 0 32 8))))
(apply concat))
contig-bytes))))

(->> default-vcf-csi-aux
((juxt :format :col-seq :col-beg
:col-end :meta-char :skip))
(map (fn [n]
(map #(bit-shift-right (int n) %)
(range 0 32 8))))

(apply concat))

(defn offsets->index
"Calculates loffsets and bidx
from offsets {:file-beg :file-end :beg :end :chr }"
[offsets shift depth]
(let [chr-offsets (->> (group-by :chr offsets)
(into (sorted-map)))
[offsets shift depth
{:keys [variant-file-type names] :or {variant-file-type :BCF}}]
(let [chr-offsets (merge (->> (range (count names))
(map #(vector % []))
(into {}))
(group-by :chr offsets))
bidx (->> chr-offsets
(map (fn [[chr offsets]]
[chr (calc-bidx offsets shift depth)]))
Expand All @@ -154,16 +184,20 @@
(set (map #(util-bin/bin-beg % shift depth)
(keys (get bidx chr))))
offsets)]))
(into (sorted-map)))]
(->CSI (count bidx) shift depth bidx loffsets)))
(into (sorted-map)))
aux (when (= variant-file-type :VCF)
(create-vcf-csi-aux-data names))]
(->CSI (count bidx) shift depth bidx loffsets aux)))

(defn write-index
[f ^CSI csi]
(with-open [w (DataOutputStream. (bgzf/bgzf-output-stream f))]
(lsb/write-bytes w (.getBytes ^String csi-magic))
(lsb/write-int w (.min-shift csi))
(lsb/write-int w (.depth csi))
(lsb/write-int w 0)
(let [aux (.aux csi)]
(lsb/write-int w (count aux))
(when aux (lsb/write-bytes w aux)))
(lsb/write-int w (count (.bidx csi)))
(doseq [[chr offsets] (.bidx csi)]
(lsb/write-int w (count offsets))
Expand Down
2 changes: 1 addition & 1 deletion src/cljam/io/protocols.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
(:refer-clojure :exclude [read indexed?]))

(defrecord SAMAlignment
[qname ^int flag rname ^int pos ^int end ^int mapq cigar rnext ^int pnext ^int tlen seq qual options])
[qname ^int flag rname ^int pos ^int end ^int mapq cigar rnext ^int pnext ^int tlen seq qual options])
(defrecord SAMRegionBlock [data ^int ref-id ^int pos ^int end])
(defrecord SAMCoordinateBlock [data ^int ref-id ^int pos ^int flag])
(defrecord SAMQuerynameBlock [data qname ^int flag])
Expand Down
2 changes: 1 addition & 1 deletion src/cljam/io/vcf/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@
contigs
(assoc contigs chr (count contigs)))]
(cons {:file-beg beg-pointer, :file-end end-pointer
:chr (contigs' chr), :beg pos,
:chr (contigs' chr), :beg pos, :chr-name chr,
:end (or (:END info) (dec (+ pos (count ref))))}
(lazy-seq (step contigs' end-pointer))))))))]
(step meta-info-contigs 0))))
Binary file added test-resources/vcf/test-chr-skipped.vcf.gz
Binary file not shown.
60 changes: 51 additions & 9 deletions test/cljam/algo/vcf_indexer_test.clj
Original file line number Diff line number Diff line change
@@ -1,22 +1,27 @@
(ns cljam.algo.vcf-indexer-test
"Tests for cljam.algo.bam-indexer."
(:require [clojure.test :refer :all]
[clojure.string :as cstr]
[cljam.test-common :refer :all]
[cljam.io.csi :as csi]
[cljam.io.vcf :as vcf]
[cljam.algo.vcf-indexer :as vcf-indexer])
(:import
[cljam.io.csi CSI]))
(:import cljam.io.csi.CSI
cljam.io.vcf.reader.VCFReader))

(def tmp-csi-file (str temp-dir "/tmp.csi"))
(def tmp-vcf-file (str temp-dir "/tmp.vcf.gz"))
(def tmp-vcf-csi-file (str temp-dir "/tmp.vcf.gz.csi"))

(defn- chunks->maps [bidx]
(apply merge-with into
(for [[chr bin-chunks] bidx
[bin chunks] bin-chunks]
{chr {bin (mapv #(into {} %) chunks)}})))
(->> bidx
(map (fn [[chr bin-chunks]]
[chr
(->> bin-chunks
(map (fn [[bin chunks]]
[bin (mapv #(into {} %) chunks)]))
(into {}))]))
(into {})))

(deftest about-vcf-indexer
(with-before-after {:before (prepare-cache!)
Expand Down Expand Up @@ -82,6 +87,10 @@
(do (vcf-indexer/create-index test-vcf-changed-chr-order-file
tmp-csi-file {:shift 18 :depth 5})
(let [csi ^CSI (csi/read-index tmp-csi-file)]
(is (= (cstr/split (String. (byte-array (drop 28 (.aux csi))))
#"\00")
["19" "20" "X"]))

(is (= (chunks->maps (.bidx csi))
{0 {4681 [{:beg 2096, :end 2205}]},
1 {4681 [{:beg 1647, :end 1811}],
Expand All @@ -99,17 +108,38 @@
(do (vcf-indexer/create-index test-vcf-changed-chr-order-field-less-file
tmp-csi-file {:shift 18 :depth 5})
(let [csi ^CSI (csi/read-index tmp-csi-file)]
(is (= (cstr/split (String. (byte-array (drop 28 (.aux csi))))
#"\00")
["20" "19" "X"]))
(is (= (chunks->maps (.bidx csi))
{0 {4681 [{:beg 1597, :end 1761}],
4685 [{:beg 1761, :end 2046}]},
1 {4681 [{:beg 2046, :end 2155}]},
2 {4681 [{:beg 2155, :end 70057984}]}}))

(is (= (.loffset csi)
{0 {1 1597, 1048577 1761},
1 {1 2046},
2 {1 2155}}))))))

(deftest about-vcf-skipped-chr-order
(with-before-after {:before (prepare-cache!)
:after (clean-cache!)}
(do (vcf-indexer/create-index test-vcf-chr-skipped-file
tmp-csi-file {:shift 14 :depth 6})
(let [csi ^CSI (csi/read-index tmp-csi-file)]

(is (= (.n-ref csi) 3))
(is (= (.min-shift csi) 14))
(is (= (.depth csi) 6))
(is (= (cstr/split (String. (byte-array (drop 28 (.aux csi))))
#"\00")
["chr1" "chr2" "chr3"]))
(is (= (chunks->maps (.bidx csi))
{0 {37449 [{:beg 117, :end 136}]},
1 {}
2 {37449 [{:beg 136, :end 10682368}]}}))
(is (= (.loffset csi) {0 {1 117}, 1 {} ,2 {1 136}}))))))

(defn- check-loffset [file-offsets file-offset-index beg file-offset]
;;If "beg" which is the key of the loffset does not exist
;;between the beg-end of the variant data,
Expand All @@ -124,7 +154,12 @@
(deftest csi-comparison-test
(with-open [r (vcf/reader test-vcf-various-bins-gz-file)]
(let [file-offsets (vcf/read-file-offsets r)
computed (csi/offsets->index file-offsets 14 6)
computed
(csi/offsets->index file-offsets 14 6
{:variant-file-type :VCF
:names (mapv :id
(:contig (.meta-info ^VCFReader
r)))})
csi ^CSI (csi/read-index test-vcf-various-bins-csi-file)
file-offset-index (->> file-offsets
(map-indexed #(vector (:file-beg %2)
Expand All @@ -133,6 +168,7 @@
(is (= (.n-ref csi) (.n-ref ^CSI computed)))
(is (= (.min-shift csi) (.min-shift ^CSI computed)))
(is (= (.depth csi) (.depth ^CSI computed)))
(is (= (vec (.aux csi)) (vec (.aux ^CSI computed))))
;;Exclude tail data
;;due to differences in implementation of HTSlib and bgzf4j.
(is (= (update (.bidx csi) 0 dissoc 75)
Expand All @@ -152,14 +188,20 @@
(with-open [r (vcf/reader test-large-vcf-file)]
(let [file-offsets (vcf/read-file-offsets r)
csi ^CSI (csi/read-index test-large-vcf-csi-file)
computed (csi/offsets->index file-offsets 14 6)
computed
(csi/offsets->index file-offsets 14 6
{:variant-file-type :VCF
:names (mapv :id
(:contig (.meta-info ^VCFReader
r)))})
file-offset-index (->> file-offsets
(map-indexed #(vector (:file-beg %2)
%1))
(into {}))]
(is (= (.n-ref csi) (.n-ref ^CSI computed)))
(is (= (.min-shift csi) (.min-shift ^CSI computed)))
(is (= (.depth csi) (.depth ^CSI computed)))
(is (= (vec (.aux csi)) (vec (.aux ^CSI computed))))
(is (= (update (.bidx csi) 24 dissoc 37450)
(update (.bidx ^CSI computed) 24 dissoc 37450)))
(doseq [chr (keys (.loffset csi))]
Expand Down
3 changes: 2 additions & 1 deletion test/cljam/io/csi_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@
(is (= (.min-shift r) (.min-shift w)))
(is (= (.depth r) (.depth w)))
(is (= (.bidx r) (.bidx w)))
(is (= (.loffset r) (.loffset w))))))
(is (= (.loffset r) (.loffset w)))
(is (= (vec (.aux r)) (vec (.aux w)))))))
1 change: 1 addition & 0 deletions test/cljam/test_common.clj
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@
(def test-vcf-various-bins-csi-file "test-resources/vcf/various-bins.vcf.gz.csi")
(def test-vcf-changed-chr-order-file "test-resources/vcf/test-changed-chr-order.vcf.gz")
(def test-vcf-changed-chr-order-field-less-file "test-resources/vcf/test-changed-chr-order-field-less.vcf.gz")
(def test-vcf-chr-skipped-file "test-resources/vcf/test-chr-skipped.vcf.gz")

(def test-large-vcf-file (cavia/resource mycavia "large.vcf.gz"))
(def test-large-vcf-tbi-file (cavia/resource mycavia "large.vcf.gz.tbi"))
Expand Down

0 comments on commit e655e42

Please sign in to comment.