Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for random reading of a bcf file. #187

Merged
merged 3 commits into from
Jan 23, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 74 additions & 20 deletions src/cljam/io/bcf/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,19 @@
[cljam.io.util.lsb :as lsb]
[cljam.io.vcf.reader :as vcf-reader]
[cljam.io.vcf.util :as vcf-util]
[cljam.util :as util])
[cljam.util :as util]
[cljam.io.util.bin :as util-bin]
[cljam.io.csi :as csi])
(:import [java.io Closeable IOException]
[java.net URL]
[java.nio Buffer ByteBuffer]
[bgzf4j BGZFInputStream]))

(declare read-variants meta-info)
(declare read-variants meta-info read-variants-randomly)

(deftype BCFReader [^URL url meta-info header ^BGZFInputStream reader ^long start-pos]
(deftype BCFReader
[^URL url meta-info header ^BGZFInputStream reader
^long start-pos index-delay]
Closeable
(close [this]
(.close ^Closeable (.reader this)))
Expand All @@ -42,7 +46,9 @@
(header [this] (.header this))
(read-variants
([this] (protocols/read-variants this {}))
([this option] (read-variants this option))))
([this option] (read-variants this option)))
(read-variants-randomly [this region-option deep-option]
(read-variants-randomly this region-option deep-option)))

(defn- parse-meta-and-header
"Parses meta-info and header of BCF files and returns them as a map.
Expand Down Expand Up @@ -72,10 +78,13 @@
(let [hlen (lsb/read-int rdr)
header-buf (lsb/read-bytes rdr hlen)]
(if (zero? (aget ^bytes header-buf (dec hlen))) ;; NULL-terminated
(let [{:keys [header meta]} (->> (String. ^bytes header-buf 0 (int (dec hlen)))
(let [{:keys [header meta]} (->> (String. ^bytes header-buf
0
(int (dec hlen)))
cstr/split-lines
parse-meta-and-header)]
(BCFReader. (util/as-url f) meta header rdr (.getFilePointer rdr)))
(BCFReader. (util/as-url f) meta header rdr (.getFilePointer rdr)
(delay (csi/read-index (str f ".csi")))))
(do
(.close rdr)
(throw (IOException. (str "Invalid file format. BCF header must be NULL-terminated."))))))
Expand Down Expand Up @@ -238,6 +247,27 @@
(update :info (fn [xs] (map (fn [m] (dissoc m :idx)) xs)))
(update :format (fn [xs] (map (fn [m] (dissoc m :idx)) xs)))))

(defn- make-parse-fn [^BCFReader rdr info depth]
(let [contigs (meta->map (:contig (.meta-info rdr)))
filters (assoc (meta->map (:filter (.meta-info rdr)))
0
{:id "PASS" :kw :PASS})
formats (meta->map (:format (.meta-info rdr)))
kws (mapv keyword (drop 8 (.header rdr)))]
(case depth
:deep (comp (partial bcf-map->parsed-variant
contigs filters formats info kws)
parse-data-line-deep)
:vcf (comp (vcf-util/variant-vals-stringifier
(.meta-info rdr)
(.header rdr))
(partial bcf-map->parsed-variant
contigs filters formats info kws)
parse-data-line-deep)
:bcf parse-data-line-deep
:shallow (partial parse-data-line-shallow contigs)
:raw identity)))

(defn read-variants
"Returns data lines of the BCF from rdr as a lazy sequence of maps.
rdr must implement cljam.bcf.BCFReader.
Expand All @@ -252,19 +282,43 @@
(read-variants rdr {}))
([^BCFReader rdr {:keys [depth] :or {depth :deep}}]
(.seek ^BGZFInputStream (.reader rdr) ^long (.start-pos rdr))
(let [contigs (meta->map (:contig (.meta-info rdr)))
filters (assoc (meta->map (:filter (.meta-info rdr))) 0 {:id "PASS" :kw :PASS})
formats (meta->map (:format (.meta-info rdr)))
info (meta->map (:info (.meta-info rdr)))
kws (mapv keyword (drop 8 (.header rdr)))
parse-fn (case depth
:deep (comp (partial bcf-map->parsed-variant contigs filters formats info kws)
parse-data-line-deep)
:vcf (comp (vcf-util/variant-vals-stringifier (.meta-info rdr) (.header rdr))
(partial bcf-map->parsed-variant contigs filters formats info kws)
parse-data-line-deep)
:bcf parse-data-line-deep
:shallow (partial parse-data-line-shallow contigs)
:raw identity)]
(let [info (meta->map (:info (.meta-info rdr)))
parse-fn (make-parse-fn rdr info depth)]
(read-data-lines (.reader rdr)
(fn [rdr] (parse-fn (read-data-line-buffer rdr)))))))

(defn- make-lazy-variants [f s]
(when-first [fs s]
(lazy-cat
(f fs)
(make-lazy-variants f (rest s)))))

(defn read-variants-randomly
"Reads variants of the bgzip compressed BCF file randomly using csi file.
Returning them as a lazy sequence."
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a matter of English grammar, but it sounds more natural to me if the second sentence starts with Returns them as ... instead of Returning them as ..., unless you join these two sentences into one:

Suggested change
Returning them as a lazy sequence."
Returns them as a lazy sequence."

[^BCFReader rdr
{:keys [chr start end] :or {start 1 end 4294967296}}
{:keys [depth] :or {depth :deep}}]
(let [info (meta->map (:info (.meta-info rdr)))
parse-fn (make-parse-fn rdr info depth)
input-stream ^BGZFInputStream (.reader rdr)
chr-names (->> (.meta-info rdr) :contig (mapv :id))
ref-idx (.indexOf ^clojure.lang.PersistentVector chr-names chr)
csi-data @(.index-delay rdr)
spans (when-not (neg? ref-idx)
(util-bin/get-spans csi-data ref-idx start end))]
(make-lazy-variants
(fn [[chunk-beg ^long chunk-end]]
(.seek input-stream chunk-beg)
(->> #(when (< (.getFilePointer input-stream) chunk-end)
(-> input-stream
read-data-line-buffer
parse-fn))
repeatedly
(take-while identity)
(filter
(fn [{chr' :chr :keys [pos ref info]}]
(and (= chr' chr)
(<= pos end)
(<= start (get info :END (dec (+ pos (count ref))))))))))
spans)))
4 changes: 3 additions & 1 deletion src/cljam/io/protocols.clj
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@
(header [this]
"Returns header of VCF/BCF file as a sequence of strings.")
(read-variants [this] [this option]
"Reads variants of the VCF/BCF file, returning them as a lazy sequence."))
"Reads variants of the VCF/BCF file, returning them as a lazy sequence.")
(read-variants-randomly [this region-option depth-option]
"Reads randomly variants of the VCF/BCF file, returning them as a lazy sequence."))

(defprotocol IVariantWriter
(write-variants [this variants]
Expand Down
4 changes: 2 additions & 2 deletions src/cljam/io/vcf.clj
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@
([rdr option] (protocols/read-variants rdr option)))

(defn read-variants-randomly
"Reads variants of the VCF file randomly, returning them as a lazy sequence."
"Reads variants of the VCF/BCF file randomly, returning them as a lazy sequence."
([rdr span-option depth-option]
(vcf-reader/read-variants-randomly
(protocols/read-variants-randomly
rdr
span-option
depth-option)))
Expand Down
4 changes: 3 additions & 1 deletion src/cljam/io/vcf/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@
(header [this] (.header this))
(read-variants
([this] (protocols/read-variants this {}))
([this option] (read-variants this option))))
([this option] (read-variants this option)))
(read-variants-randomly [this region-option deep-option]
(read-variants-randomly this region-option deep-option)))

;; Utilities
;; ---------
Expand Down
Binary file added test-resources/bcf/test-v4_3-complex.bcf.csi
Binary file not shown.
Binary file added test-resources/bcf/various-bins.bcf
Binary file not shown.
Binary file added test-resources/bcf/various-bins.bcf.csi
Binary file not shown.
143 changes: 81 additions & 62 deletions test/cljam/io/vcf_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -125,32 +125,28 @@

(deftest-remote bin-index-is-done-without-errors-with-a-large-file
(with-before-after {:before (prepare-cavia!)}
(with-open [v (vcf/vcf-reader
test-large-vcf-file)]
(is (not-throw?
(vcf/read-variants-randomly
v
{:chr "chr1"
:start 20
:end 1000000}
{}))))

(with-open [v (vcf/vcf-reader
test-large-vcf-file)]
(is (not-throw?
(vcf/read-variants-randomly
v
{:chr "chr1"
:start 2000}
{}))))

(with-open [v (vcf/vcf-reader
test-large-vcf-file)]
(is (not-throw?
(vcf/read-variants-randomly
v
{:chr "chr1"}
{}))))))
(testing "vcf"
(are [index]
(with-open [v (vcf/vcf-reader test-large-vcf-file)]
(is (not-throw?
(vcf/read-variants-randomly v index {}))))
{:chr "chr1"
:start 20
:end 1000000}
{:chr "chr1"
:start 2000}
{:chr "chr1"}))
(testing "bcf"
(are [index]
(with-open [v (vcf/vcf-reader test-large-bcf-file)]
(is (not-throw?
(vcf/read-variants-randomly v index {}))))
{:chr "chr1"
:start 20
:end 1000000}
{:chr "chr1"
:start 2000}
{:chr "chr1"}))))

(deftest read-randomly-variants-complex-test
(doseq [index [{:chr "1"}
Expand All @@ -159,21 +155,18 @@
{:chr "2" :end 40000}
{:chr "2" :start 30000 :end 40000}]]
(with-open [vcf-file (vcf/vcf-reader test-vcf-complex-file)
vcf-gz-file (vcf/vcf-reader test-vcf-complex-gz-file)]
(is
(=
(vcf/read-variants-randomly
vcf-gz-file
index
{})
(let [{:keys [chr start end] :or {start 1 end 4294967296}} index]
(filter
(fn [{chr' :chr :keys [pos ref info]}]
(and (= chr chr')
(<= start (get info :END (dec (+ pos (count ref)))))
(>= end pos)))
(vcf/read-variants
vcf-file))))))))
vcf-gz-file (vcf/vcf-reader test-vcf-complex-gz-file)
bcf-file (vcf/bcf-reader test-bcf-complex-file)]
(is (= (vcf/read-variants-randomly vcf-gz-file index {})
(vcf/read-variants-randomly bcf-file index {})
(let [{:keys [chr start end] :or {start 1 end 4294967296}} index]
(filter
(fn [{chr' :chr :keys [pos ref info]}]
(and (= chr chr')
(<= start (get info :END (dec (+ pos (count ref)))))
(>= end pos)))
(vcf/read-variants
vcf-file))))))))

(deftest writer-test
(testing "vcf"
Expand Down Expand Up @@ -353,24 +346,50 @@
(cio/as-url (cio/file temp-bcf-file)))))

(deftest read-variants-randomly-test
(are [chr start end]
(let [vcf1* (vcf/vcf-reader test-vcf-various-bins-gz-file)
vcf2* (vcf/vcf-reader test-vcf-various-bins-gz-file)]
(=
(vcf/read-variants-randomly vcf1* {:chr chr :start start :end end} {})
(filter
(fn [{chr' :chr :keys [pos ref info]}]
(and (= chr chr')
(<= start (get info :END (dec (+ pos (count ref)))))
(>= end pos)))
(vcf/read-variants vcf2*))))
"chr1" 1 16384
"chr1" 1 49153
"chr1" 1 30000
"chr1" 49153 147457
"chr1" 32769 147457
"chr1" 49153 1064952
"chr1" 1048577 414826496
"chr1" 1048577 414826497
"chr1" 32769 414859265
"chr1" 414859265 536608769))
(testing "vcf"
(are [chr start end]
(let [vcf1* (vcf/vcf-reader test-vcf-various-bins-gz-file)
vcf2* (vcf/vcf-reader test-vcf-various-bins-gz-file)]
(=
(vcf/read-variants-randomly vcf1* {:chr chr :start start :end end}
{})
(filter
(fn [{chr' :chr :keys [pos ref info]}]
(and (= chr chr')
(<= start (get info :END (dec (+ pos (count ref)))))
(>= end pos)))
(vcf/read-variants vcf2*))))
"chr1" 1 16384
"chr1" 1 49153
"chr1" 1 30000
"chr1" 49153 147457
"chr1" 32769 147457
"chr1" 49153 1064952
"chr1" 1048577 414826496
"chr1" 1048577 414826497
"chr1" 32769 414859265
"chr1" 414859265 536608769))
(testing "bcf"
(are [chr start end]
(let [bcf1* (vcf/bcf-reader test-bcf-various-bins-file)
bcf2* (vcf/bcf-reader test-bcf-various-bins-file)
vcf (vcf/vcf-reader test-vcf-various-bins-gz-file)]
(=
(vcf/read-variants-randomly bcf2* {:chr chr :start start :end end} {})
(vcf/read-variants-randomly vcf {:chr chr :start start :end end} {})
(filter
(fn [{chr' :chr :keys [pos ref info]}]
(and (= chr chr')
(<= start (get info :END (dec (+ pos (count ref)))))
(>= end pos)))
(vcf/read-variants bcf1*))))
"chr1" 1 16384
"chr1" 1 49153
"chr1" 1 30000
"chr1" 49153 147457
"chr1" 32769 147457
"chr1" 49153 1064952
"chr1" 1048577 414826496
"chr1" 1048577 414826497
"chr1" 32769 414859265
"chr1" 414859265 536608769)))
16 changes: 15 additions & 1 deletion test/cljam/test_common.clj
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,16 @@
:sha1 "f5e42e5af8666a39e1db1a477b25f183bf09fc9b"}
{:id "large.vcf.gz.csi"
:url "https://test.chrov.is/data/cljam/example-500-10000.vcf.gz.csi"
:sha1 "568a47f463de8df846e021640d38b8cf8f257e66"}]})
:sha1 "568a47f463de8df846e021640d38b8cf8f257e66"}
{:id "large.vcf.gz.csi"
:url "https://test.chrov.is/data/cljam/example-500-10000.vcf.gz.csi"
:sha1 "568a47f463de8df846e021640d38b8cf8f257e66"}
{:id "large.bcf"
:url "https://test.chrov.is/data/cljam/example-500-10000.bcf"
:sha1 "f7f57ed9d21874c92331ef6d86d85b36959f4d16"}
{:id "large.bcf.csi"
:url "https://test.chrov.is/data/cljam/example-500-10000.bcf.csi"
:sha1 "5f0c2deab6c33eda887139227c691f140f88ade9"}]})

(defn prepare-cavia! []
(with-profile mycavia
Expand Down Expand Up @@ -223,9 +232,14 @@
(def test-bcf-invalid-file "test-resources/bcf/invalid.bcf")
(def test-bcf-no-samples-file "test-resources/bcf/test-no-samples.bcf")
(def test-bcf-complex-file "test-resources/bcf/test-v4_3-complex.bcf")
(def test-bcf-various-bins-file "test-resources/bcf/various-bins.bcf")
(def test-bcf-various-bins-csi-file "test-resources/bcf/various-bins.bcf.csi")
(def test-large-bcf-file (cavia/resource mycavia "large.bcf"))
(def test-large-bcf-csi-file (cavia/resource mycavia "large.bcf.csi"))

;; ### GFF3 files


(def test-gff3-file "test-resources/gff3/example.gff3")

;; ### WIG files
Expand Down