diff --git a/src/cljam/io/bcf/reader.clj b/src/cljam/io/bcf/reader.clj index aaf843df..9ab8a5de 100644 --- a/src/cljam/io/bcf/reader.clj +++ b/src/cljam/io/bcf/reader.clj @@ -7,15 +7,19 @@ [cljam.io.util.lsb :as lsb] [cljam.io.vcf.reader :as vcf-reader] [cljam.io.vcf.util :as vcf-util] - [cljam.util :as util]) + [cljam.util :as util] + [cljam.io.util.bin :as util-bin] + [cljam.io.csi :as csi]) (:import [java.io Closeable IOException] [java.net URL] [java.nio Buffer ByteBuffer] [bgzf4j BGZFInputStream])) -(declare read-variants meta-info) +(declare read-variants meta-info read-variants-randomly) -(deftype BCFReader [^URL url meta-info header ^BGZFInputStream reader ^long start-pos] +(deftype BCFReader + [^URL url meta-info header ^BGZFInputStream reader + ^long start-pos index-delay] Closeable (close [this] (.close ^Closeable (.reader this))) @@ -42,7 +46,9 @@ (header [this] (.header this)) (read-variants ([this] (protocols/read-variants this {})) - ([this option] (read-variants this option)))) + ([this option] (read-variants this option))) + (read-variants-randomly [this region-option deep-option] + (read-variants-randomly this region-option deep-option))) (defn- parse-meta-and-header "Parses meta-info and header of BCF files and returns them as a map. @@ -72,10 +78,13 @@ (let [hlen (lsb/read-int rdr) header-buf (lsb/read-bytes rdr hlen)] (if (zero? (aget ^bytes header-buf (dec hlen))) ;; NULL-terminated - (let [{:keys [header meta]} (->> (String. ^bytes header-buf 0 (int (dec hlen))) + (let [{:keys [header meta]} (->> (String. ^bytes header-buf + 0 + (int (dec hlen))) cstr/split-lines parse-meta-and-header)] - (BCFReader. (util/as-url f) meta header rdr (.getFilePointer rdr))) + (BCFReader. (util/as-url f) meta header rdr (.getFilePointer rdr) + (delay (csi/read-index (str f ".csi"))))) (do (.close rdr) (throw (IOException. (str "Invalid file format. BCF header must be NULL-terminated.")))))) @@ -238,6 +247,27 @@ (update :info (fn [xs] (map (fn [m] (dissoc m :idx)) xs))) (update :format (fn [xs] (map (fn [m] (dissoc m :idx)) xs))))) +(defn- make-parse-fn [^BCFReader rdr info depth] + (let [contigs (meta->map (:contig (.meta-info rdr))) + filters (assoc (meta->map (:filter (.meta-info rdr))) + 0 + {:id "PASS" :kw :PASS}) + formats (meta->map (:format (.meta-info rdr))) + kws (mapv keyword (drop 8 (.header rdr)))] + (case depth + :deep (comp (partial bcf-map->parsed-variant + contigs filters formats info kws) + parse-data-line-deep) + :vcf (comp (vcf-util/variant-vals-stringifier + (.meta-info rdr) + (.header rdr)) + (partial bcf-map->parsed-variant + contigs filters formats info kws) + parse-data-line-deep) + :bcf parse-data-line-deep + :shallow (partial parse-data-line-shallow contigs) + :raw identity))) + (defn read-variants "Returns data lines of the BCF from rdr as a lazy sequence of maps. rdr must implement cljam.bcf.BCFReader. @@ -252,19 +282,43 @@ (read-variants rdr {})) ([^BCFReader rdr {:keys [depth] :or {depth :deep}}] (.seek ^BGZFInputStream (.reader rdr) ^long (.start-pos rdr)) - (let [contigs (meta->map (:contig (.meta-info rdr))) - filters (assoc (meta->map (:filter (.meta-info rdr))) 0 {:id "PASS" :kw :PASS}) - formats (meta->map (:format (.meta-info rdr))) - info (meta->map (:info (.meta-info rdr))) - kws (mapv keyword (drop 8 (.header rdr))) - parse-fn (case depth - :deep (comp (partial bcf-map->parsed-variant contigs filters formats info kws) - parse-data-line-deep) - :vcf (comp (vcf-util/variant-vals-stringifier (.meta-info rdr) (.header rdr)) - (partial bcf-map->parsed-variant contigs filters formats info kws) - parse-data-line-deep) - :bcf parse-data-line-deep - :shallow (partial parse-data-line-shallow contigs) - :raw identity)] + (let [info (meta->map (:info (.meta-info rdr))) + parse-fn (make-parse-fn rdr info depth)] (read-data-lines (.reader rdr) (fn [rdr] (parse-fn (read-data-line-buffer rdr))))))) + +(defn- make-lazy-variants [f s] + (when-first [fs s] + (lazy-cat + (f fs) + (make-lazy-variants f (rest s))))) + +(defn read-variants-randomly + "Reads variants of the BCF file randomly using csi file. + Returns them as a lazy sequence." + [^BCFReader rdr + {:keys [chr start end] :or {start 1 end 4294967296}} + {:keys [depth] :or {depth :deep}}] + (let [info (meta->map (:info (.meta-info rdr))) + parse-fn (make-parse-fn rdr info depth) + input-stream ^BGZFInputStream (.reader rdr) + chr-names (->> (.meta-info rdr) :contig (mapv :id)) + ref-idx (.indexOf ^clojure.lang.PersistentVector chr-names chr) + csi-data @(.index-delay rdr) + spans (when-not (neg? ref-idx) + (util-bin/get-spans csi-data ref-idx start end))] + (make-lazy-variants + (fn [[chunk-beg ^long chunk-end]] + (.seek input-stream chunk-beg) + (->> #(when (< (.getFilePointer input-stream) chunk-end) + (-> input-stream + read-data-line-buffer + parse-fn)) + repeatedly + (take-while identity) + (filter + (fn [{chr' :chr :keys [pos ref info]}] + (and (= chr' chr) + (<= pos end) + (<= start (get info :END (dec (+ pos (count ref)))))))))) + spans))) diff --git a/src/cljam/io/protocols.clj b/src/cljam/io/protocols.clj index 6686b057..7694b56c 100644 --- a/src/cljam/io/protocols.clj +++ b/src/cljam/io/protocols.clj @@ -50,7 +50,9 @@ (header [this] "Returns header of VCF/BCF file as a sequence of strings.") (read-variants [this] [this option] - "Reads variants of the VCF/BCF file, returning them as a lazy sequence.")) + "Reads variants of the VCF/BCF file, returning them as a lazy sequence.") + (read-variants-randomly [this region-option depth-option] + "Reads randomly variants of the VCF/BCF file, returning them as a lazy sequence.")) (defprotocol IVariantWriter (write-variants [this variants] diff --git a/src/cljam/io/vcf.clj b/src/cljam/io/vcf.clj index d1ca171d..fe87c284 100644 --- a/src/cljam/io/vcf.clj +++ b/src/cljam/io/vcf.clj @@ -81,9 +81,9 @@ ([rdr option] (protocols/read-variants rdr option))) (defn read-variants-randomly - "Reads variants of the VCF file randomly, returning them as a lazy sequence." + "Reads variants of the VCF/BCF file randomly, returning them as a lazy sequence." ([rdr span-option depth-option] - (vcf-reader/read-variants-randomly + (protocols/read-variants-randomly rdr span-option depth-option))) diff --git a/src/cljam/io/vcf/reader.clj b/src/cljam/io/vcf/reader.clj index ad82430e..f65f1fd6 100644 --- a/src/cljam/io/vcf/reader.clj +++ b/src/cljam/io/vcf/reader.clj @@ -44,7 +44,9 @@ (header [this] (.header this)) (read-variants ([this] (protocols/read-variants this {})) - ([this option] (read-variants this option)))) + ([this option] (read-variants this option))) + (read-variants-randomly [this region-option deep-option] + (read-variants-randomly this region-option deep-option))) ;; Utilities ;; --------- diff --git a/test-resources/bcf/test-v4_3-complex.bcf.csi b/test-resources/bcf/test-v4_3-complex.bcf.csi new file mode 100644 index 00000000..7fd09ac9 Binary files /dev/null and b/test-resources/bcf/test-v4_3-complex.bcf.csi differ diff --git a/test-resources/bcf/various-bins.bcf b/test-resources/bcf/various-bins.bcf new file mode 100644 index 00000000..9c23cee5 Binary files /dev/null and b/test-resources/bcf/various-bins.bcf differ diff --git a/test-resources/bcf/various-bins.bcf.csi b/test-resources/bcf/various-bins.bcf.csi new file mode 100644 index 00000000..a83dffe7 Binary files /dev/null and b/test-resources/bcf/various-bins.bcf.csi differ diff --git a/test/cljam/io/vcf_test.clj b/test/cljam/io/vcf_test.clj index 680ff983..77e19a32 100644 --- a/test/cljam/io/vcf_test.clj +++ b/test/cljam/io/vcf_test.clj @@ -125,32 +125,28 @@ (deftest-remote bin-index-is-done-without-errors-with-a-large-file (with-before-after {:before (prepare-cavia!)} - (with-open [v (vcf/vcf-reader - test-large-vcf-file)] - (is (not-throw? - (vcf/read-variants-randomly - v - {:chr "chr1" - :start 20 - :end 1000000} - {})))) - - (with-open [v (vcf/vcf-reader - test-large-vcf-file)] - (is (not-throw? - (vcf/read-variants-randomly - v - {:chr "chr1" - :start 2000} - {})))) - - (with-open [v (vcf/vcf-reader - test-large-vcf-file)] - (is (not-throw? - (vcf/read-variants-randomly - v - {:chr "chr1"} - {})))))) + (testing "vcf" + (are [index] + (with-open [v (vcf/vcf-reader test-large-vcf-file)] + (is (not-throw? + (vcf/read-variants-randomly v index {})))) + {:chr "chr1" + :start 20 + :end 1000000} + {:chr "chr1" + :start 2000} + {:chr "chr1"})) + (testing "bcf" + (are [index] + (with-open [v (vcf/vcf-reader test-large-bcf-file)] + (is (not-throw? + (vcf/read-variants-randomly v index {})))) + {:chr "chr1" + :start 20 + :end 1000000} + {:chr "chr1" + :start 2000} + {:chr "chr1"})))) (deftest read-randomly-variants-complex-test (doseq [index [{:chr "1"} @@ -159,21 +155,18 @@ {:chr "2" :end 40000} {:chr "2" :start 30000 :end 40000}]] (with-open [vcf-file (vcf/vcf-reader test-vcf-complex-file) - vcf-gz-file (vcf/vcf-reader test-vcf-complex-gz-file)] - (is - (= - (vcf/read-variants-randomly - vcf-gz-file - index - {}) - (let [{:keys [chr start end] :or {start 1 end 4294967296}} index] - (filter - (fn [{chr' :chr :keys [pos ref info]}] - (and (= chr chr') - (<= start (get info :END (dec (+ pos (count ref))))) - (>= end pos))) - (vcf/read-variants - vcf-file)))))))) + vcf-gz-file (vcf/vcf-reader test-vcf-complex-gz-file) + bcf-file (vcf/bcf-reader test-bcf-complex-file)] + (is (= (vcf/read-variants-randomly vcf-gz-file index {}) + (vcf/read-variants-randomly bcf-file index {}) + (let [{:keys [chr start end] :or {start 1 end 4294967296}} index] + (filter + (fn [{chr' :chr :keys [pos ref info]}] + (and (= chr chr') + (<= start (get info :END (dec (+ pos (count ref))))) + (>= end pos))) + (vcf/read-variants + vcf-file)))))))) (deftest writer-test (testing "vcf" @@ -353,24 +346,50 @@ (cio/as-url (cio/file temp-bcf-file))))) (deftest read-variants-randomly-test - (are [chr start end] - (let [vcf1* (vcf/vcf-reader test-vcf-various-bins-gz-file) - vcf2* (vcf/vcf-reader test-vcf-various-bins-gz-file)] - (= - (vcf/read-variants-randomly vcf1* {:chr chr :start start :end end} {}) - (filter - (fn [{chr' :chr :keys [pos ref info]}] - (and (= chr chr') - (<= start (get info :END (dec (+ pos (count ref))))) - (>= end pos))) - (vcf/read-variants vcf2*)))) - "chr1" 1 16384 - "chr1" 1 49153 - "chr1" 1 30000 - "chr1" 49153 147457 - "chr1" 32769 147457 - "chr1" 49153 1064952 - "chr1" 1048577 414826496 - "chr1" 1048577 414826497 - "chr1" 32769 414859265 - "chr1" 414859265 536608769)) + (testing "vcf" + (are [chr start end] + (let [vcf1* (vcf/vcf-reader test-vcf-various-bins-gz-file) + vcf2* (vcf/vcf-reader test-vcf-various-bins-gz-file)] + (= + (vcf/read-variants-randomly vcf1* {:chr chr :start start :end end} + {}) + (filter + (fn [{chr' :chr :keys [pos ref info]}] + (and (= chr chr') + (<= start (get info :END (dec (+ pos (count ref))))) + (>= end pos))) + (vcf/read-variants vcf2*)))) + "chr1" 1 16384 + "chr1" 1 49153 + "chr1" 1 30000 + "chr1" 49153 147457 + "chr1" 32769 147457 + "chr1" 49153 1064952 + "chr1" 1048577 414826496 + "chr1" 1048577 414826497 + "chr1" 32769 414859265 + "chr1" 414859265 536608769)) + (testing "bcf" + (are [chr start end] + (let [bcf1* (vcf/bcf-reader test-bcf-various-bins-file) + bcf2* (vcf/bcf-reader test-bcf-various-bins-file) + vcf (vcf/vcf-reader test-vcf-various-bins-gz-file)] + (= + (vcf/read-variants-randomly bcf2* {:chr chr :start start :end end} {}) + (vcf/read-variants-randomly vcf {:chr chr :start start :end end} {}) + (filter + (fn [{chr' :chr :keys [pos ref info]}] + (and (= chr chr') + (<= start (get info :END (dec (+ pos (count ref))))) + (>= end pos))) + (vcf/read-variants bcf1*)))) + "chr1" 1 16384 + "chr1" 1 49153 + "chr1" 1 30000 + "chr1" 49153 147457 + "chr1" 32769 147457 + "chr1" 49153 1064952 + "chr1" 1048577 414826496 + "chr1" 1048577 414826497 + "chr1" 32769 414859265 + "chr1" 414859265 536608769))) diff --git a/test/cljam/test_common.clj b/test/cljam/test_common.clj index 4aedefd7..f618bfd6 100644 --- a/test/cljam/test_common.clj +++ b/test/cljam/test_common.clj @@ -51,7 +51,16 @@ :sha1 "f5e42e5af8666a39e1db1a477b25f183bf09fc9b"} {:id "large.vcf.gz.csi" :url "https://test.chrov.is/data/cljam/example-500-10000.vcf.gz.csi" - :sha1 "568a47f463de8df846e021640d38b8cf8f257e66"}]}) + :sha1 "568a47f463de8df846e021640d38b8cf8f257e66"} + {:id "large.vcf.gz.csi" + :url "https://test.chrov.is/data/cljam/example-500-10000.vcf.gz.csi" + :sha1 "568a47f463de8df846e021640d38b8cf8f257e66"} + {:id "large.bcf" + :url "https://test.chrov.is/data/cljam/example-500-10000.bcf" + :sha1 "f7f57ed9d21874c92331ef6d86d85b36959f4d16"} + {:id "large.bcf.csi" + :url "https://test.chrov.is/data/cljam/example-500-10000.bcf.csi" + :sha1 "5f0c2deab6c33eda887139227c691f140f88ade9"}]}) (defn prepare-cavia! [] (with-profile mycavia @@ -223,9 +232,14 @@ (def test-bcf-invalid-file "test-resources/bcf/invalid.bcf") (def test-bcf-no-samples-file "test-resources/bcf/test-no-samples.bcf") (def test-bcf-complex-file "test-resources/bcf/test-v4_3-complex.bcf") +(def test-bcf-various-bins-file "test-resources/bcf/various-bins.bcf") +(def test-bcf-various-bins-csi-file "test-resources/bcf/various-bins.bcf.csi") +(def test-large-bcf-file (cavia/resource mycavia "large.bcf")) +(def test-large-bcf-csi-file (cavia/resource mycavia "large.bcf.csi")) ;; ### GFF3 files + (def test-gff3-file "test-resources/gff3/example.gff3") ;; ### WIG files