From abb6a71c0a53dc124cccea1568b7636f1e57691d Mon Sep 17 00:00:00 2001 From: niyarin Date: Wed, 9 Oct 2019 15:25:18 +0900 Subject: [PATCH 1/5] Add support for random reading of vcf file. --- src/cljam/io/bam_index.clj | 6 ++- src/cljam/io/bam_index/core.clj | 55 ++++++--------------- src/cljam/io/tabix.clj | 88 ++++++++++++++++++++------------- src/cljam/io/util/bin.clj | 45 +++++++++++++++++ src/cljam/io/vcf.clj | 17 +++++-- src/cljam/io/vcf/reader.clj | 53 ++++++++++++++++++-- test/cljam/io/tabix_test.clj | 46 +++++++++++------ test/cljam/io/util/bin_test.clj | 14 ++++++ test/cljam/io/vcf_test.clj | 21 ++++++++ test/cljam/test_common.clj | 17 ++++++- 10 files changed, 264 insertions(+), 98 deletions(-) create mode 100644 src/cljam/io/util/bin.clj create mode 100644 test/cljam/io/util/bin_test.clj diff --git a/src/cljam/io/bam_index.clj b/src/cljam/io/bam_index.clj index fc7ce71d..443eaa6b 100644 --- a/src/cljam/io/bam_index.clj +++ b/src/cljam/io/bam_index.clj @@ -1,6 +1,8 @@ (ns cljam.io.bam-index "Parser for a BAM index file." - (:require [cljam.io.bam-index.core :as bai-core])) + (:require + [cljam.io.bam-index.core :as bai-core] + [cljam.io.util.bin :as util-bin])) (defn bin-index "Returns binning index for the given reference index." @@ -15,7 +17,7 @@ (defn get-spans "Returns regions of a BAM file that may contain an alignment for the given range." [bai ref-idx beg end] - (bai-core/get-spans bai ref-idx beg end)) + (util-bin/get-spans bai ref-idx beg end)) (defn bam-index "Returns a cljam.bam-index.core.BAMIndex." diff --git a/src/cljam/io/bam_index/core.clj b/src/cljam/io/bam_index/core.clj index 1b983ca8..51d290be 100644 --- a/src/cljam/io/bam_index/core.clj +++ b/src/cljam/io/bam_index/core.clj @@ -2,16 +2,20 @@ "The core of BAM index features." (:require [clojure.java.io :as cio] [clojure.tools.logging :as logging] - [cljam.io.bam-index [common :refer :all] - [chunk :as chunk] - [reader :as reader] - [writer :as writer]] - [cljam.util :as util]) + [cljam.io.bam-index [reader :as reader] + [writer :as writer]] + [cljam.util :as util] + [cljam.io.util.bin :as util-bin]) (:import [java.io DataOutputStream FileOutputStream] cljam.io.bam_index.reader.BAIReader cljam.io.bam_index.writer.BAIWriter)) -(deftype BAMIndex [url bidx lidx]) +(deftype BAMIndex [url bidx lidx] + util-bin/IBinaryIndex + (bidx-ref [this] + (.bidx this)) + (lidx-ref [this] + (.lidx this))) (defn bam-index [f] (let [{:keys [bidx lidx]} (with-open [r ^BAIReader (reader/reader f)] (reader/read-all-index! r))] @@ -29,39 +33,6 @@ (with-open [r ^BAIReader (reader/reader f)] (reader/read-linear-index! r ref-idx))) -(defn- reg->bins* - "Returns candidate bins for the specified region as a vector." - [^long beg ^long end] - (let [max-pos 0x1FFFFFFF - beg (if (<= beg 0) 0 (bit-and (dec beg) max-pos)) - end (if (<= end 0) max-pos (bit-and (dec end) max-pos))] - (if (<= beg end) - (loop [bins (transient [0]) - xs [[1 26] [9 23] [73 20] [585 17] [4681 14]]] - (if-let [[^long ini shift] (first xs)] - (let [ini* (+ ini (bit-shift-right beg shift)) - end* (+ ini (bit-shift-right end shift))] - (recur - (loop [b bins k ini*] - (if (<= k end*) - (recur (conj! b k) (inc k)) - b)) - (next xs))) - (persistent! bins)))))) - -(def ^:private reg->bins (memoize reg->bins*)) - -(defn get-spans - [^BAMIndex bai ^long ref-idx ^long beg ^long end] - (let [bins (reg->bins beg end) - bidx (get (.bidx bai) ref-idx) - lidx (get (.lidx bai) ref-idx) - chunks (into [] (comp (map bidx) cat) bins) - lin-beg (writer/pos->lidx-offset beg) - min-offset (get lidx lin-beg 0)] - (->> (chunk/optimize-chunks chunks min-offset) - (map vals)))) - (defn get-unplaced-spans [^BAMIndex bai] (if-let [begin (some->> @@ -74,8 +45,14 @@ [[begin Long/MAX_VALUE]] [])) +(defn get-spans + [^BAMIndex bai ^long ref-idx ^long beg ^long end] + (util-bin/get-spans bai ref-idx beg end)) + + ;; ## Writing + (defn ^BAIWriter writer [f refs] (BAIWriter. (DataOutputStream. (FileOutputStream. (cio/file f))) diff --git a/src/cljam/io/tabix.clj b/src/cljam/io/tabix.clj index c48a6203..9ef30172 100644 --- a/src/cljam/io/tabix.clj +++ b/src/cljam/io/tabix.clj @@ -2,41 +2,64 @@ "Alpha - subject to change. Reader of a TABIX format file." (:require [cljam.io.util.bgzf :as bgzf] - [cljam.io.util.lsb :as lsb]) + [cljam.io.util.lsb :as lsb] + [cljam.io.util.bin :as util-bin]) (:import java.util.Arrays - [java.io DataInputStream IOException])) + [java.io DataInputStream IOException] + [cljam.io.bam_index.chunk Chunk])) + +(deftype Tabix [n-ref preset sc bc ec meta skip seq bidx lidx] + util-bin/IBinaryIndex + (bidx-ref [this] + (.bidx this)) + (lidx-ref [this] + (.lidx this))) (def tabix-magic "TBI\1") +(defn- read-chunks! + [rdr] + (let [n (lsb/read-int rdr)] + (loop [i 0, chunks []] + (if (< i n) + (recur (inc i) (conj chunks (Chunk. (lsb/read-long rdr) (lsb/read-long rdr)))) + chunks)))) + (defn- read-seq [buf len] (loop [i 0, j 0, seq* []] (if (< i len) - (if (zero? (nth buf i)) - (let [ba-len (- i j) - ba (byte-array ba-len)] - (System/arraycopy buf j ba 0 ba-len) - (recur (inc i) (inc i) (conj seq* (String. ba)))) - (recur (inc i) j seq*)) - seq*))) + (if (zero? (nth buf i)) + (let [ba-len (- i j) + ba (byte-array ba-len)] + (System/arraycopy buf j ba 0 ba-len) + (recur (inc i) (inc i) (conj seq* (String. ba)))) + (recur (inc i) j seq*)) + seq*))) -(defn- read-chunk - [^DataInputStream rdr] - {:beg (lsb/read-long rdr) - :end (lsb/read-long rdr)}) +(defn- read-bin-index**! + [rdr] + (let [n (lsb/read-int rdr)] + (loop [i 0, bidx []] + (if (< i n) + (let [bin (lsb/read-int rdr) + chunks (read-chunks! rdr)] + (recur (inc i) (conj bidx {:bin bin, :chunks chunks}))) + bidx)))) -(defn- read-bin - [^DataInputStream rdr] - (let [bin (lsb/read-int rdr) - n-chunk (lsb/read-int rdr)] - {:bin bin - :chunks (doall (map (fn [_] (read-chunk rdr)) (range n-chunk)))})) +(defn- read-linear-index**! + [rdr] + (let [n (lsb/read-int rdr)] + (loop [i 0, lidx []] + (if (< i n) + (recur (inc i) (conj lidx (lsb/read-long rdr))) + lidx)))) (defn- read-index* [^DataInputStream rdr] (when-not (Arrays/equals ^bytes (lsb/read-bytes rdr 4) (.getBytes ^String tabix-magic)) (throw (IOException. "Invalid TABIX file"))) - (let [n-seq (lsb/read-int rdr) + (let [n-ref (lsb/read-int rdr) preset (lsb/read-int rdr) sc (lsb/read-int rdr) bc (lsb/read-int rdr) @@ -46,20 +69,17 @@ len (lsb/read-int rdr) buf (lsb/read-bytes rdr len) seq (read-seq buf len) - index (loop [i 0 - bin-index [] - linear-index []] - (if (< i n-seq) - (let [n-bin (lsb/read-int rdr) - new-bin-index (doall (map (fn [_] (read-bin rdr)) (range n-bin))) - n-linear-index (lsb/read-int rdr) - new-linear-index (doall (map (fn [_] (lsb/read-long rdr)) (range n-linear-index)))] - (recur (inc i) - (conj bin-index new-bin-index) - (conj linear-index new-linear-index))) - [bin-index linear-index]))] - {:n-seq n-seq, :preset preset, :sc sc, :bc bc, :ec ec, :meta meta, - :skip skip, :seq seq ,:bin-index (first index), :linear-index (last index)})) + refs (range n-ref) + all-idx (map (fn [_] [(read-bin-index**! rdr) (read-linear-index**! rdr)]) refs) + bidx-seq (map first all-idx) + bidx (zipmap + refs + (map (fn [bins] + (zipmap (map :bin bins) (map :chunks bins))) + bidx-seq)) + lidx (zipmap refs (map second all-idx))] + (->Tabix n-ref preset sc bc ec meta + skip seq bidx lidx))) (defn read-index [f] diff --git a/src/cljam/io/util/bin.clj b/src/cljam/io/util/bin.clj new file mode 100644 index 00000000..b74d6771 --- /dev/null +++ b/src/cljam/io/util/bin.clj @@ -0,0 +1,45 @@ +(ns cljam.io.util.bin + (:refer-clojure :exclude [compare]) + (:require [cljam.io.bam-index [chunk :as bam-index-chunk] + [writer :as bam-index-writer]]) + (:import [java.io File] + [java.net MalformedURLException URI URL] + [cljam.io.bam_index.chunk Chunk] + [bgzf4j BGZFInputStream BGZFOutputStream])) + +(defn- reg->bins* + "Returns candidate bins for the specified region as a vector." + [^long beg ^long end] + (let [max-pos 0x1FFFFFFF + beg (if (<= beg 0) 0 (bit-and (dec beg) max-pos)) + end (if (<= end 0) max-pos (bit-and (dec end) max-pos))] + (if (<= beg end) + (loop [bins (transient [0]) + xs [[1 26] [9 23] [73 20] [585 17] [4681 14]]] + (if-let [[^long ini shift] (first xs)] + (let [ini* (+ ini (bit-shift-right beg shift)) + end* (+ ini (bit-shift-right end shift))] + (recur + (loop [b bins k ini*] + (if (<= k end*) + (recur (conj! b k) (inc k)) + b)) + (next xs))) + (persistent! bins)))))) + +(def ^:private reg->bins (memoize reg->bins*)) + +(defprotocol IBinaryIndex + (bidx-ref [this]) + (lidx-ref [this])) + +(defn get-spans + [^cljam.io.util.bin.IBinaryIndex index-data ^long ref-idx ^long beg ^long end] + (let [bins (reg->bins beg end) + bidx (get (bidx-ref index-data) ref-idx) + lidx (get (lidx-ref index-data) ref-idx) + chunks (into [] (comp (map bidx) cat) bins) + lin-beg (bam-index-writer/pos->lidx-offset beg) + min-offset (get lidx lin-beg 0)] + (->> (bam-index-chunk/optimize-chunks chunks min-offset) + (map vals)))) diff --git a/src/cljam/io/vcf.clj b/src/cljam/io/vcf.clj index 48b25d37..1c09abb2 100644 --- a/src/cljam/io/vcf.clj +++ b/src/cljam/io/vcf.clj @@ -9,12 +9,14 @@ [cljam.io.vcf.reader :as vcf-reader] [cljam.io.vcf.writer :as vcf-writer] [cljam.io.bcf.reader :as bcf-reader] - [cljam.io.bcf.writer :as bcf-writer]) + [cljam.io.bcf.writer :as bcf-writer] + [cljam.io.util.bgzf :as bgzf]) (:import java.io.Closeable cljam.io.vcf.reader.VCFReader cljam.io.vcf.writer.VCFWriter cljam.io.bcf.reader.BCFReader - cljam.io.bcf.writer.BCFWriter)) + cljam.io.bcf.writer.BCFWriter + bgzf4j.BGZFInputStream)) ;; Reading ;; ------- @@ -28,7 +30,9 @@ header (with-open [r (cio/reader (util/compressor-input-stream f))] (vcf-reader/load-header r))] (VCFReader. (util/as-url f) meta-info header - (cio/reader (util/compressor-input-stream f))))) + (if (bgzf/bgzip? f) + (bgzf/bgzf-input-stream f) + (cio/reader (util/compressor-input-stream f)))))) (defn ^BCFReader bcf-reader "Returns an open cljam.io.bcf.reader.BCFReader of f. Should be used inside @@ -70,6 +74,13 @@ ([rdr] (protocols/read-variants rdr)) ([rdr option] (protocols/read-variants rdr option))) +(defn read-variants-randomly + "Reads variants of the VCF file randomly, returning them as a lazy sequence." + ([rdr option] + (vcf-reader/read-variants-randomly + rdr + option))) + ;; Writing ;; ------- diff --git a/src/cljam/io/vcf/reader.clj b/src/cljam/io/vcf/reader.clj index 19a931d0..75c19692 100644 --- a/src/cljam/io/vcf/reader.clj +++ b/src/cljam/io/vcf/reader.clj @@ -6,14 +6,18 @@ [cljam.io.protocols :as protocols] [camel-snake-kebab.core :refer [->kebab-case-keyword]] [proton.core :refer [as-long]] - [cljam.io.vcf.util :as vcf-util]) + [cljam.io.util.bin :as util-bin] + [cljam.io.vcf.util :as vcf-util] + [cljam.io.tabix :as tabix]) (:import [java.io Closeable] - [clojure.lang LazilyPersistentVector])) + [cljam.io.tabix Tabix] + [clojure.lang LazilyPersistentVector] + bgzf4j.BGZFInputStream)) ;; VCFReader ;; --------- -(declare read-variants) +(declare read-variants read-variants-randomly) (deftype VCFReader [url meta-info header reader] Closeable @@ -178,3 +182,46 @@ :deep (vcf-util/variant-parser (.meta-info rdr) (.header rdr)) :vcf identity)] (map parse-fn (read-data-lines (.reader rdr) (.header rdr) kws))))) + +(defn read-variants-randomly + [^VCFReader rdr {:keys [chr start end depth] :or {depth :deep start 1 end 4294967296}}] + (let [kws (mapv keyword (drop 8 (.header rdr))) + tabix-path (str (.getPath ^java.net.URL (.url rdr)) ".tbi") + tabix-data (tabix/read-index tabix-path) + ref-idx (.indexOf + ^clojure.lang.PersistentVector (.seq ^Tabix tabix-data) + (if (and (string? chr) + (> (count chr) 3) + (= (subs chr 0 3) "chr")) + (subs chr 3) + (throw (ex-info "Invalid chr." {:chr chr})))) + spans + (if (= ref-idx -1) + '() + (util-bin/get-spans tabix-data ref-idx start end)) + input-stream ^bgzf4j.BGZFInputStream (.reader rdr) + parse-fn (case depth + :deep (vcf-util/variant-parser (.meta-info rdr) (.header rdr)) + :vcf identity)] + (reduce + (fn [res span] + (.seek input-stream (first span)) + (loop [res res] + + (if (< (.getFilePointer input-stream) (second span)) + (let [variant + (parse-fn + (parse-data-line + (.readLine input-stream) + kws)) + variant-pos (:pos variant)] + (if (or (< variant-pos (first span)) + (>= variant-pos (second span))) + res + (recur + (cons + variant + res)))) + res))) + '() + spans))) diff --git a/test/cljam/io/tabix_test.clj b/test/cljam/io/tabix_test.clj index bdd64887..214fc20c 100644 --- a/test/cljam/io/tabix_test.clj +++ b/test/cljam/io/tabix_test.clj @@ -2,26 +2,39 @@ (:require [clojure.java.io :as cio] [clojure.test :refer :all] [cljam.test-common :refer :all] - [cljam.io.tabix :as tbi])) + [cljam.io.tabix :as tbi]) + (:import + [cljam.io.tabix Tabix] + [cljam.io.bam_index.chunk Chunk])) (deftest about-read-index-with-error (is (thrown? java.io.IOException (tbi/read-index small-bam-file)))) -(deftest about-read-index-returns-a-map - (is (map? (tbi/read-index test-tabix-file)))) +(deftest about-read-index-returns-tabix-object + (is (instance? cljam.io.tabix.Tabix (tbi/read-index test-tabix-file)))) -(deftest about-read-index-check-the-returning-maps-structure - (is (just-map? {:n-seq number? - :preset number? - :sc number? - :bc number? - :ec number? - :meta number? - :skip number? - :seq vector? - :bin-index vector? - :linear-index vector?} - (tbi/read-index test-tabix-file)))) +(deftest about-read-index-check-the-returning-object + (let [tabix-data ^Tabix (tbi/read-index test-tabix-file)] + (is (number? (.n-ref tabix-data))) + (is (number? (.preset tabix-data))) + (is (number? (.sc tabix-data))) + (is (number? (.bc tabix-data))) + (is (number? (.ec tabix-data))) + (is (number? (.meta tabix-data))) + (is (number? (.skip tabix-data))) + (is (vector? (.seq tabix-data))) + (is (instance? + Chunk + (get + ^clojure.lang.IPersistentVector + (get + ^clojure.lang.IPersistentMap + (get + ^clojure.lang.IPersistentMap + (.bidx tabix-data) 0) 4687) 0))) + (is (vector? + ^clojure.lang.IPersistentVector + (get (.lidx tabix-data) 0))))) (deftest-remote large-file (with-before-after {:before (prepare-cavia!)} @@ -29,7 +42,8 @@ (deftest source-type-test (with-open [server (http-server)] - (are [x] (map? (tbi/read-index x)) + (are [x] ((partial instance? cljam.io.tabix.Tabix) + (tbi/read-index x)) test-tabix-file (cio/file test-tabix-file) (cio/as-url (cio/file test-tabix-file)) diff --git a/test/cljam/io/util/bin_test.clj b/test/cljam/io/util/bin_test.clj new file mode 100644 index 00000000..d4c07052 --- /dev/null +++ b/test/cljam/io/util/bin_test.clj @@ -0,0 +1,14 @@ +(ns cljam.io.util.bin-test + "Tests for cljam.io.util.bin." + (:require [clojure.test :refer :all] + [cljam.test-common :refer :all] + [cljam.io.tabix :as tabix] + [cljam.io.util.bin :as util-bin])) + +(deftest get-spans-returns-a-sequence-including-regions + (let [tabix* (tabix/read-index test-tabix-file)] + (is (seq? (util-bin/get-spans tabix* 0 0 100))) + (is (every? #(and (= 2 (count %)) + (number? (first %)) + (number? (second %))) + (util-bin/get-spans tabix* 0 0 100))))) diff --git a/test/cljam/io/vcf_test.clj b/test/cljam/io/vcf_test.clj index c8dd05b5..976b2931 100644 --- a/test/cljam/io/vcf_test.clj +++ b/test/cljam/io/vcf_test.clj @@ -121,6 +121,27 @@ (is (= (vcf/read-variants v) (vcf/read-variants b))))) +(deftest-remote bin-index-is-done-without-errors-with-a-large-file + (with-before-after {:before (prepare-cavia!)} + (is (not-throw? + (vcf/read-variants-randomly + (vcf/vcf-reader + test-large-vcf-file) + {:chr "chr1" + :start 20 + :end 1000000}))) + (is (not-throw? + (vcf/read-variants-randomly + (vcf/vcf-reader + test-large-vcf-file) + {:chr "chr1" + :start 2000}))) + (is (not-throw? + (vcf/read-variants-randomly + (vcf/vcf-reader + test-large-vcf-file) + {:chr "chr1"}))))) + (deftest writer-test (testing "vcf" (with-before-after {:before (prepare-cache!) diff --git a/test/cljam/test_common.clj b/test/cljam/test_common.clj index bed214cf..b686487c 100644 --- a/test/cljam/test_common.clj +++ b/test/cljam/test_common.clj @@ -42,7 +42,16 @@ :sha1 "dcc3ba10c8432be3094cbf5d6fb1b577317e3429"} {:id "large.tbi" :url "https://test.chrov.is/data/test3_summits.bed.gz.tbi" - :sha1 "1aff56f9961c0b93c6de3a190f02d3264c27a9c7"}]}) + :sha1 "1aff56f9961c0b93c6de3a190f02d3264c27a9c7"} + {:id "large.vcf.gz" + :url "https://test.chrov.is/data/cljam/example-500-10000.vcf.gz" + :sha1 "15eda0cb653e29ced47caafa5c2f58f014e36437"} + {:id "large.vcf.gz.tbi" + :url "https://test.chrov.is/data/cljam/example-500-10000.vcf.gz.tbi" + :sha1 "f5e42e5af8666a39e1db1a477b25f183bf09fc9b"} + {:id "large.vcf.gz.csi" + :url "https://test.chrov.is/data/cljam/example-500-10000.vcf.gz.csi" + :sha1 "568a47f463de8df846e021640d38b8cf8f257e66"}]}) (defn prepare-cavia! [] (with-profile mycavia @@ -189,8 +198,14 @@ (def test-vcf-no-samples-file "test-resources/vcf/test-no-samples.vcf") (def test-vcf-complex-file "test-resources/vcf/test-v4_3-complex.vcf") +(def test-large-vcf-file (cavia/resource mycavia "large.vcf.gz")) +(def test-large-vcf-tbi-file (cavia/resource mycavia "large.vcf.gz.tbi")) +(def test-large-vcf-csi-file (cavia/resource mycavia "large.vcf.gz.csi")) + + ;; ### pileup files + (def test-pileup-file "test-resources/pileup/test.pileup") (def test-pileup-dir "test-resources/pileup/") From a93898407933ae0d8c58e6a30398c9dc8d6a93a3 Mon Sep 17 00:00:00 2001 From: niyarin Date: Tue, 22 Oct 2019 12:55:09 +0900 Subject: [PATCH 2/5] Fix performance and bad expression. --- src/cljam/io/bam_index/reader.clj | 12 ++-- src/cljam/io/bam_index/writer.clj | 4 +- src/cljam/io/tabix.clj | 62 ++++++++---------- src/cljam/io/util/bin.clj | 18 ++--- src/cljam/io/{bam_index => util}/chunk.clj | 14 ++-- src/cljam/io/vcf.clj | 7 +- src/cljam/io/vcf/reader.clj | 61 ++++++++--------- test-resources/vcf/test-v4_3-complex.vcf.gz | Bin 0 -> 1620 bytes .../vcf/test-v4_3-complex.vcf.gz.tbi | Bin 0 -> 285 bytes test/cljam/io/tabix_test.clj | 21 ++---- test/cljam/io/vcf_test.clj | 28 +++++++- test/cljam/test_common.clj | 2 + 12 files changed, 114 insertions(+), 115 deletions(-) rename src/cljam/io/{bam_index => util}/chunk.clj (82%) create mode 100644 test-resources/vcf/test-v4_3-complex.vcf.gz create mode 100644 test-resources/vcf/test-v4_3-complex.vcf.gz.tbi diff --git a/src/cljam/io/bam_index/reader.clj b/src/cljam/io/bam_index/reader.clj index a13046ce..d273e349 100644 --- a/src/cljam/io/bam_index/reader.clj +++ b/src/cljam/io/bam_index/reader.clj @@ -2,11 +2,11 @@ (:require [clojure.java.io :as cio] [cljam.io.util.lsb :as lsb] [cljam.io.bam-index.common :refer [bai-magic]] - [cljam.io.bam-index.chunk :as chunk] + [cljam.io.util.chunk :as chunk] [cljam.util :as util]) (:import java.util.Arrays [java.io FileInputStream Closeable IOException] - [cljam.io.bam_index.chunk Chunk])) + [cljam.io.util.chunk Chunk])) (deftype BAIReader [url reader] Closeable @@ -49,10 +49,10 @@ (defn- read-chunks! [rdr] (let [n (lsb/read-int rdr)] - (loop [i 0, chunks []] - (if (< i n) - (recur (inc i) (conj chunks (Chunk. (lsb/read-long rdr) (lsb/read-long rdr)))) - chunks)))) + (loop [i 0, chunks []] + (if (< i n) + (recur (inc i) (conj chunks (Chunk. (lsb/read-long rdr) (lsb/read-long rdr)))) + chunks)))) (defn- read-bin-index**! [rdr] diff --git a/src/cljam/io/bam_index/writer.clj b/src/cljam/io/bam_index/writer.clj index fe8bcff7..250d76ed 100644 --- a/src/cljam/io/bam_index/writer.clj +++ b/src/cljam/io/bam_index/writer.clj @@ -5,12 +5,12 @@ [cljam.io.util.bgzf :as bgzf] [cljam.io.util.lsb :as lsb] [cljam.io.bam-index.common :refer :all] - [cljam.io.bam-index.chunk :as chunk] + [cljam.io.util.chunk :as chunk] [cljam.io.bam.decoder :as bam-decoder]) (:import [java.io DataOutputStream Closeable] [java.nio ByteBuffer ByteOrder] [cljam.io.bam.decoder BAMPointerBlock] - [cljam.io.bam_index.chunk Chunk])) + [cljam.io.util.chunk Chunk])) (declare make-index-from-blocks) diff --git a/src/cljam/io/tabix.clj b/src/cljam/io/tabix.clj index 9ef30172..dfd09bc9 100644 --- a/src/cljam/io/tabix.clj +++ b/src/cljam/io/tabix.clj @@ -3,57 +3,49 @@ Reader of a TABIX format file." (:require [cljam.io.util.bgzf :as bgzf] [cljam.io.util.lsb :as lsb] - [cljam.io.util.bin :as util-bin]) + [cljam.io.util.bin :as util-bin] + [clojure.string :as cstr]) (:import java.util.Arrays [java.io DataInputStream IOException] - [cljam.io.bam_index.chunk Chunk])) + [cljam.io.util.chunk Chunk])) (deftype Tabix [n-ref preset sc bc ec meta skip seq bidx lidx] util-bin/IBinaryIndex (bidx-ref [this] (.bidx this)) (lidx-ref [this] - (.lidx this))) + (.lidx this)) + (get-ref-index [this chr] + (.indexOf + ^clojure.lang.PersistentVector + (.seq this) + chr))) (def tabix-magic "TBI\1") (defn- read-chunks! [rdr] - (let [n (lsb/read-int rdr)] - (loop [i 0, chunks []] - (if (< i n) - (recur (inc i) (conj chunks (Chunk. (lsb/read-long rdr) (lsb/read-long rdr)))) - chunks)))) + (->> #(Chunk. (lsb/read-long rdr) (lsb/read-long rdr)) + (repeatedly (lsb/read-int rdr)) + vec)) (defn- read-seq - [buf len] - (loop [i 0, j 0, seq* []] - (if (< i len) - (if (zero? (nth buf i)) - (let [ba-len (- i j) - ba (byte-array ba-len)] - (System/arraycopy buf j ba 0 ba-len) - (recur (inc i) (inc i) (conj seq* (String. ba)))) - (recur (inc i) j seq*)) - seq*))) + [^bytes buf] + (cstr/split (String. buf) #"\00")) -(defn- read-bin-index**! +(defn- read-bin-index [rdr] - (let [n (lsb/read-int rdr)] - (loop [i 0, bidx []] - (if (< i n) - (let [bin (lsb/read-int rdr) - chunks (read-chunks! rdr)] - (recur (inc i) (conj bidx {:bin bin, :chunks chunks}))) - bidx)))) + (->> #(hash-map + :bin (lsb/read-int rdr) + :chunks (read-chunks! rdr)) + (repeatedly (lsb/read-int rdr)) + vec)) -(defn- read-linear-index**! +(defn- read-linear-index [rdr] - (let [n (lsb/read-int rdr)] - (loop [i 0, lidx []] - (if (< i n) - (recur (inc i) (conj lidx (lsb/read-long rdr))) - lidx)))) + (->> #(lsb/read-long rdr) + (repeatedly (lsb/read-int rdr)) + vec)) (defn- read-index* [^DataInputStream rdr] @@ -68,14 +60,14 @@ skip (lsb/read-int rdr) len (lsb/read-int rdr) buf (lsb/read-bytes rdr len) - seq (read-seq buf len) + seq (read-seq buf) refs (range n-ref) - all-idx (map (fn [_] [(read-bin-index**! rdr) (read-linear-index**! rdr)]) refs) + all-idx (map (fn [_] [(read-bin-index rdr) (read-linear-index rdr)]) refs) bidx-seq (map first all-idx) bidx (zipmap refs (map (fn [bins] - (zipmap (map :bin bins) (map :chunks bins))) + (into {} (map (juxt :bin :chunks)) bins)) bidx-seq)) lidx (zipmap refs (map second all-idx))] (->Tabix n-ref preset sc bc ec meta diff --git a/src/cljam/io/util/bin.clj b/src/cljam/io/util/bin.clj index b74d6771..ab63ff8f 100644 --- a/src/cljam/io/util/bin.clj +++ b/src/cljam/io/util/bin.clj @@ -1,11 +1,6 @@ (ns cljam.io.util.bin - (:refer-clojure :exclude [compare]) - (:require [cljam.io.bam-index [chunk :as bam-index-chunk] - [writer :as bam-index-writer]]) - (:import [java.io File] - [java.net MalformedURLException URI URL] - [cljam.io.bam_index.chunk Chunk] - [bgzf4j BGZFInputStream BGZFOutputStream])) + (:require [cljam.io.util.chunk :as util-chunk] + [cljam.io.bam-index.writer :as bam-index-writer])) (defn- reg->bins* "Returns candidate bins for the specified region as a vector." @@ -31,15 +26,16 @@ (defprotocol IBinaryIndex (bidx-ref [this]) - (lidx-ref [this])) + (lidx-ref [this]) + (get-ref-index [this chr])) (defn get-spans - [^cljam.io.util.bin.IBinaryIndex index-data ^long ref-idx ^long beg ^long end] + [index-data ^long ref-idx ^long beg ^long end] (let [bins (reg->bins beg end) bidx (get (bidx-ref index-data) ref-idx) lidx (get (lidx-ref index-data) ref-idx) - chunks (into [] (comp (map bidx) cat) bins) + chunks (into [] (mapcat bidx) bins) lin-beg (bam-index-writer/pos->lidx-offset beg) min-offset (get lidx lin-beg 0)] - (->> (bam-index-chunk/optimize-chunks chunks min-offset) + (->> (util-chunk/optimize-chunks chunks min-offset) (map vals)))) diff --git a/src/cljam/io/bam_index/chunk.clj b/src/cljam/io/util/chunk.clj similarity index 82% rename from src/cljam/io/bam_index/chunk.clj rename to src/cljam/io/util/chunk.clj index 65bbb5b4..7c523037 100644 --- a/src/cljam/io/bam_index/chunk.clj +++ b/src/cljam/io/util/chunk.clj @@ -1,4 +1,4 @@ -(ns cljam.io.bam-index.chunk +(ns cljam.io.util.chunk (:refer-clojure :exclude [compare]) (:require [cljam.io.util.bgzf :as bgzf])) @@ -52,10 +52,10 @@ ret (transient [])] (if f (cond - (<= (.end f) min-offset) (recur r last-chunk ret) - (nil? last-chunk) (recur r f (conj! ret f)) - (and (not (overlap? last-chunk f)) - (not (adjacent? last-chunk f))) (recur r f (conj! ret f)) - (> (.end f) (.end last-chunk)) (let [l (assoc last-chunk :end (.end f))] (recur r l (conj! (pop! ret) l))) - :else (recur r last-chunk ret)) + (<= (.end f) min-offset) (recur r last-chunk ret) + (nil? last-chunk) (recur r f (conj! ret f)) + (and (not (overlap? last-chunk f)) + (not (adjacent? last-chunk f))) (recur r f (conj! ret f)) + (> (.end f) (.end last-chunk)) (let [l (assoc last-chunk :end (.end f))] (recur r l (conj! (pop! ret) l))) + :else (recur r last-chunk ret)) (persistent! ret))))) diff --git a/src/cljam/io/vcf.clj b/src/cljam/io/vcf.clj index 1c09abb2..0ed95070 100644 --- a/src/cljam/io/vcf.clj +++ b/src/cljam/io/vcf.clj @@ -10,7 +10,8 @@ [cljam.io.vcf.writer :as vcf-writer] [cljam.io.bcf.reader :as bcf-reader] [cljam.io.bcf.writer :as bcf-writer] - [cljam.io.util.bgzf :as bgzf]) + [cljam.io.util.bgzf :as bgzf] + [cljam.io.tabix :as tabix]) (:import java.io.Closeable cljam.io.vcf.reader.VCFReader cljam.io.vcf.writer.VCFWriter @@ -30,9 +31,11 @@ header (with-open [r (cio/reader (util/compressor-input-stream f))] (vcf-reader/load-header r))] (VCFReader. (util/as-url f) meta-info header + (if (bgzf/bgzip? f) (bgzf/bgzf-input-stream f) - (cio/reader (util/compressor-input-stream f)))))) + (cio/reader (util/compressor-input-stream f))) + (delay (tabix/read-index (str f ".tbi")))))) (defn ^BCFReader bcf-reader "Returns an open cljam.io.bcf.reader.BCFReader of f. Should be used inside diff --git a/src/cljam/io/vcf/reader.clj b/src/cljam/io/vcf/reader.clj index 75c19692..b3afbcb6 100644 --- a/src/cljam/io/vcf/reader.clj +++ b/src/cljam/io/vcf/reader.clj @@ -7,8 +7,8 @@ [camel-snake-kebab.core :refer [->kebab-case-keyword]] [proton.core :refer [as-long]] [cljam.io.util.bin :as util-bin] - [cljam.io.vcf.util :as vcf-util] - [cljam.io.tabix :as tabix]) + [cljam.io.tabix :as tabix] + [cljam.io.vcf.util :as vcf-util]) (:import [java.io Closeable] [cljam.io.tabix Tabix] [clojure.lang LazilyPersistentVector] @@ -19,7 +19,7 @@ (declare read-variants read-variants-randomly) -(deftype VCFReader [url meta-info header reader] +(deftype VCFReader [url meta-info header reader index-delay] Closeable (close [this] (.close ^Closeable (.reader this))) @@ -183,18 +183,17 @@ :vcf identity)] (map parse-fn (read-data-lines (.reader rdr) (.header rdr) kws))))) +(defn- make-lazy-variants [f s] + (if (not-empty s) + (concat + (f (first s)) + (make-lazy-variants f (rest s))))) + (defn read-variants-randomly [^VCFReader rdr {:keys [chr start end depth] :or {depth :deep start 1 end 4294967296}}] (let [kws (mapv keyword (drop 8 (.header rdr))) - tabix-path (str (.getPath ^java.net.URL (.url rdr)) ".tbi") - tabix-data (tabix/read-index tabix-path) - ref-idx (.indexOf - ^clojure.lang.PersistentVector (.seq ^Tabix tabix-data) - (if (and (string? chr) - (> (count chr) 3) - (= (subs chr 0 3) "chr")) - (subs chr 3) - (throw (ex-info "Invalid chr." {:chr chr})))) + tabix-data @(.index-delay rdr) + ref-idx (util-bin/get-ref-index tabix-data chr) spans (if (= ref-idx -1) '() @@ -203,25 +202,21 @@ parse-fn (case depth :deep (vcf-util/variant-parser (.meta-info rdr) (.header rdr)) :vcf identity)] - (reduce - (fn [res span] - (.seek input-stream (first span)) - (loop [res res] - - (if (< (.getFilePointer input-stream) (second span)) - (let [variant - (parse-fn - (parse-data-line - (.readLine input-stream) - kws)) - variant-pos (:pos variant)] - (if (or (< variant-pos (first span)) - (>= variant-pos (second span))) - res - (recur - (cons - variant - res)))) - res))) - '() + + (make-lazy-variants + (fn [[chunk-beg ^long chunk-end]] + (.seek input-stream chunk-beg) + + (->> #(when (< (.getFilePointer input-stream) chunk-end) + (-> input-stream + .readLine + (parse-data-line kws) + parse-fn)) + repeatedly + (take-while identity) + (filter + (fn [{chr' :chr :keys [pos ref info]}] + (and (= chr' chr) + (<= pos end) + (<= start (get info :END (dec (+ pos (count ref)))))))))) spans))) diff --git a/test-resources/vcf/test-v4_3-complex.vcf.gz b/test-resources/vcf/test-v4_3-complex.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..9c3520d16633a5119576f80caef52991537419cf GIT binary patch literal 1620 zcmV-a2CMlWiwFb&00000{{{d;LjnLd2CZ0OZ=*O6{Y-y_)#{`>$>%i$x-A?lQ4(mR zY|}!P-QG8vI1M@iSs<-?x*vaILuld+S)@Bf24j!k43Ec*$;HJoN!ceJ;kcDtWg57!Pd~r!0G}R<@>JT^pCWrhPgz(B2I+ z^?G1v*OI|tXP{nRY1f8f$Oihm0o7C;EV#jD6_;>#_9iZ>Bu-((naqT}dl%s$E9XVB zsggXi&)t0UhVHiOCsv>+E|QoJpMTn0G8|0~mke3Tezf{>FIjCjX)^y2ZhRLW8jiD) z6-Ue>+3bZC<<86-PiCV)SbkUOq2}%RV*OJciyxnaXh*k;+iyghO@T~v&UJ*o>?K2n=O+C*Ahx#zIcgK z6c_n6TOdF{AW@X8HwBv~B||Tq{j4fwU-#4SKl%cMUcUD3N@yJ&X)!Iz6IXij=Ajb@tYG$E|1a2&5?uZDH%Yf z%w5GrJg;QZ8jN;Uy7N|o!3pALc?D0dVDSPi*ano+-g|xDQfPrEfgYh#gcTq$BQFo7 z<(LOPg;JPD$U338N)D3q3WGC__r5xEk;DJDo46XDGcML$6V1i;hhQxeO0 zbNbntjoV1*j)_>hkuZn7R5C~FZ6zu?`jx+M^BB@~A+p{&ng4bOtx`(;p8|6$1 z+^440Dh(o%&&S@E1#yRh}=(AH(UB|J!If=;>Pn zTAXfr`qhC3h;$04Pc3Lcjti$(J!mmHnCV*+TATxB+}mLKS%J;IG9D_hWmaGxs62de zCQAMBqww6U)N7T{`FuYaQFYhRcAdeQs=z3NjGm`?W|a)SB4MAuu=uvo%9qwQOJK)o z@sEuvStgLNl{4^c}^ASI@qybp9KX=#Tp)99UJ%(Mz|{#Q=&l| zK+`dIO$nb1ZAvj!u?hwnRyC{vn(sndyumIW;t}>N*s<6G5qb~`B|Ng`xI0lESQOe5 z;2GyjT#E#TTzi6Z>{{WC-9%+Ih1(KU78o>#_vY<`2{i_q2Q7r=nvhwc!F2%F0bKiI z)1SZzE`7L+U^;^72(BYnA!d)CqJGd!m6{NWDcCg|X3dsN6K>tC$yd!m!#h}+wfKG) z)Hi*x=)JF*9`^BVjf@8r>lDNH`XzYbJY7S(g7y*GB{Ty-0gWSKe%^(k*1cv(YllQn zs~vs85&nw5;Ya+ZscHk=;77pJ4UYWr3VfOTSxlGpllYocD<_0b%0?;UfBu+{0MXEf z_ga08_ejI-?{Lf{XS$;6Iw*anKst0C2h>jaYvbC4OPmOC6CrEDMNNd7iO?_+vL!;Y zM97o~K@uTAB4kH|jW42*Bj8TuV|5MaF!*&MLTBTsXsriX^c;*zpkA$Quje{jSc zaDR}sadUR!`UhgH-}JxzR{y&4@4M%p-}8A+QlT#T zzisBOY~@Q&Zn&-NU;gtp|Ia=1mfyH$Hv2UjANQ0~K6e-xS{9v}h$Z?t;j literal 0 HcmV?d00001 diff --git a/test/cljam/io/tabix_test.clj b/test/cljam/io/tabix_test.clj index 214fc20c..6805d81a 100644 --- a/test/cljam/io/tabix_test.clj +++ b/test/cljam/io/tabix_test.clj @@ -5,13 +5,13 @@ [cljam.io.tabix :as tbi]) (:import [cljam.io.tabix Tabix] - [cljam.io.bam_index.chunk Chunk])) + [cljam.io.util.chunk Chunk])) (deftest about-read-index-with-error (is (thrown? java.io.IOException (tbi/read-index small-bam-file)))) (deftest about-read-index-returns-tabix-object - (is (instance? cljam.io.tabix.Tabix (tbi/read-index test-tabix-file)))) + (is (instance? Tabix (tbi/read-index test-tabix-file)))) (deftest about-read-index-check-the-returning-object (let [tabix-data ^Tabix (tbi/read-index test-tabix-file)] @@ -23,18 +23,8 @@ (is (number? (.meta tabix-data))) (is (number? (.skip tabix-data))) (is (vector? (.seq tabix-data))) - (is (instance? - Chunk - (get - ^clojure.lang.IPersistentVector - (get - ^clojure.lang.IPersistentMap - (get - ^clojure.lang.IPersistentMap - (.bidx tabix-data) 0) 4687) 0))) - (is (vector? - ^clojure.lang.IPersistentVector - (get (.lidx tabix-data) 0))))) + (is (instance? Chunk (get (get (get (.bidx tabix-data) 0) 4687) 0))) + (is (vector? (get (.lidx tabix-data) 0))))) (deftest-remote large-file (with-before-after {:before (prepare-cavia!)} @@ -42,8 +32,7 @@ (deftest source-type-test (with-open [server (http-server)] - (are [x] ((partial instance? cljam.io.tabix.Tabix) - (tbi/read-index x)) + (are [x] (instance? Tabix (tbi/read-index x)) test-tabix-file (cio/file test-tabix-file) (cio/as-url (cio/file test-tabix-file)) diff --git a/test/cljam/io/vcf_test.clj b/test/cljam/io/vcf_test.clj index 976b2931..ecbf636f 100644 --- a/test/cljam/io/vcf_test.clj +++ b/test/cljam/io/vcf_test.clj @@ -127,20 +127,42 @@ (vcf/read-variants-randomly (vcf/vcf-reader test-large-vcf-file) - {:chr "chr1" + {:chr "1" :start 20 :end 1000000}))) (is (not-throw? (vcf/read-variants-randomly (vcf/vcf-reader test-large-vcf-file) - {:chr "chr1" + {:chr "1" :start 2000}))) (is (not-throw? (vcf/read-variants-randomly (vcf/vcf-reader test-large-vcf-file) - {:chr "chr1"}))))) + {:chr "1"}))))) + +(deftest read-randomly-variants-complex-test + (doseq [index [{:chr "1"} + {:chr "2" :start 30000} + {:chr "2" :start 40000} + {:chr "2" :end 40000} + {:chr "2" :start 30000 :end 40000}]] + (with-open [vcf-file (vcf/vcf-reader test-vcf-complex-file) + vcf-gz-file (vcf/vcf-reader test-vcf-complex-gz-file)] + (is + (= + (vcf/read-variants-randomly + vcf-gz-file + index) + (let [{:keys [chr start end] :or {start 1 end 4294967296}} index] + (filter + (fn [variant] + (and (= chr (:chr variant)) + (<= start (:pos variant)) + (> end (:pos variant)))) + (vcf/read-variants + vcf-file)))))))) (deftest writer-test (testing "vcf" diff --git a/test/cljam/test_common.clj b/test/cljam/test_common.clj index b686487c..88bee24d 100644 --- a/test/cljam/test_common.clj +++ b/test/cljam/test_common.clj @@ -197,6 +197,8 @@ (def test-vcf-v4_3-file "test-resources/vcf/test-v4_3.vcf") (def test-vcf-no-samples-file "test-resources/vcf/test-no-samples.vcf") (def test-vcf-complex-file "test-resources/vcf/test-v4_3-complex.vcf") +(def test-vcf-complex-gz-file "test-resources/vcf/test-v4_3-complex.vcf.gz") +(def test-vcf-complex-tbi-file "test-resources/vcf/test-v4_3-complex.vcf.gz.tbi") (def test-large-vcf-file (cavia/resource mycavia "large.vcf.gz")) (def test-large-vcf-tbi-file (cavia/resource mycavia "large.vcf.gz.tbi")) From 2489eddc758a1dde14c9b7e56c51fa9624d63fbf Mon Sep 17 00:00:00 2001 From: niyarin Date: Mon, 11 Nov 2019 10:38:37 +0900 Subject: [PATCH 3/5] Fix performance and bad expression. --- src/cljam/io/bam_index/core.clj | 10 ++++--- src/cljam/io/tabix.clj | 10 ++++--- src/cljam/io/util/bin.clj | 17 +++++++----- src/cljam/io/vcf.clj | 5 ++-- src/cljam/io/vcf/reader.clj | 18 ++++++++----- test/cljam/io/util/bin_test.clj | 2 +- test/cljam/io/vcf_test.clj | 47 ++++++++++++++++++++------------- 7 files changed, 65 insertions(+), 44 deletions(-) diff --git a/src/cljam/io/bam_index/core.clj b/src/cljam/io/bam_index/core.clj index 51d290be..a677deb8 100644 --- a/src/cljam/io/bam_index/core.clj +++ b/src/cljam/io/bam_index/core.clj @@ -12,10 +12,12 @@ (deftype BAMIndex [url bidx lidx] util-bin/IBinaryIndex - (bidx-ref [this] - (.bidx this)) - (lidx-ref [this] - (.lidx this))) + (get-chunks [this ref-idx bins] + (into [] (mapcat (get (.bidx this) ref-idx) bins))) + (get-min-offset [this ref-idx beg] + (util-bin/calculate-min-offset + (get (.lidx this) ref-idx) + beg))) (defn bam-index [f] (let [{:keys [bidx lidx]} (with-open [r ^BAIReader (reader/reader f)] (reader/read-all-index! r))] diff --git a/src/cljam/io/tabix.clj b/src/cljam/io/tabix.clj index dfd09bc9..3d9e72bf 100644 --- a/src/cljam/io/tabix.clj +++ b/src/cljam/io/tabix.clj @@ -11,10 +11,12 @@ (deftype Tabix [n-ref preset sc bc ec meta skip seq bidx lidx] util-bin/IBinaryIndex - (bidx-ref [this] - (.bidx this)) - (lidx-ref [this] - (.lidx this)) + (get-chunks [this ref-idx bins] + (into [] (mapcat (get (.bidx this) ref-idx) bins))) + (get-min-offset [this ref-idx beg] + (util-bin/calculate-min-offset + (get (.lidx this) ref-idx) + beg)) (get-ref-index [this chr] (.indexOf ^clojure.lang.PersistentVector diff --git a/src/cljam/io/util/bin.clj b/src/cljam/io/util/bin.clj index ab63ff8f..4e8a746c 100644 --- a/src/cljam/io/util/bin.clj +++ b/src/cljam/io/util/bin.clj @@ -25,17 +25,20 @@ (def ^:private reg->bins (memoize reg->bins*)) (defprotocol IBinaryIndex - (bidx-ref [this]) - (lidx-ref [this]) + (get-chunks [this ref-idx bins]) + (get-min-offset [this ref-idx beg]) (get-ref-index [this chr])) +(defn calculate-min-offset + "Calculate offset for get-spans." + [lidx beg] + (get lidx (bam-index-writer/pos->lidx-offset beg) 0)) + (defn get-spans + "Calculate span information for random access from ndex data such as tabix." [index-data ^long ref-idx ^long beg ^long end] (let [bins (reg->bins beg end) - bidx (get (bidx-ref index-data) ref-idx) - lidx (get (lidx-ref index-data) ref-idx) - chunks (into [] (mapcat bidx) bins) - lin-beg (bam-index-writer/pos->lidx-offset beg) - min-offset (get lidx lin-beg 0)] + chunks (get-chunks index-data ref-idx bins) + min-offset (get-min-offset index-data ref-idx beg)] (->> (util-chunk/optimize-chunks chunks min-offset) (map vals)))) diff --git a/src/cljam/io/vcf.clj b/src/cljam/io/vcf.clj index 0ed95070..cb36f3a2 100644 --- a/src/cljam/io/vcf.clj +++ b/src/cljam/io/vcf.clj @@ -79,10 +79,11 @@ (defn read-variants-randomly "Reads variants of the VCF file randomly, returning them as a lazy sequence." - ([rdr option] + ([rdr span-option depth-option] (vcf-reader/read-variants-randomly rdr - option))) + span-option + depth-option))) ;; Writing ;; ------- diff --git a/src/cljam/io/vcf/reader.clj b/src/cljam/io/vcf/reader.clj index b3afbcb6..856d4023 100644 --- a/src/cljam/io/vcf/reader.clj +++ b/src/cljam/io/vcf/reader.clj @@ -184,13 +184,18 @@ (map parse-fn (read-data-lines (.reader rdr) (.header rdr) kws))))) (defn- make-lazy-variants [f s] - (if (not-empty s) - (concat - (f (first s)) - (make-lazy-variants f (rest s))))) + (when-first [fs s] + (lazy-cat + (f fs) + (lazy-seq + (make-lazy-variants f (rest s)))))) (defn read-variants-randomly - [^VCFReader rdr {:keys [chr start end depth] :or {depth :deep start 1 end 4294967296}}] + "Read variants of the bgzip compressed VCF file randomly using tabix file. + Returning them as a lazy sequence." + [^VCFReader rdr + {:keys [chr start end] :or {start 1 end 4294967296}} + {:keys [depth] :or {depth :deep}}] (let [kws (mapv keyword (drop 8 (.header rdr))) tabix-data @(.index-delay rdr) ref-idx (util-bin/get-ref-index tabix-data chr) @@ -198,7 +203,7 @@ (if (= ref-idx -1) '() (util-bin/get-spans tabix-data ref-idx start end)) - input-stream ^bgzf4j.BGZFInputStream (.reader rdr) + input-stream ^BGZFInputStream (.reader rdr) parse-fn (case depth :deep (vcf-util/variant-parser (.meta-info rdr) (.header rdr)) :vcf identity)] @@ -206,7 +211,6 @@ (make-lazy-variants (fn [[chunk-beg ^long chunk-end]] (.seek input-stream chunk-beg) - (->> #(when (< (.getFilePointer input-stream) chunk-end) (-> input-stream .readLine diff --git a/test/cljam/io/util/bin_test.clj b/test/cljam/io/util/bin_test.clj index d4c07052..efebcbaa 100644 --- a/test/cljam/io/util/bin_test.clj +++ b/test/cljam/io/util/bin_test.clj @@ -7,7 +7,7 @@ (deftest get-spans-returns-a-sequence-including-regions (let [tabix* (tabix/read-index test-tabix-file)] - (is (seq? (util-bin/get-spans tabix* 0 0 100))) + (is (= [[0 50872]] (util-bin/get-spans tabix* 0 0 100))) (is (every? #(and (= 2 (count %)) (number? (first %)) (number? (second %))) diff --git a/test/cljam/io/vcf_test.clj b/test/cljam/io/vcf_test.clj index ecbf636f..3670556e 100644 --- a/test/cljam/io/vcf_test.clj +++ b/test/cljam/io/vcf_test.clj @@ -123,24 +123,32 @@ (deftest-remote bin-index-is-done-without-errors-with-a-large-file (with-before-after {:before (prepare-cavia!)} - (is (not-throw? - (vcf/read-variants-randomly - (vcf/vcf-reader - test-large-vcf-file) - {:chr "1" - :start 20 - :end 1000000}))) - (is (not-throw? - (vcf/read-variants-randomly - (vcf/vcf-reader - test-large-vcf-file) - {:chr "1" - :start 2000}))) - (is (not-throw? - (vcf/read-variants-randomly - (vcf/vcf-reader - test-large-vcf-file) - {:chr "1"}))))) + (with-open [v (vcf/vcf-reader + test-large-vcf-file)] + (is (not-throw? + (vcf/read-variants-randomly + v + {:chr "chr1" + :start 20 + :end 1000000} + {})))) + + (with-open [v (vcf/vcf-reader + test-large-vcf-file)] + (is (not-throw? + (vcf/read-variants-randomly + v + {:chr "chr1" + :start 2000} + {})))) + + (with-open [v (vcf/vcf-reader + test-large-vcf-file)] + (is (not-throw? + (vcf/read-variants-randomly + v + {:chr "chr1"} + {})))))) (deftest read-randomly-variants-complex-test (doseq [index [{:chr "1"} @@ -154,7 +162,8 @@ (= (vcf/read-variants-randomly vcf-gz-file - index) + index + {}) (let [{:keys [chr start end] :or {start 1 end 4294967296}} index] (filter (fn [variant] From cdba04a6f802351261a91eb474e76727c3bffd63 Mon Sep 17 00:00:00 2001 From: niyarin Date: Mon, 11 Nov 2019 18:07:04 +0900 Subject: [PATCH 4/5] Fix bad expressions. --- src/cljam/io/bam_index/core.clj | 8 ++++---- src/cljam/io/bam_index/writer.clj | 14 +++++--------- src/cljam/io/tabix.clj | 7 ++++--- src/cljam/io/util/bin.clj | 12 ++++++------ src/cljam/io/vcf/reader.clj | 12 +++--------- 5 files changed, 22 insertions(+), 31 deletions(-) diff --git a/src/cljam/io/bam_index/core.clj b/src/cljam/io/bam_index/core.clj index a677deb8..58f6b498 100644 --- a/src/cljam/io/bam_index/core.clj +++ b/src/cljam/io/bam_index/core.clj @@ -3,7 +3,8 @@ (:require [clojure.java.io :as cio] [clojure.tools.logging :as logging] [cljam.io.bam-index [reader :as reader] - [writer :as writer]] + [writer :as writer] + [common :as common]] [cljam.util :as util] [cljam.io.util.bin :as util-bin]) (:import [java.io DataOutputStream FileOutputStream] @@ -15,9 +16,8 @@ (get-chunks [this ref-idx bins] (into [] (mapcat (get (.bidx this) ref-idx) bins))) (get-min-offset [this ref-idx beg] - (util-bin/calculate-min-offset - (get (.lidx this) ref-idx) - beg))) + (get (get (.lidx this) ref-idx) + (util-bin/pos->lidx-offset beg common/linear-index-shift) 0))) (defn bam-index [f] (let [{:keys [bidx lidx]} (with-open [r ^BAIReader (reader/reader f)] (reader/read-all-index! r))] diff --git a/src/cljam/io/bam_index/writer.clj b/src/cljam/io/bam_index/writer.clj index 250d76ed..bc78f3a6 100644 --- a/src/cljam/io/bam_index/writer.clj +++ b/src/cljam/io/bam_index/writer.clj @@ -4,6 +4,7 @@ [cljam.io.sam.util :as sam-util] [cljam.io.util.bgzf :as bgzf] [cljam.io.util.lsb :as lsb] + [cljam.io.util.bin :as util-bin] [cljam.io.bam-index.common :refer :all] [cljam.io.util.chunk :as chunk] [cljam.io.bam.decoder :as bam-decoder]) @@ -22,18 +23,13 @@ (close [this] (.close writer))) -;; Indexing -;; -------- - -(defn pos->lidx-offset - [^long pos] - (bit-shift-right (if (<= pos 0) 0 (dec pos)) linear-index-shift)) ;; ### Intermediate data definitions ;; ;; Use record for performance. ;; Record is faster than map for retrieving elements. + (defrecord MetaData [^long first-offset ^long last-offset ^long aligned-alns ^long unaligned-alns]) (defrecord IndexStatus [^MetaData meta-data bin-index linear-index]) @@ -95,11 +91,11 @@ aln-beg (.pos aln) aln-end (.end aln) win-beg (if (zero? aln-end) - (pos->lidx-offset (dec aln-beg)) - (pos->lidx-offset aln-beg)) + (util-bin/pos->lidx-offset (dec aln-beg) linear-index-shift) + (util-bin/pos->lidx-offset aln-beg linear-index-shift)) win-end (if (zero? aln-end) win-beg - (pos->lidx-offset aln-end)) + (util-bin/pos->lidx-offset aln-end linear-index-shift)) min* (fn [x] (if x (min x beg) beg))] (loop [i win-beg, ret linear-index] diff --git a/src/cljam/io/tabix.clj b/src/cljam/io/tabix.clj index 3d9e72bf..ac41ec3b 100644 --- a/src/cljam/io/tabix.clj +++ b/src/cljam/io/tabix.clj @@ -9,14 +9,15 @@ [java.io DataInputStream IOException] [cljam.io.util.chunk Chunk])) +(def ^:const linear-index-shift 14) + (deftype Tabix [n-ref preset sc bc ec meta skip seq bidx lidx] util-bin/IBinaryIndex (get-chunks [this ref-idx bins] (into [] (mapcat (get (.bidx this) ref-idx) bins))) (get-min-offset [this ref-idx beg] - (util-bin/calculate-min-offset - (get (.lidx this) ref-idx) - beg)) + (get (get (.lidx this) ref-idx) + (util-bin/pos->lidx-offset beg linear-index-shift) 0)) (get-ref-index [this chr] (.indexOf ^clojure.lang.PersistentVector diff --git a/src/cljam/io/util/bin.clj b/src/cljam/io/util/bin.clj index 4e8a746c..6afc5b0f 100644 --- a/src/cljam/io/util/bin.clj +++ b/src/cljam/io/util/bin.clj @@ -1,6 +1,7 @@ (ns cljam.io.util.bin - (:require [cljam.io.util.chunk :as util-chunk] - [cljam.io.bam-index.writer :as bam-index-writer])) + (:require [cljam.io.util.chunk :as util-chunk])) + +(def ^:const linear-index-shift 14) (defn- reg->bins* "Returns candidate bins for the specified region as a vector." @@ -29,10 +30,9 @@ (get-min-offset [this ref-idx beg]) (get-ref-index [this chr])) -(defn calculate-min-offset - "Calculate offset for get-spans." - [lidx beg] - (get lidx (bam-index-writer/pos->lidx-offset beg) 0)) +(defn pos->lidx-offset + [^long pos ^long linear-index-shift] + (bit-shift-right (if (<= pos 0) 0 (dec pos)) linear-index-shift)) (defn get-spans "Calculate span information for random access from ndex data such as tabix." diff --git a/src/cljam/io/vcf/reader.clj b/src/cljam/io/vcf/reader.clj index 856d4023..cd614bf9 100644 --- a/src/cljam/io/vcf/reader.clj +++ b/src/cljam/io/vcf/reader.clj @@ -7,10 +7,8 @@ [camel-snake-kebab.core :refer [->kebab-case-keyword]] [proton.core :refer [as-long]] [cljam.io.util.bin :as util-bin] - [cljam.io.tabix :as tabix] [cljam.io.vcf.util :as vcf-util]) (:import [java.io Closeable] - [cljam.io.tabix Tabix] [clojure.lang LazilyPersistentVector] bgzf4j.BGZFInputStream)) @@ -187,8 +185,7 @@ (when-first [fs s] (lazy-cat (f fs) - (lazy-seq - (make-lazy-variants f (rest s)))))) + (make-lazy-variants f (rest s))))) (defn read-variants-randomly "Read variants of the bgzip compressed VCF file randomly using tabix file. @@ -199,15 +196,12 @@ (let [kws (mapv keyword (drop 8 (.header rdr))) tabix-data @(.index-delay rdr) ref-idx (util-bin/get-ref-index tabix-data chr) - spans - (if (= ref-idx -1) - '() - (util-bin/get-spans tabix-data ref-idx start end)) + spans (when-not (neg? ref-idx) + (util-bin/get-spans tabix-data ref-idx start end)) input-stream ^BGZFInputStream (.reader rdr) parse-fn (case depth :deep (vcf-util/variant-parser (.meta-info rdr) (.header rdr)) :vcf identity)] - (make-lazy-variants (fn [[chunk-beg ^long chunk-end]] (.seek input-stream chunk-beg) From a6d66925d87bea0ab73dd4db5270a2bd1e140112 Mon Sep 17 00:00:00 2001 From: niyarin Date: Tue, 12 Nov 2019 20:28:24 +0900 Subject: [PATCH 5/5] Fix bad expressions. --- src/cljam/io/util/bin.clj | 2 +- src/cljam/io/vcf/reader.clj | 10 ++-------- test/cljam/io/tabix_test.clj | 4 ++-- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/cljam/io/util/bin.clj b/src/cljam/io/util/bin.clj index 6afc5b0f..a63bc87c 100644 --- a/src/cljam/io/util/bin.clj +++ b/src/cljam/io/util/bin.clj @@ -35,7 +35,7 @@ (bit-shift-right (if (<= pos 0) 0 (dec pos)) linear-index-shift)) (defn get-spans - "Calculate span information for random access from ndex data such as tabix." + "Calculate span information for random access from index data such as tabix." [index-data ^long ref-idx ^long beg ^long end] (let [bins (reg->bins beg end) chunks (get-chunks index-data ref-idx bins) diff --git a/src/cljam/io/vcf/reader.clj b/src/cljam/io/vcf/reader.clj index cd614bf9..c8d32a7e 100644 --- a/src/cljam/io/vcf/reader.clj +++ b/src/cljam/io/vcf/reader.clj @@ -181,14 +181,8 @@ :vcf identity)] (map parse-fn (read-data-lines (.reader rdr) (.header rdr) kws))))) -(defn- make-lazy-variants [f s] - (when-first [fs s] - (lazy-cat - (f fs) - (make-lazy-variants f (rest s))))) - (defn read-variants-randomly - "Read variants of the bgzip compressed VCF file randomly using tabix file. + "Reads variants of the bgzip compressed VCF file randomly using tabix file. Returning them as a lazy sequence." [^VCFReader rdr {:keys [chr start end] :or {start 1 end 4294967296}} @@ -202,7 +196,7 @@ parse-fn (case depth :deep (vcf-util/variant-parser (.meta-info rdr) (.header rdr)) :vcf identity)] - (make-lazy-variants + (mapcat (fn [[chunk-beg ^long chunk-end]] (.seek input-stream chunk-beg) (->> #(when (< (.getFilePointer input-stream) chunk-end) diff --git a/test/cljam/io/tabix_test.clj b/test/cljam/io/tabix_test.clj index 6805d81a..aecb6301 100644 --- a/test/cljam/io/tabix_test.clj +++ b/test/cljam/io/tabix_test.clj @@ -23,8 +23,8 @@ (is (number? (.meta tabix-data))) (is (number? (.skip tabix-data))) (is (vector? (.seq tabix-data))) - (is (instance? Chunk (get (get (get (.bidx tabix-data) 0) 4687) 0))) - (is (vector? (get (.lidx tabix-data) 0))))) + (is (instance? Chunk (get (get (get (.bidx tabix-data) 0) 4687) 0))) + (is (vector? (get (.lidx tabix-data) 0))))) (deftest-remote large-file (with-before-after {:before (prepare-cavia!)}