Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of reading BAM files. #22

Merged
merged 2 commits into from
Jan 25, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,19 @@
:url "http://www.apache.org/licenses/LICENSE-2.0.html"}
:dependencies [[org.clojure/tools.logging "0.3.1"]
[org.clojure/tools.cli "0.3.5"]
[org.apache.commons/commons-compress "1.12"]
[org.apache.commons/commons-compress "1.13"]
[me.raynes/fs "1.4.6"]
[clj-sub-command "0.2.3"]
[digest "1.4.4"]
[clj-sub-command "0.3.0"]
[digest "1.4.5"]
[bgzf4j "0.1.0"]
[com.climate/claypoole "1.1.3"]
[com.climate/claypoole "1.1.4"]
[camel-snake-kebab "0.4.0"]]
:plugins [[lein-midje "3.2"]]
:plugins [[lein-midje "3.2.1"]]
:profiles {:dev {:dependencies [[org.clojure/clojure "1.8.0"]
[midje "1.8.3" :exclusions [slingshot]]
[cavia "0.2.3"]]
[cavia "0.3.0"]]
:plugins [[lein-bin "0.3.5"]
[lein-codox "0.9.5"]
[lein-codox "0.10.2"]
[lein-marginalia "0.9.0"]]
:main cljam.main
:aot [cljam.main]
Expand Down
7 changes: 3 additions & 4 deletions src/cljam/bam/decoder.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
"Decoder of BAM alignment blocks."
(:require [clojure.string :refer [join]]
[cljam.util :refer [ubyte hex-string->bytes]]
[cljam.util.sam-util :refer [phred->fastq compressed-bases->chars
ref-name]]
[cljam.util.sam-util :refer [phred->fastq compressed-bases->str ref-name]]
[cljam.lsb :as lsb]
[cljam.bam.common :refer [fixed-block-size]])
(:import java.util.Arrays
Expand Down Expand Up @@ -80,7 +79,7 @@
(phred->fastq b)))

(defn decode-seq [seq-bytes length]
(join (compressed-bases->chars length seq-bytes 0)))
(compressed-bases->str length seq-bytes 0))

(defn decode-next-ref-id [refs n rname]
(cond
Expand Down Expand Up @@ -133,7 +132,7 @@
qname (lsb/read-string buffer (dec l-read-name))
_ (lsb/skip buffer 1)
cigar (decode-cigar (lsb/read-bytes buffer (* n-cigar-op 4)))
seq (decode-seq (lsb/read-bytes buffer (/ (inc l-seq) 2)) l-seq)
seq (decode-seq (lsb/read-bytes buffer (quot (inc l-seq) 2)) l-seq)
qual (decode-qual (lsb/read-bytes buffer l-seq))
rest (lsb/read-bytes buffer (options-size (:size block)
l-read-name
Expand Down
35 changes: 32 additions & 3 deletions src/cljam/util/sam_util.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
"Utilities related to SAM/BAM format."
(:require [clojure.string :as cstr]
[cljam.cigar :refer [count-ref]]
[cljam.util :refer [ubyte str->int str->float]]))
[cljam.util :refer [ubyte str->int str->float]])
(:import [java.nio CharBuffer ByteBuffer]
[java.nio.charset StandardCharsets]))

;;; parse

Expand Down Expand Up @@ -318,6 +320,26 @@
(aset-byte result i (byte (bit-or h l)))
(recur (inc i)))))))

(def ^:const ^:private seq-byte-table
(let [nibble-table "=ACMGRSVTWYHKDBN"]
(->> (for [i nibble-table j nibble-table] [i j])
(apply concat)
cstr/join)))

(defn compressed-bases->str
"Decode a sequence from byte array to String."
[^long length ^bytes compressed-bases ^long compressed-offset]
(let [cb (CharBuffer/allocate (inc length))
bb (ByteBuffer/wrap compressed-bases)]
(.position bb compressed-offset)
(dotimes [_ (quot (inc length) 2)]
(let [i (-> (.get bb) (bit-and 0xff) (* 2))]
(.put cb (.charAt seq-byte-table i))
(.put cb (.charAt seq-byte-table (inc i)))))
(.limit cb length)
(.flip cb)
(.toString cb)))

(defn compressed-bases->chars [length compressed-bases compressed-offset]
(let [bases (apply concat
(for [i (range 1 length) :when (odd? i)]
Expand Down Expand Up @@ -353,9 +375,16 @@
(defmulti phred->fastq class)

(defmethod phred->fastq (class (byte-array nil))
[b]
[^bytes b]
(when-not (nil? b)
(cstr/join (map #(phred->fastq (int (bit-and % 0xff))) b))))
(let [bb (ByteBuffer/wrap b)
cb (CharBuffer/allocate (alength b))]
(loop []
(when (.hasRemaining bb)
(.put cb (char (+ 33 (.get bb))))
(recur)))
(.flip cb)
(.toString cb))))

(def ^:const max-phred-score 93)

Expand Down
4 changes: 2 additions & 2 deletions test/cljam/t_common.clj
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
(ns cljam.t-common
(:use [clojure.java.io :only [file]])
(:require [pandect.core :refer [md5-file]]
(:require [digest]
[cljam.sam :as sam]
[cljam.bam :as bam]
[cljam.io :as io]
Expand Down Expand Up @@ -284,7 +284,7 @@
(defn same-file?
"Returns true if the two files' MD5 hash are same, false if not."
[f1 f2]
(= (md5-file f1) (md5-file f2)))
(= (digest/sha1 (file f1)) (digest/sha1 (file f2))))

;;;; FASTA

Expand Down
26 changes: 25 additions & 1 deletion test/cljam/util/t_sam_util.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
(:require [midje.sweet :refer :all]
[cljam.t-common :refer :all]
[cljam.util :as util]
[cljam.util.sam-util :as sam-util]))
[cljam.util.sam-util :as sam-util]
[clojure.string :as cstr]))

(tabular
(fact "about char->compressed-base-high"
Expand Down Expand Up @@ -99,6 +100,29 @@
(util/ubyte 0xd0) \D
(util/ubyte 0xe0) \B)

(def nibble-table "=ACMGRSVTWYHKDBN")

(tabular
(fact
"about compressed-bases->str"
(let [ba (byte-array (mapv util/ubyte ?bases))]
(cstr/join (sam-util/compressed-bases->chars ?length ba ?offset)) => ?expected
(sam-util/compressed-bases->str ?length ba ?offset) => ?expected))
?length ?bases ?offset ?expected
1 [0x00] 0 "="
2 [0x00] 0 "=="
1 [0x10] 0 "A"
2 [0x12] 0 "AC"
4 [0x12 0x8F] 0 "ACTN"
1 [0x12 0x8F] 1 "T"
2 [0x12 0x8F] 1 "TN"
16 [0x01 0x23 0x45 0x67 0x89 0xAB 0xCD 0xEF] 0 nibble-table
14 [0x01 0x23 0x45 0x67 0x89 0xAB 0xCD 0xEF] 1 (subs nibble-table 2)
2 [0x01 0x23 0x45 0x67 0x89 0xAB 0xCD 0xEF] 7 "BN"
512 (range 256) 0 (->> (for [i nibble-table j nibble-table] [i j])
(apply concat)
cstr/join))

;; Reference functions
;; -------------------

Expand Down