Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support SAM normalization #86

Merged
merged 1 commit into from
Jun 23, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 19 additions & 18 deletions src/cljam/algo/normal.clj
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
(ns cljam.algo.normal
(:require [clojure.java.io :refer [file]]
[cljam.io.sam :as sam]
(:require [cljam.io.sam :as sam]
[cljam.io.util :as io-util]
[cljam.util :refer [swap]]
[cljam.util.chromosome :refer [normalize-chromosome-key]]))

;; TODO add test cases
;; TODO add functions to core.clj

(def ^:private chunk-size 1500000)

(defn- normalize-header
Expand All @@ -15,22 +12,26 @@
(assoc hdr
:SQ (vec (map #(swap % :SN normalize-chromosome-key) seq)))))

(defmulti normalize (fn [rdr wtr]
(case (str (type rdr))
"class cljam.io.sam.reader.SAMReader" :sam
"class cljam.io.bam.reader.BAMReader" :bam
nil)))

(defmethod normalize :sam
;; TODO: copy all rest of stream for performance. (do not read, parse and write)
(defn- transfer-blocks
[rdr wtr]
;; TODO implement
)
(doseq [blks (partition-all chunk-size (sam/read-blocks rdr))]
(sam/write-blocks wtr blks)))

(defn- transfer-alignments
[rdr wtr hdr]
(doseq [alns (->> (sam/read-alignments rdr)
(map #(update % :rname normalize-chromosome-key))
(partition-all chunk-size))]
(sam/write-alignments wtr alns hdr)))

(defmethod normalize :bam
(defn normalize
"Normalizes references of the SAM/BAM format. Be noted that performance may be
degraded if either or both of rdr and wtr is one about the SAM format."
[rdr wtr]
(let [hdr (normalize-header (sam/read-header rdr))]
(sam/write-header wtr hdr)
(sam/write-refs wtr hdr)
;; TODO copy all rest of stream for performance. (do not read, parse and write)
(doseq [blks (partition-all chunk-size (sam/read-blocks rdr))]
(sam/write-blocks wtr blks))))
(if (and (io-util/bam-reader? rdr) (io-util/bam-writer? wtr))
(transfer-blocks rdr wtr)
(transfer-alignments rdr wtr hdr))))
Binary file modified test-resources/bam/normalize_after.bam
Binary file not shown.
Binary file modified test-resources/bam/normalize_before.bam
Binary file not shown.
14 changes: 14 additions & 0 deletions test-resources/sam/normalize_after.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
@SQ SN:chr1 LN:45
@SQ SN:chr11 LN:40
r003 16 chr1 29 30 6H5M * 0 0 TAGGC *
r001 163 chr1 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112
r002 0 chr1 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA *
r003 0 chr1 9 30 5H6M * 0 0 AGCTAA *
x3 0 chr11 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ??????????????????????????
r004 0 chr1 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC *
r001 83 chr1 37 30 9M = 7 -39 CAGCGCCAT *
x1 0 chr11 1 30 20M * 0 0 aggttttataaaacaaataa ????????????????????
x2 0 chr11 2 30 21M * 0 0 ggttttataaaacaaataatt ?????????????????????
x4 0 chr11 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ?????????????????????????
x6 0 chr11 14 30 23M * 0 0 Taattaagtctacagagcaacta ???????????????????????
x5 0 chr11 12 30 24M * 0 0 aaTaattaagtctacagagcaact ????????????????????????
14 changes: 14 additions & 0 deletions test-resources/sam/normalize_before.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
@SQ SN:chr01 LN:45
@SQ SN:11 LN:40
r003 16 chr01 29 30 6H5M * 0 0 TAGGC *
r001 163 chr01 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112
r002 0 chr01 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA *
r003 0 chr01 9 30 5H6M * 0 0 AGCTAA *
x3 0 11 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ??????????????????????????
r004 0 chr01 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC *
r001 83 chr01 37 30 9M = 7 -39 CAGCGCCAT *
x1 0 11 1 30 20M * 0 0 aggttttataaaacaaataa ????????????????????
x2 0 11 2 30 21M * 0 0 ggttttataaaacaaataatt ?????????????????????
x4 0 11 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ?????????????????????????
x6 0 11 14 30 23M * 0 0 Taattaagtctacagagcaacta ???????????????????????
x5 0 11 12 30 24M * 0 0 aaTaattaagtctacagagcaact ????????????????????????
2 changes: 1 addition & 1 deletion test/cljam/algo/t_dedupe.clj
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
:after (clean-cache!)}
(let [out-file (str temp-dir "/deduped.bam")]
(is (not-throw? (dedupe/dedupe dedupe-before-bam-file out-file)))
(is (same-bam-file? out-file dedupe-after-bam-file)))))
(is (same-sam-contents? out-file dedupe-after-bam-file)))))

(deftest simple-pe-dedupe-xform
(is (= (into #{}
Expand Down
38 changes: 38 additions & 0 deletions test/cljam/algo/t_normal.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
(ns cljam.algo.t-normal
(:require [clojure.test :refer :all]
[cljam.t-common :refer :all]
[cljam.algo.normal :refer :all]
[cljam.io.sam :as sam]))

(def temp-sam (str temp-dir "/out.sam"))
(def temp-bam (str temp-dir "/out.bam"))

(deftest normalize-test
(testing "sam -> sam"
(with-before-after {:before (prepare-cache!)
:after (clean-cache!)}
(with-open [rdr (sam/reader normalize-before-sam-file)
wtr (sam/writer temp-sam)]
(is (not-throw? (normalize rdr wtr))))
(is (same-sam-contents? temp-sam normalize-after-sam-file))))
(testing "bam -> bam"
(with-before-after {:before (prepare-cache!)
:after (clean-cache!)}
(with-open [rdr (sam/reader normalize-before-bam-file)
wtr (sam/writer temp-bam)]
(is (not-throw? (normalize rdr wtr))))
(is (same-sam-contents? temp-bam normalize-after-bam-file))))
(testing "sam -> bam"
(with-before-after {:before (prepare-cache!)
:after (clean-cache!)}
(with-open [rdr (sam/reader normalize-before-sam-file)
wtr (sam/writer temp-bam)]
(is (not-throw? (normalize rdr wtr))))
(is (same-sam-contents? temp-bam normalize-after-bam-file))))
(testing "bam -> sam"
(with-before-after {:before (prepare-cache!)
:after (clean-cache!)}
(with-open [rdr (sam/reader normalize-before-bam-file)
wtr (sam/writer temp-sam)]
(is (not-throw? (normalize rdr wtr))))
(is (same-sam-contents? temp-sam normalize-after-sam-file)))))
10 changes: 7 additions & 3 deletions test/cljam/t_common.clj
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@

(def test-sam-file "test-resources/sam/test.sam")

(def normalize-before-sam-file "test-resources/sam/normalize_before.sam")
(def normalize-after-sam-file "test-resources/sam/normalize_after.sam")

;; ### BAM files

(def test-bam-file "test-resources/bam/test.bam")
Expand Down Expand Up @@ -460,10 +463,11 @@
[f1 f2]
(= (digest/sha1 (file f1)) (digest/sha1 (file f2))))

(defn same-bam-file?
(defn same-sam-contents?
"Returns true if the contents of two SAM/BAM files are equivalent, false if not."
[f1 f2]
(with-open [r1 (sam/bam-reader f1)
r2 (sam/bam-reader f2)]
(with-open [r1 (sam/reader f1)
r2 (sam/reader f2)]
(and (= (sam/read-header r1)
(sam/read-header r2))
(= (sam/read-alignments r1 {})
Expand Down
2 changes: 1 addition & 1 deletion test/cljam/tools/t_cli.clj
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
(with-before-after {:before (prepare-cache!)
:after (clean-cache!)}
(is (not-throw? (with-out-file temp-out (cli/normalize [normalize-before-bam-file temp-bam]))))
(is (same-bam-file? temp-bam normalize-after-bam-file))))
(is (same-sam-contents? temp-bam normalize-after-bam-file))))

(deftest about-sort-by-pos
(with-before-after {:before (prepare-cache!)
Expand Down