diff --git a/src/cljam/algo/normal.clj b/src/cljam/algo/normal.clj index 88242c90..8cb79b1b 100644 --- a/src/cljam/algo/normal.clj +++ b/src/cljam/algo/normal.clj @@ -1,12 +1,9 @@ (ns cljam.algo.normal - (:require [clojure.java.io :refer [file]] - [cljam.io.sam :as sam] + (:require [cljam.io.sam :as sam] + [cljam.io.util :as io-util] [cljam.util :refer [swap]] [cljam.util.chromosome :refer [normalize-chromosome-key]])) -;; TODO add test cases -;; TODO add functions to core.clj - (def ^:private chunk-size 1500000) (defn- normalize-header @@ -15,22 +12,26 @@ (assoc hdr :SQ (vec (map #(swap % :SN normalize-chromosome-key) seq))))) -(defmulti normalize (fn [rdr wtr] - (case (str (type rdr)) - "class cljam.io.sam.reader.SAMReader" :sam - "class cljam.io.bam.reader.BAMReader" :bam - nil))) - -(defmethod normalize :sam +;; TODO: copy all rest of stream for performance. (do not read, parse and write) +(defn- transfer-blocks [rdr wtr] - ;; TODO implement - ) + (doseq [blks (partition-all chunk-size (sam/read-blocks rdr))] + (sam/write-blocks wtr blks))) + +(defn- transfer-alignments + [rdr wtr hdr] + (doseq [alns (->> (sam/read-alignments rdr) + (map #(update % :rname normalize-chromosome-key)) + (partition-all chunk-size))] + (sam/write-alignments wtr alns hdr))) -(defmethod normalize :bam +(defn normalize + "Normalizes references of the SAM/BAM format. Be noted that performance may be + degraded if either or both of rdr and wtr is one about the SAM format." [rdr wtr] (let [hdr (normalize-header (sam/read-header rdr))] (sam/write-header wtr hdr) (sam/write-refs wtr hdr) - ;; TODO copy all rest of stream for performance. (do not read, parse and write) - (doseq [blks (partition-all chunk-size (sam/read-blocks rdr))] - (sam/write-blocks wtr blks)))) + (if (and (io-util/bam-reader? rdr) (io-util/bam-writer? wtr)) + (transfer-blocks rdr wtr) + (transfer-alignments rdr wtr hdr)))) diff --git a/test-resources/bam/normalize_after.bam b/test-resources/bam/normalize_after.bam index fdcdbf11..a42af695 100644 Binary files a/test-resources/bam/normalize_after.bam and b/test-resources/bam/normalize_after.bam differ diff --git a/test-resources/bam/normalize_before.bam b/test-resources/bam/normalize_before.bam index 9613963a..dfb5bc73 100644 Binary files a/test-resources/bam/normalize_before.bam and b/test-resources/bam/normalize_before.bam differ diff --git a/test-resources/sam/normalize_after.sam b/test-resources/sam/normalize_after.sam new file mode 100644 index 00000000..9243d9e9 --- /dev/null +++ b/test-resources/sam/normalize_after.sam @@ -0,0 +1,14 @@ +@SQ SN:chr1 LN:45 +@SQ SN:chr11 LN:40 +r003 16 chr1 29 30 6H5M * 0 0 TAGGC * +r001 163 chr1 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 +r002 0 chr1 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * +r003 0 chr1 9 30 5H6M * 0 0 AGCTAA * +x3 0 chr11 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ?????????????????????????? +r004 0 chr1 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * +r001 83 chr1 37 30 9M = 7 -39 CAGCGCCAT * +x1 0 chr11 1 30 20M * 0 0 aggttttataaaacaaataa ???????????????????? +x2 0 chr11 2 30 21M * 0 0 ggttttataaaacaaataatt ????????????????????? +x4 0 chr11 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ????????????????????????? +x6 0 chr11 14 30 23M * 0 0 Taattaagtctacagagcaacta ??????????????????????? +x5 0 chr11 12 30 24M * 0 0 aaTaattaagtctacagagcaact ???????????????????????? diff --git a/test-resources/sam/normalize_before.sam b/test-resources/sam/normalize_before.sam new file mode 100644 index 00000000..a312a626 --- /dev/null +++ b/test-resources/sam/normalize_before.sam @@ -0,0 +1,14 @@ +@SQ SN:chr01 LN:45 +@SQ SN:11 LN:40 +r003 16 chr01 29 30 6H5M * 0 0 TAGGC * +r001 163 chr01 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 +r002 0 chr01 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * +r003 0 chr01 9 30 5H6M * 0 0 AGCTAA * +x3 0 11 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ?????????????????????????? +r004 0 chr01 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * +r001 83 chr01 37 30 9M = 7 -39 CAGCGCCAT * +x1 0 11 1 30 20M * 0 0 aggttttataaaacaaataa ???????????????????? +x2 0 11 2 30 21M * 0 0 ggttttataaaacaaataatt ????????????????????? +x4 0 11 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ????????????????????????? +x6 0 11 14 30 23M * 0 0 Taattaagtctacagagcaacta ??????????????????????? +x5 0 11 12 30 24M * 0 0 aaTaattaagtctacagagcaact ???????????????????????? diff --git a/test/cljam/algo/t_dedupe.clj b/test/cljam/algo/t_dedupe.clj index 00298ae2..1bed0c6d 100644 --- a/test/cljam/algo/t_dedupe.clj +++ b/test/cljam/algo/t_dedupe.clj @@ -8,7 +8,7 @@ :after (clean-cache!)} (let [out-file (str temp-dir "/deduped.bam")] (is (not-throw? (dedupe/dedupe dedupe-before-bam-file out-file))) - (is (same-bam-file? out-file dedupe-after-bam-file))))) + (is (same-sam-contents? out-file dedupe-after-bam-file))))) (deftest simple-pe-dedupe-xform (is (= (into #{} diff --git a/test/cljam/algo/t_normal.clj b/test/cljam/algo/t_normal.clj new file mode 100644 index 00000000..3e014924 --- /dev/null +++ b/test/cljam/algo/t_normal.clj @@ -0,0 +1,38 @@ +(ns cljam.algo.t-normal + (:require [clojure.test :refer :all] + [cljam.t-common :refer :all] + [cljam.algo.normal :refer :all] + [cljam.io.sam :as sam])) + +(def temp-sam (str temp-dir "/out.sam")) +(def temp-bam (str temp-dir "/out.bam")) + +(deftest normalize-test + (testing "sam -> sam" + (with-before-after {:before (prepare-cache!) + :after (clean-cache!)} + (with-open [rdr (sam/reader normalize-before-sam-file) + wtr (sam/writer temp-sam)] + (is (not-throw? (normalize rdr wtr)))) + (is (same-sam-contents? temp-sam normalize-after-sam-file)))) + (testing "bam -> bam" + (with-before-after {:before (prepare-cache!) + :after (clean-cache!)} + (with-open [rdr (sam/reader normalize-before-bam-file) + wtr (sam/writer temp-bam)] + (is (not-throw? (normalize rdr wtr)))) + (is (same-sam-contents? temp-bam normalize-after-bam-file)))) + (testing "sam -> bam" + (with-before-after {:before (prepare-cache!) + :after (clean-cache!)} + (with-open [rdr (sam/reader normalize-before-sam-file) + wtr (sam/writer temp-bam)] + (is (not-throw? (normalize rdr wtr)))) + (is (same-sam-contents? temp-bam normalize-after-bam-file)))) + (testing "bam -> sam" + (with-before-after {:before (prepare-cache!) + :after (clean-cache!)} + (with-open [rdr (sam/reader normalize-before-bam-file) + wtr (sam/writer temp-sam)] + (is (not-throw? (normalize rdr wtr)))) + (is (same-sam-contents? temp-sam normalize-after-sam-file))))) diff --git a/test/cljam/t_common.clj b/test/cljam/t_common.clj index 88a87d8f..3640ff1c 100644 --- a/test/cljam/t_common.clj +++ b/test/cljam/t_common.clj @@ -95,6 +95,9 @@ (def test-sam-file "test-resources/sam/test.sam") +(def normalize-before-sam-file "test-resources/sam/normalize_before.sam") +(def normalize-after-sam-file "test-resources/sam/normalize_after.sam") + ;; ### BAM files (def test-bam-file "test-resources/bam/test.bam") @@ -460,10 +463,11 @@ [f1 f2] (= (digest/sha1 (file f1)) (digest/sha1 (file f2)))) -(defn same-bam-file? +(defn same-sam-contents? + "Returns true if the contents of two SAM/BAM files are equivalent, false if not." [f1 f2] - (with-open [r1 (sam/bam-reader f1) - r2 (sam/bam-reader f2)] + (with-open [r1 (sam/reader f1) + r2 (sam/reader f2)] (and (= (sam/read-header r1) (sam/read-header r2)) (= (sam/read-alignments r1 {}) diff --git a/test/cljam/tools/t_cli.clj b/test/cljam/tools/t_cli.clj index 264854e2..a5245cee 100644 --- a/test/cljam/tools/t_cli.clj +++ b/test/cljam/tools/t_cli.clj @@ -51,7 +51,7 @@ (with-before-after {:before (prepare-cache!) :after (clean-cache!)} (is (not-throw? (with-out-file temp-out (cli/normalize [normalize-before-bam-file temp-bam])))) - (is (same-bam-file? temp-bam normalize-after-bam-file)))) + (is (same-sam-contents? temp-bam normalize-after-bam-file)))) (deftest about-sort-by-pos (with-before-after {:before (prepare-cache!)