Skip to content

Commit

Permalink
Merge pull request #20 from chrovis/feature/bed-reader
Browse files Browse the repository at this point in the history
BED file reader/writer
  • Loading branch information
totakke committed Jan 7, 2017
2 parents 9eafd5c + d4f2ba0 commit 09456cd
Show file tree
Hide file tree
Showing 11 changed files with 402 additions and 30 deletions.
165 changes: 165 additions & 0 deletions src/cljam/bed.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
(ns cljam.bed
(:require [clojure.string :as cstr]
[cljam.util :as util]
[cljam.util.chromosome :as chr-util])
(:import [java.io BufferedReader BufferedWriter]))

(def ^:const bed-columns
[:chr :start :end :name :score :strand :thick-start :thick-end :item-rgb :block-count :block-sizes :block-starts])

(defn- str->long-list
"Convert string of comma-separated long values into list of longs.
Comma at the end of input string will be ignored.
Returns nil if input is nil."
[^String s]
(when-not (nil? s)
(map util/str->long (cstr/split s #","))))

(defn- long-list->str
"Inverse function of str->long-list."
[xs]
(when-not (nil? xs)
(apply str (interpose "," xs))))

(defn- update-some
"Same as update if map 'm' contains key 'k'. Otherwise returns the original map 'm'."
[m k f & args]
(if (get m k)
(apply update m k f args)
m))

(defn- deserialize-bed
"Parse BED fields string and returns a map.
Based on information at https://genome.ucsc.edu/FAQ/FAQformat#format1."
[^String s]
{:post [(and (:chr %) (:start %) (:end %))
;; First 3 fields are required.
(< (:start %) (:end %))
;; The chromEnd base is not included in the display of the feature.
(every? true? (drop-while false? (map nil? ((apply juxt bed-columns) %))))
;; Lower-numbered fields must be populated if higher-numbered fields are used.
(if-let [s (:score %)] (<= 0 s 1000) true)
;; A score between 0 and 1000.
(if-let [xs (:block-sizes %)] (= (count xs) (:block-count %)) true)
;; The number of items in this list should correspond to blockCount.
(if-let [xs (:block-starts %)] (= (count xs) (:block-count %)) true)
;; The number of items in this list should correspond to blockCount.
(if-let [[f] (:block-starts %)] (= 0 f) true)
;; The first blockStart value must be 0.
(if-let [xs (:block-starts %)] (= (+ (last xs) (last (:block-sizes %))) (- (:end %) (:start %))) true)
;; The final blockStart position plus the final blockSize value must equal chromEnd.
(if-let [xs (:block-starts %)] (apply <= (mapcat (fn [a b] [a (+ a b)]) xs (:block-sizes %))) true)
;; Blocks may not overlap.
]}
(reduce
(fn deserialize-bed-reduce-fn [m [k f]] (update-some m k f))
(zipmap bed-columns (cstr/split s #"\s+"))
{:start util/str->long
:end util/str->long
:score util/str->long
:strand #(case % "." :no-strand "+" :plus "-" :minus)
:thick-start util/str->long
:thick-end util/str->long
:block-count util/str->long
:block-sizes str->long-list
:block-starts str->long-list}))

(defn- serialize-bed
"Serialize bed fields into string."
[m]
(->> (-> m
(update-some :strand #(case % :plus "+" :minus "-" :no-strand "."))
(update-some :block-sizes long-list->str)
(update-some :block-starts long-list->str))
((apply juxt bed-columns))
(take-while identity)
(interpose " ")
(apply str)))

(defn- header-or-comment?
"Checks if given string is neither a header nor a comment line."
[^String s]
(or (empty? s)
(.startsWith s "browser")
(.startsWith s "track")
(.startsWith s "#")))

(defn- normalize
"Normalize BED fields.
BED fields are stored in format: 0-origin and inclusive-start / exclusive-end.
This function converts the coordinate into cljam style: 1-origin and inclusice-start / inclusive-end."
[m]
(-> m
(update :chr chr-util/normalize-chromosome-key)
(update :start inc)
(update-some :thick-start inc)))

(defn- denormalize
"De-normalize BED fields.
This is an inverse function of normalize."
[m]
(-> m
(update :start dec)
(update-some :thick-start dec)))

(defn read-raw-fields
"Returns a lazy sequence of unnormalized BED fields."
[^BufferedReader rdr]
(sequence
(comp (remove header-or-comment?)
(map deserialize-bed))
(line-seq rdr)))

(defn read-fields
"Returns a lazy sequence of normalized BED fields."
[^BufferedReader rdr]
(sequence
(comp (remove header-or-comment?)
(map deserialize-bed)
(map normalize))
(line-seq rdr)))

(defn sort-fields
"Sort BED fields based on :chr, :start and :end.
:chr with common names come first, in order of (chr1, chr2, ..., chrX, chrY, chrM).
Other chromosomes follow after in lexicographic order."
[xs]
(sort-by
(fn [m]
[(or (util/str->int (last (re-find #"(chr)?(\d+)" (:chr m)))) Integer/MAX_VALUE)
(or ({"X" 23 "Y" 24 "M" 25} (last (re-find #"(chr)?([X|Y|M])" (:chr m)))) Integer/MAX_VALUE)
(:chr m)
(:start m)
(:end m)])
xs))

(defn merge-fields
"Sort and merge overlapped regions.
Currently, this function affects only :end and :name fields."
[xs]
(reduce
(fn [r m]
(let [l (last r)]
(if (and l (= (:chr l) (:chr m)) (<= (:start m) (:end l)) (<= (:start l) (:end m)))
(update r (dec (count r)) (fn [n m] (-> n (assoc :end (:end m)) (update-some :name str "+" (:name m)))) m)
(conj r m))))
[]
(sort-fields xs)))

(defn write-raw-fields
"Write sequence of BED fields to writer without converting :start and :thick-start values."
[^BufferedWriter wtr xs]
(->> xs
(map serialize-bed)
(interpose "\n")
^String (apply str)
(.write wtr)))

(defn write-fields
"Write sequence of BED fields to writer."
[^BufferedWriter wtr xs]
(->> xs
(map (comp serialize-bed denormalize))
(interpose "\n")
^String (apply str)
(.write wtr)))
9 changes: 5 additions & 4 deletions src/cljam/fasta.clj
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,11 @@
(fa-core/read-sequences rdr))

(defn read-sequence
([rdr name]
(fa-core/read-sequence rdr name))
([rdr name start end]
(fa-core/read-sequence rdr name start end)))
[rdr {:keys [chr start end]}]
(cond
(and chr start end) (fa-core/read-sequence rdr chr start end)
chr (fa-core/read-sequence rdr chr)
:else (fa-core/read-sequences rdr)))

(defn reset
[rdr]
Expand Down
8 changes: 6 additions & 2 deletions src/cljam/fasta/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,12 @@
(defn read-sequence
[^FASTAReader rdr name start end]
(when-let [fai (.index rdr)]
(let [[offset-start offset-end] (fasta-index/get-span fai name start end)]
(read-sequence-with-offset rdr offset-start offset-end))))
(let [header (fasta-index/get-header fai name)]
(when-let [[offset-start offset-end] (fasta-index/get-span fai name (dec start) end)]
(->> (concat (repeat (max 0 (- 1 start)) \N)
(read-sequence-with-offset rdr offset-start offset-end)
(repeat (max 0 (- end (:len header))) \N))
(apply str))))))

(defn read
"Reads FASTA sequence data, returning its information as a lazy sequence."
Expand Down
6 changes: 2 additions & 4 deletions src/cljam/pileup/mpileup.clj
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,10 @@
([fa-reader bam-reader rname start end]
(try
(if-let [r (sam-util/ref-by-name (io/read-refs bam-reader) rname)]
(let [s (if (neg? start) 0 start)
(let [s (if (neg? start) 1 start)
e (if (neg? end) (:len r) end)
refseq (if fa-reader
(if (zero? s)
(concat [\N] (fa/read-sequence fa-reader rname s e))
(fa/read-sequence fa-reader rname (dec s) e))
(fa/read-sequence fa-reader {:chr rname :start s :end e})
(repeat \N))
alns (io/read-alignments bam-reader {:chr rname :start s :end e :depth :deep})]
(pileup* refseq alns rname s e)))
Expand Down
2 changes: 1 addition & 1 deletion src/cljam/pileup/pileup.clj
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@
(if-let [r (sam-util/ref-by-name (io/read-refs bam-reader) rname)]
(pileup* bam-reader
rname (:len r)
(if (neg? start) 0 start)
(if (neg? start) 1 start)
(if (neg? end) (:len r) end)))
(catch bgzf4j.BGZFException _
(throw (RuntimeException. "Invalid file format"))))))
3 changes: 3 additions & 0 deletions test-resources/test1.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
track name=pairedReads description="Clone Paired Reads" useScore=1
chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512
chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601
13 changes: 13 additions & 0 deletions test-resources/test2.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
browser position chr7:127471196-127495720
browser hide all
track name="ItemRGBDemo" description="Item RGB demonstration" visibility=2 itemRgb="On"
chr7 127471196 127472363 Pos1 0 + 127471196 127472363 255,0,0
chr7 127472363 127473530 Pos2 0 + 127472363 127473530 255,0,0
chr7 127473530 127474697 Pos3 0 + 127473530 127474697 255,0,0
chr7 127474697 127475864 Pos4 0 + 127474697 127475864 255,0,0
chr7 127475864 127477031 Neg1 0 - 127475864 127477031 0,0,255
chr7 127477031 127478198 Neg2 0 - 127477031 127478198 0,0,255
chr7 127478198 127479365 Neg3 0 - 127478198 127479365 0,0,255
chr7 127479365 127480532 Pos5 0 + 127479365 127480532 255,0,0
chr7 127480532 127481699 Neg4 0 - 127480532 127481699 0,0,255

12 changes: 12 additions & 0 deletions test-resources/test3.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
browser position chr7:127471196-127495720
browser hide all
track name="ColorByStrandDemo" description="Color by strand demonstration" visibility=2 colorByStrand="255,0,0 0,0,255"
chr7 127471196 127472363 Pos1 0 +
chr7 127472363 127473530 Pos2 0 +
chr7 127473530 127474697 Pos3 0 +
chr7 127474697 127475864 Pos4 0 +
chr7 127475864 127477031 Neg1 0 -
chr7 127477031 127478198 Neg2 0 -
chr7 127478198 127479365 Neg3 0 -
chr7 127479365 127480532 Pos5 0 +
chr7 127480532 127481699 Neg4 0 -
Loading

0 comments on commit 09456cd

Please sign in to comment.