Skip to content

Commit

Permalink
Implement pileup(contig=None) for unindexed/SAM files (PR #916)
Browse files Browse the repository at this point in the history
Reorganise pileup() so that it chooses between IteratorColumnRegion/
IteratorColumnAllRefs/IteratorColumnAll primarily by has_coord (i.e.,
whether contig+etc/region was specified) and secondarily by has_index.

Add IteratorColumnAll() which iterates as per HTS_IDX_REST, piling up the
entire file, which works without an index and for SAM files.  Fixes #915.

Unfortunately sam_itr_next() does not work for SAM files (even for
HTS_IDX_REST) prior to HTSlib 1.10. Instead we follow samtools mpileup's
lead and special case this to use sam_read1() at the bottom level
instead of an HTS_IDX_REST iterator. We duplicate __advance_all() and
__advance_nofilter() functions, but __advance_samtools() is itself
complicated enough that we just make it handle both cases instead.
  • Loading branch information
jmarshall committed May 4, 2020
1 parent ecc133f commit 9e7bc31
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 13 deletions.
5 changes: 5 additions & 0 deletions pysam/libcalignmentfile.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ cdef class IteratorColumn:
int start,
int stop,
int multiple_iterators=?)
cdef _setup_raw_rest_iterator(self)

cdef reset(self, tid, start, stop)
cdef _free_pileup_iter(self)
Expand All @@ -157,6 +158,10 @@ cdef class IteratorColumnAllRefs(IteratorColumn):
pass


cdef class IteratorColumnAll(IteratorColumn):
pass


cdef class IndexedReads:
cdef AlignmentFile samfile
cdef htsFile * htsfile
Expand Down
135 changes: 122 additions & 13 deletions pysam/libcalignmentfile.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
# class IteratorRowSelection
# class IteratorColumn
# class IteratorColumnRegion
# class IteratorColumnAll
# class IteratorColumnAllRefs
#
########################################################
Expand Down Expand Up @@ -1325,22 +1326,20 @@ cdef class AlignmentFile(HTSFile):
has_coord, rtid, rstart, rstop = self.parse_region(
contig, start, stop, region, reference=reference, end=end)

if self.is_bam or self.is_cram:
if has_coord:
if not self.has_index():
raise ValueError("no index available for pileup")

if has_coord:
return IteratorColumnRegion(self,
tid=rtid,
start=rstart,
stop=rstop,
**kwargs)
else:
return IteratorColumnAllRefs(self, **kwargs)

return IteratorColumnRegion(self,
tid=rtid,
start=rstart,
stop=rstop,
**kwargs)
else:
raise NotImplementedError(
"pileup of samfiles not implemented yet")
if self.has_index():
return IteratorColumnAllRefs(self, **kwargs)
else:
return IteratorColumnAll(self, **kwargs)

def count(self,
contig=None,
Expand Down Expand Up @@ -2318,6 +2317,16 @@ cdef int __advance_nofilter(void *data, bam1_t *b):
return ret


cdef int __advance_raw_nofilter(void *data, bam1_t *b):
'''advance (without iterator) without any read filtering.
'''
cdef __iterdata * d = <__iterdata*>data
cdef int ret
with nogil:
ret = sam_read1(d.htsfile, d.header, b)
return ret


cdef int __advance_all(void *data, bam1_t *b):
'''only use reads for pileup passing basic filters such as
Expand All @@ -2338,6 +2347,25 @@ cdef int __advance_all(void *data, bam1_t *b):
return ret


cdef int __advance_raw_all(void *data, bam1_t *b):
'''only use reads for pileup passing basic filters such as
BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
'''

cdef __iterdata * d = <__iterdata*>data
cdef int ret
while 1:
with nogil:
ret = sam_read1(d.htsfile, d.header, b)
if ret < 0:
break
if b.core.flag & d.flag_filter:
continue
break
return ret


cdef int __advance_samtools(void * data, bam1_t * b):
'''advance using same filter and read processing as in
the samtools pileup.
Expand All @@ -2348,7 +2376,7 @@ cdef int __advance_samtools(void * data, bam1_t * b):

while 1:
with nogil:
ret = sam_itr_next(d.htsfile, d.iter, b)
ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b)
if ret < 0:
break
if b.core.flag & d.flag_filter:
Expand Down Expand Up @@ -2544,12 +2572,64 @@ cdef class IteratorColumn:
with nogil:
bam_mplp_init_overlaps(self.pileup_iter)

cdef _setup_raw_rest_iterator(self):
'''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST'''

self.iter = None
self.iterdata.iter = NULL
self.iterdata.htsfile = self.samfile.htsfile
self.iterdata.seq = NULL
self.iterdata.tid = -1
self.iterdata.header = self.samfile.header.ptr

if self.fastafile is not None:
self.iterdata.fastafile = self.fastafile.fastafile
else:
self.iterdata.fastafile = NULL

# Free any previously allocated memory before reassigning
# pileup_iter
self._free_pileup_iter()

cdef void * data[1]
data[0] = <void*>&self.iterdata

if self.stepper is None or self.stepper == "all":
with nogil:
self.pileup_iter = bam_mplp_init(1,
<bam_plp_auto_f>&__advance_raw_all,
data)
elif self.stepper == "nofilter":
with nogil:
self.pileup_iter = bam_mplp_init(1,
<bam_plp_auto_f>&__advance_raw_nofilter,
data)
elif self.stepper == "samtools":
with nogil:
self.pileup_iter = bam_mplp_init(1,
<bam_plp_auto_f>&__advance_samtools,
data)
else:
raise ValueError(
"unknown stepper option `%s` in IteratorColumn" % self.stepper)

if self.max_depth:
with nogil:
bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)

if self.ignore_overlaps:
with nogil:
bam_mplp_init_overlaps(self.pileup_iter)

cdef reset(self, tid, start, stop):
'''reset iterator position.
This permits using the iterator multiple times without
having to incur the full set-up costs.
'''
if self.iter is None:
raise TypeError("Raw iterator set up without region cannot be reset")

self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0)
self.iterdata.iter = self.iter.iter

Expand Down Expand Up @@ -2689,6 +2769,35 @@ cdef class IteratorColumnAllRefs(IteratorColumn):
self.samfile.header)


cdef class IteratorColumnAll(IteratorColumn):
"""iterates over all columns, without using an index
"""

def __cinit__(self,
AlignmentFile samfile,
**kwargs):

self._setup_raw_rest_iterator()

def __next__(self):

cdef int n
n = self.cnext()
if n < 0:
raise ValueError("error during iteration")

if n == 0:
raise StopIteration

return makePileupColumn(&self.plp,
self.tid,
self.pos,
self.n_plp,
self.min_base_quality,
self.iterdata.seq,
self.samfile.header)


cdef class SNPCall:
'''the results of a SNP call.'''
cdef int _tid
Expand Down

0 comments on commit 9e7bc31

Please sign in to comment.