Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add include_terminal_gaps parameter #563

Merged
merged 1 commit into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 60 additions & 15 deletions src/biotite/sequence/align/cigar.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,14 @@ def read_alignment_from_cigar(cigar, position,
AAAAGGTTTCCGACCGTAGGTAG
CCCCGGTTT--GACCGTATGTAG

Explicit terminal deletions are also possible.
Note that in this case the deleted positions count as aligned bases
with respect to the `position` parameter.

>>> print(read_alignment_from_cigar("3D9M2D12M4D", 0, ref, seg))
TATAAAAGGTTTCCGACCGTAGGTAGCTGA
---CCCCGGTTT--GACCGTATGTAG----

If bases in the segment sequence are soft-clipped, they do not
appear in the alignment.
Furthermore, the start of the reference sequence must be adapted.
Expand All @@ -122,7 +130,7 @@ def read_alignment_from_cigar(cigar, position,
GGTTTCCGACCGTAGGTAG
GGTTT--GACCGTATGTAG

Reading from BAM codes is also possible:
Reading from BAM codes is also possible.

>>> seg = NucleotideSequence("CCCCGGTTTGACCGTATGTAG")
>>> op_tuples = [
Expand Down Expand Up @@ -190,7 +198,8 @@ def read_alignment_from_cigar(cigar, position,

def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
introns=(), distinguish_matches=False,
hard_clip=False, as_string=True):
hard_clip=False, include_terminal_gaps=False,
as_string=True):
"""
Convert an :class:`Alignment` into a CIGAR string.

Expand Down Expand Up @@ -220,6 +229,13 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
hard_clip : bool, optional
If true, clipped bases are hard-clipped.
Otherwise, clipped bases are soft-clipped.
include_terminal_gaps : bool, optional
If true, terminal gaps in the segment sequence are included in
the CIGAR string.
These are represented by ``D`` operations at the start and/or
end of the string.
By default, those terminal gaps are omitted in the CIGAR, which
is the way SAM/BAM expects a CIGAR to be.
as_string : bool, optional
If true, the CIGAR string is returned.
Otherwise, a list of tuples is returned, where the first element
Expand All @@ -238,6 +254,12 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
--------
read_alignment_from_cigar

Notes
-----
If `include_terminal_gaps` is set to true, you usually want to set
``position=0`` in :func:`read_alignment_from_cigar` to get the
correct alignment.

Examples
--------

Expand All @@ -256,6 +278,8 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
9M2N12M
>>> print(write_alignment_to_cigar(semiglobal_alignment, distinguish_matches=True))
4X5=2D7=1X4=
>>> print(write_alignment_to_cigar(semiglobal_alignment, include_terminal_gaps=True))
3D9M2D12M4D
>>> local_alignment = align_optimal(ref, seg, matrix, local=True)[0]
>>> print(local_alignment)
GGTTTCCGACCGTAGGTAG
Expand All @@ -274,9 +298,8 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
CigarOp.DELETION 2
CigarOp.MATCH 12
"""
# Ignore terminal gaps in segment sequence
no_gap_pos = np.where(alignment.trace[:, segment_index] != -1)[0]
alignment = alignment[no_gap_pos[0] : no_gap_pos[-1] + 1]
if not include_terminal_gaps:
alignment = _remove_terminal_segment_gaps(alignment, segment_index)

ref_trace = alignment.trace[:, reference_index]
seg_trace = alignment.trace[:, segment_index]
Expand Down Expand Up @@ -321,19 +344,13 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
op_tuples = _aggregate_consecutive(operations)

clip_op = CigarOp.HARD_CLIP if hard_clip else CigarOp.SOFT_CLIP
# Missing bases at the beginning and end of the segment are
# interpreted as clipped
# As first element in the segment trace is the first aligned base,
# all previous bases are clipped...
start_clip_length = seg_trace[0]
start_clip_length, end_clip_length = _find_clipped_bases(
alignment, segment_index
)
if start_clip_length != 0:
start_clip = [(clip_op, seg_trace[0])]
start_clip = [(clip_op, start_clip_length)]
else:
start_clip = np.zeros((0, 2), dtype=int)
# ...and the same applies for the last base
end_clip_length = (
len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
)
if end_clip_length != 0:
end_clip = [(clip_op, end_clip_length)]
else:
Expand All @@ -347,6 +364,34 @@ def write_alignment_to_cigar(alignment, reference_index=0, segment_index=1,
return op_tuples


def _remove_terminal_segment_gaps(alignment, segment_index):
"""
Remove terminal gaps in the segment sequence.
"""
no_gap_pos = np.where(alignment.trace[:, segment_index] != -1)[0]
return alignment[no_gap_pos[0] : no_gap_pos[-1] + 1]


def _find_clipped_bases(alignment, segment_index):
"""
Find the number of clipped bases at the start and end of the segment.
"""
# Finding the clipped part is easier, when the terminal segment gaps
# are removed (if not already done)
alignment = _remove_terminal_segment_gaps(alignment, segment_index)
seg_trace = alignment.trace[:, segment_index]
# Missing bases at the beginning and end of the segment are
# interpreted as clipped
# As first element in the segment trace is the first aligned base,
# all previous bases are clipped...
start_clip_length = seg_trace[0]
# ...and the same applies for the last base
end_clip_length = (
len(alignment.sequences[segment_index]) - seg_trace[-1] - 1
)
return start_clip_length, end_clip_length


def _aggregate_consecutive(operations):
"""
Aggregate consecutive operations of the same type.
Expand Down
20 changes: 10 additions & 10 deletions tests/sequence/align/test_cigar.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,16 @@ def test_cigar_conversion(cigar):


@pytest.mark.parametrize(
"seed, local, distinguish_matches",
"seed, local, distinguish_matches, include_terminal_gaps",
itertools.product(
range(20),
[False, True],
[False, True],
[False, True],
)
)
def test_alignment_conversion(seed, local, distinguish_matches):
def test_alignment_conversion(seed, local, distinguish_matches,
include_terminal_gaps):
"""
Check whether an :class:`Alignment` converted into a CIGAR string
and back again into an :class:`Alignment` gives the same result.
Expand All @@ -130,15 +132,18 @@ def test_alignment_conversion(seed, local, distinguish_matches):
ref_ali = align.align_optimal(
ref, seg, matrix, terminal_penalty=False, max_number=1
)[0]
# Turn into a semi-global alignment
ref_ali = align.remove_terminal_gaps(ref_ali)
if not include_terminal_gaps:
# Turn into a semi-global alignment
ref_ali = align.remove_terminal_gaps(ref_ali)
# Remove score as the compared reconstructed alignment does not
# contain it either
ref_ali.score = None
start_position = ref_ali.trace[0,0]

cigar = align.write_alignment_to_cigar(
ref_ali, distinguish_matches=distinguish_matches
ref_ali,
distinguish_matches=distinguish_matches,
include_terminal_gaps=include_terminal_gaps
)

test_ali = align.read_alignment_from_cigar(
Expand All @@ -147,13 +152,8 @@ def test_alignment_conversion(seed, local, distinguish_matches):

print(cigar)
print("\n\n")
print(seg)
print("\n\n")
print(ref_ali)
print("\n\n")
print(test_ali)
print("\n\n")
print(start_position)
print(ref_ali.trace[:5])
print(test_ali.trace[:5])
assert test_ali == ref_ali
Loading