Skip to content

Commit

Permalink
Merge branch 'nm-tag' into 'dev'
Browse files Browse the repository at this point in the history
Output a correct MN tag, and remove potentially confusing minimap2 tags

See merge request epi2melabs/pore-c-py!50
  • Loading branch information
cjw85 committed Jul 18, 2024
2 parents ebbc3f5 + 38f65ce commit d57a802
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 14 deletions.
10 changes: 8 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v2.1.4]
### Fixed
- MN tag is now amended when monomer records are created.
### Added
- All potential minimap2 SAM tags are removed from input records before processing.

## [v2.1.3]
### Changed
- ML tag is now correctly a byte array tag.
### Fixed
- Output ML tag is now correctly a byte array tag.

## [v2.1.2]
### Added
Expand Down
2 changes: 1 addition & 1 deletion pore_c_py/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Init."""

__version__ = "2.1.3"
__version__ = "2.1.4"
34 changes: 23 additions & 11 deletions pore_c_py/digest.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,31 @@ def digest_sequence(align, enzyme, remove_tags=None, max_monomers=None):
"""
if max_monomers is None:
max_monomers = float('inf')
# the move tag massively bloats files, and we don't care for
# it or handle it in trimming, so force its removal by default.
if remove_tags is None:
remove_tags = set()
else:
remove_tags = set(remove_tags)
remove_tags.add('mv')

# build a set of tags to always remove regardless of the input.
# This is everything produce by minimap2 plus the move 'mv'
# from basecaller which is massive and not handled in trimming
hardlist = {
'tp', 'cm', 's1', 's2',
'NM', 'MD', 'AS', 'SA',
'ms', 'nn', 'ts', 'cg',
'cs', 'dv', 'de', 'rl',
'mv'}
remove_tags.update(hardlist)
# delete all mod tags if any were subject to deletion
keep_mods = True
mod_tags = {'Mm', 'Ml', 'MM', 'ML', 'Mn', 'MN'}
if remove_tags & mod_tags:
keep_mods = False
remove_tags.update(mod_tags)

# now actually remove stuff!
for tag in remove_tags:
if align.has_tag(tag):
if align.has_tag(tag): # set tag doesn't like invalid tag codes
align.set_tag(tag, None)

concatemer_id = align.query_name
Expand All @@ -133,21 +149,17 @@ def digest_sequence(align, enzyme, remove_tags=None, max_monomers=None):
read.query_sequence = seq
read.query_qualities = qual
# deal with mods, upgrading tag from interim to approved spec
if ('Mm' in remove_tags) or ('MM' in remove_tags):
# Setting to None effectively deletes,
# or does nothing if not present
for tag in ('Mm', 'Ml', 'MM', 'ML'):
read.set_tag(tag, None)
else:
if keep_mods:
mm, ml = get_subread_modified_bases(align, start, end)
for tag in ('Mm', 'Ml'):
for tag in ('Mm', 'Ml', 'Mn'):
read.set_tag(tag, None)
if len(ml) == 0:
logger.info(
f"Read {concatemer_id} has no modified bases.")
else:
read.set_tag("MM", mm)
read.set_tag("ML", ml)
read.set_tag("MN", len(seq))
# lexograpically sortable monomer ID
read.query_name = \
f"{concatemer_id}:{start:0{num_digits}d}:{end:0{num_digits}d}"
Expand Down

0 comments on commit d57a802

Please sign in to comment.