Merge branch 'nm-tag' into 'dev'

Output a correct MN tag, and remove potentially confusing minimap2 tags See merge request epi2melabs/pore-c-py!50
epi2me-labs · Jul 18, 2024 · d57a802 · d57a802
2 parents ebbc3f5 + 38f65ce
commit d57a802
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,9 +4,15 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v2.1.4]
+### Fixed
+- MN tag is now amended when monomer records are created.
+### Added
+- All potential minimap2 SAM tags are removed from input records before processing.
+
 ## [v2.1.3]
-### Changed
-- ML tag is now correctly a byte array tag.
+### Fixed
+- Output ML tag is now correctly a byte array tag.
 
 ## [v2.1.2]
 ### Added

diff --git a/pore_c_py/__init__.py b/pore_c_py/__init__.py
@@ -1,3 +1,3 @@
 """Init."""
 
-__version__ = "2.1.3"
+__version__ = "2.1.4"
diff --git a/pore_c_py/digest.py b/pore_c_py/digest.py
@@ -98,15 +98,31 @@ def digest_sequence(align, enzyme, remove_tags=None, max_monomers=None):
     """
     if max_monomers is None:
         max_monomers = float('inf')
-    # the move tag massively bloats files, and we don't care for
-    # it or handle it in trimming, so force its removal by default.
     if remove_tags is None:
         remove_tags = set()
     else:
         remove_tags = set(remove_tags)
-    remove_tags.add('mv')
+
+    # build a set of tags to always remove regardless of the input.
+    # This is everything produce by minimap2 plus the move 'mv'
+    # from basecaller which is massive and not handled in trimming
+    hardlist = {
+        'tp', 'cm', 's1', 's2',
+        'NM', 'MD', 'AS', 'SA',
+        'ms', 'nn', 'ts', 'cg',
+        'cs', 'dv', 'de', 'rl',
+        'mv'}
+    remove_tags.update(hardlist)
+    # delete all mod tags if any were subject to deletion
+    keep_mods = True
+    mod_tags = {'Mm', 'Ml', 'MM', 'ML', 'Mn', 'MN'}
+    if remove_tags & mod_tags:
+        keep_mods = False
+        remove_tags.update(mod_tags)
+
+    # now actually remove stuff!
     for tag in remove_tags:
-        if align.has_tag(tag):
+        if align.has_tag(tag):  # set tag doesn't like invalid tag codes
             align.set_tag(tag, None)
 
     concatemer_id = align.query_name
@@ -133,21 +149,17 @@ def digest_sequence(align, enzyme, remove_tags=None, max_monomers=None):
         read.query_sequence = seq
         read.query_qualities = qual
         # deal with mods, upgrading tag from interim to approved spec
-        if ('Mm' in remove_tags) or ('MM' in remove_tags):
-            # Setting to None effectively deletes,
-            # or does nothing if not present
-            for tag in ('Mm', 'Ml', 'MM', 'ML'):
-                read.set_tag(tag, None)
-        else:
+        if keep_mods:
             mm, ml = get_subread_modified_bases(align, start, end)
-            for tag in ('Mm', 'Ml'):
+            for tag in ('Mm', 'Ml', 'Mn'):
                 read.set_tag(tag, None)
             if len(ml) == 0:
                 logger.info(
                     f"Read {concatemer_id} has no modified bases.")
             else:
                 read.set_tag("MM", mm)
                 read.set_tag("ML", ml)
+                read.set_tag("MN", len(seq))
         # lexograpically sortable monomer ID
         read.query_name = \
             f"{concatemer_id}:{start:0{num_digits}d}:{end:0{num_digits}d}"