From ecb112de90327054ddafd12345d2aa60051dd650 Mon Sep 17 00:00:00 2001
From: Hillel Arnold <helrond@hotmail.com>
Date: Sun, 22 May 2022 16:26:20 -0400
Subject: [PATCH 1/3] use only first letter of alphabetical indicators

---
 fixtures/merger/instance_parse/subcontainer.json | 12 ++++++++++++
 merger/helpers.py                                |  6 ++++--
 2 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/fixtures/merger/instance_parse/subcontainer.json b/fixtures/merger/instance_parse/subcontainer.json
index ab076e46..8c48ec6a 100644
--- a/fixtures/merger/instance_parse/subcontainer.json
+++ b/fixtures/merger/instance_parse/subcontainer.json
@@ -112,5 +112,17 @@
       "extent_type": "folder",
       "number": 5
     }]
+  },
+  {
+    "source": [{
+      "sub_container": {
+        "indicator_2": "Ar - Be",
+        "type_2": "folder"
+      }
+    }],
+    "parsed": [{
+      "extent_type": "folder",
+      "number": 2
+    }]
   }
 ]
diff --git a/merger/helpers.py b/merger/helpers.py
index c0024412..4413cf62 100644
--- a/merger/helpers.py
+++ b/merger/helpers.py
@@ -12,7 +12,9 @@ def indicator_to_integer(indicator):
     """Converts an instance indicator to an integer.
 
     An indicator can be an integer (23) a combination of integers and letters (23b)
-    or just a letter (B).
+    or just letters (B, Be). In cases where indicator data only consists of letters,
+    the function will return an integer based on the ordinal value of the lowercased
+    first letter in the indicator.
     """
     try:
         integer = int(indicator)
@@ -20,7 +22,7 @@ def indicator_to_integer(indicator):
         parsed = re.sub("[^0-9]", "", indicator)
         if len(parsed):
             return indicator_to_integer(parsed)
-        integer = ord(indicator.lower()) - 97
+        integer = ord(indicator[0].lower()) - 97
     return integer
 
 

From e23ecba069219794e3c3ecdf3f251f04b43b59f2 Mon Sep 17 00:00:00 2001
From: Hillel Arnold <helrond@hotmail.com>
Date: Fri, 3 Jun 2022 13:55:52 -0400
Subject: [PATCH 2/3] strip all tags

---
 transformer/mappings.py | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/transformer/mappings.py b/transformer/mappings.py
index 873f659a..b5c2762e 100644
--- a/transformer/mappings.py
+++ b/transformer/mappings.py
@@ -1,4 +1,6 @@
 import json
+import re
+import xml.etree.ElementTree as ET
 
 import odin
 import requests
@@ -51,9 +53,15 @@ def has_online_instance(instances, uri):
     return False
 
 
-def replace_xml(content_list):
-    """Replaces XML entities in notes with HTML tags."""
-    return [c.replace("extref", "a") for c in content_list]
+def strip_tags(user_string):
+    """Strips XML and HTML tags from a string."""
+    try:
+        xmldoc = ET.fromstring(f'<xml>{user_string}</xml>')
+        textcontent = ''.join(xmldoc.itertext())
+    except ET.ParseError:
+        tagregxp = re.compile(r'<[/\w][^>]+>')
+        textcontent = tagregxp.sub('', user_string)
+    return textcontent
 
 
 def transform_language(value, lang_materials):
@@ -123,7 +131,7 @@ class SourceAncestorToRecordReference(odin.Mapping):
 
     @odin.map_field(from_field="title", to_field="title")
     def title(self, value):
-        return value.strip()
+        return strip_tags(value.strip())
 
     @odin.map_field(from_field="order", to_field="order")
     def order(self, value):
@@ -159,7 +167,7 @@ def type(self, value):
 
     @odin.map_field(from_field="title", to_field="title")
     def title(self, value):
-        return value.strip()
+        return strip_tags(value.strip())
 
     @odin.map_list_field(from_field="ref", to_field="external_identifiers", to_list=True)
     def external_identifiers(self, value):
@@ -297,8 +305,8 @@ def map_subnotes(self, value):
             subnote = self.chronology_subnotes(value.items)
         else:
             subnote = Subnote(
-                type="text", content=replace_xml(value.content)
-                if isinstance(value.content, list) else replace_xml([value.content]))
+                type="text", content=[strip_tags(c) for c in value.content]
+                if isinstance(value.content, list) else [strip_tags(value.content)])
         return subnote
 
     @odin.map_list_field(from_field="subnotes", to_field="subnotes", to_list=True)
@@ -309,7 +317,7 @@ def subnotes(self, value):
         elif self.source.jsonmodel_type in ["note_singlepart"]:
             # Here content is a list passed as a string, so we have to reconvert.
             content = [self.source.content.strip("][\"\'")]
-            subnotes = [Subnote(type="text", content=replace_xml(content))]
+            subnotes = [Subnote(type="text", content=[strip_tags(c) for c in content])]
         elif self.source.jsonmodel_type == "note_index":
             subnotes = self.index_subnotes(self.source.content, self.source.items)
         elif self.source.jsonmodel_type == "note_bibliography":
@@ -322,7 +330,7 @@ def bibliograpy_subnotes(self, raw_content, items):
         data = []
         # Here content is a list passed as a string, so we have to reconvert.
         content = [raw_content.strip("][\'")]
-        data.append(Subnote(type="text", content=replace_xml(content)))
+        data.append(Subnote(type="text", content=[strip_tags(c) for c in content]))
         data.append(Subnote(type="orderedlist", content=items))
         return data
 
@@ -343,6 +351,10 @@ class SourceResourceToCollection(odin.Mapping):
     from_obj = SourceResource
     to_obj = Collection
 
+    @odin.map_field(from_field="title", to_field="title")
+    def title(self, value):
+        return strip_tags(value)
+
     @odin.map_list_field(from_field="notes", to_field="notes", to_list=True)
     def notes(self, value):
         return SourceNoteToNote.apply([v for v in value if (v.publish and v.type in NOTE_TYPE_CHOICES_TRANSFORM)])
@@ -410,7 +422,7 @@ def title(self, value):
         title = value.strip() if value else self.source.display_string.strip()
         if getattr(self.source, "component_id", None):
             title = "{}, {} {}".format(title, self.source.level.capitalize(), self.source.component_id)
-        return title
+        return strip_tags(title)
 
     @odin.map_field(from_field="language", to_field="languages", to_list=True)
     def languages(self, value):
@@ -476,7 +488,8 @@ def dates(self, value):
 
     @odin.map_field
     def title(self, value):
-        return value.strip() if value else self.source.display_string.strip()
+        title = value.strip() if value else self.source.display_string.strip()
+        return strip_tags(title)
 
     @odin.map_field(from_field="language", to_field="languages", to_list=True)
     def languages(self, value):

From a96a465e0ac56b1e33f578a9f4fd614b58245cfc Mon Sep 17 00:00:00 2001
From: Hillel Arnold <helrond@hotmail.com>
Date: Fri, 3 Jun 2022 13:56:46 -0400
Subject: [PATCH 3/3] add tests for tag stripping

---
 transformer/tests.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/transformer/tests.py b/transformer/tests.py
index 75632c39..c4054500 100644
--- a/transformer/tests.py
+++ b/transformer/tests.py
@@ -10,7 +10,7 @@
 from fetcher.helpers import identifier_from_uri
 
 from .cron import CheckMissingOnlineAssets
-from .mappings import has_online_instance
+from .mappings import has_online_instance, strip_tags
 from .models import DataObject
 from .resources.configs import NOTE_TYPE_CHOICES_TRANSFORM
 from .transformers import Transformer
@@ -255,3 +255,7 @@ def test_transformer(self):
     def test_ping(self):
         response = self.client.get(reverse('ping'))
         self.assertEqual(response.status_code, 200)
+
+    def test_strip_tags(self):
+        for input in ["<title>a collection</title>", "a <a href='https://example.com'>collection</a>", "a collection"]:
+            self.assertEqual('a collection', strip_tags(input))