From ecb112de90327054ddafd12345d2aa60051dd650 Mon Sep 17 00:00:00 2001 From: Hillel Arnold Date: Sun, 22 May 2022 16:26:20 -0400 Subject: [PATCH 1/3] use only first letter of alphabetical indicators --- fixtures/merger/instance_parse/subcontainer.json | 12 ++++++++++++ merger/helpers.py | 6 ++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/fixtures/merger/instance_parse/subcontainer.json b/fixtures/merger/instance_parse/subcontainer.json index ab076e46..8c48ec6a 100644 --- a/fixtures/merger/instance_parse/subcontainer.json +++ b/fixtures/merger/instance_parse/subcontainer.json @@ -112,5 +112,17 @@ "extent_type": "folder", "number": 5 }] + }, + { + "source": [{ + "sub_container": { + "indicator_2": "Ar - Be", + "type_2": "folder" + } + }], + "parsed": [{ + "extent_type": "folder", + "number": 2 + }] } ] diff --git a/merger/helpers.py b/merger/helpers.py index c0024412..4413cf62 100644 --- a/merger/helpers.py +++ b/merger/helpers.py @@ -12,7 +12,9 @@ def indicator_to_integer(indicator): """Converts an instance indicator to an integer. An indicator can be an integer (23) a combination of integers and letters (23b) - or just a letter (B). + or just letters (B, Be). In cases where indicator data only consists of letters, + the function will return an integer based on the ordinal value of the lowercased + first letter in the indicator. """ try: integer = int(indicator) @@ -20,7 +22,7 @@ def indicator_to_integer(indicator): parsed = re.sub("[^0-9]", "", indicator) if len(parsed): return indicator_to_integer(parsed) - integer = ord(indicator.lower()) - 97 + integer = ord(indicator[0].lower()) - 97 return integer From e23ecba069219794e3c3ecdf3f251f04b43b59f2 Mon Sep 17 00:00:00 2001 From: Hillel Arnold Date: Fri, 3 Jun 2022 13:55:52 -0400 Subject: [PATCH 2/3] strip all tags --- transformer/mappings.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/transformer/mappings.py b/transformer/mappings.py index 873f659a..b5c2762e 100644 --- a/transformer/mappings.py +++ b/transformer/mappings.py @@ -1,4 +1,6 @@ import json +import re +import xml.etree.ElementTree as ET import odin import requests @@ -51,9 +53,15 @@ def has_online_instance(instances, uri): return False -def replace_xml(content_list): - """Replaces XML entities in notes with HTML tags.""" - return [c.replace("extref", "a") for c in content_list] +def strip_tags(user_string): + """Strips XML and HTML tags from a string.""" + try: + xmldoc = ET.fromstring(f'{user_string}') + textcontent = ''.join(xmldoc.itertext()) + except ET.ParseError: + tagregxp = re.compile(r'<[/\w][^>]+>') + textcontent = tagregxp.sub('', user_string) + return textcontent def transform_language(value, lang_materials): @@ -123,7 +131,7 @@ class SourceAncestorToRecordReference(odin.Mapping): @odin.map_field(from_field="title", to_field="title") def title(self, value): - return value.strip() + return strip_tags(value.strip()) @odin.map_field(from_field="order", to_field="order") def order(self, value): @@ -159,7 +167,7 @@ def type(self, value): @odin.map_field(from_field="title", to_field="title") def title(self, value): - return value.strip() + return strip_tags(value.strip()) @odin.map_list_field(from_field="ref", to_field="external_identifiers", to_list=True) def external_identifiers(self, value): @@ -297,8 +305,8 @@ def map_subnotes(self, value): subnote = self.chronology_subnotes(value.items) else: subnote = Subnote( - type="text", content=replace_xml(value.content) - if isinstance(value.content, list) else replace_xml([value.content])) + type="text", content=[strip_tags(c) for c in value.content] + if isinstance(value.content, list) else [strip_tags(value.content)]) return subnote @odin.map_list_field(from_field="subnotes", to_field="subnotes", to_list=True) @@ -309,7 +317,7 @@ def subnotes(self, value): elif self.source.jsonmodel_type in ["note_singlepart"]: # Here content is a list passed as a string, so we have to reconvert. content = [self.source.content.strip("][\"\'")] - subnotes = [Subnote(type="text", content=replace_xml(content))] + subnotes = [Subnote(type="text", content=[strip_tags(c) for c in content])] elif self.source.jsonmodel_type == "note_index": subnotes = self.index_subnotes(self.source.content, self.source.items) elif self.source.jsonmodel_type == "note_bibliography": @@ -322,7 +330,7 @@ def bibliograpy_subnotes(self, raw_content, items): data = [] # Here content is a list passed as a string, so we have to reconvert. content = [raw_content.strip("][\'")] - data.append(Subnote(type="text", content=replace_xml(content))) + data.append(Subnote(type="text", content=[strip_tags(c) for c in content])) data.append(Subnote(type="orderedlist", content=items)) return data @@ -343,6 +351,10 @@ class SourceResourceToCollection(odin.Mapping): from_obj = SourceResource to_obj = Collection + @odin.map_field(from_field="title", to_field="title") + def title(self, value): + return strip_tags(value) + @odin.map_list_field(from_field="notes", to_field="notes", to_list=True) def notes(self, value): return SourceNoteToNote.apply([v for v in value if (v.publish and v.type in NOTE_TYPE_CHOICES_TRANSFORM)]) @@ -410,7 +422,7 @@ def title(self, value): title = value.strip() if value else self.source.display_string.strip() if getattr(self.source, "component_id", None): title = "{}, {} {}".format(title, self.source.level.capitalize(), self.source.component_id) - return title + return strip_tags(title) @odin.map_field(from_field="language", to_field="languages", to_list=True) def languages(self, value): @@ -476,7 +488,8 @@ def dates(self, value): @odin.map_field def title(self, value): - return value.strip() if value else self.source.display_string.strip() + title = value.strip() if value else self.source.display_string.strip() + return strip_tags(title) @odin.map_field(from_field="language", to_field="languages", to_list=True) def languages(self, value): From a96a465e0ac56b1e33f578a9f4fd614b58245cfc Mon Sep 17 00:00:00 2001 From: Hillel Arnold Date: Fri, 3 Jun 2022 13:56:46 -0400 Subject: [PATCH 3/3] add tests for tag stripping --- transformer/tests.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/transformer/tests.py b/transformer/tests.py index 75632c39..c4054500 100644 --- a/transformer/tests.py +++ b/transformer/tests.py @@ -10,7 +10,7 @@ from fetcher.helpers import identifier_from_uri from .cron import CheckMissingOnlineAssets -from .mappings import has_online_instance +from .mappings import has_online_instance, strip_tags from .models import DataObject from .resources.configs import NOTE_TYPE_CHOICES_TRANSFORM from .transformers import Transformer @@ -255,3 +255,7 @@ def test_transformer(self): def test_ping(self): response = self.client.get(reverse('ping')) self.assertEqual(response.status_code, 200) + + def test_strip_tags(self): + for input in ["a collection", "a collection", "a collection"]: + self.assertEqual('a collection', strip_tags(input))