Skip to content

Commit

Permalink
Merge pull request #511 from RockefellerArchiveCenter/development
Browse files Browse the repository at this point in the history
Changes from development
  • Loading branch information
helrond authored Jun 6, 2022
2 parents ac7dc71 + 168d0aa commit 6f75fd4
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 14 deletions.
12 changes: 12 additions & 0 deletions fixtures/merger/instance_parse/subcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -112,5 +112,17 @@
"extent_type": "folder",
"number": 5
}]
},
{
"source": [{
"sub_container": {
"indicator_2": "Ar - Be",
"type_2": "folder"
}
}],
"parsed": [{
"extent_type": "folder",
"number": 2
}]
}
]
6 changes: 4 additions & 2 deletions merger/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,17 @@ def indicator_to_integer(indicator):
"""Converts an instance indicator to an integer.
An indicator can be an integer (23) a combination of integers and letters (23b)
or just a letter (B).
or just letters (B, Be). In cases where indicator data only consists of letters,
the function will return an integer based on the ordinal value of the lowercased
first letter in the indicator.
"""
try:
integer = int(indicator)
except ValueError:
parsed = re.sub("[^0-9]", "", indicator)
if len(parsed):
return indicator_to_integer(parsed)
integer = ord(indicator.lower()) - 97
integer = ord(indicator[0].lower()) - 97
return integer


Expand Down
35 changes: 24 additions & 11 deletions transformer/mappings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import json
import re
import xml.etree.ElementTree as ET

import odin
import requests
Expand Down Expand Up @@ -51,9 +53,15 @@ def has_online_instance(instances, uri):
return False


def replace_xml(content_list):
"""Replaces XML entities in notes with HTML tags."""
return [c.replace("extref", "a") for c in content_list]
def strip_tags(user_string):
"""Strips XML and HTML tags from a string."""
try:
xmldoc = ET.fromstring(f'<xml>{user_string}</xml>')
textcontent = ''.join(xmldoc.itertext())
except ET.ParseError:
tagregxp = re.compile(r'<[/\w][^>]+>')
textcontent = tagregxp.sub('', user_string)
return textcontent


def transform_language(value, lang_materials):
Expand Down Expand Up @@ -123,7 +131,7 @@ class SourceAncestorToRecordReference(odin.Mapping):

@odin.map_field(from_field="title", to_field="title")
def title(self, value):
return value.strip()
return strip_tags(value.strip())

@odin.map_field(from_field="order", to_field="order")
def order(self, value):
Expand Down Expand Up @@ -159,7 +167,7 @@ def type(self, value):

@odin.map_field(from_field="title", to_field="title")
def title(self, value):
return value.strip()
return strip_tags(value.strip())

@odin.map_list_field(from_field="ref", to_field="external_identifiers", to_list=True)
def external_identifiers(self, value):
Expand Down Expand Up @@ -297,8 +305,8 @@ def map_subnotes(self, value):
subnote = self.chronology_subnotes(value.items)
else:
subnote = Subnote(
type="text", content=replace_xml(value.content)
if isinstance(value.content, list) else replace_xml([value.content]))
type="text", content=[strip_tags(c) for c in value.content]
if isinstance(value.content, list) else [strip_tags(value.content)])
return subnote

@odin.map_list_field(from_field="subnotes", to_field="subnotes", to_list=True)
Expand All @@ -309,7 +317,7 @@ def subnotes(self, value):
elif self.source.jsonmodel_type in ["note_singlepart"]:
# Here content is a list passed as a string, so we have to reconvert.
content = [self.source.content.strip("][\"\'")]
subnotes = [Subnote(type="text", content=replace_xml(content))]
subnotes = [Subnote(type="text", content=[strip_tags(c) for c in content])]
elif self.source.jsonmodel_type == "note_index":
subnotes = self.index_subnotes(self.source.content, self.source.items)
elif self.source.jsonmodel_type == "note_bibliography":
Expand All @@ -322,7 +330,7 @@ def bibliograpy_subnotes(self, raw_content, items):
data = []
# Here content is a list passed as a string, so we have to reconvert.
content = [raw_content.strip("][\'")]
data.append(Subnote(type="text", content=replace_xml(content)))
data.append(Subnote(type="text", content=[strip_tags(c) for c in content]))
data.append(Subnote(type="orderedlist", content=items))
return data

Expand All @@ -343,6 +351,10 @@ class SourceResourceToCollection(odin.Mapping):
from_obj = SourceResource
to_obj = Collection

@odin.map_field(from_field="title", to_field="title")
def title(self, value):
return strip_tags(value)

@odin.map_list_field(from_field="notes", to_field="notes", to_list=True)
def notes(self, value):
return SourceNoteToNote.apply([v for v in value if (v.publish and v.type in NOTE_TYPE_CHOICES_TRANSFORM)])
Expand Down Expand Up @@ -410,7 +422,7 @@ def title(self, value):
title = value.strip() if value else self.source.display_string.strip()
if getattr(self.source, "component_id", None):
title = "{}, {} {}".format(title, self.source.level.capitalize(), self.source.component_id)
return title
return strip_tags(title)

@odin.map_field(from_field="language", to_field="languages", to_list=True)
def languages(self, value):
Expand Down Expand Up @@ -476,7 +488,8 @@ def dates(self, value):

@odin.map_field
def title(self, value):
return value.strip() if value else self.source.display_string.strip()
title = value.strip() if value else self.source.display_string.strip()
return strip_tags(title)

@odin.map_field(from_field="language", to_field="languages", to_list=True)
def languages(self, value):
Expand Down
6 changes: 5 additions & 1 deletion transformer/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from fetcher.helpers import identifier_from_uri

from .cron import CheckMissingOnlineAssets
from .mappings import has_online_instance
from .mappings import has_online_instance, strip_tags
from .models import DataObject
from .resources.configs import NOTE_TYPE_CHOICES_TRANSFORM
from .transformers import Transformer
Expand Down Expand Up @@ -255,3 +255,7 @@ def test_transformer(self):
def test_ping(self):
response = self.client.get(reverse('ping'))
self.assertEqual(response.status_code, 200)

def test_strip_tags(self):
for input in ["<title>a collection</title>", "a <a href='https://example.com'>collection</a>", "a collection"]:
self.assertEqual('a collection', strip_tags(input))

0 comments on commit 6f75fd4

Please sign in to comment.