Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strip XML and HTML tags from note text and title fields #510

Merged
merged 2 commits into from
Jun 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 24 additions & 11 deletions transformer/mappings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import json
import re
import xml.etree.ElementTree as ET

import odin
import requests
Expand Down Expand Up @@ -51,9 +53,15 @@ def has_online_instance(instances, uri):
return False


def replace_xml(content_list):
"""Replaces XML entities in notes with HTML tags."""
return [c.replace("extref", "a") for c in content_list]
def strip_tags(user_string):
"""Strips XML and HTML tags from a string."""
try:
xmldoc = ET.fromstring(f'<xml>{user_string}</xml>')
textcontent = ''.join(xmldoc.itertext())
except ET.ParseError:
tagregxp = re.compile(r'<[/\w][^>]+>')
textcontent = tagregxp.sub('', user_string)
return textcontent


def transform_language(value, lang_materials):
Expand Down Expand Up @@ -123,7 +131,7 @@ class SourceAncestorToRecordReference(odin.Mapping):

@odin.map_field(from_field="title", to_field="title")
def title(self, value):
return value.strip()
return strip_tags(value.strip())

@odin.map_field(from_field="order", to_field="order")
def order(self, value):
Expand Down Expand Up @@ -159,7 +167,7 @@ def type(self, value):

@odin.map_field(from_field="title", to_field="title")
def title(self, value):
return value.strip()
return strip_tags(value.strip())

@odin.map_list_field(from_field="ref", to_field="external_identifiers", to_list=True)
def external_identifiers(self, value):
Expand Down Expand Up @@ -297,8 +305,8 @@ def map_subnotes(self, value):
subnote = self.chronology_subnotes(value.items)
else:
subnote = Subnote(
type="text", content=replace_xml(value.content)
if isinstance(value.content, list) else replace_xml([value.content]))
type="text", content=[strip_tags(c) for c in value.content]
if isinstance(value.content, list) else [strip_tags(value.content)])
return subnote

@odin.map_list_field(from_field="subnotes", to_field="subnotes", to_list=True)
Expand All @@ -309,7 +317,7 @@ def subnotes(self, value):
elif self.source.jsonmodel_type in ["note_singlepart"]:
# Here content is a list passed as a string, so we have to reconvert.
content = [self.source.content.strip("][\"\'")]
subnotes = [Subnote(type="text", content=replace_xml(content))]
subnotes = [Subnote(type="text", content=[strip_tags(c) for c in content])]
elif self.source.jsonmodel_type == "note_index":
subnotes = self.index_subnotes(self.source.content, self.source.items)
elif self.source.jsonmodel_type == "note_bibliography":
Expand All @@ -322,7 +330,7 @@ def bibliograpy_subnotes(self, raw_content, items):
data = []
# Here content is a list passed as a string, so we have to reconvert.
content = [raw_content.strip("][\'")]
data.append(Subnote(type="text", content=replace_xml(content)))
data.append(Subnote(type="text", content=[strip_tags(c) for c in content]))
data.append(Subnote(type="orderedlist", content=items))
return data

Expand All @@ -343,6 +351,10 @@ class SourceResourceToCollection(odin.Mapping):
from_obj = SourceResource
to_obj = Collection

@odin.map_field(from_field="title", to_field="title")
def title(self, value):
return strip_tags(value)

@odin.map_list_field(from_field="notes", to_field="notes", to_list=True)
def notes(self, value):
return SourceNoteToNote.apply([v for v in value if (v.publish and v.type in NOTE_TYPE_CHOICES_TRANSFORM)])
Expand Down Expand Up @@ -410,7 +422,7 @@ def title(self, value):
title = value.strip() if value else self.source.display_string.strip()
if getattr(self.source, "component_id", None):
title = "{}, {} {}".format(title, self.source.level.capitalize(), self.source.component_id)
return title
return strip_tags(title)

@odin.map_field(from_field="language", to_field="languages", to_list=True)
def languages(self, value):
Expand Down Expand Up @@ -476,7 +488,8 @@ def dates(self, value):

@odin.map_field
def title(self, value):
return value.strip() if value else self.source.display_string.strip()
title = value.strip() if value else self.source.display_string.strip()
return strip_tags(title)

@odin.map_field(from_field="language", to_field="languages", to_list=True)
def languages(self, value):
Expand Down
6 changes: 5 additions & 1 deletion transformer/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from fetcher.helpers import identifier_from_uri

from .cron import CheckMissingOnlineAssets
from .mappings import has_online_instance
from .mappings import has_online_instance, strip_tags
from .models import DataObject
from .resources.configs import NOTE_TYPE_CHOICES_TRANSFORM
from .transformers import Transformer
Expand Down Expand Up @@ -255,3 +255,7 @@ def test_transformer(self):
def test_ping(self):
response = self.client.get(reverse('ping'))
self.assertEqual(response.status_code, 200)

def test_strip_tags(self):
for input in ["<title>a collection</title>", "a <a href='https://example.com'>collection</a>", "a collection"]:
self.assertEqual('a collection', strip_tags(input))