From e751c4d1c0c811f668018adc979189ab83ec178e Mon Sep 17 00:00:00 2001 From: w3stling Date: Sun, 13 Aug 2023 22:35:39 +0200 Subject: [PATCH] Content or summary element is not mapped if it contains XML --- .../rssreader/AbstractRssReader.java | 107 +++++++++++++----- .../RssReaderIntegrationTest.java | 25 +++- src/test/resources/atom-feed.xml | 25 +++- 3 files changed, 124 insertions(+), 33 deletions(-) diff --git a/src/main/java/com/apptasticsoftware/rssreader/AbstractRssReader.java b/src/main/java/com/apptasticsoftware/rssreader/AbstractRssReader.java index 565ce0c..f84d923 100644 --- a/src/main/java/com/apptasticsoftware/rssreader/AbstractRssReader.java +++ b/src/main/java/com/apptasticsoftware/rssreader/AbstractRssReader.java @@ -65,6 +65,7 @@ public abstract class AbstractRssReader { private final HashMap>> channelAttributes = new HashMap<>(); private final HashMap> itemTags = new HashMap<>(); private final HashMap>> itemAttributes = new HashMap<>(); + private final Set collectChildNodesForTag = Set.of("content", "summary"); private boolean isInitialized; @@ -458,6 +459,7 @@ else if (firstChar == 13 || Character.isWhitespace(firstChar)) { class RssItemIterator implements Iterator { private final StringBuilder textBuilder; + private final Map childNodeTextBuilder; private final InputStream is; private final Deque elementStack; private XMLStreamReader reader; @@ -466,12 +468,12 @@ class RssItemIterator implements Iterator { private I nextItem; private boolean isChannelPart = false; private boolean isItemPart = false; - private String elementName = null; public RssItemIterator(InputStream is) { this.is = is; nextItem = null; textBuilder = new StringBuilder(); + childNodeTextBuilder = new HashMap<>(); elementStack = new ArrayDeque<>(); try { @@ -533,7 +535,8 @@ public I next() { try { while (reader.hasNext()) { - var type = reader.next(); // do something here + var type = reader.next(); + collectChildNodes(type); if (type == CHARACTERS || type == CDATA) { parseCharacters(); @@ -560,21 +563,66 @@ else if (type == END_ELEMENT) { throw new NoSuchElementException(); } + void collectChildNodes(int type) { + if (type == START_ELEMENT) { + var nsTagName = toNsName(reader.getPrefix(), reader.getLocalName()); + + if (!childNodeTextBuilder.isEmpty()) { + StringBuilder startTagBuilder = new StringBuilder("<").append(nsTagName); + // Add namespaces to start tag + for (int i = 0; i < reader.getNamespaceCount(); ++i) { + startTagBuilder.append(" ") + .append(toNamespacePrefix(reader.getNamespacePrefix(i))) + .append("=") + .append(reader.getNamespaceURI(i)); + } + // Add attributes to start tag + for (int i = 0; i < reader.getAttributeCount(); ++i) { + startTagBuilder.append(" ") + .append(toNsName(reader.getAttributePrefix(i), reader.getAttributeLocalName(i))) + .append("=") + .append(reader.getAttributeValue(i)); + } + startTagBuilder.append(">"); + var startTag = startTagBuilder.toString(); + + childNodeTextBuilder.entrySet() + .stream() + .filter(e -> !e.getKey().equals(nsTagName)) + .forEach(e -> e.getValue().append(startTag)); + } + + // Collect child notes for tag names in this set + if (collectChildNodesForTag.contains(nsTagName)) { + childNodeTextBuilder.put(nsTagName, new StringBuilder()); + } + } + else if (type == CHARACTERS || type == CDATA) { + childNodeTextBuilder.forEach((k, builder) -> builder.append(reader.getText())); + } + else if (type == END_ELEMENT) { + var nsTagName = toNsName(reader.getPrefix(), reader.getLocalName()); + var endTag = ""; + childNodeTextBuilder.entrySet() + .stream() + .filter(e -> !e.getKey().equals(nsTagName)) + .forEach(e -> e.getValue().append(endTag)); + } + } + void parseStartElement() { textBuilder.setLength(0); - elementName = reader.getLocalName(); - var prefix = reader.getPrefix(); - var nsLocalName = toNsName(prefix, elementName); - elementStack.addLast(nsLocalName); + var nsTagName = toNsName(reader.getPrefix(), reader.getLocalName()); + elementStack.addLast(nsTagName); - if ("channel".equals(nsLocalName) || "feed".equals(nsLocalName)) { + if ("channel".equals(nsTagName) || "feed".equals(nsTagName)) { channel = createChannel(); channel.setTitle(""); channel.setDescription(""); channel.setLink(""); isChannelPart = true; } - else if ("item".equals(nsLocalName) || "entry".equals(nsLocalName)) { + else if ("item".equals(nsTagName) || "entry".equals(nsTagName)) { item = createItem(); item.setChannel(channel); isChannelPart = false; @@ -583,20 +631,18 @@ else if ("item".equals(nsLocalName) || "entry".equals(nsLocalName)) { } void parseAttributes() { - var localName = reader.getLocalName(); - var prefix = reader.getPrefix(); - var nsLocalName = toNsName(prefix, localName); - var attributeFullPath = getElementFullPath(); + var nsTagName = toNsName(reader.getPrefix(), reader.getLocalName()); + var elementFullPath = getElementFullPath(); if (isChannelPart) { // Map channel attributes - mapChannelAttributes(nsLocalName); - mapChannelAttributes(attributeFullPath); + mapChannelAttributes(nsTagName); + mapChannelAttributes(elementFullPath); } else if (isItemPart) { // Map item attributes - mapItemAttributes(nsLocalName); - mapItemAttributes(attributeFullPath); + mapItemAttributes(nsTagName); + mapItemAttributes(elementFullPath); } } @@ -621,22 +667,19 @@ void mapItemAttributes(String key) { } boolean parseEndElement() { - var localName = reader.getLocalName(); - var prefix = reader.getPrefix(); - var nsLocalName = toNsName(prefix, localName); + var nsTagName = toNsName(reader.getPrefix(), reader.getLocalName()); var text = textBuilder.toString().trim(); var elementFullPath = getElementFullPath(); elementStack.removeLast(); if (isChannelPart) - parseChannelCharacters(channel, prefix, elementName, elementFullPath, text); + parseChannelCharacters(channel, nsTagName, elementFullPath, text); else - parseItemCharacters(item, prefix, elementName, elementFullPath, text); + parseItemCharacters(item, nsTagName, elementFullPath, text); textBuilder.setLength(0); - elementName = ""; - return "item".equals(nsLocalName) || "entry".equals(nsLocalName); + return "item".equals(nsTagName) || "entry".equals(nsTagName); } void parseCharacters() { @@ -648,21 +691,21 @@ void parseCharacters() { textBuilder.append(text); } - void parseChannelCharacters(C channel, String prefix, String elementName, String elementFullPath, String text) { + void parseChannelCharacters(C channel, String nsTagName, String elementFullPath, String text) { if (channel == null || text.isEmpty()) return; - var nsElementName = toNsName(prefix, elementName); - channelTags.computeIfPresent(nsElementName, (k, f) -> { f.accept(channel, text); return f; }); + channelTags.computeIfPresent(nsTagName, (k, f) -> { f.accept(channel, text); return f; }); channelTags.computeIfPresent(elementFullPath, (k, f) -> { f.accept(channel, text); return f; }); } - void parseItemCharacters(final I item, String prefix, String elementName, String elementFullPath, final String text) { - if (item == null || text.isEmpty()) + void parseItemCharacters(final I item, String nsTagName, String elementFullPath, final String text) { + var builder = childNodeTextBuilder.remove(nsTagName); + if (item == null || (text.isEmpty() && builder == null)) return; - var nsElementName = toNsName(prefix, elementName); - itemTags.computeIfPresent(nsElementName, (k, f) -> { f.accept(item, text); return f; }); + var textValue = (builder != null) ? builder.toString().trim() : text; + itemTags.computeIfPresent(nsTagName, (k, f) -> { f.accept(item, textValue); return f; }); itemTags.computeIfPresent(elementFullPath, (k, f) -> { f.accept(item, text); return f; }); } @@ -670,6 +713,10 @@ String toNsName(String prefix, String name) { return prefix.isEmpty() ? name : prefix + ":" + name; } + String toNamespacePrefix(String prefix) { + return prefix == null || prefix.isEmpty() ? "xmlns" : "xmlns" + ":" + prefix; + } + String getElementFullPath() { return "/" + String.join("/", elementStack); } diff --git a/src/test/java/com/apptasticsoftware/integrationtest/RssReaderIntegrationTest.java b/src/test/java/com/apptasticsoftware/integrationtest/RssReaderIntegrationTest.java index 203c5e9..4c68c3f 100644 --- a/src/test/java/com/apptasticsoftware/integrationtest/RssReaderIntegrationTest.java +++ b/src/test/java/com/apptasticsoftware/integrationtest/RssReaderIntegrationTest.java @@ -599,8 +599,29 @@ void testCloseTwice() throws IOException { void testAtomFeed() { var items = new RssReader().read(fromFile("atom-feed.xml")) .collect(Collectors.toList()); - assertEquals(1, items.size()); - assertEquals("Mark Pilgrim", items.get(0).getAuthor().orElse(null)); + + assertEquals(3, items.size()); + + assertEquals("dive into mark", items.get(0).getChannel().getTitle()); + assertEquals(65, items.get(0).getChannel().getDescription().length()); + assertEquals("http://example.org/feed.atom", items.get(0).getChannel().getLink()); + assertEquals("Copyright (c) 2003, Mark Pilgrim", items.get(0).getChannel().getCopyright().orElse(null)); + assertEquals("Example Toolkit", items.get(0).getChannel().getGenerator().orElse(null)); + assertEquals("2005-07-31T12:29:29Z", items.get(0).getChannel().getLastBuildDate().orElse(null)); + + assertEquals("Atom-Powered Robots Run Amok", items.get(1).getTitle().orElse(null)); + assertNull(items.get(1).getAuthor().orElse(null)); + assertEquals("http://example.org/2003/12/13/atom03", items.get(1).getLink().orElse(null)); + assertEquals("urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a", items.get(1).getGuid().orElse(null)); + assertEquals("2003-12-13T18:30:02Z", items.get(1).getPubDate().orElse(null)); + assertEquals(211, items.get(1).getDescription().orElse("").length()); + + assertEquals("Atom-Powered Robots Run Amok 2", items.get(2).getTitle().orElse(null)); + assertNull(items.get(2).getAuthor().orElse(null)); + assertEquals("http://example.org/2003/12/13/atom04", items.get(2).getLink().orElse(null)); + assertEquals("urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6b", items.get(2).getGuid().orElse(null)); + assertEquals("2003-12-13T18:30:01Z", items.get(2).getPubDate().orElse(null)); + assertEquals(47, items.get(2).getDescription().orElse("").length()); } @Test diff --git a/src/test/resources/atom-feed.xml b/src/test/resources/atom-feed.xml index c566ae5..5c8f9de 100644 --- a/src/test/resources/atom-feed.xml +++ b/src/test/resources/atom-feed.xml @@ -37,9 +37,32 @@ -
+

[Update: The Atom draft is finished.]

+ + + + 10 + John + Doe + + + Atom-Powered Robots Run Amok + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2003-12-13T18:30:02Z + + + + {"firstName"="John","lastName"="Doe","id"="10"} + + Atom-Powered Robots Run Amok 2 + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6b + 2003-12-13T18:30:01Z + \ No newline at end of file