Skip to content

Commit

Permalink
Merge pull request #109 from w3stling/map-child-nodes
Browse files Browse the repository at this point in the history
Content or summary element is not mapped if it contains XML
  • Loading branch information
w3stling authored Aug 13, 2023
2 parents 18128c3 + e751c4d commit ee77de8
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 33 deletions.
107 changes: 77 additions & 30 deletions src/main/java/com/apptasticsoftware/rssreader/AbstractRssReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ public abstract class AbstractRssReader<C extends Channel, I extends Item> {
private final HashMap<String, Map<String, BiConsumer<C, String>>> channelAttributes = new HashMap<>();
private final HashMap<String, BiConsumer<I, String>> itemTags = new HashMap<>();
private final HashMap<String, Map<String, BiConsumer<I, String>>> itemAttributes = new HashMap<>();
private final Set<String> collectChildNodesForTag = Set.of("content", "summary");
private boolean isInitialized;


Expand Down Expand Up @@ -458,6 +459,7 @@ else if (firstChar == 13 || Character.isWhitespace(firstChar)) {

class RssItemIterator implements Iterator<I> {
private final StringBuilder textBuilder;
private final Map<String, StringBuilder> childNodeTextBuilder;
private final InputStream is;
private final Deque<String> elementStack;
private XMLStreamReader reader;
Expand All @@ -466,12 +468,12 @@ class RssItemIterator implements Iterator<I> {
private I nextItem;
private boolean isChannelPart = false;
private boolean isItemPart = false;
private String elementName = null;

public RssItemIterator(InputStream is) {
this.is = is;
nextItem = null;
textBuilder = new StringBuilder();
childNodeTextBuilder = new HashMap<>();
elementStack = new ArrayDeque<>();

try {
Expand Down Expand Up @@ -533,7 +535,8 @@ public I next() {

try {
while (reader.hasNext()) {
var type = reader.next(); // do something here
var type = reader.next();
collectChildNodes(type);

if (type == CHARACTERS || type == CDATA) {
parseCharacters();
Expand All @@ -560,21 +563,66 @@ else if (type == END_ELEMENT) {
throw new NoSuchElementException();
}

void collectChildNodes(int type) {
if (type == START_ELEMENT) {
var nsTagName = toNsName(reader.getPrefix(), reader.getLocalName());

if (!childNodeTextBuilder.isEmpty()) {
StringBuilder startTagBuilder = new StringBuilder("<").append(nsTagName);
// Add namespaces to start tag
for (int i = 0; i < reader.getNamespaceCount(); ++i) {
startTagBuilder.append(" ")
.append(toNamespacePrefix(reader.getNamespacePrefix(i)))
.append("=")
.append(reader.getNamespaceURI(i));
}
// Add attributes to start tag
for (int i = 0; i < reader.getAttributeCount(); ++i) {
startTagBuilder.append(" ")
.append(toNsName(reader.getAttributePrefix(i), reader.getAttributeLocalName(i)))
.append("=")
.append(reader.getAttributeValue(i));
}
startTagBuilder.append(">");
var startTag = startTagBuilder.toString();

childNodeTextBuilder.entrySet()
.stream()
.filter(e -> !e.getKey().equals(nsTagName))
.forEach(e -> e.getValue().append(startTag));
}

// Collect child notes for tag names in this set
if (collectChildNodesForTag.contains(nsTagName)) {
childNodeTextBuilder.put(nsTagName, new StringBuilder());
}
}
else if (type == CHARACTERS || type == CDATA) {
childNodeTextBuilder.forEach((k, builder) -> builder.append(reader.getText()));
}
else if (type == END_ELEMENT) {
var nsTagName = toNsName(reader.getPrefix(), reader.getLocalName());
var endTag = "</" + nsTagName + ">";
childNodeTextBuilder.entrySet()
.stream()
.filter(e -> !e.getKey().equals(nsTagName))
.forEach(e -> e.getValue().append(endTag));
}
}

void parseStartElement() {
textBuilder.setLength(0);
elementName = reader.getLocalName();
var prefix = reader.getPrefix();
var nsLocalName = toNsName(prefix, elementName);
elementStack.addLast(nsLocalName);
var nsTagName = toNsName(reader.getPrefix(), reader.getLocalName());
elementStack.addLast(nsTagName);

if ("channel".equals(nsLocalName) || "feed".equals(nsLocalName)) {
if ("channel".equals(nsTagName) || "feed".equals(nsTagName)) {
channel = createChannel();
channel.setTitle("");
channel.setDescription("");
channel.setLink("");
isChannelPart = true;
}
else if ("item".equals(nsLocalName) || "entry".equals(nsLocalName)) {
else if ("item".equals(nsTagName) || "entry".equals(nsTagName)) {
item = createItem();
item.setChannel(channel);
isChannelPart = false;
Expand All @@ -583,20 +631,18 @@ else if ("item".equals(nsLocalName) || "entry".equals(nsLocalName)) {
}

void parseAttributes() {
var localName = reader.getLocalName();
var prefix = reader.getPrefix();
var nsLocalName = toNsName(prefix, localName);
var attributeFullPath = getElementFullPath();
var nsTagName = toNsName(reader.getPrefix(), reader.getLocalName());
var elementFullPath = getElementFullPath();

if (isChannelPart) {
// Map channel attributes
mapChannelAttributes(nsLocalName);
mapChannelAttributes(attributeFullPath);
mapChannelAttributes(nsTagName);
mapChannelAttributes(elementFullPath);
}
else if (isItemPart) {
// Map item attributes
mapItemAttributes(nsLocalName);
mapItemAttributes(attributeFullPath);
mapItemAttributes(nsTagName);
mapItemAttributes(elementFullPath);
}
}

Expand All @@ -621,22 +667,19 @@ void mapItemAttributes(String key) {
}

boolean parseEndElement() {
var localName = reader.getLocalName();
var prefix = reader.getPrefix();
var nsLocalName = toNsName(prefix, localName);
var nsTagName = toNsName(reader.getPrefix(), reader.getLocalName());
var text = textBuilder.toString().trim();
var elementFullPath = getElementFullPath();
elementStack.removeLast();

if (isChannelPart)
parseChannelCharacters(channel, prefix, elementName, elementFullPath, text);
parseChannelCharacters(channel, nsTagName, elementFullPath, text);
else
parseItemCharacters(item, prefix, elementName, elementFullPath, text);
parseItemCharacters(item, nsTagName, elementFullPath, text);

textBuilder.setLength(0);
elementName = "";

return "item".equals(nsLocalName) || "entry".equals(nsLocalName);
return "item".equals(nsTagName) || "entry".equals(nsTagName);
}

void parseCharacters() {
Expand All @@ -648,28 +691,32 @@ void parseCharacters() {
textBuilder.append(text);
}

void parseChannelCharacters(C channel, String prefix, String elementName, String elementFullPath, String text) {
void parseChannelCharacters(C channel, String nsTagName, String elementFullPath, String text) {
if (channel == null || text.isEmpty())
return;

var nsElementName = toNsName(prefix, elementName);
channelTags.computeIfPresent(nsElementName, (k, f) -> { f.accept(channel, text); return f; });
channelTags.computeIfPresent(nsTagName, (k, f) -> { f.accept(channel, text); return f; });
channelTags.computeIfPresent(elementFullPath, (k, f) -> { f.accept(channel, text); return f; });
}

void parseItemCharacters(final I item, String prefix, String elementName, String elementFullPath, final String text) {
if (item == null || text.isEmpty())
void parseItemCharacters(final I item, String nsTagName, String elementFullPath, final String text) {
var builder = childNodeTextBuilder.remove(nsTagName);
if (item == null || (text.isEmpty() && builder == null))
return;

var nsElementName = toNsName(prefix, elementName);
itemTags.computeIfPresent(nsElementName, (k, f) -> { f.accept(item, text); return f; });
var textValue = (builder != null) ? builder.toString().trim() : text;
itemTags.computeIfPresent(nsTagName, (k, f) -> { f.accept(item, textValue); return f; });
itemTags.computeIfPresent(elementFullPath, (k, f) -> { f.accept(item, text); return f; });
}

String toNsName(String prefix, String name) {
return prefix.isEmpty() ? name : prefix + ":" + name;
}

String toNamespacePrefix(String prefix) {
return prefix == null || prefix.isEmpty() ? "xmlns" : "xmlns" + ":" + prefix;
}

String getElementFullPath() {
return "/" + String.join("/", elementStack);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -599,8 +599,29 @@ void testCloseTwice() throws IOException {
void testAtomFeed() {
var items = new RssReader().read(fromFile("atom-feed.xml"))
.collect(Collectors.toList());
assertEquals(1, items.size());
assertEquals("Mark Pilgrim", items.get(0).getAuthor().orElse(null));

assertEquals(3, items.size());

assertEquals("dive into mark", items.get(0).getChannel().getTitle());
assertEquals(65, items.get(0).getChannel().getDescription().length());
assertEquals("http://example.org/feed.atom", items.get(0).getChannel().getLink());
assertEquals("Copyright (c) 2003, Mark Pilgrim", items.get(0).getChannel().getCopyright().orElse(null));
assertEquals("Example Toolkit", items.get(0).getChannel().getGenerator().orElse(null));
assertEquals("2005-07-31T12:29:29Z", items.get(0).getChannel().getLastBuildDate().orElse(null));

assertEquals("Atom-Powered Robots Run Amok", items.get(1).getTitle().orElse(null));
assertNull(items.get(1).getAuthor().orElse(null));
assertEquals("http://example.org/2003/12/13/atom03", items.get(1).getLink().orElse(null));
assertEquals("urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a", items.get(1).getGuid().orElse(null));
assertEquals("2003-12-13T18:30:02Z", items.get(1).getPubDate().orElse(null));
assertEquals(211, items.get(1).getDescription().orElse("").length());

assertEquals("Atom-Powered Robots Run Amok 2", items.get(2).getTitle().orElse(null));
assertNull(items.get(2).getAuthor().orElse(null));
assertEquals("http://example.org/2003/12/13/atom04", items.get(2).getLink().orElse(null));
assertEquals("urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6b", items.get(2).getGuid().orElse(null));
assertEquals("2003-12-13T18:30:01Z", items.get(2).getPubDate().orElse(null));
assertEquals(47, items.get(2).getDescription().orElse("").length());
}

@Test
Expand Down
25 changes: 24 additions & 1 deletion src/test/resources/atom-feed.xml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,32 @@
</contributor>
<content type="xhtml" xml:lang="en"
xml:base="http://diveintomark.org/">
<div xmlns="http://www.w3.org/1999/xhtml">
<div xmlns="http://www.w3.org/1999/xhtml" xmlns:t="https://www.apptasticsoftware.com/testing" t:text="some value">
<p><i>[Update: The Atom draft is finished.]</i></p>
</div>
</content>
</entry>
<entry>
<summary type="text/xml">
<p:Customer xmlns:p="http://www.ibm.com/crm"
xmlns="http://www.ibm.com/crm">
<id>10</id>
<firstName>John</firstName>
<lastName>Doe</lastName>
</p:Customer>
</summary>
<title>Atom-Powered Robots Run Amok</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
</entry>
<entry>
<content type="text/json">
{"firstName"="John","lastName"="Doe","id"="10"}
</content>
<title>Atom-Powered Robots Run Amok 2</title>
<link href="http://example.org/2003/12/13/atom04"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6b</id>
<updated>2003-12-13T18:30:01Z</updated>
</entry>
</feed>

0 comments on commit ee77de8

Please sign in to comment.