From d40223216d18857b9983c2f8f9d003611b910c86 Mon Sep 17 00:00:00 2001 From: Bert Frees Date: Tue, 24 Mar 2020 12:06:51 +0100 Subject: [PATCH] Refactor px:html-chunker Give the px:chunker a "mapping" output instead of the "link-attribute-name" option and apply the mapping afterwards with px:html-update-links. See https://github.com/daisy/pipeline-modules/issues/16 --- .../resources/xml/xslt/fileset-compose.xsl | 190 ++++++++++++------ .../test/xprocspec/fileset-compose.xprocspec | 35 +++- .../pipeline/html/calabash/impl/Chunker.java | 125 ++++++------ .../html/calabash/impl/ChunkerStep.java | 26 ++- .../src/main/resources/xml/xproc/chunker.xpl | 20 +- .../main/resources/xml/xproc/html-chunker.xpl | 37 +++- .../resources/xml/xproc/html-update-links.xpl | 14 +- .../resources/xml/xslt/html-update-links.xsl | 32 ++- .../src/test/xprocspec/html-chunker.xprocspec | 30 ++- .../xprocspec/html-update-links.xprocspec | 51 ++++- .../resources/xml/dtbook-update-links.xsl | 25 ++- .../xml/pub/package-doc-update-links.xsl | 25 ++- .../main/resources/xml/smil-update-links.xsl | 25 ++- 13 files changed, 445 insertions(+), 190 deletions(-) diff --git a/common/fileset-utils/src/main/resources/xml/xslt/fileset-compose.xsl b/common/fileset-utils/src/main/resources/xml/xslt/fileset-compose.xsl index 1928e0e27d..d5198e6328 100644 --- a/common/fileset-utils/src/main/resources/xml/xslt/fileset-compose.xsl +++ b/common/fileset-utils/src/main/resources/xml/xslt/fileset-compose.xsl @@ -25,67 +25,97 @@ - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -93,6 +123,38 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + + + + + + + + + + diff --git a/common/fileset-utils/src/test/xprocspec/fileset-compose.xprocspec b/common/fileset-utils/src/test/xprocspec/fileset-compose.xprocspec index 769d5da6c1..e138d5accf 100644 --- a/common/fileset-utils/src/test/xprocspec/fileset-compose.xprocspec +++ b/common/fileset-utils/src/test/xprocspec/fileset-compose.xprocspec @@ -18,7 +18,10 @@ + + + @@ -26,12 +29,23 @@ - + - - + + + + + + + + + + + + + @@ -46,13 +60,24 @@ - + - + + + + + + + + + + + + diff --git a/common/html-utils/src/main/java/org/daisy/pipeline/html/calabash/impl/Chunker.java b/common/html-utils/src/main/java/org/daisy/pipeline/html/calabash/impl/Chunker.java index 68eedd179a..c8e2988b9b 100644 --- a/common/html-utils/src/main/java/org/daisy/pipeline/html/calabash/impl/Chunker.java +++ b/common/html-utils/src/main/java/org/daisy/pipeline/html/calabash/impl/Chunker.java @@ -1,7 +1,6 @@ package org.daisy.pipeline.html.calabash.impl; import java.net.URI; -import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -31,7 +30,9 @@ import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamWriter; +import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.Iterators; +import com.google.common.collect.Multimap; import com.xmlcalabash.model.RuntimeValue; @@ -50,6 +51,7 @@ import org.daisy.common.stax.BaseURIAwareXMLStreamWriter; import static org.daisy.common.stax.XMLStreamWriterHelper.getAttributes; import static org.daisy.common.stax.XMLStreamWriterHelper.writeAttribute; +import static org.daisy.common.stax.XMLStreamWriterHelper.writeAttributes; import static org.daisy.common.stax.XMLStreamWriterHelper.writeCharacters; import static org.daisy.common.stax.XMLStreamWriterHelper.writeDocument; import static org.daisy.common.stax.XMLStreamWriterHelper.writeEvent; @@ -70,16 +72,20 @@ class Chunker implements NodeToXMLStreamTransformer { final RuntimeValue alwaysBreakAfterOption; final int maxChunkSize; - final QName linkAttributeName; final Configuration config; private static final QName _ID = new QName("id"); private static final QName XML_ID = new QName("http://www.w3.org/XML/1998/namespace", "id", "xml"); + private static final QName D_FILESET = new QName("http://www.daisy.org/ns/pipeline/data", "fileset"); + private static final QName D_FILE = new QName("http://www.daisy.org/ns/pipeline/data", "file"); + private static final QName D_ANCHOR = new QName("http://www.daisy.org/ns/pipeline/data", "anchor"); + private static final QName _HREF = new QName("href"); + private static final QName _ORIGINAL_HREF = new QName("original-href"); Chunker(RuntimeValue allowBreakBeforeOption, RuntimeValue allowBreakAfterOption, RuntimeValue preferBreakBeforeOption, RuntimeValue preferBreakAfterOption, RuntimeValue alwaysBreakBeforeOption, RuntimeValue alwaysBreakAfterOption, - int maxChunkSize, QName linkAttributeName, Configuration config) { + int maxChunkSize, Configuration config) { this.allowBreakBeforeOption = allowBreakBeforeOption; this.allowBreakAfterOption = allowBreakAfterOption; this.preferBreakBeforeOption = preferBreakBeforeOption; @@ -87,7 +93,6 @@ class Chunker implements NodeToXMLStreamTransformer { this.alwaysBreakBeforeOption = alwaysBreakBeforeOption; this.alwaysBreakAfterOption = alwaysBreakAfterOption; this.maxChunkSize = maxChunkSize; - this.linkAttributeName = linkAttributeName; this.config = config; } @@ -95,49 +100,88 @@ class Chunker implements NodeToXMLStreamTransformer { private Stack> parentAttrs; private Path.Builder currentPath; private Iterator splitPoints; - private Map idToChunk; + private Multimap idToChunk; private BreakPosition nextSplitPoint; public void transform(Iterator input, Supplier output) throws TransformerException { XdmNode doc = Iterators.getOnlyElement(input); BaseURIAwareXMLStreamReader reader; + int chunkCount; try { reader = SaxonHelper.nodeReader(doc, config); SortedSet collectSplitPoints = new TreeSet<>(); Map collectIds = new HashMap<>(); getSplitPoints(doc, collectSplitPoints, collectIds); - idToChunk = new HashMap(); + chunkCount = collectSplitPoints.size() + 1; + idToChunk = ArrayListMultimap.create(); int n = 1; for (BreakPosition sp : collectSplitPoints) { Iterator> i = collectIds.entrySet().iterator(); while (i.hasNext()) { Map.Entry e = i.next(); if (sp.compareTo(e.getValue()) > 0) { - idToChunk.put(e.getKey(), n); + idToChunk.put(n, e.getKey()); i.remove(); } } n++; } for (String id : collectIds.keySet()) - idToChunk.put(id, n); + idToChunk.put(n, id); splitPoints = collectSplitPoints.iterator(); } catch (XPathException | SaxonApiException e) { throw new TransformerException(e); } + URI inputBase = doc.getBaseURI(); + if (inputBase == null) + throw new TransformerException(new RuntimeException("source document must have a base URI")); if (splitPoints.hasNext()) { - if (doc.getBaseURI() != null) - output = setBaseURI(output, doc.getBaseURI()); + // first document is the mapping + try { + BaseURIAwareXMLStreamWriter writer = output.get(); + writer.setBaseURI(inputBase); + writer.writeStartDocument(); + writeStartElement(writer, D_FILESET); + for (Integer chunk = 1; chunk <= chunkCount; chunk++) { + writeStartElement(writer, D_FILE); + writeAttribute(writer, _HREF, getChunkBaseURI(inputBase, chunk).toASCIIString()); + writeAttribute(writer, _ORIGINAL_HREF, inputBase.toASCIIString()); + for (String id : idToChunk.get(chunk)) { + writeStartElement(writer, D_ANCHOR); + writeAttribute(writer, _ID, id); + writer.writeEndElement(); + } + writer.writeEndElement(); + } + writer.writeEndElement(); + writer.writeEndDocument(); + } catch (XMLStreamException e) { + throw new TransformerException(e); + } + // then output the chunks + output = setBaseURI(output, inputBase); transform(reader, output); - } else + } else { + // first document is the mapping: leave empty try { BaseURIAwareXMLStreamWriter writer = output.get(); - if (doc.getBaseURI() != null) - writer.setBaseURI(doc.getBaseURI()); + writer.setBaseURI(inputBase); + writer.writeStartDocument(); + writeStartElement(writer, D_FILESET); + writer.writeEndElement(); + writer.writeEndDocument(); + } catch (XMLStreamException e) { + throw new TransformerException(e); + } + // pass on the input to the output + try { + BaseURIAwareXMLStreamWriter writer = output.get(); + writer.setBaseURI(inputBase); writeDocument(writer, reader); } catch (XMLStreamException e) { throw new TransformerException(e); } + } } void transform(BaseURIAwareXMLStreamReader reader, Supplier writers) throws TransformerException { @@ -151,7 +195,6 @@ void transform(BaseURIAwareXMLStreamReader reader, Supplier 0) { - if (bpSegments[j].equals(cpSegments[j])) { - i--; - j++; } - else - break; } - relativizedPath = ""; - while (i > 0) { - relativizedPath += "../"; - i--; } - while (j < cpSegments.length) { - relativizedPath += cpSegments[j] + "/"; - j++; } - relativizedPath = relativizedPath.substring(0, relativizedPath.length() - 1); } - else - relativizedPath = cp; - if (relativizedPath.isEmpty()) - relativizedPath = "./"; - return new URI(null, null, relativizedPath, child.getQuery(), child.getFragment()); } - } catch (URISyntaxException e) { - throw new RuntimeException(e); - } - } - /* * The path of a node in the tree is encoded as a list of integers where each integer * is an index of a child node, starting with the ancestor of the node that is a child diff --git a/common/html-utils/src/main/java/org/daisy/pipeline/html/calabash/impl/ChunkerStep.java b/common/html-utils/src/main/java/org/daisy/pipeline/html/calabash/impl/ChunkerStep.java index 112fd54df9..66a5dcbbf1 100644 --- a/common/html-utils/src/main/java/org/daisy/pipeline/html/calabash/impl/ChunkerStep.java +++ b/common/html-utils/src/main/java/org/daisy/pipeline/html/calabash/impl/ChunkerStep.java @@ -6,14 +6,15 @@ import com.xmlcalabash.io.ReadablePipe; import com.xmlcalabash.io.WritablePipe; import com.xmlcalabash.library.DefaultStep; +import com.xmlcalabash.model.Step; import com.xmlcalabash.runtime.XAtomicStep; import net.sf.saxon.Configuration; import net.sf.saxon.s9api.QName; import net.sf.saxon.s9api.SaxonApiException; +import net.sf.saxon.s9api.XdmNode; import org.daisy.common.calabash.XMLCalabashHelper; -import org.daisy.common.saxon.SaxonHelper; import org.daisy.common.xproc.calabash.XProcStepProvider; import org.osgi.service.component.annotations.Component; @@ -38,6 +39,7 @@ public XProcStep newStep(XProcRuntime runtime, XAtomicStep step) { private ReadablePipe sourcePipe = null; private WritablePipe resultPipe = null; + private WritablePipe mappingPipe = null; private static final QName ALLOW_BREAK_BEFORE = new QName("allow-break-before"); private static final QName ALLOW_BREAK_AFTER = new QName("allow-break-after"); @@ -47,8 +49,6 @@ public XProcStep newStep(XProcRuntime runtime, XAtomicStep step) { private static final QName ALWAYS_BREAK_AFTER = new QName("always-break-after"); private static final QName MAX_CHUNK_SIZE = new QName("max-chunk-size"); - private static final QName LINK_ATTRIBUTE_NAME = new QName("link-attribute-name"); - private static final QName DEFAULT_LINK_ATTRIBUTE_NAME = new QName("href"); private ChunkerStep(XProcRuntime runtime, XAtomicStep step) { super(runtime, step); @@ -61,13 +61,18 @@ public void setInput(String port, ReadablePipe pipe) { @Override public void setOutput(String port, WritablePipe pipe) { - resultPipe = pipe; + if ("result".equals(port)) { + resultPipe = pipe; + } else { // "mapping" + mappingPipe = pipe; + } } @Override public void reset() { sourcePipe.resetReader(); resultPipe.resetWriter(); + mappingPipe.resetWriter(); } @Override @@ -83,10 +88,19 @@ public void run() throws SaxonApiException { getOption(ALWAYS_BREAK_BEFORE), getOption(ALWAYS_BREAK_AFTER), getOption(MAX_CHUNK_SIZE, -1), - SaxonHelper.jaxpQName(getOption(LINK_ATTRIBUTE_NAME, DEFAULT_LINK_ATTRIBUTE_NAME)), configuration), sourcePipe, - resultPipe, + new WritablePipe() { + private int count = 0; + public void write(XdmNode doc) { + (count++ == 0 ? mappingPipe : resultPipe).write(doc); + } + public void canWriteSequence(boolean sequence) { throw new UnsupportedOperationException(); } + public boolean writeSequence() { throw new UnsupportedOperationException(); } + public void setWriter(Step step) { throw new UnsupportedOperationException(); } + public void resetWriter() { throw new UnsupportedOperationException(); } + public void close() { throw new UnsupportedOperationException(); } + }, runtime); } catch (Exception e) { logger.error("px:chunker failed", e); diff --git a/common/html-utils/src/main/resources/xml/xproc/chunker.xpl b/common/html-utils/src/main/resources/xml/xproc/chunker.xpl index dd081510e0..f5c1623e3d 100644 --- a/common/html-utils/src/main/resources/xml/xproc/chunker.xpl +++ b/common/html-utils/src/main/resources/xml/xproc/chunker.xpl @@ -2,7 +2,6 @@ @@ -34,22 +33,21 @@ - - -

The name of the attribute used for links. Every - attribute with this name that points to an element within the same document (URI with - only a fragment part) is translated in such a way that in the output the links point to - the right chunks.

-
-
- - +

Every output document gets a different base URI derived from the input base URI.

+ + + A

d:fileset document that contains a mapping from input file + (@original-href) to output files (@href) with contained + id attributes (d:anchor).

+
+
+ diff --git a/common/html-utils/src/main/resources/xml/xproc/html-chunker.xpl b/common/html-utils/src/main/resources/xml/xproc/html-chunker.xpl index 72c53487ac..3c843aef48 100644 --- a/common/html-utils/src/main/resources/xml/xproc/html-chunker.xpl +++ b/common/html-utils/src/main/resources/xml/xproc/html-chunker.xpl @@ -8,20 +8,37 @@ version="1.0" name="main"> - -

Break a HTML document into smaller parts based on - its structure.

+ +

Break a HTML document into smaller parts based on its structure.

- + + + +

A d:fileset document that contains a mapping from input file + (@original-href) to output files (@href) with contained + id attributes (d:anchor).

+
+ +
- + + + px:chunker + + + + + px:html-update-links + + - + /html:html/html:body/html:section[tokenize(@epub:type,'\s+')='bodymatter']/html:section"> @@ -48,6 +64,11 @@
+ + + + +
diff --git a/common/html-utils/src/main/resources/xml/xproc/html-update-links.xpl b/common/html-utils/src/main/resources/xml/xproc/html-update-links.xpl index 98bd9fb5f3..4171910ed5 100644 --- a/common/html-utils/src/main/resources/xml/xproc/html-update-links.xpl +++ b/common/html-utils/src/main/resources/xml/xproc/html-update-links.xpl @@ -24,6 +24,16 @@ + + +

Whether the source document itself has previously been renamed according to "mapping" + or not. In other words, whether the URI of the source document is to be compared with + the href rather than the original-href attributes of the + "mapping" document. By default it is assumed that the renaming is done after this + step.

+
+
+

The output HTML document with updated links

@@ -38,9 +48,7 @@ - - - + diff --git a/common/html-utils/src/main/resources/xml/xslt/html-update-links.xsl b/common/html-utils/src/main/resources/xml/xslt/html-update-links.xsl index c4f3d4e899..41494aef9d 100644 --- a/common/html-utils/src/main/resources/xml/xslt/html-update-links.xsl +++ b/common/html-utils/src/main/resources/xml/xslt/html-update-links.xsl @@ -13,6 +13,8 @@ + + @@ -20,13 +22,19 @@ - - + + - + + diff --git a/common/html-utils/src/test/xprocspec/html-chunker.xprocspec b/common/html-utils/src/test/xprocspec/html-chunker.xprocspec index 07196e9936..e1bd1d4a76 100644 --- a/common/html-utils/src/test/xprocspec/html-chunker.xprocspec +++ b/common/html-utils/src/test/xprocspec/html-chunker.xprocspec @@ -2,6 +2,7 @@ @@ -90,6 +91,14 @@
+ + + + + + + + @@ -151,7 +160,7 @@ - +
@@ -171,14 +180,14 @@ + type="xpath" test="collection()/base-uri(.)" equals="('file:/Users/me/file-1.html', + 'file:/Users/me/file-2.html')"/>

a

- +
@@ -191,6 +200,19 @@
+ + + + + + + + + + + + + diff --git a/common/html-utils/src/test/xprocspec/html-update-links.xprocspec b/common/html-utils/src/test/xprocspec/html-update-links.xprocspec index 50aca27bac..5ae988a6c5 100644 --- a/common/html-utils/src/test/xprocspec/html-update-links.xprocspec +++ b/common/html-utils/src/test/xprocspec/html-update-links.xprocspec @@ -4,15 +4,17 @@ xmlns:d="http://www.daisy.org/ns/pipeline/data" script="../../main/resources/xml/xproc/html-update-links.xpl"> - +
+

+

@@ -40,8 +42,55 @@
+

+ +

+ + + + + + + + + + + + + +
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
+ +
diff --git a/scripts-utils/dtbook-utils/src/main/resources/xml/dtbook-update-links.xsl b/scripts-utils/dtbook-utils/src/main/resources/xml/dtbook-update-links.xsl index 495c4441b0..96c6a389e3 100644 --- a/scripts-utils/dtbook-utils/src/main/resources/xml/dtbook-update-links.xsl +++ b/scripts-utils/dtbook-utils/src/main/resources/xml/dtbook-update-links.xsl @@ -10,6 +10,8 @@ + +