From 36704fb4cb8d3042b262ca992f5df3ef7356ac64 Mon Sep 17 00:00:00 2001 From: Louis-Dominique Dubeau Date: Tue, 25 Jun 2019 19:37:21 -0400 Subject: [PATCH] feat: support for XML 1.1 BREAKING CHANGE: previous versions of saxes would parse files with an XML declaration set to 1.1 as 1.0 documents. The support for 1.1 entails that if a document has an XML declaration that specifies version 1.1 it is parsed as a 1.1 document. --- README.md | 67 ++++++++++++------- lib/saxes.d.ts | 1 + lib/saxes.js | 142 ++++++++++++++++++++++++++++++++++++---- test/conformance.js | 50 +++++++++++++- test/xml-declaration.js | 110 +++++++++++++++++++++++++++++++ 5 files changed, 331 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 4e6fa1ac..590516d4 100644 --- a/README.md +++ b/README.md @@ -16,11 +16,10 @@ Saxes does not support Node versions older than 8. well-formedness. Sax, even in its so-called "strict mode", is not strict. It silently accepts structures that are not well-formed XML. Projects that need better compliance with well-formedness constraints cannot use sax as-is. - Saxes aims for conformance with [XML 1.0 fifth - edition](https://www.w3.org/TR/2008/REC-xml-20081126/) and [XML Namespaces 1.0 - third edition](http://www.w3.org/TR/2009/REC-xml-names-20091208/). - Consequently, saxes does not support HTML, or pseudo-XML, or bad XML. + Consequently, saxes does not support HTML, or pseudo-XML, or bad XML. Saxes + will report well-formedness errors in all these cases but it won't try to + extract data from malformed documents like sax does. * Saxes is much much faster than sax, mostly because of a substantial redesign of the internal parsing logic. The speed improvement is not merely due to @@ -45,28 +44,23 @@ Saxes does not support Node versions older than 8. * Saxes does not have facilities for limiting the size the data chunks passed to event handlers. See the FAQ entry for more details. -## Limitations - -This is a non-validating parser so it only verifies whether the document is -well-formed. We do aim to raise errors for all malformed constructs encountered. +## Conformance -However, this parser does not parse the contents of DTDs. So malformedness -errors caused by errors in DTDs cannot be reported. +Saxes supports: -Also, the parser continues to parse even upon encountering errors, and does its -best to continue reporting errors. You should heed all errors -reported. +* [XML 1.0 fifth edition](https://www.w3.org/TR/2008/REC-xml-20081126/) +* [XML 1.1 second edition](https://www.w3.org/TR/2006/REC-xml11-20060816/) +* [Namespaces in XML 1.0 (Third Edition)](https://www.w3.org/TR/2009/REC-xml-names-20091208/). +* [Namespaces in XML 1.1 (Second Edition)](https://www.w3.org/TR/2006/REC-xml-names11-20060816/). -**HOWEVER, ONCE AN ERROR HAS BEEN ENCOUNTERED YOU CANNOT RELY ON THE DATA -PROVIDED THROUGH THE OTHER EVENT HANDLERS.** +## Limitations -After an error, saxes tries to make sense of your document, but it may interpret -it incorrectly. For instance ```` is invalid XML. Did you mean to -have ```` or ```` or some other variation? -Saxes takes an honest stab at figuring out your mangled XML. That's as good as -it gets. +This is a non-validating parser so it only verifies whether the document is +well-formed. We do aim to raise errors for all malformed constructs +encountered. However, this parser does not thorougly parse the contents of +DTDs. So most malformedness errors caused by errors in DTDs cannot be reported. -## Regarding ``` is invalid XML. Did you mean to have ```` or +```` or some other variation? For the sake of continuing to +provide errors, saxes will continue parsing the document, but the structure it +reports may be incorrect. It is only after the errors are fixed in the document +that saxes can provide a reliable interpretation of the document. + +That leaves you with two rules of thumb when using saxes: + +* Pay attention to the errors that saxes report. The default `onerror` handler + throws, so by default, you cannot miss errors. + +* **ONCE AN ERROR HAS BEEN ENCOUNTERED, STOP RELYING ON THE EVENT HANDLERS OTHER + THAN `onerror`.** As explained above, when saxes runs into a well-formedness + problem, it makes a guess in order to continue reporting more errors. The guess + may be wrong. + ### Events To listen to an event, override `on`. The list of supported events diff --git a/lib/saxes.d.ts b/lib/saxes.d.ts index 2a6e98f1..48e81105 100644 --- a/lib/saxes.d.ts +++ b/lib/saxes.d.ts @@ -7,6 +7,7 @@ declare namespace saxes { fragment?: boolean; fileName?: string; additionalNamespaces?: Record; + defaultXMLVersion?: "1.0" | "1.1"; } export interface XMLDecl { diff --git a/lib/saxes.js b/lib/saxes.js index 6ed45327..f70c8e80 100644 --- a/lib/saxes.js +++ b/lib/saxes.js @@ -1,8 +1,11 @@ "use strict"; -const { isS, isChar, isNameStartChar, isNameChar, S_LIST, NAME_RE } = - require("xmlchars/xml/1.0/ed5"); -const { isNCNameStartChar, isNCNameChar, NC_NAME_RE } = require("xmlchars/xmlns/1.0/ed3"); +const { + isS, isChar: isChar10, isNameStartChar, isNameChar, S_LIST, NAME_RE, +} = require("xmlchars/xml/1.0/ed5"); +const { isChar: isChar11, isRestrictedChar } = require("xmlchars/xml/1.1/ed2"); +const { isNCNameStartChar, isNCNameChar, NC_NAME_RE } = + require("xmlchars/xmlns/1.0/ed3"); const XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"; const XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/"; @@ -101,6 +104,8 @@ const GREATER = 0x3E; const QUESTION = 0x3F; const OPEN_BRACKET = 0x5B; const CLOSE_BRACKET = 0x5D; +const NEL = 0x85; +const LS = 0x2028; // Line Separator function isQuote(c) { return c === DQUOTE || c === SQUOTE; @@ -259,6 +264,10 @@ const FORBIDDEN_BRACKET_BRACKET = 2; * @property {string} [fileName] A file name to use for error reporting. "File name" is a loose * concept. You could use a URL to some resource, or any descriptive name you * like. + * + * @property {"1.0" | "1.1"} [defaultXMLVersion] The default XML version to + * use. If unspecified, and there is no XML encoding declaration, the default + * version is "1.0". */ class SaxesParser { @@ -360,7 +369,6 @@ class SaxesParser { this.nameCheck = isNCNameChar; this.isName = isNCName; this.processAttribs = this.processAttribsNS; - this.pushAttrib = this.pushAttribNS; this.ns = Object.assign({ __proto__: null }, rootNS); const additional = this.opt.additionalNamespaces; @@ -374,9 +382,14 @@ class SaxesParser { this.nameCheck = isNameChar; this.isName = isName; this.processAttribs = this.processAttribsPlain; - this.pushAttrib = this.pushAttribPlain; } + let { defaultXMLVersion } = this.opt; + if (defaultXMLVersion === undefined) { + defaultXMLVersion = "1.0"; + } + this.setXMLVersion(defaultXMLVersion); + this.trackPosition = this.opt.position !== false; /** The line number the parser is currently looking at. */ this.line = 1; @@ -575,11 +588,13 @@ class SaxesParser { * Get a single code point out of the current chunk. This updates the current * position if we do position tracking. * + * This is the algorithm to use for XML 1.0. + * * @private * * @returns {number} The character read. */ - getCode() { + getCode10() { const { chunk, i } = this; // Using charCodeAt and handling the surrogates ourselves is faster // than using codePointAt. @@ -614,7 +629,68 @@ class SaxesParser { skip++; } - if (!isChar(code)) { + if (!isChar10(code)) { + this.fail("disallowed character."); + } + } + + this.i += skip; + + return code; + } + + + /** + * Get a single code point out of the current chunk. This updates the current + * position if we do position tracking. + * + * This is the algorithm to use for XML 1.1. + * + * @private + * + * @returns {number} The character read. + */ + getCode11() { + const { chunk, i } = this; + // Using charCodeAt and handling the surrogates ourselves is faster + // than using codePointAt. + let code = chunk.charCodeAt(i); + + let skip = 1; + switch (code) { + case CR: { // 0xD + // We may get NaN if we read past the end of the chunk, which is + // fine. + const next = chunk.charCodeAt(i + 1); + if (next === NL || next === NEL) { + // A CR NL or CR NEL sequence is converted to NL so we have to skip over + // the next character. We already know it has a size of 1 so ++ is fine + // here. + skip++; + } + // Otherwise, a CR is just converted to NL, no skip. + } + /* yes, fall through */ + case NEL: // 0x85 + case LS: // Ox2028 + case NL: // 0xA + code = NL; + this.line++; + this.column = 0; + break; + + default: + this.column++; + if (code >= 0xD800 && code <= 0xDBFF) { + code = 0x10000 + ((code - 0xD800) * 0x400) + + (chunk.charCodeAt(i + 1) - 0xDC00); + this.column++; + skip++; + } + + // In XML 1.1 the character we read must satisfy the Char production but + // not the RestrictedChar production. + if (!isChar11(code) || isRestrictedChar(code)) { this.fail("disallowed character."); } } @@ -769,6 +845,22 @@ class SaxesParser { return undefined; } + /** @private */ + setXMLVersion(version) { + if (version === "1.0") { + this.isChar = isChar10; + this.getCode = this.getCode10; + this.pushAttrib = + this.xmlnsOpt ? this.pushAttribNS10 : this.pushAttribPlain; + } + else { + this.isChar = isChar11; + this.getCode = this.getCode11; + this.pushAttrib = + this.xmlnsOpt ? this.pushAttribNS11 : this.pushAttribPlain; + } + } + // STATE HANDLERS /** @private */ @@ -1380,13 +1472,19 @@ class SaxesParser { if (c) { switch (this.xmlDeclName) { - case "version": - if (!/^1\.[0-9]+$/.test(this.xmlDeclValue)) { + case "version": { + this.xmlDeclExpects = ["encoding", "standalone"]; + const version = this.xmlDeclValue; + this.xmlDecl.version = version; + // This is the test specified by XML 1.0 but it is fine for XML 1.1. + if (!/^1\.[0-9]+$/.test(version)) { this.fail("version number must match /^1\\.[0-9]+$/."); } - this.xmlDeclExpects = ["encoding", "standalone"]; - this.xmlDecl.version = this.xmlDeclValue; + else { + this.setXMLVersion(version); + } break; + } case "encoding": if (!/^[A-Za-z][A-Za-z0-9._-]*$/.test(this.xmlDeclValue)) { this.fail("encoding value must match \ @@ -1561,7 +1659,25 @@ class SaxesParser { } /** @private */ - pushAttribNS(name, value) { + pushAttribNS10(name, value) { + const { prefix, local } = this.qname(name); + this.attribList.push({ name, prefix, local, value, uri: undefined }); + if (prefix === "xmlns") { + const trimmed = value.trim(); + if (trimmed === "") { + this.fail("invalid attempt to undefine prefix in XML 1.0"); + } + this.tag.ns[local] = trimmed; + nsPairCheck(this, local, trimmed); + } + else if (name === "xmlns") { + const trimmed = value.trim(); + this.tag.ns[""] = trimmed; + nsPairCheck(this, "", trimmed); + } + } + + pushAttribNS11(name, value) { const { prefix, local } = this.qname(name); this.attribList.push({ name, prefix, local, value, uri: undefined }); if (prefix === "xmlns") { @@ -2060,7 +2176,7 @@ class SaxesParser { } // The character reference is required to match the CHAR production. - if (!isChar(num)) { + if (!this.isChar(num)) { this.fail("malformed character entity."); return `&${entity};`; } diff --git a/test/conformance.js b/test/conformance.js index 07a2ee32..327b059f 100644 --- a/test/conformance.js +++ b/test/conformance.js @@ -4,7 +4,7 @@ const { build } = require("xml-conformance-suite/js/frameworks/mocha/builders/ba const { ResourceLoader } = require("xml-conformance-suite/js/lib/resource-loader"); const { loadTests } = require("xml-conformance-suite/js/lib/test-parser"); const { BaseDriver } = require("xml-conformance-suite/js/drivers/base"); -const { Selection } = require("xml-conformance-suite/js/selections/whatwg"); +const { BaseSelection } = require("xml-conformance-suite/js/selections/base"); const saxes = require("../lib/saxes"); @@ -105,6 +105,11 @@ const SKIP = { "ibm-valid-P29-ibm29v01.xml": "ENTITIES", "ibm-valid-P43-ibm43v01.xml": "ENTITIES", "ibm-valid-P67-ibm67v01.xml": "ENTITIES", + "ibm-1-1-not-wf-P77-ibm77n14.xml": "DTD", + "ibm-1-1-valid-P02-ibm02v04.xml": "ENTITIES", + "ibm-1-1-valid-P03-ibm03v05.xml": "ENTITIES", + "ibm-1-1-valid-P03-ibm03v06.xml": "ENTITIES", + "ibm-1-1-valid-P03-ibm03v07.xml": "ENTITIES", "rmt-e2e-15e": "ENTITIES", "rmt-e2e-15f": "ENTITIES", "rmt-ns10-043": "DTD", @@ -150,7 +155,15 @@ const PLATFORM_ISSUES = { "ibm-not-wf-P02-ibm02n30.xml": "surrogate encoding", "ibm-not-wf-P02-ibm02n31.xml": "surrogate encoding", "rmt-e2e-27": "surrogate encoding", + "rmt-e2e-50": "xml declaration encoding", "rmt-e2e-61": "xml declaration encoding", + "rmt-011": "xml declarations encoding", + "rmt-034": "xml declarations encoding", + "rmt-035": "xml declarations encoding", + "rmt-041": "xml declarations encoding", + "rmt-050": "xml declarations encoding", + "rmt-051": "xml declarations encoding", + "rmt-054": "xml declarations encoding", "x-ibm-1-0.5-not-wf-P04-ibm04n21.xml": "surrogate encoding", "x-ibm-1-0.5-not-wf-P04-ibm04n22.xml": "surrogate encoding", "x-ibm-1-0.5-not-wf-P04-ibm04n23.xml": "surrogate encoding", @@ -159,11 +172,37 @@ const PLATFORM_ISSUES = { "x-ibm-1-0.5-not-wf-P04a-ibm04an22.xml": "surrogate encoding", "x-ibm-1-0.5-not-wf-P04a-ibm04an23.xml": "surrogate encoding", "x-ibm-1-0.5-not-wf-P04a-ibm04an24.xml": "surrogate encoding", + "ibm-1-1-not-wf-P02-ibm02n67.xml": "surrogate encoding", + "ibm-1-1-not-wf-P04-ibm04n21.xml": "surrogate encoding", + "ibm-1-1-not-wf-P04-ibm04n22.xml": "surrogate encoding", + "ibm-1-1-not-wf-P04-ibm04n23.xml": "surrogate encoding", + "ibm-1-1-not-wf-P04-ibm04n24.xml": "surrogate encoding", + "ibm-1-1-not-wf-P04a-ibm04an21.xml": "surrogate encoding", + "ibm-1-1-not-wf-P04a-ibm04an22.xml": "surrogate encoding", + "ibm-1-1-not-wf-P04a-ibm04an23.xml": "surrogate encoding", + "ibm-1-1-not-wf-P04a-ibm04an24.xml": "surrogate encoding", "hst-lhs-007": "xml declaration encoding", }; -class SaxesSelection extends Selection { +class SaxesSelection extends BaseSelection { + // eslint-disable-next-line class-methods-use-this + getHandlingByType(test) { + const { testType } = test; + switch (testType) { + case "not-wf": + return "fails"; + case "valid": + return "succeeds"; + case "invalid": + case "error": + return "skip"; + default: + throw new Error(`unexpected test type: ${testType}`); + } + } + + // eslint-disable-next-line class-methods-use-this shouldSkipTest(test) { return Promise.resolve() .then(() => SKIP[test.id] || PLATFORM_ISSUES[test.id] || @@ -172,7 +211,12 @@ class SaxesSelection extends Selection { test.includesSections( ["[12]", "[13]", "[69]", "3.2", "3.2.1", "3.2.2", "3.3", "3.3.1", "3.3.2", "4.2", "4.2.2", "4.5", "4.7"]) || - super.shouldSkipTest(test)); + !((test.includesVersion("1.0") && test.includesEdition("5")) || + test.includesEdition("1.1")) || + // The tests that use BOM rely on the parser being able to look at + // the *raw* data, without decoding. There does not seem to be a way + // to do this. + test.getHasBOM()); } } diff --git a/test/xml-declaration.js b/test/xml-declaration.js index 30e2af36..3e01c6d6 100644 --- a/test/xml-declaration.js +++ b/test/xml-declaration.js @@ -4,6 +4,12 @@ const { expect } = require("chai"); const saxes = require("../lib/saxes"); const { test } = require("."); +const XML_1_0_DECLARATION = ``; +const XML_1_1_DECLARATION = ``; + +const WELL_FORMED_1_0_NOT_1_1 = `\u007F`; +const WELL_FORMED_1_1_NOT_1_0 = ``; + describe("xml declaration", () => { test({ name: "empty declaration", @@ -238,4 +244,108 @@ describe("xml declaration", () => { parser.close(); expect(seen).to.be.true; }); + + function makeTests(groupName, xmlDeclaration, document, expectedResults) { + describe(groupName, () => { + for (const { version, expectError } of expectedResults) { + const errorLabel = expectError ? "errors" : "no errors"; + const title = version === undefined ? + `and without defaultXMLVersion: ${errorLabel}` : + `and with defaultXMLVersion === ${version}: ${errorLabel}`; + + it(title, () => { + const parser = + new saxes.SaxesParser(version === undefined ? undefined : + { defaultXMLVersion: version }); + let error = false; + parser.onerror = () => { + error = true; + }; + parser.write(xmlDeclaration + document); + parser.close(); + expect(error).to.equal(expectError); + }); + } + }); + } + + describe("well-formed for 1.0, not 1.1", () => { + makeTests("without XML declaration", "", WELL_FORMED_1_0_NOT_1_1, [{ + version: undefined, + expectError: false, + }, { + version: "1.0", + expectError: false, + }, { + version: "1.1", + expectError: true, + }]); + + makeTests("with XML 1.0 declaration", XML_1_0_DECLARATION, + WELL_FORMED_1_0_NOT_1_1, [{ + version: undefined, + expectError: false, + }, { + version: "1.0", + expectError: false, + }, { + version: "1.1", + // The XML declaration overrides defaultXMLVersion. + expectError: false, + }]); + + makeTests("with XML 1.1 declaration", XML_1_1_DECLARATION, + WELL_FORMED_1_0_NOT_1_1, [{ + version: undefined, + // The XML declaration overrides defaultXMLVersion. + expectError: true, + }, { + version: "1.0", + // The XML declaration overrides defaultXMLVersion. + expectError: true, + }, { + version: "1.1", + expectError: true, + }]); + }); + + describe("well-formed for 1.1, not 1.0", () => { + makeTests("without XML declaration", "", WELL_FORMED_1_1_NOT_1_0, [{ + version: undefined, + expectError: true, + }, { + version: "1.0", + expectError: true, + }, { + version: "1.1", + expectError: false, + }]); + + makeTests("with XML 1.0 declaration", XML_1_0_DECLARATION, + WELL_FORMED_1_1_NOT_1_0, [{ + version: undefined, + expectError: true, + }, { + version: "1.0", + expectError: true, + }, { + version: "1.1", + // The XML declaration overrides defaultXMLVersion. + expectError: true, + }]); + + makeTests("with XML 1.1 declaration", XML_1_1_DECLARATION, + WELL_FORMED_1_1_NOT_1_0, [{ + version: undefined, + // The XML declaration overrides defaultXMLVersion. + expectError: false, + }, { + version: "1.0", + // The XML declaration overrides defaultXMLVersion. + expectError: false, + }, { + version: "1.1", + expectError: false, + }]); + }); });