From 36704fb4cb8d3042b262ca992f5df3ef7356ac64 Mon Sep 17 00:00:00 2001
From: Louis-Dominique Dubeau <ldd@lddubeau.com>
Date: Tue, 25 Jun 2019 19:37:21 -0400
Subject: [PATCH] feat: support for XML 1.1

BREAKING CHANGE: previous versions of saxes would parse files with an XML
declaration set to 1.1 as 1.0 documents. The support for 1.1 entails that if a
document has an XML declaration that specifies version 1.1 it is parsed as a 1.1
document.
---
 README.md               |  67 ++++++++++++-------
 lib/saxes.d.ts          |   1 +
 lib/saxes.js            | 142 ++++++++++++++++++++++++++++++++++++----
 test/conformance.js     |  50 +++++++++++++-
 test/xml-declaration.js | 110 +++++++++++++++++++++++++++++++
 5 files changed, 331 insertions(+), 39 deletions(-)
diff --git a/README.md b/README.md
index 4e6fa1ac..590516d4 100644
--- a/README.md
+++ b/README.md
@@ -16,11 +16,10 @@ Saxes does not support Node versions older than 8.
   well-formedness. Sax, even in its so-called "strict mode", is not strict. It
   silently accepts structures that are not well-formed XML. Projects that need
   better compliance with well-formedness constraints cannot use sax as-is.
-  Saxes aims for conformance with [XML 1.0 fifth
-  edition](https://www.w3.org/TR/2008/REC-xml-20081126/) and [XML Namespaces 1.0
-  third edition](http://www.w3.org/TR/2009/REC-xml-names-20091208/).
 
-  Consequently, saxes does not support HTML, or pseudo-XML, or bad XML.
+  Consequently, saxes does not support HTML, or pseudo-XML, or bad XML. Saxes
+  will report well-formedness errors in all these cases but it won't try to
+  extract data from malformed documents like sax does.
 
 * Saxes is much much faster than sax, mostly because of a substantial redesign
   of the internal parsing logic. The speed improvement is not merely due to
@@ -45,28 +44,23 @@ Saxes does not support Node versions older than 8.
 * Saxes does not have facilities for limiting the size the data chunks passed to
   event handlers. See the FAQ entry for more details.
 
-## Limitations
-
-This is a non-validating parser so it only verifies whether the document is
-well-formed. We do aim to raise errors for all malformed constructs encountered.
+## Conformance
 
-However, this parser does not parse the contents of DTDs. So malformedness
-errors caused by errors in DTDs cannot be reported.
+Saxes supports:
 
-Also, the parser continues to parse even upon encountering errors, and does its
-best to continue reporting errors. You should heed all errors
-reported.
+* [XML 1.0 fifth edition](https://www.w3.org/TR/2008/REC-xml-20081126/)
+* [XML 1.1 second edition](https://www.w3.org/TR/2006/REC-xml11-20060816/)
+* [Namespaces in XML 1.0 (Third Edition)](https://www.w3.org/TR/2009/REC-xml-names-20091208/).
+* [Namespaces in XML 1.1 (Second Edition)](https://www.w3.org/TR/2006/REC-xml-names11-20060816/).
 
-**HOWEVER, ONCE AN ERROR HAS BEEN ENCOUNTERED YOU CANNOT RELY ON THE DATA
-PROVIDED THROUGH THE OTHER EVENT HANDLERS.**
+## Limitations
 
-After an error, saxes tries to make sense of your document, but it may interpret
-it incorrectly. For instance ``<foo a=bc="d"/>`` is invalid XML. Did you mean to
-have ``<foo a="bc=d"/>`` or ``<foo a="b" c="d"/>`` or some other variation?
-Saxes takes an honest stab at figuring out your mangled XML. That's as good as
-it gets.
+This is a non-validating parser so it only verifies whether the document is
+well-formed. We do aim to raise errors for all malformed constructs
+encountered. However, this parser does not thorougly parse the contents of
+DTDs. So most malformedness errors caused by errors in DTDs cannot be reported.
 
-## Regarding `<!DOCTYPE`s and `<!ENTITY`s
+## Regarding `<!DOCTYPE` and `<!ENTITY`
 
 The parser will handle the basic XML entities in text nodes and attribute
 values: `&amp; &lt; &gt; &apos; &quot;`. It's possible to define additional
@@ -138,10 +132,16 @@ Settings supported:
    namespaces known before parsing the XML file. It is not legal to pass
    bindings for the namespaces `"xml"` or `"xmlns"`.
 
+* `defaultXMLVersion` - The default version of the XML specification to use if
+  the document contains no XML declaration. If the document does contain an XML
+  declaration, then this setting is ignored. Must be `"1.0"` or `"1.1"`. The
+  default is `"1.0"`.
+
 ### Methods
 
-`write` - Write bytes onto the stream. You don't have to do this all at
-once. You can keep writing as much as you want.
+`write` - Write bytes onto the stream. You don't have to pass the whole document
+in one `write` call. You can read your source chunk by chunk and call `write`
+with each chunk.
 
 `close` - Close the stream. Once closed, no more data may be written until it is
 done processing the buffer, which is signaled by the `end` event.
@@ -168,6 +168,27 @@ generated by the parser happens, the declaration has been processed if present
 at all. Otherwise, you have a malformed document, and as stated above, you
 cannot rely on the parser data!
 
+### Error Handling
+
+The parser continues to parse even upon encountering errors, and does its best
+to continue reporting errors. You should heed all errors reported. After an
+error, however, saxes may interpret your document incorrectly. For instance
+``<foo a=bc="d"/>`` is invalid XML. Did you mean to have ``<foo a="bc=d"/>`` or
+``<foo a="b" c="d"/>`` or some other variation?  For the sake of continuing to
+provide errors, saxes will continue parsing the document, but the structure it
+reports may be incorrect. It is only after the errors are fixed in the document
+that saxes can provide a reliable interpretation of the document.
+
+That leaves you with two rules of thumb when using saxes:
+
+* Pay attention to the errors that saxes report. The default `onerror` handler
+  throws, so by default, you cannot miss errors.
+
+* **ONCE AN ERROR HAS BEEN ENCOUNTERED, STOP RELYING ON THE EVENT HANDLERS OTHER
+  THAN `onerror`.** As explained above, when saxes runs into a well-formedness
+  problem, it makes a guess in order to continue reporting more errors. The guess
+  may be wrong.
+
 ### Events
 
 To listen to an event, override `on<eventname>`. The list of supported events
diff --git a/lib/saxes.d.ts b/lib/saxes.d.ts
index 2a6e98f1..48e81105 100644
--- a/lib/saxes.d.ts
+++ b/lib/saxes.d.ts
@@ -7,6 +7,7 @@ declare namespace saxes {
     fragment?: boolean;
     fileName?: string;
     additionalNamespaces?: Record<string, string>;
+    defaultXMLVersion?: "1.0" | "1.1";
   }
 
   export interface XMLDecl {
diff --git a/lib/saxes.js b/lib/saxes.js
index 6ed45327..f70c8e80 100644
--- a/lib/saxes.js
+++ b/lib/saxes.js
@@ -1,8 +1,11 @@
 "use strict";
 
-const { isS, isChar, isNameStartChar, isNameChar, S_LIST, NAME_RE } =
-      require("xmlchars/xml/1.0/ed5");
-const { isNCNameStartChar, isNCNameChar, NC_NAME_RE } = require("xmlchars/xmlns/1.0/ed3");
+const {
+  isS, isChar: isChar10, isNameStartChar, isNameChar, S_LIST, NAME_RE,
+} = require("xmlchars/xml/1.0/ed5");
+const { isChar: isChar11, isRestrictedChar } = require("xmlchars/xml/1.1/ed2");
+const { isNCNameStartChar, isNCNameChar, NC_NAME_RE } =
+      require("xmlchars/xmlns/1.0/ed3");
 
 const XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace";
 const XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/";
@@ -101,6 +104,8 @@ const GREATER = 0x3E;
 const QUESTION = 0x3F;
 const OPEN_BRACKET = 0x5B;
 const CLOSE_BRACKET = 0x5D;
+const NEL = 0x85;
+const LS = 0x2028; // Line Separator
 
 function isQuote(c) {
   return c === DQUOTE || c === SQUOTE;
@@ -259,6 +264,10 @@ const FORBIDDEN_BRACKET_BRACKET = 2;
  * @property {string} [fileName] A file name to use for error reporting. "File name" is a loose
  * concept. You could use a URL to some resource, or any descriptive name you
  * like.
+ *
+ * @property {"1.0" | "1.1"} [defaultXMLVersion] The default XML version to
+ * use. If unspecified, and there is no XML encoding declaration, the default
+ * version is "1.0".
  */
 
 class SaxesParser {
@@ -360,7 +369,6 @@ class SaxesParser {
       this.nameCheck = isNCNameChar;
       this.isName = isNCName;
       this.processAttribs = this.processAttribsNS;
-      this.pushAttrib = this.pushAttribNS;
 
       this.ns = Object.assign({ __proto__: null }, rootNS);
       const additional = this.opt.additionalNamespaces;
@@ -374,9 +382,14 @@ class SaxesParser {
       this.nameCheck = isNameChar;
       this.isName = isName;
       this.processAttribs = this.processAttribsPlain;
-      this.pushAttrib = this.pushAttribPlain;
     }
 
+    let { defaultXMLVersion } = this.opt;
+    if (defaultXMLVersion === undefined) {
+      defaultXMLVersion = "1.0";
+    }
+    this.setXMLVersion(defaultXMLVersion);
+
     this.trackPosition = this.opt.position !== false;
     /** The line number the parser is  currently looking at. */
     this.line = 1;
@@ -575,11 +588,13 @@ class SaxesParser {
    * Get a single code point out of the current chunk. This updates the current
    * position if we do position tracking.
    *
+   * This is the algorithm to use for XML 1.0.
+   *
    * @private
    *
    * @returns {number} The character read.
    */
-  getCode() {
+  getCode10() {
     const { chunk, i } = this;
     // Using charCodeAt and handling the surrogates ourselves is faster
     // than using codePointAt.
@@ -614,7 +629,68 @@ class SaxesParser {
         skip++;
       }
 
-      if (!isChar(code)) {
+      if (!isChar10(code)) {
+        this.fail("disallowed character.");
+      }
+    }
+
+    this.i += skip;
+
+    return code;
+  }
+
+
+  /**
+   * Get a single code point out of the current chunk. This updates the current
+   * position if we do position tracking.
+   *
+   * This is the algorithm to use for XML 1.1.
+   *
+   * @private
+   *
+   * @returns {number} The character read.
+   */
+  getCode11() {
+    const { chunk, i } = this;
+    // Using charCodeAt and handling the surrogates ourselves is faster
+    // than using codePointAt.
+    let code = chunk.charCodeAt(i);
+
+    let skip = 1;
+    switch (code) {
+    case CR: { // 0xD
+      // We may get NaN if we read past the end of the chunk, which is
+      // fine.
+      const next = chunk.charCodeAt(i + 1);
+      if (next === NL || next === NEL) {
+        // A CR NL or CR NEL sequence is converted to NL so we have to skip over
+        // the next character. We already know it has a size of 1 so ++ is fine
+        // here.
+        skip++;
+      }
+      // Otherwise, a CR is just converted to NL, no skip.
+    }
+      /* yes, fall through */
+    case NEL: // 0x85
+    case LS: // Ox2028
+    case NL: // 0xA
+      code = NL;
+      this.line++;
+      this.column = 0;
+      break;
+
+    default:
+      this.column++;
+      if (code >= 0xD800 && code <= 0xDBFF) {
+        code = 0x10000 + ((code - 0xD800) * 0x400) +
+          (chunk.charCodeAt(i + 1) - 0xDC00);
+        this.column++;
+        skip++;
+      }
+
+      // In XML 1.1 the character we read must satisfy the Char production but
+      // not the RestrictedChar production.
+      if (!isChar11(code) || isRestrictedChar(code)) {
         this.fail("disallowed character.");
       }
     }
@@ -769,6 +845,22 @@ class SaxesParser {
     return undefined;
   }
 
+  /** @private */
+  setXMLVersion(version) {
+    if (version === "1.0") {
+      this.isChar = isChar10;
+      this.getCode = this.getCode10;
+      this.pushAttrib =
+        this.xmlnsOpt ? this.pushAttribNS10 : this.pushAttribPlain;
+    }
+    else {
+      this.isChar = isChar11;
+      this.getCode = this.getCode11;
+      this.pushAttrib =
+        this.xmlnsOpt ? this.pushAttribNS11 : this.pushAttribPlain;
+    }
+  }
+
   // STATE HANDLERS
 
   /** @private */
@@ -1380,13 +1472,19 @@ class SaxesParser {
 
         if (c) {
           switch (this.xmlDeclName) {
-          case "version":
-            if (!/^1\.[0-9]+$/.test(this.xmlDeclValue)) {
+          case "version": {
+            this.xmlDeclExpects = ["encoding", "standalone"];
+            const version = this.xmlDeclValue;
+            this.xmlDecl.version = version;
+            // This is the test specified by XML 1.0 but it is fine for XML 1.1.
+            if (!/^1\.[0-9]+$/.test(version)) {
               this.fail("version number must match /^1\\.[0-9]+$/.");
             }
-            this.xmlDeclExpects = ["encoding", "standalone"];
-            this.xmlDecl.version = this.xmlDeclValue;
+            else {
+              this.setXMLVersion(version);
+            }
             break;
+          }
           case "encoding":
             if (!/^[A-Za-z][A-Za-z0-9._-]*$/.test(this.xmlDeclValue)) {
               this.fail("encoding value must match \
@@ -1561,7 +1659,25 @@ class SaxesParser {
   }
 
   /** @private */
-  pushAttribNS(name, value) {
+  pushAttribNS10(name, value) {
+    const { prefix, local } = this.qname(name);
+    this.attribList.push({ name, prefix, local, value, uri: undefined });
+    if (prefix === "xmlns") {
+      const trimmed = value.trim();
+      if (trimmed === "") {
+        this.fail("invalid attempt to undefine prefix in XML 1.0");
+      }
+      this.tag.ns[local] = trimmed;
+      nsPairCheck(this, local, trimmed);
+    }
+    else if (name === "xmlns") {
+      const trimmed = value.trim();
+      this.tag.ns[""] = trimmed;
+      nsPairCheck(this, "", trimmed);
+    }
+  }
+
+  pushAttribNS11(name, value) {
     const { prefix, local } = this.qname(name);
     this.attribList.push({ name, prefix, local, value, uri: undefined });
     if (prefix === "xmlns") {
@@ -2060,7 +2176,7 @@ class SaxesParser {
     }
 
     // The character reference is required to match the CHAR production.
-    if (!isChar(num)) {
+    if (!this.isChar(num)) {
       this.fail("malformed character entity.");
       return `&${entity};`;
     }
diff --git a/test/conformance.js b/test/conformance.js
index 07a2ee32..327b059f 100644
--- a/test/conformance.js
+++ b/test/conformance.js
@@ -4,7 +4,7 @@ const { build } = require("xml-conformance-suite/js/frameworks/mocha/builders/ba
 const { ResourceLoader } = require("xml-conformance-suite/js/lib/resource-loader");
 const { loadTests } = require("xml-conformance-suite/js/lib/test-parser");
 const { BaseDriver } = require("xml-conformance-suite/js/drivers/base");
-const { Selection } = require("xml-conformance-suite/js/selections/whatwg");
+const { BaseSelection } = require("xml-conformance-suite/js/selections/base");
 
 const saxes = require("../lib/saxes");
 
@@ -105,6 +105,11 @@ const SKIP = {
   "ibm-valid-P29-ibm29v01.xml": "ENTITIES",
   "ibm-valid-P43-ibm43v01.xml": "ENTITIES",
   "ibm-valid-P67-ibm67v01.xml": "ENTITIES",
+  "ibm-1-1-not-wf-P77-ibm77n14.xml": "DTD",
+  "ibm-1-1-valid-P02-ibm02v04.xml": "ENTITIES",
+  "ibm-1-1-valid-P03-ibm03v05.xml": "ENTITIES",
+  "ibm-1-1-valid-P03-ibm03v06.xml": "ENTITIES",
+  "ibm-1-1-valid-P03-ibm03v07.xml": "ENTITIES",
   "rmt-e2e-15e": "ENTITIES",
   "rmt-e2e-15f": "ENTITIES",
   "rmt-ns10-043": "DTD",
@@ -150,7 +155,15 @@ const PLATFORM_ISSUES = {
   "ibm-not-wf-P02-ibm02n30.xml": "surrogate encoding",
   "ibm-not-wf-P02-ibm02n31.xml": "surrogate encoding",
   "rmt-e2e-27": "surrogate encoding",
+  "rmt-e2e-50": "xml declaration encoding",
   "rmt-e2e-61": "xml declaration encoding",
+  "rmt-011": "xml declarations encoding",
+  "rmt-034": "xml declarations encoding",
+  "rmt-035": "xml declarations encoding",
+  "rmt-041": "xml declarations encoding",
+  "rmt-050": "xml declarations encoding",
+  "rmt-051": "xml declarations encoding",
+  "rmt-054": "xml declarations encoding",
   "x-ibm-1-0.5-not-wf-P04-ibm04n21.xml": "surrogate encoding",
   "x-ibm-1-0.5-not-wf-P04-ibm04n22.xml": "surrogate encoding",
   "x-ibm-1-0.5-not-wf-P04-ibm04n23.xml": "surrogate encoding",
@@ -159,11 +172,37 @@ const PLATFORM_ISSUES = {
   "x-ibm-1-0.5-not-wf-P04a-ibm04an22.xml": "surrogate encoding",
   "x-ibm-1-0.5-not-wf-P04a-ibm04an23.xml": "surrogate encoding",
   "x-ibm-1-0.5-not-wf-P04a-ibm04an24.xml": "surrogate encoding",
+  "ibm-1-1-not-wf-P02-ibm02n67.xml": "surrogate encoding",
+  "ibm-1-1-not-wf-P04-ibm04n21.xml": "surrogate encoding",
+  "ibm-1-1-not-wf-P04-ibm04n22.xml": "surrogate encoding",
+  "ibm-1-1-not-wf-P04-ibm04n23.xml": "surrogate encoding",
+  "ibm-1-1-not-wf-P04-ibm04n24.xml": "surrogate encoding",
+  "ibm-1-1-not-wf-P04a-ibm04an21.xml": "surrogate encoding",
+  "ibm-1-1-not-wf-P04a-ibm04an22.xml": "surrogate encoding",
+  "ibm-1-1-not-wf-P04a-ibm04an23.xml": "surrogate encoding",
+  "ibm-1-1-not-wf-P04a-ibm04an24.xml": "surrogate encoding",
   "hst-lhs-007": "xml declaration encoding",
 };
 
 
-class SaxesSelection extends Selection {
+class SaxesSelection extends BaseSelection {
+  // eslint-disable-next-line class-methods-use-this
+  getHandlingByType(test) {
+    const { testType } = test;
+    switch (testType) {
+    case "not-wf":
+      return "fails";
+    case "valid":
+      return "succeeds";
+    case "invalid":
+    case "error":
+      return "skip";
+    default:
+      throw new Error(`unexpected test type: ${testType}`);
+    }
+  }
+
+  // eslint-disable-next-line class-methods-use-this
   shouldSkipTest(test) {
     return Promise.resolve()
       .then(() => SKIP[test.id] || PLATFORM_ISSUES[test.id] ||
@@ -172,7 +211,12 @@ class SaxesSelection extends Selection {
             test.includesSections(
               ["[12]", "[13]", "[69]", "3.2", "3.2.1", "3.2.2", "3.3",
                "3.3.1", "3.3.2", "4.2", "4.2.2", "4.5", "4.7"]) ||
-            super.shouldSkipTest(test));
+            !((test.includesVersion("1.0") && test.includesEdition("5")) ||
+              test.includesEdition("1.1")) ||
+            // The tests that use BOM rely on the parser being able to look at
+            // the *raw* data, without decoding. There does not seem to be a way
+            // to do this.
+            test.getHasBOM());
   }
 }
 
diff --git a/test/xml-declaration.js b/test/xml-declaration.js
index 30e2af36..3e01c6d6 100644
--- a/test/xml-declaration.js
+++ b/test/xml-declaration.js
@@ -4,6 +4,12 @@ const { expect } = require("chai");
 const saxes = require("../lib/saxes");
 const { test } = require(".");
 
+const XML_1_0_DECLARATION = `<?xml version="1.0"?>`;
+const XML_1_1_DECLARATION = `<?xml version="1.1"?>`;
+
+const WELL_FORMED_1_0_NOT_1_1 = `<root>\u007F</root>`;
+const WELL_FORMED_1_1_NOT_1_0 = `<root></root>`;
+
 describe("xml declaration", () => {
   test({
     name: "empty declaration",
@@ -238,4 +244,108 @@ describe("xml declaration", () => {
     parser.close();
     expect(seen).to.be.true;
   });
+
+  function makeTests(groupName, xmlDeclaration, document, expectedResults) {
+    describe(groupName, () => {
+      for (const { version, expectError } of expectedResults) {
+        const errorLabel = expectError ? "errors" : "no errors";
+        const title = version === undefined ?
+              `and without defaultXMLVersion: ${errorLabel}` :
+              `and with defaultXMLVersion === ${version}: ${errorLabel}`;
+
+        it(title, () => {
+          const parser =
+                new saxes.SaxesParser(version === undefined ? undefined :
+                                      { defaultXMLVersion: version });
+          let error = false;
+          parser.onerror = () => {
+            error = true;
+          };
+          parser.write(xmlDeclaration + document);
+          parser.close();
+          expect(error).to.equal(expectError);
+        });
+      }
+    });
+  }
+
+  describe("well-formed for 1.0, not 1.1", () => {
+    makeTests("without XML declaration", "", WELL_FORMED_1_0_NOT_1_1, [{
+      version: undefined,
+      expectError: false,
+    }, {
+      version: "1.0",
+      expectError: false,
+    }, {
+      version: "1.1",
+      expectError: true,
+    }]);
+
+    makeTests("with XML 1.0 declaration", XML_1_0_DECLARATION,
+              WELL_FORMED_1_0_NOT_1_1, [{
+                version: undefined,
+                expectError: false,
+              }, {
+                version: "1.0",
+                expectError: false,
+              }, {
+                version: "1.1",
+                // The XML declaration overrides defaultXMLVersion.
+                expectError: false,
+              }]);
+
+    makeTests("with XML 1.1 declaration", XML_1_1_DECLARATION,
+              WELL_FORMED_1_0_NOT_1_1, [{
+                version: undefined,
+                // The XML declaration overrides defaultXMLVersion.
+                expectError: true,
+              }, {
+                version: "1.0",
+                // The XML declaration overrides defaultXMLVersion.
+                expectError: true,
+              }, {
+                version: "1.1",
+                expectError: true,
+              }]);
+  });
+
+  describe("well-formed for 1.1, not 1.0", () => {
+    makeTests("without XML declaration", "", WELL_FORMED_1_1_NOT_1_0, [{
+      version: undefined,
+      expectError: true,
+    }, {
+      version: "1.0",
+      expectError: true,
+    }, {
+      version: "1.1",
+      expectError: false,
+    }]);
+
+    makeTests("with XML 1.0 declaration", XML_1_0_DECLARATION,
+              WELL_FORMED_1_1_NOT_1_0, [{
+                version: undefined,
+                expectError: true,
+              }, {
+                version: "1.0",
+                expectError: true,
+              }, {
+                version: "1.1",
+                // The XML declaration overrides defaultXMLVersion.
+                expectError: true,
+              }]);
+
+    makeTests("with XML 1.1 declaration", XML_1_1_DECLARATION,
+              WELL_FORMED_1_1_NOT_1_0, [{
+                version: undefined,
+                // The XML declaration overrides defaultXMLVersion.
+                expectError: false,
+              }, {
+                version: "1.0",
+                // The XML declaration overrides defaultXMLVersion.
+                expectError: false,
+              }, {
+                version: "1.1",
+                expectError: false,
+              }]);
+  });
 });