diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f0bdc720..d64fdfc2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -79,3 +79,38 @@ You should run it before you make your change and after. Make sure you run it when your CPU is relatively idle, and run it multiple times to get a valid result. If the code takes 10 times longer with your change, you know you have a problem. Address it before submitting your PR. + +In general, we want both elegance and performance. However, in those cases where +one must be sacrificed for the other, **we are willing to sacrifice elegance in +favor of performance.** A good example of this is how saxes handles the +normalization of end-of-line (EOL) characters to newlines (NL). In September +2019 I (lddubeau) discovered that saxes was not doing the normalization +correctly. I came up with two fixes: + +1. One fix modified the ``write`` method to split chunks on all EOL characters + except NL, and would normalize those characters to NL early on. It would also + adjust positioning data to handle skipping ``\r`` in the ``\r\n`` sequence, + etc. It performed excellently with files containing only ``\n`` but it + performed terribly with files that needed normalization. **However**, a file + with ``\n`` as EOL would process half as fast if its EOL characters were + replaced with ``\r\n`` prior to parsing. The performance for files using + anything else than ``\n`` for EOL was atrocious. + + This approach was the more elegant of the two approaches mentioned here + because it made all the logic normalizing EOL characters localized to one + spot in the code. It was also the least error-prone approach, for the same + reason. Adding a new ``captureSomething`` method would not run the risk of + forgetting to handle EOL characters properly. + +2. Another fix took the approach of recording state while running ``getCode`` + and using this state in all places where substrings are extracted from chunks + to handle the presence of EOL characters. This fix performs about as fast as + the other fix described above for files that contain ``\n`` as EOL, and maybe + 10-20% slower when a file contains ``\r\n``. + + This approach is definitely not as elegant as the first. It spreads the logic + for handling EOL into multiple locations, and there's a risk of forgetting to + add the proper logic when adding a new ``captureSomething`` method. + +Of the two approaches, the 2nd one was the one selected for posterity. Though +the first method was more elegant, its performance was unacceptable. diff --git a/README.md b/README.md index a55dfc1e..845d04fc 100644 --- a/README.md +++ b/README.md @@ -238,6 +238,15 @@ namespaces is onerous. Note that you can use `additionalNamespaces` and `resolvePrefix` together if you want. `additionalNamespaces` applies before `resolvePrefix`. +### Performance Tips + +* saxes works faster on files that use newlines (``\u000A``) as end of line + markers than files that use other end of line markers (like ``\r`` or + ``\r\n``). The XML specification requires that conformant applications behave + as if all characters that are to be treated as end of line characters are + converted to ``\u000A`` prior to parsing. The optimal code path for saxes is a + file in which all end of line characters are already ``\u000A``. + ## FAQ Q. Why has saxes dropped support for limiting the size of data chunks passed to diff --git a/lib/saxes.js b/lib/saxes.js index 81ca3f5c..76ea7479 100644 --- a/lib/saxes.js +++ b/lib/saxes.js @@ -335,6 +335,7 @@ class SaxesParser { this.chunkPosition = 0; this.i = 0; this.trailingCR = false; + this.originalNL = true; this.forbiddenState = FORBIDDEN_START; /** * A map of entity name to expansion. @@ -562,20 +563,22 @@ class SaxesParser { // isn't. (There may be Node-specific code that would perform faster than // ``Array.from`` but don't want to be dependent on Node.) - let limit = chunk.length; - if (this.trailingCR) { // The previous chunk had a trailing cr. We need to handle it now. chunk = `\r${chunk}`; + this.trailingCR = false; } - if (!end && chunk[limit - 1] === CR) { + let limit = chunk.length; + if (!end && chunk[limit - 1] === "\r") { // The chunk ends with a trailing CR. We cannot know how to handle it // until we get the next chunk or the end of the stream. So save it for // later. limit--; + chunk = chunk.substring(0, limit); this.trailingCR = true; } + this.chunk = chunk; this.i = 0; while (this.i < limit) { @@ -596,6 +599,13 @@ class SaxesParser { return this.write(null); } + /** @private */ + newline(originalNL) { + this.originalNL = originalNL; + this.line++; + this.positionAtNewLine = this.position; + } + /** * Get a single code point out of the current chunk. This updates the current * position if we do position tracking. @@ -621,21 +631,25 @@ class SaxesParser { } switch (code) { + case NL: + this.newline(true); + return NL; case CR: // We may get NaN if we read past the end of the chunk, which is fine. if (chunk.charCodeAt(i + 1) === NL) { // A \r\n sequence is converted to \n so we have to skip over the next // character. We already know it has a size of 1 so ++ is fine here. this.i = i + 2; + this.skipTwo = true; + } + else { + this.skipTwo = false; } // Otherwise, a \r is just converted to \n, so we don't have to skip // ahead. // In either case, \r becomes \n. - /* yes, fall through */ - case NL: - this.line++; - this.positionAtNewLine = this.position; + this.newline(false); return NL; default: // If we get here, then code < SPACE and it is not NL CR or TAB. @@ -694,11 +708,15 @@ class SaxesParser { // read this.i again, which is a bit faster. this.i = i + 1; if (code < 0xD800) { - if ((code > 0x1F && code < 0x7F) || code > 0x9F || code === TAB) { + if ((code > 0x1F && code < 0x7F) || (code > 0x9F && code !== LS) || + code === TAB) { return code; } switch (code) { + case NL: // 0xA + this.newline(true); + return NL; case CR: { // 0xD // We may get NaN if we read past the end of the chunk, which is // fine. @@ -707,15 +725,17 @@ class SaxesParser { // A CR NL or CR NEL sequence is converted to NL so we have to skip over // the next character. We already know it has a size of 1. this.i = i + 2; + this.skipTwo = true; + } + else { + this.skipTwo = false; } // Otherwise, a CR is just converted to NL, no skip. } /* yes, fall through */ - case NL: // 0xA case NEL: // 0x85 case LS: // Ox2028 - this.line++; - this.positionAtNewLine = this.position; + this.newline(false); return NL; default: this.fail("disallowed character."); @@ -763,6 +783,19 @@ class SaxesParser { * ``false`` otherwise. */ + /** + * @private + */ + handleEOL(buffer, chunk, start) { + if (this.originalNL) { + return start; + } + + const { i } = this; + this[buffer] += `${chunk.substring(start, i - (this.skipTwo ? 2 : 1))}\n`; + return i; + } + /** * Capture characters into a buffer until encountering one of a set of * characters. @@ -778,18 +811,22 @@ class SaxesParser { * ``undefined`` if we hit the end of the chunk. */ captureTo(chars, buffer) { - const { chunk, i: start } = this; + let { i: start } = this; + const { chunk } = this; while (true) { const c = this.getCode(); - if (chars.includes(c)) { - this[buffer] += chunk.substring(start, this.i - (c <= 0xFFFF ? 1 : 2)); - return c; + if (c === NL) { + start = this.handleEOL(buffer, chunk, start); } - - if (c === undefined) { + else if (c === undefined) { this[buffer] += chunk.substring(start); return undefined; } + + if (chars.includes(c)) { + this[buffer] += chunk.substring(start, this.i - (c <= 0xFFFF ? 1 : 2)); + return c; + } } } @@ -806,18 +843,22 @@ class SaxesParser { * into the end of the current chunk. */ captureToChar(char, buffer) { - const { chunk, i: start } = this; + let { i: start } = this; + const { chunk } = this; while (true) { const c = this.getCode(); - if (c === char) { - this[buffer] += chunk.substring(start, this.i - (c <= 0xFFFF ? 1 : 2)); - return true; + if (c === NL) { + start = this.handleEOL(buffer, chunk, start); } - - if (c === undefined) { + else if (c === undefined) { this[buffer] += chunk.substring(start); return false; } + + if (c === char) { + this[buffer] += chunk.substring(start, this.i - (c <= 0xFFFF ? 1 : 2)); + return true; + } } } @@ -839,6 +880,7 @@ class SaxesParser { return undefined; } + // NL is not a name char so we don't have to test specifically for it. if (!isNameChar(c)) { this.name += chunk.substring(start, this.i - (c <= 0xFFFF ? 1 : 2)); return c; @@ -866,6 +908,8 @@ class SaxesParser { return undefined; } + // NL cannot satisfy this.nameCheck so we don't have to test + // specifically for it. if (!this.nameCheck(c)) { this[buffer] += chunk.substring(start, this.i - (c <= 0xFFFF ? 1 : 2)); return c; @@ -980,17 +1024,12 @@ class SaxesParser { // Since we are using a specialized loop, we also keep track of the presence // of ]]> in text data. The sequence ]]> is forbidden to appear as-is. // - const { chunk, i: start } = this; - let { forbiddenState } = this; + let { i: start, forbiddenState } = this; + const { chunk } = this; // eslint-disable-next-line no-labels, no-restricted-syntax scanLoop: while (true) { - const code = this.getCode(); - if (code === undefined) { - this.text += chunk.substring(start); - break; - } - switch (code) { + switch (this.getCode()) { case LESS: this.state = S_OPEN_WAKA; this.text += chunk.substring(start, this.i - 1); @@ -1024,6 +1063,14 @@ class SaxesParser { } forbiddenState = FORBIDDEN_START; break; + case NL: + start = this.handleEOL("text", chunk, start); + forbiddenState = FORBIDDEN_START; + break; + case undefined: + this.text += chunk.substring(start); + // eslint-disable-next-line no-labels + break scanLoop; default: forbiddenState = FORBIDDEN_START; } @@ -1037,16 +1084,13 @@ class SaxesParser { // for a specialized task. We keep track of the presence of non-space // characters in the text since these are errors when appearing outside the // document root element. - const { chunk, i: start } = this; + let { i: start } = this; + const { chunk } = this; let nonSpace = false; // eslint-disable-next-line no-labels, no-restricted-syntax outRootLoop: while (true) { const code = this.getCode(); - if (code === undefined) { - this.text += chunk.substring(start); - break; - } switch (code) { case LESS: this.state = S_OPEN_WAKA; @@ -1060,6 +1104,14 @@ class SaxesParser { nonSpace = true; // eslint-disable-next-line no-labels break outRootLoop; + case NL: + start = this.handleEOL("text", chunk, start); + // eslint-disable-next-line no-labels + break; + case undefined: + this.text += chunk.substring(start); + // eslint-disable-next-line no-labels + break outRootLoop; default: if (!isS(code)) { nonSpace = true; @@ -1109,7 +1161,7 @@ class SaxesParser { this.state = S_PI_FIRST_CHAR; break; default: - this.fail("disallowed character in tag name."); + this.fail("disallowed character in tag name"); this.state = S_TEXT; this.xmlDeclPossible = false; } @@ -1800,14 +1852,19 @@ class SaxesParser { // We deliberately do not use captureTo here. The specialized code we use // here is faster than using captureTo. const { q } = this; - const { chunk, i: start } = this; + let { i: start } = this; + const { chunk } = this; while (true) { const code = this.getCode(); if (code === undefined) { this.text += chunk.substring(start); return; } - if (code === q || code === AMP || code === LESS) { + + if (code === NL) { + start = this.handleEOL("text", chunk, start); + } + else if (code === q || code === AMP || code === LESS) { const slice = chunk.substring(start, this.i - 1); switch (code) { case q: