diff --git a/packages/node/test/offset.ts b/packages/node/test/offset.ts index 7de5194..2cd5986 100644 --- a/packages/node/test/offset.ts +++ b/packages/node/test/offset.ts @@ -4,7 +4,8 @@ import TokenType from "@streamparser/json/utils/types/tokenType.js"; const input1 = '{\n "string": "value",\n "number": 3,\n "object"'; const input2 = ': {\n "key": "vд"\n },\n "array": [\n -1,\n 12\n ]\n '; -const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }'; +const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,'; +const input4 = '"escape": "\\"\\u00e1" }'; const offsets = [ [0, TokenType.LEFT_BRACE], @@ -46,7 +47,11 @@ const offsets = [ [146, TokenType.STRING], [152, TokenType.COLON], [154, TokenType.NUMBER], - [159, TokenType.RIGHT_BRACE], + [158, TokenType.COMMA], + [159, TokenType.STRING], + [167, TokenType.COLON], + [169, TokenType.STRING], + [180, TokenType.RIGHT_BRACE], ]; test("offset", async () => { @@ -54,7 +59,7 @@ test("offset", async () => { await runTokenizerTest( new Tokenizer(), - [input1, input2, input3], + [input1, input2, input3, input4], ({ token, offset }) => { expect(offset).toEqual(offsets[i][0]); expect(token).toEqual(offsets[i][1]); diff --git a/packages/plainjs/dist/deno/tokenizer.ts b/packages/plainjs/dist/deno/tokenizer.ts index b071519..2843085 100644 --- a/packages/plainjs/dist/deno/tokenizer.ts +++ b/packages/plainjs/dist/deno/tokenizer.ts @@ -110,6 +110,7 @@ export default class Tokenizer { private separator?: string; private separatorBytes?: Uint8Array; private separatorIndex = 0; + private escapedCharsByteLength = 0; private bufferedString: StringBuilder; private bufferedNumber: StringBuilder; @@ -300,6 +301,7 @@ export default class Tokenizer { if (n === charset.QUOTATION_MARK) { this.bufferedString.reset(); + this.escapedCharsByteLength = 0; this.state = TokenizerStates.STRING_DEFAULT; continue; } @@ -336,7 +338,10 @@ export default class Tokenizer { value: string, offset: this.offset, }); - this.offset += this.bufferedString.byteLength + 1; + this.offset += + this.escapedCharsByteLength + + this.bufferedString.byteLength + + 1; continue; } @@ -398,6 +403,7 @@ export default class Tokenizer { const controlChar = escapedSequences[n]; if (controlChar) { this.bufferedString.appendChar(controlChar); + this.escapedCharsByteLength += 1; // len(\")=2 minus the fact you're appending len(controlChar)=1 this.state = TokenizerStates.STRING_DEFAULT; continue; } @@ -436,32 +442,32 @@ export default class Tokenizer { this.unicode + String.fromCharCode(n), 16, ); + let unicodeString: string; if (this.highSurrogate === undefined) { if (intVal >= 0xd800 && intVal <= 0xdbff) { //<55296,56319> - highSurrogate this.highSurrogate = intVal; + this.state = TokenizerStates.STRING_DEFAULT; + continue; } else { - this.bufferedString.appendBuf( - this.encoder.encode(String.fromCharCode(intVal)), - ); + unicodeString = String.fromCharCode(intVal); } } else { if (intVal >= 0xdc00 && intVal <= 0xdfff) { //<56320,57343> - lowSurrogate - this.bufferedString.appendBuf( - this.encoder.encode( - String.fromCharCode(this.highSurrogate, intVal), - ), + unicodeString = String.fromCharCode( + this.highSurrogate, + intVal, ); } else { - this.bufferedString.appendBuf( - this.encoder.encode( - String.fromCharCode(this.highSurrogate), - ), - ); + unicodeString = String.fromCharCode(this.highSurrogate); } this.highSurrogate = undefined; } + const unicodeBuffer = this.encoder.encode(unicodeString); + this.bufferedString.appendBuf(unicodeBuffer); + // len(\u0000)=6 minus the fact you're appending len(buf) + this.escapedCharsByteLength += 6 - unicodeBuffer.byteLength; this.state = TokenizerStates.STRING_DEFAULT; continue; } diff --git a/packages/plainjs/src/tokenizer.ts b/packages/plainjs/src/tokenizer.ts index 7cda24c..08a9d5d 100644 --- a/packages/plainjs/src/tokenizer.ts +++ b/packages/plainjs/src/tokenizer.ts @@ -110,6 +110,7 @@ export default class Tokenizer { private separator?: string; private separatorBytes?: Uint8Array; private separatorIndex = 0; + private escapedCharsByteLength = 0; private bufferedString: StringBuilder; private bufferedNumber: StringBuilder; @@ -300,6 +301,7 @@ export default class Tokenizer { if (n === charset.QUOTATION_MARK) { this.bufferedString.reset(); + this.escapedCharsByteLength = 0; this.state = TokenizerStates.STRING_DEFAULT; continue; } @@ -336,7 +338,10 @@ export default class Tokenizer { value: string, offset: this.offset, }); - this.offset += this.bufferedString.byteLength + 1; + this.offset += + this.escapedCharsByteLength + + this.bufferedString.byteLength + + 1; continue; } @@ -398,6 +403,7 @@ export default class Tokenizer { const controlChar = escapedSequences[n]; if (controlChar) { this.bufferedString.appendChar(controlChar); + this.escapedCharsByteLength += 1; // len(\")=2 minus the fact you're appending len(controlChar)=1 this.state = TokenizerStates.STRING_DEFAULT; continue; } @@ -436,32 +442,32 @@ export default class Tokenizer { this.unicode + String.fromCharCode(n), 16, ); + let unicodeString: string; if (this.highSurrogate === undefined) { if (intVal >= 0xd800 && intVal <= 0xdbff) { //<55296,56319> - highSurrogate this.highSurrogate = intVal; + this.state = TokenizerStates.STRING_DEFAULT; + continue; } else { - this.bufferedString.appendBuf( - this.encoder.encode(String.fromCharCode(intVal)), - ); + unicodeString = String.fromCharCode(intVal); } } else { if (intVal >= 0xdc00 && intVal <= 0xdfff) { //<56320,57343> - lowSurrogate - this.bufferedString.appendBuf( - this.encoder.encode( - String.fromCharCode(this.highSurrogate, intVal), - ), + unicodeString = String.fromCharCode( + this.highSurrogate, + intVal, ); } else { - this.bufferedString.appendBuf( - this.encoder.encode( - String.fromCharCode(this.highSurrogate), - ), - ); + unicodeString = String.fromCharCode(this.highSurrogate); } this.highSurrogate = undefined; } + const unicodeBuffer = this.encoder.encode(unicodeString); + this.bufferedString.appendBuf(unicodeBuffer); + // len(\u0000)=6 minus the fact you're appending len(buf) + this.escapedCharsByteLength += 6 - unicodeBuffer.byteLength; this.state = TokenizerStates.STRING_DEFAULT; continue; } diff --git a/packages/plainjs/test/offset.ts b/packages/plainjs/test/offset.ts index 39fc243..2d0a236 100644 --- a/packages/plainjs/test/offset.ts +++ b/packages/plainjs/test/offset.ts @@ -4,7 +4,8 @@ import TokenType from "../src/utils/types/tokenType.js"; const input1 = '{\n "string": "value",\n "number": 3,\n "object"'; const input2 = ': {\n "key": "vд"\n },\n "array": [\n -1,\n 12\n ]\n '; -const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }'; +const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,'; +const input4 = '"escape": "\\"\\u00e1" }'; const offsets = [ [0, TokenType.LEFT_BRACE], @@ -46,7 +47,11 @@ const offsets = [ [146, TokenType.STRING], [152, TokenType.COLON], [154, TokenType.NUMBER], - [159, TokenType.RIGHT_BRACE], + [158, TokenType.COMMA], + [159, TokenType.STRING], + [167, TokenType.COLON], + [169, TokenType.STRING], + [180, TokenType.RIGHT_BRACE], ]; test("offset", async () => { @@ -54,7 +59,7 @@ test("offset", async () => { await runTokenizerTest( new Tokenizer(), - [input1, input2, input3], + [input1, input2, input3, input4], ({ token, offset }) => { expect(offset).toEqual(offsets[i][0]); expect(token).toEqual(offsets[i][1]); diff --git a/packages/whatwg/test/offset.ts b/packages/whatwg/test/offset.ts index 7de5194..2cd5986 100644 --- a/packages/whatwg/test/offset.ts +++ b/packages/whatwg/test/offset.ts @@ -4,7 +4,8 @@ import TokenType from "@streamparser/json/utils/types/tokenType.js"; const input1 = '{\n "string": "value",\n "number": 3,\n "object"'; const input2 = ': {\n "key": "vд"\n },\n "array": [\n -1,\n 12\n ]\n '; -const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }'; +const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,'; +const input4 = '"escape": "\\"\\u00e1" }'; const offsets = [ [0, TokenType.LEFT_BRACE], @@ -46,7 +47,11 @@ const offsets = [ [146, TokenType.STRING], [152, TokenType.COLON], [154, TokenType.NUMBER], - [159, TokenType.RIGHT_BRACE], + [158, TokenType.COMMA], + [159, TokenType.STRING], + [167, TokenType.COLON], + [169, TokenType.STRING], + [180, TokenType.RIGHT_BRACE], ]; test("offset", async () => { @@ -54,7 +59,7 @@ test("offset", async () => { await runTokenizerTest( new Tokenizer(), - [input1, input2, input3], + [input1, input2, input3, input4], ({ token, offset }) => { expect(offset).toEqual(offsets[i][0]); expect(token).toEqual(offsets[i][1]);