From 48dfcf507ffe210ee53aa945e237ac9b1a9c7eaf Mon Sep 17 00:00:00 2001 From: Chad Norvell Date: Wed, 18 Dec 2024 16:56:32 -0800 Subject: [PATCH] pw_tokenizer: Fix CSV database parsing in TS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This replaces the custom CSV parser with a more robust library that should handle variations in file format better. This change also adds support for 4 column token databases and removal dates. Change-Id: Ia71da392a6eec4c3bb5a97b8fac0efbdbecf1734 Bug: 379172909 Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/251892 Reviewed-by: Wyatt Hepler Commit-Queue: Chad Norvell Lint: Lint 🤖 Docs-Not-Needed: Chad Norvell --- package-lock.json | 18 ++++ package.json | 2 + pw_tokenizer/ts/detokenizer_test.ts | 94 +++++++++++-------- pw_tokenizer/ts/parser_test.ts | 32 +++++++ pw_tokenizer/ts/token_database.ts | 139 ++++++++++++++++++++++------ 5 files changed, 216 insertions(+), 69 deletions(-) create mode 100644 pw_tokenizer/ts/parser_test.ts diff --git a/package-lock.json b/package-lock.json index a8a3a622da..c7a85a7fd0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,6 +13,7 @@ "buffer": "^6.0.3", "google-protobuf": "^3.17.3", "long": "^5.2.1", + "papaparse": "^5.4.1", "ts-protoc-gen": "^0.15.0" }, "bin": { @@ -39,6 +40,7 @@ "@types/jest": "^28.1.4", "@types/mocha": "^10.0.6", "@types/node": "^22.8.4", + "@types/papaparse": "^5.3.15", "@types/react": "^17.0.14", "@types/react-dom": "^17.0.9", "@typescript-eslint/eslint-plugin": "^5.59.7", @@ -3137,6 +3139,16 @@ "undici-types": "~6.19.8" } }, + "node_modules/@types/papaparse": { + "version": "5.3.15", + "resolved": "https://registry.npmjs.org/@types/papaparse/-/papaparse-5.3.15.tgz", + "integrity": "sha512-JHe6vF6x/8Z85nCX4yFdDslN11d+1pr12E526X8WAfhadOeaOTx5AuIkvDKIBopfvlzpzkdMx4YyvSKCM9oqtw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/parse5": { "version": "6.0.3", "resolved": "https://registry.npmjs.org/@types/parse5/-/parse5-6.0.3.tgz", @@ -11614,6 +11626,12 @@ "node": ">= 14" } }, + "node_modules/papaparse": { + "version": "5.4.1", + "resolved": "https://registry.npmjs.org/papaparse/-/papaparse-5.4.1.tgz", + "integrity": "sha512-HipMsgJkZu8br23pW15uvo6sib6wne/4woLZPlFf3rpDyMe9ywEXUsuD7+6K9PRkJlVT51j/sCOYDKGGS3ZJrw==", + "license": "MIT" + }, "node_modules/parent-module": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", diff --git a/package.json b/package.json index 837e088652..3da293655e 100644 --- a/package.json +++ b/package.json @@ -38,6 +38,7 @@ "@types/google-protobuf": "^3.15.5", "@types/jest": "^28.1.4", "@types/node": "^22.8.4", + "@types/papaparse": "^5.3.15", "@types/react": "^17.0.14", "@types/react-dom": "^17.0.9", "@typescript-eslint/eslint-plugin": "^5.59.7", @@ -100,6 +101,7 @@ "buffer": "^6.0.3", "google-protobuf": "^3.17.3", "long": "^5.2.1", + "papaparse": "^5.4.1", "ts-protoc-gen": "^0.15.0" }, "config": { diff --git a/pw_tokenizer/ts/detokenizer_test.ts b/pw_tokenizer/ts/detokenizer_test.ts index 0340f260ef..52d59d112c 100644 --- a/pw_tokenizer/ts/detokenizer_test.ts +++ b/pw_tokenizer/ts/detokenizer_test.ts @@ -17,7 +17,7 @@ import { Frame, Encoder, Decoder } from 'pigweedjs/pw_hdlc'; import { Detokenizer } from './detokenizer'; -const CSV = ` +const CSV3Col = ` 64636261, ,"regular token" 86fc33f3, ,"base64 token" 0d6bd33c, ,"Regular Token: %s and Nested Token: %s" @@ -25,6 +25,14 @@ const CSV = ` 451d86ed, ,"Cat" `; +const CSV4Col = ` +64636261, ,"foo","regular token" +86fc33f3, ,"bar","base64 token" +0d6bd33c, ,"baz","Regular Token: %s and Nested Token: %s" +97185e6f, ,"","(token: %s, string: %s, int: %d, float: %f)" +451d86ed, ,"","Cat" +`; + function generateFrame(text: string): Frame { const uintArray = new TextEncoder().encode(text); const encodedFrame = new Encoder().uiFrame(1, uintArray); @@ -32,49 +40,53 @@ function generateFrame(text: string): Frame { return decodedFrames[0]; } -describe('Detokenizer', () => { - let detokenizer: Detokenizer; +const generateTests = (description: string, csv: string) => + describe(description, () => { + let detokenizer: Detokenizer; - beforeEach(() => { - detokenizer = new Detokenizer(CSV); - }); + beforeEach(() => { + detokenizer = new Detokenizer(csv); + }); - it('parses a base64 correct frame properly', () => { - const frame = generateFrame('$8zP8hg=='); - expect(detokenizer.detokenizeBase64(frame)).toEqual('base64 token'); - }); - it('parses a correct frame properly', () => { - const frame = generateFrame('abcde'); - expect(detokenizer.detokenize(frame)).toEqual('regular token'); - }); - it('failure to detokenize returns original string', () => { - expect(detokenizer.detokenize(generateFrame('aabbcc'))).toEqual('aabbcc'); - expect(detokenizer.detokenizeBase64(generateFrame('$8zP7hg=='))).toEqual( - '$8zP7hg==', - ); - }); - it('recursive detokenize all nested base64 tokens', () => { - expect( - detokenizer.detokenizeBase64( - generateFrame( - '$PNNrDQkkN1lZZFJRPT0lJGIxNFlsd2trTjFsWlpGSlJQVDBGUTJGdFpXeFlwSENkUHc9PQ==', + it('parses a base64 correct frame properly', () => { + const frame = generateFrame('$8zP8hg=='); + expect(detokenizer.detokenizeBase64(frame)).toEqual('base64 token'); + }); + it('parses a correct frame properly', () => { + const frame = generateFrame('abcde'); + expect(detokenizer.detokenize(frame)).toEqual('regular token'); + }); + it('failure to detokenize returns original string', () => { + expect(detokenizer.detokenize(generateFrame('aabbcc'))).toEqual('aabbcc'); + expect(detokenizer.detokenizeBase64(generateFrame('$8zP7hg=='))).toEqual( + '$8zP7hg==', + ); + }); + it('recursive detokenize all nested base64 tokens', () => { + expect( + detokenizer.detokenizeBase64( + generateFrame( + '$PNNrDQkkN1lZZFJRPT0lJGIxNFlsd2trTjFsWlpGSlJQVDBGUTJGdFpXeFlwSENkUHc9PQ==', + ), ), - ), - ).toEqual( - 'Regular Token: Cat and Nested Token: (token: Cat, string: Camel, int: 44, float: 1.2300000190734863)', - ); - }); + ).toEqual( + 'Regular Token: Cat and Nested Token: (token: Cat, string: Camel, int: 44, float: 1.2300000190734863)', + ); + }); - it('recursion detokenize with limits on max recursion', () => { - expect( - detokenizer.detokenizeBase64( - generateFrame( - '$PNNrDQkkN1lZZFJRPT0lJGIxNFlsd2trTjFsWlpGSlJQVDBGUTJGdFpXeFlwSENkUHc9PQ==', + it('recursion detokenize with limits on max recursion', () => { + expect( + detokenizer.detokenizeBase64( + generateFrame( + '$PNNrDQkkN1lZZFJRPT0lJGIxNFlsd2trTjFsWlpGSlJQVDBGUTJGdFpXeFlwSENkUHc9PQ==', + ), + 1, ), - 1, - ), - ).toEqual( - 'Regular Token: Cat and Nested Token: (token: $7YYdRQ==, string: Camel, int: 44, float: 1.2300000190734863)', - ); + ).toEqual( + 'Regular Token: Cat and Nested Token: (token: $7YYdRQ==, string: Camel, int: 44, float: 1.2300000190734863)', + ); + }); }); -}); + +generateTests('Detokenize with 3 column database', CSV3Col); +generateTests('Detokenize with 4 column database', CSV4Col); diff --git a/pw_tokenizer/ts/parser_test.ts b/pw_tokenizer/ts/parser_test.ts new file mode 100644 index 0000000000..19fa8241d7 --- /dev/null +++ b/pw_tokenizer/ts/parser_test.ts @@ -0,0 +1,32 @@ +// Copyright 2022 The Pigweed Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy of +// the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + +/* eslint-env browser */ + +import Papa from 'papaparse'; +import { parseCsvEntry } from './token_database'; + +const CSV = ` +86fc33f3, ,"normal and ""quoted"" text" +`; + +describe('parseCsvEntry', () => { + it('correctly parses double quotes', () => { + const parsedCsv = Papa.parse(CSV, { header: false }); + const data = parsedCsv.data[1] as string[]; + const results = parseCsvEntry(data); + expect(results).toBeDefined(); + expect(results.template).toEqual('normal and "quoted" text'); + }); +}); diff --git a/pw_tokenizer/ts/token_database.ts b/pw_tokenizer/ts/token_database.ts index ab2b2f8a60..6fc047f992 100644 --- a/pw_tokenizer/ts/token_database.ts +++ b/pw_tokenizer/ts/token_database.ts @@ -14,11 +14,101 @@ /** Parses CSV Database for easier lookups */ +import Papa from 'papaparse'; + +interface TokenData { + token: number; + removalDate: Date | null; + domain: string; + template: string; +} + +function parseTokenNumber(num: string, lineNumber?: number): number { + if (!/^[a-fA-F0-9]+$/.test(num)) { + // Malformed number + console.error( + new Error( + `TokenDatabase number ${num} ` + lineNumber + ? `at line ${lineNumber} ` + : '' + `is not a valid hex number`, + ), + ); + } + + try { + return parseInt(num, 16); + } catch { + console.error( + new Error( + `TokenDatabase number ${num} ` + lineNumber + ? `at line ${lineNumber} ` + : '' + `could not be parsed`, + ), + ); + } +} + +function parseRemovalDate( + dateString: string, + lineNumber?: number, +): Date | null { + const dateContent = dateString.trim(); + if (dateContent === '') return null; + + try { + return new Date(dateContent); + } catch { + console.error( + new Error( + `TokenDatabase removal date ${dateString} ` + lineNumber + ? `at line ${lineNumber} ` + : '' + `could not be parsed`, + ), + ); + } +} + +export function parseCsvEntry( + data: string[], + lineNumber?: number, +): TokenData | undefined { + if (data.length < 3) { + console.error( + new Error( + `TokenDatabase entry ${data} ` + lineNumber + ? `at line ${lineNumber} ` + : '' + `could not be parsed`, + ), + ); + + return undefined; + } + + // Column 0: Token + const token = parseTokenNumber(data.shift(), lineNumber); + + // Column 1: Removal date + const removalDate = parseRemovalDate(data.shift(), lineNumber); + + // Modern 4-column databases will have the domain in this position. + const domain = data.length > 1 ? data.shift() : ''; + + // Last column: Template strings + const template = data.shift(); + + return { + token, + removalDate, + domain, + template, + }; +} + export class TokenDatabase { - private tokens: Map = new Map(); + private tokens: Map = new Map(); constructor(readonly csv: string) { - this.parseTokensToTokensMap(csv.split(/\r?\n/)); + this.parseTokensToTokensMap(csv); } has(token: number): boolean { @@ -26,32 +116,25 @@ export class TokenDatabase { } get(token: number): string | undefined { - return this.tokens.get(token); - } - - private parseTokensToTokensMap(csv: string[]) { - for (const [lineNumber, line] of Object.entries( - csv.map((line) => line.split(/,/)), - )) { - if (!line[0] || !line[2]) { - continue; - } - if (!/^[a-fA-F0-9]+$/.test(line[0])) { - // Malformed number - console.error( - new Error( - `TokenDatabase number ${line[0]} at line ` + - `${lineNumber} is not a valid hex number`, - ), - ); - continue; - } - const tokenNumber = parseInt(line[0], 16); - // To extract actual string value of a token number, we: - // - Slice token number and whitespace that are in [0] and [1] of line. - // - Join the rest as a string and trim the trailing quotes. - const data = line.slice(2).join(',').slice(1, -1); - this.tokens.set(tokenNumber, data); + return this.tokens.get(token)?.template; + } + + private parseTokensToTokensMap(csv: string) { + const parsedCsv = Papa.parse(csv, { header: false }); + + if (parsedCsv.errors.length > 0) { + console.error( + new Error( + `TokenDatabase could not be parsed: ${parsedCsv.errors.join(', ')}`, + ), + ); + } + + const csvData = parsedCsv.data as string[][]; + + for (const [lineNumber, line] of csvData.entries()) { + const entry = parseCsvEntry(line, lineNumber); + entry && this.tokens.set(entry.token, entry); } } }