Skip to content

Commit

Permalink
pw_tokenizer: Fix CSV database parsing in TS
Browse files Browse the repository at this point in the history
This replaces the custom CSV parser with a more robust library that
should handle variations in file format better. This change also adds
support for 4 column token databases and removal dates.

Change-Id: Ia71da392a6eec4c3bb5a97b8fac0efbdbecf1734
Bug: 379172909
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/251892
Reviewed-by: Wyatt Hepler <hepler@google.com>
Commit-Queue: Chad Norvell <chadnorvell@google.com>
Lint: Lint 🤖 <android-build-ayeaye@system.gserviceaccount.com>
Docs-Not-Needed: Chad Norvell <chadnorvell@google.com>
  • Loading branch information
chadnorvell authored and CQ Bot Account committed Dec 19, 2024
1 parent 935559e commit 48dfcf5
Show file tree
Hide file tree
Showing 5 changed files with 216 additions and 69 deletions.
18 changes: 18 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"@types/google-protobuf": "^3.15.5",
"@types/jest": "^28.1.4",
"@types/node": "^22.8.4",
"@types/papaparse": "^5.3.15",
"@types/react": "^17.0.14",
"@types/react-dom": "^17.0.9",
"@typescript-eslint/eslint-plugin": "^5.59.7",
Expand Down Expand Up @@ -100,6 +101,7 @@
"buffer": "^6.0.3",
"google-protobuf": "^3.17.3",
"long": "^5.2.1",
"papaparse": "^5.4.1",
"ts-protoc-gen": "^0.15.0"
},
"config": {
Expand Down
94 changes: 53 additions & 41 deletions pw_tokenizer/ts/detokenizer_test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,64 +17,76 @@
import { Frame, Encoder, Decoder } from 'pigweedjs/pw_hdlc';
import { Detokenizer } from './detokenizer';

const CSV = `
const CSV3Col = `
64636261, ,"regular token"
86fc33f3, ,"base64 token"
0d6bd33c, ,"Regular Token: %s and Nested Token: %s"
97185e6f, ,"(token: %s, string: %s, int: %d, float: %f)"
451d86ed, ,"Cat"
`;

const CSV4Col = `
64636261, ,"foo","regular token"
86fc33f3, ,"bar","base64 token"
0d6bd33c, ,"baz","Regular Token: %s and Nested Token: %s"
97185e6f, ,"","(token: %s, string: %s, int: %d, float: %f)"
451d86ed, ,"","Cat"
`;

function generateFrame(text: string): Frame {
const uintArray = new TextEncoder().encode(text);
const encodedFrame = new Encoder().uiFrame(1, uintArray);
const decodedFrames = Array.from(new Decoder().process(encodedFrame));
return decodedFrames[0];
}

describe('Detokenizer', () => {
let detokenizer: Detokenizer;
const generateTests = (description: string, csv: string) =>
describe(description, () => {
let detokenizer: Detokenizer;

beforeEach(() => {
detokenizer = new Detokenizer(CSV);
});
beforeEach(() => {
detokenizer = new Detokenizer(csv);
});

it('parses a base64 correct frame properly', () => {
const frame = generateFrame('$8zP8hg==');
expect(detokenizer.detokenizeBase64(frame)).toEqual('base64 token');
});
it('parses a correct frame properly', () => {
const frame = generateFrame('abcde');
expect(detokenizer.detokenize(frame)).toEqual('regular token');
});
it('failure to detokenize returns original string', () => {
expect(detokenizer.detokenize(generateFrame('aabbcc'))).toEqual('aabbcc');
expect(detokenizer.detokenizeBase64(generateFrame('$8zP7hg=='))).toEqual(
'$8zP7hg==',
);
});
it('recursive detokenize all nested base64 tokens', () => {
expect(
detokenizer.detokenizeBase64(
generateFrame(
'$PNNrDQkkN1lZZFJRPT0lJGIxNFlsd2trTjFsWlpGSlJQVDBGUTJGdFpXeFlwSENkUHc9PQ==',
it('parses a base64 correct frame properly', () => {
const frame = generateFrame('$8zP8hg==');
expect(detokenizer.detokenizeBase64(frame)).toEqual('base64 token');
});
it('parses a correct frame properly', () => {
const frame = generateFrame('abcde');
expect(detokenizer.detokenize(frame)).toEqual('regular token');
});
it('failure to detokenize returns original string', () => {
expect(detokenizer.detokenize(generateFrame('aabbcc'))).toEqual('aabbcc');
expect(detokenizer.detokenizeBase64(generateFrame('$8zP7hg=='))).toEqual(
'$8zP7hg==',
);
});
it('recursive detokenize all nested base64 tokens', () => {
expect(
detokenizer.detokenizeBase64(
generateFrame(
'$PNNrDQkkN1lZZFJRPT0lJGIxNFlsd2trTjFsWlpGSlJQVDBGUTJGdFpXeFlwSENkUHc9PQ==',
),
),
),
).toEqual(
'Regular Token: Cat and Nested Token: (token: Cat, string: Camel, int: 44, float: 1.2300000190734863)',
);
});
).toEqual(
'Regular Token: Cat and Nested Token: (token: Cat, string: Camel, int: 44, float: 1.2300000190734863)',
);
});

it('recursion detokenize with limits on max recursion', () => {
expect(
detokenizer.detokenizeBase64(
generateFrame(
'$PNNrDQkkN1lZZFJRPT0lJGIxNFlsd2trTjFsWlpGSlJQVDBGUTJGdFpXeFlwSENkUHc9PQ==',
it('recursion detokenize with limits on max recursion', () => {
expect(
detokenizer.detokenizeBase64(
generateFrame(
'$PNNrDQkkN1lZZFJRPT0lJGIxNFlsd2trTjFsWlpGSlJQVDBGUTJGdFpXeFlwSENkUHc9PQ==',
),
1,
),
1,
),
).toEqual(
'Regular Token: Cat and Nested Token: (token: $7YYdRQ==, string: Camel, int: 44, float: 1.2300000190734863)',
);
).toEqual(
'Regular Token: Cat and Nested Token: (token: $7YYdRQ==, string: Camel, int: 44, float: 1.2300000190734863)',
);
});
});
});

generateTests('Detokenize with 3 column database', CSV3Col);
generateTests('Detokenize with 4 column database', CSV4Col);
32 changes: 32 additions & 0 deletions pw_tokenizer/ts/parser_test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright 2022 The Pigweed Authors
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

/* eslint-env browser */

import Papa from 'papaparse';
import { parseCsvEntry } from './token_database';

const CSV = `
86fc33f3, ,"normal and ""quoted"" text"
`;

describe('parseCsvEntry', () => {
it('correctly parses double quotes', () => {
const parsedCsv = Papa.parse(CSV, { header: false });
const data = parsedCsv.data[1] as string[];
const results = parseCsvEntry(data);
expect(results).toBeDefined();
expect(results.template).toEqual('normal and "quoted" text');
});
});
139 changes: 111 additions & 28 deletions pw_tokenizer/ts/token_database.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,44 +14,127 @@

/** Parses CSV Database for easier lookups */

import Papa from 'papaparse';

interface TokenData {
token: number;
removalDate: Date | null;
domain: string;
template: string;
}

function parseTokenNumber(num: string, lineNumber?: number): number {
if (!/^[a-fA-F0-9]+$/.test(num)) {
// Malformed number
console.error(
new Error(
`TokenDatabase number ${num} ` + lineNumber
? `at line ${lineNumber} `
: '' + `is not a valid hex number`,
),
);
}

try {
return parseInt(num, 16);
} catch {
console.error(
new Error(
`TokenDatabase number ${num} ` + lineNumber
? `at line ${lineNumber} `
: '' + `could not be parsed`,
),
);
}
}

function parseRemovalDate(
dateString: string,
lineNumber?: number,
): Date | null {
const dateContent = dateString.trim();
if (dateContent === '') return null;

try {
return new Date(dateContent);
} catch {
console.error(
new Error(
`TokenDatabase removal date ${dateString} ` + lineNumber
? `at line ${lineNumber} `
: '' + `could not be parsed`,
),
);
}
}

export function parseCsvEntry(
data: string[],
lineNumber?: number,
): TokenData | undefined {
if (data.length < 3) {
console.error(
new Error(
`TokenDatabase entry ${data} ` + lineNumber
? `at line ${lineNumber} `
: '' + `could not be parsed`,
),
);

return undefined;
}

// Column 0: Token
const token = parseTokenNumber(data.shift(), lineNumber);

// Column 1: Removal date
const removalDate = parseRemovalDate(data.shift(), lineNumber);

// Modern 4-column databases will have the domain in this position.
const domain = data.length > 1 ? data.shift() : '';

// Last column: Template strings
const template = data.shift();

return {
token,
removalDate,
domain,
template,
};
}

export class TokenDatabase {
private tokens: Map<number, string> = new Map();
private tokens: Map<number, TokenData> = new Map();

constructor(readonly csv: string) {
this.parseTokensToTokensMap(csv.split(/\r?\n/));
this.parseTokensToTokensMap(csv);
}

has(token: number): boolean {
return this.tokens.has(token);
}

get(token: number): string | undefined {
return this.tokens.get(token);
}

private parseTokensToTokensMap(csv: string[]) {
for (const [lineNumber, line] of Object.entries(
csv.map((line) => line.split(/,/)),
)) {
if (!line[0] || !line[2]) {
continue;
}
if (!/^[a-fA-F0-9]+$/.test(line[0])) {
// Malformed number
console.error(
new Error(
`TokenDatabase number ${line[0]} at line ` +
`${lineNumber} is not a valid hex number`,
),
);
continue;
}
const tokenNumber = parseInt(line[0], 16);
// To extract actual string value of a token number, we:
// - Slice token number and whitespace that are in [0] and [1] of line.
// - Join the rest as a string and trim the trailing quotes.
const data = line.slice(2).join(',').slice(1, -1);
this.tokens.set(tokenNumber, data);
return this.tokens.get(token)?.template;
}

private parseTokensToTokensMap(csv: string) {
const parsedCsv = Papa.parse(csv, { header: false });

if (parsedCsv.errors.length > 0) {
console.error(
new Error(
`TokenDatabase could not be parsed: ${parsedCsv.errors.join(', ')}`,
),
);
}

const csvData = parsedCsv.data as string[][];

for (const [lineNumber, line] of csvData.entries()) {
const entry = parseCsvEntry(line, lineNumber);
entry && this.tokens.set(entry.token, entry);
}
}
}

0 comments on commit 48dfcf5

Please sign in to comment.