From f9409e3fcf4452fe2a5005d25eaec4dbc0b8aeec Mon Sep 17 00:00:00 2001 From: Christian Fehmer Date: Wed, 25 Sep 2024 13:49:23 +0200 Subject: [PATCH] fix: handling of characters outside the BMP (@fehmer) (#5911) Handle multi-byte characters outside the [basic multilingual plane](https://en.wikipedia.org/wiki/Plane_(Unicode)) correctly. Fixes #5906 --- frontend/__tests__/utils/strings.spec.ts | 12 +++++++ frontend/src/ts/test/caret.ts | 5 +-- frontend/src/ts/test/test-ui.ts | 43 +++++++++++++----------- frontend/src/ts/utils/strings.ts | 15 +++++++++ 4 files changed, 54 insertions(+), 21 deletions(-) create mode 100644 frontend/__tests__/utils/strings.spec.ts diff --git a/frontend/__tests__/utils/strings.spec.ts b/frontend/__tests__/utils/strings.spec.ts new file mode 100644 index 000000000000..c44e07c6145e --- /dev/null +++ b/frontend/__tests__/utils/strings.spec.ts @@ -0,0 +1,12 @@ +import * as Strings from "../../src/ts/utils/strings"; + +describe("string utils", () => { + describe("splitIntoCharacters", () => { + it("splits regular characters", () => { + expect(Strings.splitIntoCharacters("abc")).toEqual(["a", "b", "c"]); + }); + it("splits characters outside of the bmp", () => { + expect(Strings.splitIntoCharacters("t𐑩e")).toEqual(["t", "𐑩", "e"]); + }); + }); +}); diff --git a/frontend/src/ts/test/caret.ts b/frontend/src/ts/test/caret.ts index 2c73cbfe5bde..40108f7d39aa 100644 --- a/frontend/src/ts/test/caret.ts +++ b/frontend/src/ts/test/caret.ts @@ -6,6 +6,7 @@ import * as TestState from "../test/test-state"; import * as TestWords from "./test-words"; import { prefersReducedMotion } from "../utils/misc"; import { convertRemToPixels } from "../utils/numbers"; +import { splitIntoCharacters } from "../utils/strings"; export let caretAnimating = true; const caret = document.querySelector("#caret") as HTMLElement; @@ -133,8 +134,8 @@ export async function updatePosition(noAnim = false): Promise { Config.caretStyle ); - let wordLen = TestWords.words.getCurrent().length; - const inputLen = TestInput.input.current.length; + let wordLen = splitIntoCharacters(TestWords.words.getCurrent()).length; + const inputLen = splitIntoCharacters(TestInput.input.current).length; if (Config.mode === "zen") wordLen = inputLen; const activeWordEl = document?.querySelector("#words .active") as HTMLElement; //insert temporary character so the caret will work in zen mode diff --git a/frontend/src/ts/test/test-ui.ts b/frontend/src/ts/test/test-ui.ts index 5a0b825ff453..ce717aa68f5e 100644 --- a/frontend/src/ts/test/test-ui.ts +++ b/frontend/src/ts/test/test-ui.ts @@ -41,13 +41,14 @@ function createHintsHtml( activeWordLetters: NodeListOf, inputWord: string ): string { + const inputChars = Strings.splitIntoCharacters(inputWord); let hintsHtml = ""; for (const adjacentLetters of incorrectLtrIndices) { for (const indx of adjacentLetters) { const blockLeft = (activeWordLetters[indx] as HTMLElement).offsetLeft; const blockWidth = (activeWordLetters[indx] as HTMLElement).offsetWidth; const blockIndices = `[${indx}]`; - const blockChars = inputWord[indx]; + const blockChars = inputChars[indx]; hintsHtml += ` f.functions?.getWordHtml ); - for (let c = 0; c < word.length; c++) { + const chars = Strings.splitIntoCharacters(word); + for (const char of chars) { if (funbox?.functions?.getWordHtml) { - retval += funbox.functions.getWordHtml(word.charAt(c), true); - } else if (word.charAt(c) === "\t") { + retval += funbox.functions.getWordHtml(char, true); + } else if (char === "\t") { retval += ``; - } else if (word.charAt(c) === "\n") { + } else if (char === "\n") { newlineafter = true; retval += ``; } else { - retval += "" + word.charAt(c) + ""; + retval += "" + char + ""; } } retval += ""; @@ -833,10 +835,12 @@ export async function updateActiveWordLetters( (f) => f.functions?.getWordHtml ); - for (let i = 0; i < input.length; i++) { - const charCorrect = currentWord[i] === input[i]; + const inputChars = Strings.splitIntoCharacters(input); + const currentWordChars = Strings.splitIntoCharacters(currentWord); + for (let i = 0; i < inputChars.length; i++) { + const charCorrect = currentWordChars[i] === inputChars[i]; - let currentLetter = currentWord[i] as string; + let currentLetter = currentWordChars[i] as string; let tabChar = ""; let nlChar = ""; if (funbox?.functions?.getWordHtml) { @@ -862,13 +866,13 @@ export async function updateActiveWordLetters( ) { ret += `${ Config.indicateTypos === "replace" - ? input[i] === " " + ? inputChars[i] === " " ? "_" - : input[i] + : inputChars[i] : currentLetter }`; } else if (currentLetter === undefined) { - let letter = input[i]; + let letter = inputChars[i]; if (letter === " " || letter === "\t" || letter === "\n") { letter = "_"; } @@ -877,9 +881,9 @@ export async function updateActiveWordLetters( ret += `` + (Config.indicateTypos === "replace" - ? input[i] === " " + ? inputChars[i] === " " ? "_" - : input[i] + : inputChars[i] : currentLetter) + ""; if (Config.indicateTypos === "below") { @@ -893,15 +897,16 @@ export async function updateActiveWordLetters( } } - for (let i = input.length; i < currentWord.length; i++) { + for (let i = inputChars.length; i < currentWordChars.length; i++) { + const currentLetter = currentWordChars[i]; if (funbox?.functions?.getWordHtml) { - ret += funbox.functions.getWordHtml(currentWord[i] as string, true); - } else if (currentWord[i] === "\t") { + ret += funbox.functions.getWordHtml(currentLetter as string, true); + } else if (currentLetter === "\t") { ret += ``; - } else if (currentWord[i] === "\n") { + } else if (currentLetter === "\n") { ret += ``; } else { - ret += `` + currentWord[i] + ""; + ret += `` + currentLetter + ""; } } } diff --git a/frontend/src/ts/utils/strings.ts b/frontend/src/ts/utils/strings.ts index 55e1328d0c00..d879a6d3ad58 100644 --- a/frontend/src/ts/utils/strings.ts +++ b/frontend/src/ts/utils/strings.ts @@ -149,3 +149,18 @@ export function cleanTypographySymbols(textToClean: string): string { (char) => specials[char as keyof typeof specials] || "" ); } + +/** + * Split a string into characters. This supports multi-byte characters outside of the [Basic Multilinugal Plane](https://en.wikipedia.org/wiki/Plane_(Unicode). + * Using `string.length` and `string[index]` does not work. + * @param s string to be tokenized into characters + * @returns array of characters + */ +export function splitIntoCharacters(s: string): string[] { + const result: string[] = []; + for (const t of s) { + result.push(t); + } + + return result; +}