From c2f01473fd7b062005e748af7c7cb419aa627428 Mon Sep 17 00:00:00 2001 From: FoxxMD Date: Tue, 30 Jan 2024 14:47:19 -0500 Subject: [PATCH] fix(scrobbler): Fix erasing non-english characters #121 --- src/backend/tests/utils/strings.test.ts | 22 ++++++++++++++++++++-- src/backend/utils/StringUtils.ts | 9 +++++---- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/backend/tests/utils/strings.test.ts b/src/backend/tests/utils/strings.test.ts index 09739cac..1ba15949 100644 --- a/src/backend/tests/utils/strings.test.ts +++ b/src/backend/tests/utils/strings.test.ts @@ -1,6 +1,11 @@ import {describe, it} from 'mocha'; import {assert} from 'chai'; -import { compareNormalizedStrings, parseTrackCredits, uniqueNormalizedStrArr } from "../../utils/StringUtils.js"; +import { + compareNormalizedStrings, + normalizeStr, + parseTrackCredits, + uniqueNormalizedStrArr +} from "../../utils/StringUtils.js"; import testData from './playTestData.json'; import { ExpectedResults } from "./interfaces.js"; import { intersect } from "../../utils.js"; @@ -17,7 +22,7 @@ interface PlayTestFixture { describe('String Comparisons', function () { - it('should ignore punctuation', async function () { + it('should ignore symbols', async function () { const result = compareNormalizedStrings('this string! is the. same', 'this string is the same'); assert.isAtLeast(result.highScore, 100); }); @@ -46,6 +51,19 @@ describe('String Comparisons', function () { } }); + it('should not erase non-english characters', async function () { + const tests = [ + ['VAPERROR / t e l e p a t h テレパシー能力者 - 切っても切れない', 'vaperror t e l e p a t h テレパシー能力者 切っても切れない'], + ['Мой мармеладный (Speed Up)', 'мои мармеладныи speed up'], + ['Мой мармеладный (Я не права) [Из сериала "Ольга", 2 Сезон]', 'мои мармеладныи я не права из сериала ольга 2 сезон'] + ] + + for(const test of tests) { + const result = normalizeStr(test[0], {keepSingleWhitespace: true}); + assert.equal(result, test[1]); + } + }); + it('should score small changes correctly', async function () { const tests = [ ['there is change', 'therr is change'], diff --git a/src/backend/utils/StringUtils.ts b/src/backend/utils/StringUtils.ts index c334ca9f..be672f78 100644 --- a/src/backend/utils/StringUtils.ts +++ b/src/backend/utils/StringUtils.ts @@ -6,8 +6,9 @@ import {strategies} from '@foxxmd/string-sameness'; const {levenStrategy, diceStrategy} = strategies; -export const PUNCTUATION_WHITESPACE_REGEX = new RegExp(/[^\w\d]/g); -export const PUNCTUATION_REGEX = new RegExp(/[^\w\s]/g); +// cant use [^\w\s] because this also catches non-english characters +export const SYMBOLS_WHITESPACE_REGEX = new RegExp(/[`=(){}<>;',.~!@#$%^&*_+|:"?\-\\\[\]\/\s]/g); +export const SYMBOLS_REGEX = new RegExp(/[`=(){}<>;',.~!@#$%^&*_+|:"?\-\\\[\]\/]/g); export const MULTI_WHITESPACE_REGEX = new RegExp(/\s{2,}/g); export const uniqueNormalizedStrArr = (arr: string[]): string[] => { @@ -24,9 +25,9 @@ export const normalizeStr = (str: string, options?: {keepSingleWhitespace?: bool const {keepSingleWhitespace = false} = options || {}; const normal = str.normalize('NFD').replace(/[\u0300-\u036f]/g, ""); if(!keepSingleWhitespace) { - return normal.replace(PUNCTUATION_WHITESPACE_REGEX, '').toLocaleLowerCase(); + return normal.replace(SYMBOLS_WHITESPACE_REGEX, '').toLocaleLowerCase(); } - return normal.replace(PUNCTUATION_REGEX, '').replace(MULTI_WHITESPACE_REGEX, ' ').toLocaleLowerCase().trim(); + return normal.replace(SYMBOLS_REGEX, '').replace(MULTI_WHITESPACE_REGEX, ' ').toLocaleLowerCase().trim(); } export interface PlayCredits {