Skip to content

Commit

Permalink
fix(scrobbler): Fix erasing non-english characters #121
Browse files Browse the repository at this point in the history
  • Loading branch information
FoxxMD committed Jan 30, 2024
1 parent 41e607c commit c2f0147
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 6 deletions.
22 changes: 20 additions & 2 deletions src/backend/tests/utils/strings.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import {describe, it} from 'mocha';
import {assert} from 'chai';
import { compareNormalizedStrings, parseTrackCredits, uniqueNormalizedStrArr } from "../../utils/StringUtils.js";
import {
compareNormalizedStrings,
normalizeStr,
parseTrackCredits,
uniqueNormalizedStrArr
} from "../../utils/StringUtils.js";
import testData from './playTestData.json';
import { ExpectedResults } from "./interfaces.js";
import { intersect } from "../../utils.js";
Expand All @@ -17,7 +22,7 @@ interface PlayTestFixture {

describe('String Comparisons', function () {

it('should ignore punctuation', async function () {
it('should ignore symbols', async function () {
const result = compareNormalizedStrings('this string! is the. same', 'this string is the same');
assert.isAtLeast(result.highScore, 100);
});
Expand Down Expand Up @@ -46,6 +51,19 @@ describe('String Comparisons', function () {
}
});

it('should not erase non-english characters', async function () {
const tests = [
['VAPERROR / t e l e p a t h テレパシー能力者 - 切っても切れない', 'vaperror t e l e p a t h テレパシー能力者 切っても切れない'],
['Мой мармеладный (Speed Up)', 'мои мармеладныи speed up'],
['Мой мармеладный (Я не права) [Из сериала "Ольга", 2 Сезон]', 'мои мармеладныи я не права из сериала ольга 2 сезон']
]

for(const test of tests) {
const result = normalizeStr(test[0], {keepSingleWhitespace: true});
assert.equal(result, test[1]);
}
});

it('should score small changes correctly', async function () {
const tests = [
['there is change', 'therr is change'],
Expand Down
9 changes: 5 additions & 4 deletions src/backend/utils/StringUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ import {strategies} from '@foxxmd/string-sameness';

const {levenStrategy, diceStrategy} = strategies;

export const PUNCTUATION_WHITESPACE_REGEX = new RegExp(/[^\w\d]/g);
export const PUNCTUATION_REGEX = new RegExp(/[^\w\s]/g);
// cant use [^\w\s] because this also catches non-english characters
export const SYMBOLS_WHITESPACE_REGEX = new RegExp(/[`=(){}<>;',.~!@#$%^&*_+|:"?\-\\\[\]\/\s]/g);
export const SYMBOLS_REGEX = new RegExp(/[`=(){}<>;',.~!@#$%^&*_+|:"?\-\\\[\]\/]/g);

export const MULTI_WHITESPACE_REGEX = new RegExp(/\s{2,}/g);
export const uniqueNormalizedStrArr = (arr: string[]): string[] => {
Expand All @@ -24,9 +25,9 @@ export const normalizeStr = (str: string, options?: {keepSingleWhitespace?: bool
const {keepSingleWhitespace = false} = options || {};
const normal = str.normalize('NFD').replace(/[\u0300-\u036f]/g, "");
if(!keepSingleWhitespace) {
return normal.replace(PUNCTUATION_WHITESPACE_REGEX, '').toLocaleLowerCase();
return normal.replace(SYMBOLS_WHITESPACE_REGEX, '').toLocaleLowerCase();
}
return normal.replace(PUNCTUATION_REGEX, '').replace(MULTI_WHITESPACE_REGEX, ' ').toLocaleLowerCase().trim();
return normal.replace(SYMBOLS_REGEX, '').replace(MULTI_WHITESPACE_REGEX, ' ').toLocaleLowerCase().trim();
}

export interface PlayCredits {
Expand Down

0 comments on commit c2f0147

Please sign in to comment.