From 5fad2b8257ec829032ef678c345ac23b56193ef4 Mon Sep 17 00:00:00 2001 From: Ryota Kameoka Date: Wed, 2 Oct 2024 03:15:09 +0900 Subject: [PATCH] `diffWords` now takes an optional `intlSegmenter` option https://github.com/kpdecker/jsdiff/pull/539 --- types/diff/diff-tests.ts | 3 +++ types/diff/index.d.ts | 9 +++++++++ types/diff/tsconfig.json | 3 ++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/types/diff/diff-tests.ts b/types/diff/diff-tests.ts index 0173ba32592c0ae..f65fd6f59b24b8f 100644 --- a/types/diff/diff-tests.ts +++ b/types/diff/diff-tests.ts @@ -16,6 +16,9 @@ Diff.diffChars(one, other, { Diff.diffChars(one, other, (value) => { value; // $ExpectType Change[] }); +Diff.diffWords('吾輩は猫である。名前はまだ無い。', '吾輩は猫である。名前はたぬき。', { + intlSegmenter: new Intl.Segmenter('ja-JP', { granularity: 'word' }), +}); // $ExpectType Change[] Diff.diffLines( "line\nold value\nline", diff --git a/types/diff/index.d.ts b/types/diff/index.d.ts index 2ac6744908d00c4..3973978d4c61b60 100644 --- a/types/diff/index.d.ts +++ b/types/diff/index.d.ts @@ -32,6 +32,15 @@ export interface WordsOptions extends BaseOptions { * `true` to ignore leading and trailing whitespace. This is the same as `diffWords()`. */ ignoreWhitespace?: boolean | undefined; + + /** + * An optional [`Intl.Segmenter`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter) object (which must have a `granularity` of `'word'`) for `diffWords` to use to split the text into words. + * + * By default, `diffWords` does not use an `Intl.Segmenter`, just some regexes for splitting text into words. This will tend to give worse results than `Intl.Segmenter` would, but ensures the results are consistent across environments; `Intl.Segmenter` behaviour is only loosely specced and the implementations in browsers could in principle change dramatically in future. If you want to use `diffWords` with an `Intl.Segmenter` but ensure it behaves the same whatever environment you run it in, use an `Intl.Segmenter` polyfill instead of the JavaScript engine's native `Intl.Segmenter` implementation. + * + * Using an `Intl.Segmenter` should allow better word-level diffing of non-English text than the default behaviour. For instance, `Intl.Segmenter`s can generally identify via built-in dictionaries which sequences of adjacent Chinese characters form words, allowing word-level diffing of Chinese. By specifying a language when instantiating the segmenter (e.g. `new Intl.Segmenter('sv', {granularity: 'word'})`) you can also support language-specific rules, like treating Swedish's colon separated contractions (like *k:a* for *kyrka*) as single words; by default this would be seen as two words separated by a colon. + */ + intlSegmenter?: Intl.Segmenter | undefined; } export interface LinesOptions extends BaseOptions { diff --git a/types/diff/tsconfig.json b/types/diff/tsconfig.json index 6462f458f325f10..10038a73d344d2c 100644 --- a/types/diff/tsconfig.json +++ b/types/diff/tsconfig.json @@ -2,7 +2,8 @@ "compilerOptions": { "module": "node16", "lib": [ - "es6" + "es6", + "es2022.intl" ], "noImplicitAny": true, "noImplicitThis": true,