-
-
Notifications
You must be signed in to change notification settings - Fork 375
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: introduce experimental JavaScript RegExp Engine (#761)
- Loading branch information
Showing
37 changed files
with
4,880 additions
and
985 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import fs from 'node:fs/promises' | ||
import { bench, describe } from 'vitest' | ||
import type { BundledLanguage } from 'shiki' | ||
import { createHighlighter, createJavaScriptRegexEngine, createWasmOnigEngine } from 'shiki' | ||
import type { ReportItem } from '../scripts/report-engine-js-compat' | ||
|
||
describe('engines', async () => { | ||
const js = createJavaScriptRegexEngine() | ||
const wasm = await createWasmOnigEngine(() => import('shiki/wasm')) | ||
|
||
// Run `npx jiti scripts/report-engine-js-compat.ts` to generate the report first | ||
const report = await fs.readFile('../scripts/report-engine-js-compat.json', 'utf-8').then(JSON.parse) as ReportItem[] | ||
const langs = report.filter(i => i.highlightMatch === true).map(i => i.lang) as BundledLanguage[] | ||
const samples = await Promise.all(langs.map(lang => fs.readFile(`../tm-grammars-themes/samples/${lang}.sample`, 'utf-8'))) | ||
|
||
const shikiJs = await createHighlighter({ | ||
langs, | ||
themes: ['vitesse-dark'], | ||
engine: js, | ||
}) | ||
|
||
const shikiWasm = await createHighlighter({ | ||
langs, | ||
themes: ['vitesse-dark'], | ||
engine: wasm, | ||
}) | ||
|
||
bench('js', () => { | ||
for (const lang of langs) { | ||
shikiJs.codeToTokensBase(samples[langs.indexOf(lang)], { lang, theme: 'vitesse-dark' }) | ||
} | ||
}, { warmupIterations: 10, iterations: 30 }) | ||
|
||
bench('wasm', () => { | ||
for (const lang of langs) { | ||
shikiWasm.codeToTokensBase(samples[langs.indexOf(lang)], { lang, theme: 'vitesse-dark' }) | ||
} | ||
}, { warmupIterations: 10, iterations: 30 }) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
import { onigurumaToRegexp } from 'oniguruma-to-js' | ||
import type { PatternScanner, RegexEngine, RegexEngineString } from '../textmate' | ||
import type { JavaScriptRegexEngineOptions } from '../types/engines' | ||
|
||
const MAX = 4294967295 | ||
|
||
export class JavaScriptScanner implements PatternScanner { | ||
regexps: (RegExp | null)[] | ||
|
||
constructor( | ||
public patterns: string[], | ||
public cache: Map<string, RegExp | Error>, | ||
public forgiving: boolean, | ||
) { | ||
this.regexps = patterns.map((p) => { | ||
const cached = cache?.get(p) | ||
if (cached) { | ||
if (cached instanceof RegExp) { | ||
return cached | ||
} | ||
if (forgiving) | ||
return null | ||
throw cached | ||
} | ||
try { | ||
const regex = onigurumaToRegexp( | ||
p | ||
// YAML specific handling; TODO: move to tm-grammars | ||
.replaceAll('[^\\s[-?:,\\[\\]{}#&*!|>\'"%@`]]', '[^\\s\\-?:,\\[\\]{}#&*!|>\'"%@`]'), | ||
{ flags: 'dg' }, | ||
) | ||
cache?.set(p, regex) | ||
return regex | ||
} | ||
catch (e) { | ||
cache?.set(p, e as Error) | ||
if (forgiving) | ||
return null | ||
// console.error({ ...e }) | ||
throw e | ||
} | ||
}) | ||
} | ||
|
||
findNextMatchSync(string: string | RegexEngineString, startPosition: number) { | ||
const str = typeof string === 'string' | ||
? string | ||
: string.content | ||
const pending: [index: number, match: RegExpExecArray][] = [] | ||
|
||
function toResult(index: number, match: RegExpExecArray) { | ||
return { | ||
index, | ||
captureIndices: match.indices!.map((indice) => { | ||
if (indice == null) { | ||
return { | ||
end: MAX, | ||
start: MAX, | ||
length: 0, | ||
} | ||
} | ||
return { | ||
start: indice[0], | ||
length: indice[1] - indice[0], | ||
end: indice[1], | ||
} | ||
}), | ||
} | ||
} | ||
|
||
for (let i = 0; i < this.regexps.length; i++) { | ||
const regexp = this.regexps[i] | ||
if (!regexp) | ||
continue | ||
try { | ||
regexp.lastIndex = startPosition | ||
const match = regexp.exec(str) | ||
if (!match) | ||
continue | ||
// If the match is at the start position, return it immediately | ||
if (match.index === startPosition) { | ||
return toResult(i, match) | ||
} | ||
// Otherwise, store it for later | ||
pending.push([i, match]) | ||
} | ||
catch (e) { | ||
if (this.forgiving) | ||
continue | ||
throw e | ||
} | ||
} | ||
|
||
// Find the closest match to the start position | ||
if (pending.length) { | ||
const minIndex = Math.min(...pending.map(m => m[1].index)) | ||
for (const [i, match] of pending) { | ||
if (match.index === minIndex) { | ||
return toResult(i, match) | ||
} | ||
} | ||
} | ||
|
||
return null | ||
} | ||
} | ||
|
||
/** | ||
* Use the modern JavaScript RegExp engine to implement the OnigScanner. | ||
* | ||
* As Oniguruma regex is more powerful than JavaScript regex, some patterns may not be supported. | ||
* Errors will be thrown when parsing TextMate grammars with unsupported patterns. | ||
* Set `forgiving` to `true` to ignore these errors and skip the unsupported patterns. | ||
* | ||
* @experimental | ||
*/ | ||
export function createJavaScriptRegexEngine(options: JavaScriptRegexEngineOptions = {}): RegexEngine { | ||
const { | ||
forgiving = false, | ||
cache = new Map(), | ||
} = options | ||
|
||
return { | ||
createScanner(patterns: string[]) { | ||
return new JavaScriptScanner(patterns, cache, forgiving) | ||
}, | ||
createString(s: string) { | ||
return { | ||
content: s, | ||
} | ||
}, | ||
} | ||
} |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.