Skip to content

Commit

Permalink
feat: introduce experimental JavaScript RegExp Engine (#761)
Browse files Browse the repository at this point in the history
  • Loading branch information
antfu authored Aug 30, 2024
1 parent 523f5fd commit 2be5b2d
Show file tree
Hide file tree
Showing 37 changed files with 4,880 additions and 985 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ packages/shiki/src/assets/themes
packages/shiki/src/assets/*.json
cache
.eslintcache
report-engine-js-compat.json
39 changes: 39 additions & 0 deletions bench/engines.bench.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import fs from 'node:fs/promises'
import { bench, describe } from 'vitest'
import type { BundledLanguage } from 'shiki'
import { createHighlighter, createJavaScriptRegexEngine, createWasmOnigEngine } from 'shiki'
import type { ReportItem } from '../scripts/report-engine-js-compat'

describe('engines', async () => {
const js = createJavaScriptRegexEngine()
const wasm = await createWasmOnigEngine(() => import('shiki/wasm'))

// Run `npx jiti scripts/report-engine-js-compat.ts` to generate the report first
const report = await fs.readFile('../scripts/report-engine-js-compat.json', 'utf-8').then(JSON.parse) as ReportItem[]
const langs = report.filter(i => i.highlightMatch === true).map(i => i.lang) as BundledLanguage[]
const samples = await Promise.all(langs.map(lang => fs.readFile(`../tm-grammars-themes/samples/${lang}.sample`, 'utf-8')))

const shikiJs = await createHighlighter({
langs,
themes: ['vitesse-dark'],
engine: js,
})

const shikiWasm = await createHighlighter({
langs,
themes: ['vitesse-dark'],
engine: wasm,
})

bench('js', () => {
for (const lang of langs) {
shikiJs.codeToTokensBase(samples[langs.indexOf(lang)], { lang, theme: 'vitesse-dark' })
}
}, { warmupIterations: 10, iterations: 30 })

bench('wasm', () => {
for (const lang of langs) {
shikiWasm.codeToTokensBase(samples[langs.indexOf(lang)], { lang, theme: 'vitesse-dark' })
}
}, { warmupIterations: 10, iterations: 30 })
})
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"type": "module",
"version": "1.14.1",
"private": true,
"packageManager": "pnpm@9.8.0",
"packageManager": "pnpm@9.9.0",
"scripts": {
"lint": "eslint . --cache",
"release": "bumpp && pnpm -r publish",
Expand Down Expand Up @@ -46,6 +46,7 @@
"mdast-util-gfm": "catalog:",
"mdast-util-to-hast": "catalog:",
"ofetch": "catalog:",
"picocolors": "catalog:",
"pnpm": "catalog:",
"prettier": "catalog:",
"rimraf": "catalog:",
Expand Down
1 change: 1 addition & 0 deletions packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
},
"devDependencies": {
"hast-util-to-html": "catalog:",
"oniguruma-to-js": "catalog:",
"vscode-oniguruma": "catalog:"
}
}
133 changes: 133 additions & 0 deletions packages/core/src/engines/javascript.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import { onigurumaToRegexp } from 'oniguruma-to-js'
import type { PatternScanner, RegexEngine, RegexEngineString } from '../textmate'
import type { JavaScriptRegexEngineOptions } from '../types/engines'

const MAX = 4294967295

export class JavaScriptScanner implements PatternScanner {
regexps: (RegExp | null)[]

constructor(
public patterns: string[],
public cache: Map<string, RegExp | Error>,
public forgiving: boolean,
) {
this.regexps = patterns.map((p) => {
const cached = cache?.get(p)
if (cached) {
if (cached instanceof RegExp) {
return cached
}
if (forgiving)
return null
throw cached
}
try {
const regex = onigurumaToRegexp(
p
// YAML specific handling; TODO: move to tm-grammars
.replaceAll('[^\\s[-?:,\\[\\]{}#&*!|>\'"%@`]]', '[^\\s\\-?:,\\[\\]{}#&*!|>\'"%@`]'),
{ flags: 'dg' },
)
cache?.set(p, regex)
return regex
}
catch (e) {
cache?.set(p, e as Error)
if (forgiving)
return null
// console.error({ ...e })
throw e
}
})
}

findNextMatchSync(string: string | RegexEngineString, startPosition: number) {
const str = typeof string === 'string'
? string
: string.content
const pending: [index: number, match: RegExpExecArray][] = []

function toResult(index: number, match: RegExpExecArray) {
return {
index,
captureIndices: match.indices!.map((indice) => {
if (indice == null) {
return {
end: MAX,
start: MAX,
length: 0,
}
}
return {
start: indice[0],
length: indice[1] - indice[0],
end: indice[1],
}
}),
}
}

for (let i = 0; i < this.regexps.length; i++) {
const regexp = this.regexps[i]
if (!regexp)
continue
try {
regexp.lastIndex = startPosition
const match = regexp.exec(str)
if (!match)
continue
// If the match is at the start position, return it immediately
if (match.index === startPosition) {
return toResult(i, match)
}
// Otherwise, store it for later
pending.push([i, match])
}
catch (e) {
if (this.forgiving)
continue
throw e
}
}

// Find the closest match to the start position
if (pending.length) {
const minIndex = Math.min(...pending.map(m => m[1].index))
for (const [i, match] of pending) {
if (match.index === minIndex) {
return toResult(i, match)
}
}
}

return null
}
}

/**
* Use the modern JavaScript RegExp engine to implement the OnigScanner.
*
* As Oniguruma regex is more powerful than JavaScript regex, some patterns may not be supported.
* Errors will be thrown when parsing TextMate grammars with unsupported patterns.
* Set `forgiving` to `true` to ignore these errors and skip the unsupported patterns.
*
* @experimental
*/
export function createJavaScriptRegexEngine(options: JavaScriptRegexEngineOptions = {}): RegexEngine {
const {
forgiving = false,
cache = new Map(),
} = options

return {
createScanner(patterns: string[]) {
return new JavaScriptScanner(patterns, cache, forgiving)
},
createString(s: string) {
return {
content: s,
}
},
}
}
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@
* Copyright (C) Microsoft Corporation. All rights reserved.
*-------------------------------------------------------- */

import { ShikiError } from '../error'
import type { IOnigBinding, IOnigCaptureIndex, IOnigMatch, OnigScanner as IOnigScanner, OnigString as IOnigString, Pointer } from './types'
import { ShikiError } from '../../error'
import type { LoadWasmOptions, WebAssemblyInstance, WebAssemblyInstantiator } from '../../types'
import type { IOnigCaptureIndex, IOnigMatch, OnigScanner as IOnigScanner, OnigString as IOnigString } from '../../../vendor/vscode-textmate/src/main'
import createOnigasm from './onig'

export type Instantiator = (importObject: Record<string, Record<string, WebAssembly.ImportValue>>) => Promise<WebAssembly.Exports>

export type Pointer = number

export const enum FindOption {
None = 0,
/**
Expand All @@ -20,14 +25,25 @@ export const enum FindOption {
* equivalent of ONIG_OPTION_NOT_BEGIN_POSITION: (start) isn't considered as start position of search (* fail \G)
*/
NotBeginPosition = 4,
/**
* used for debugging purposes.
*/
DebugCall = 8,
}

export interface IOnigBinding {
HEAPU8: Uint8Array
HEAPU32: Uint32Array

UTF8ToString: (ptr: Pointer) => string

omalloc: (count: number) => Pointer
ofree: (ptr: Pointer) => void
getLastOnigError: () => Pointer
createOnigScanner: (strPtrsPtr: Pointer, strLenPtr: Pointer, count: number) => Pointer
freeOnigScanner: (ptr: Pointer) => void
findNextOnigScannerMatch: (scanner: Pointer, strCacheId: number, strData: Pointer, strLength: number, position: number, options: number) => number
// findNextOnigScannerMatchDbg: (scanner: Pointer, strCacheId: number, strData: Pointer, strLength: number, position: number, options: number) => number
}

let onigBinding: IOnigBinding | null = null
let defaultDebugCall = false
// let defaultDebugCall = false

function throwLastOnigError(onigBinding: IOnigBinding): void {
throw new ShikiError(onigBinding.UTF8ToString(onigBinding.getLastOnigError()))
Expand Down Expand Up @@ -294,34 +310,33 @@ export class OnigScanner implements IOnigScanner {
public findNextMatchSync(string: string | OnigString, startPosition: number, debugCall: boolean): IOnigMatch | null
public findNextMatchSync(string: string | OnigString, startPosition: number): IOnigMatch | null
public findNextMatchSync(string: string | OnigString, startPosition: number, arg?: number | boolean): IOnigMatch | null {
let debugCall = defaultDebugCall
// let debugCall = defaultDebugCall
let options = FindOption.None
if (typeof arg === 'number') {
if (arg & FindOption.DebugCall)
debugCall = true

// if (arg & FindOption.DebugCall)
// debugCall = true
options = arg
}
else if (typeof arg === 'boolean') {
debugCall = arg
// debugCall = arg
}
if (typeof string === 'string') {
string = new OnigString(string)
const result = this._findNextMatchSync(string, startPosition, debugCall, options)
const result = this._findNextMatchSync(string, startPosition, false, options)
string.dispose()
return result
}
return this._findNextMatchSync(string, startPosition, debugCall, options)
return this._findNextMatchSync(string, startPosition, false, options)
}

private _findNextMatchSync(string: OnigString, startPosition: number, debugCall: boolean, options: number): IOnigMatch | null {
const onigBinding = this._onigBinding
let resultPtr: Pointer
if (debugCall)
resultPtr = onigBinding.findNextOnigScannerMatchDbg(this._ptr, string.id, string.ptr, string.utf8Length, string.convertUtf16OffsetToUtf8(startPosition), options)
// let resultPtr: Pointer
// if (debugCall)
// resultPtr = onigBinding.findNextOnigScannerMatchDbg(this._ptr, string.id, string.ptr, string.utf8Length, string.convertUtf16OffsetToUtf8(startPosition), options)

else
resultPtr = onigBinding.findNextOnigScannerMatch(this._ptr, string.id, string.ptr, string.utf8Length, string.convertUtf16OffsetToUtf8(startPosition), options)
// else
const resultPtr = onigBinding.findNextOnigScannerMatch(this._ptr, string.id, string.ptr, string.utf8Length, string.convertUtf16OffsetToUtf8(startPosition), options)

if (resultPtr === 0) {
// no match
Expand All @@ -348,17 +363,6 @@ export class OnigScanner implements IOnigScanner {
}
}

export interface WebAssemblyInstantiator {
(importObject: Record<string, Record<string, WebAssembly.ImportValue>> | undefined): Promise<WebAssemblyInstance>
}

export type WebAssemblyInstance = WebAssembly.WebAssemblyInstantiatedSource | WebAssembly.Instance | WebAssembly.Instance['exports']

export type OnigurumaLoadOptions =
| { instantiator: WebAssemblyInstantiator }
| { default: WebAssemblyInstantiator }
| { data: ArrayBufferView | ArrayBuffer | Response }

function isInstantiatorOptionsObject(dataOrOptions: any): dataOrOptions is { instantiator: WebAssemblyInstantiator } {
return (typeof dataOrOptions.instantiator === 'function')
}
Expand All @@ -385,15 +389,6 @@ function isArrayBuffer(data: any): data is ArrayBuffer | ArrayBufferView {

let initPromise: Promise<void>

type Awaitable<T> = T | Promise<T>

export type LoadWasmOptionsPlain =
| OnigurumaLoadOptions
| WebAssemblyInstantiator
| ArrayBufferView | ArrayBuffer | Response

export type LoadWasmOptions = Awaitable<LoadWasmOptionsPlain> | (() => Awaitable<LoadWasmOptionsPlain>)

export function loadWasm(options: LoadWasmOptions): Promise<void> {
if (initPromise)
return initPromise
Expand Down Expand Up @@ -461,14 +456,14 @@ function _makeResponseNonStreamingLoader(data: Response): WebAssemblyInstantiato
}
}

export function createOnigString(str: string) {
return new OnigString(str)
}
// export function createOnigString(str: string) {
// return new OnigString(str)
// }

export function createOnigScanner(patterns: string[]) {
return new OnigScanner(patterns)
}
// export function createOnigScanner(patterns: string[]) {
// return new OnigScanner(patterns)
// }

export function setDefaultDebugCall(_defaultDebugCall: boolean): void {
defaultDebugCall = _defaultDebugCall
}
// export function setDefaultDebugCall(_defaultDebugCall: boolean): void {
// defaultDebugCall = _defaultDebugCall
// }
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
import type { IOnigBinding, Instantiator } from './types'
import type { IOnigBinding, Instantiator } from '.'

function getHeapMax() {
return 2147483648
}

function _emscripten_get_now() {
return typeof performance !== 'undefined' ? performance.now() : Date.now()
}

const alignUp = (x: number, multiple: number) => x + ((multiple - (x % multiple)) % multiple)

export default async function main(init: Instantiator): Promise<IOnigBinding> {
let wasmMemory: any
Expand All @@ -12,15 +22,10 @@ export default async function main(init: Instantiator): Promise<IOnigBinding> {
binding.HEAPU32 = new Uint32Array(buf)
}

function _emscripten_get_now() {
return typeof performance !== 'undefined' ? performance.now() : Date.now()
}
function _emscripten_memcpy_big(dest: number, src: number, num: number) {
binding.HEAPU8.copyWithin(dest, src, src + num)
}
function getHeapMax() {
return 2147483648
}

function emscripten_realloc_buffer(size: number) {
try {
wasmMemory.grow((size - buffer.byteLength + 65535) >>> 16)
Expand All @@ -36,7 +41,6 @@ export default async function main(init: Instantiator): Promise<IOnigBinding> {
if (requestedSize > maxHeapSize)
return false

const alignUp = (x: number, multiple: number) => x + ((multiple - (x % multiple)) % multiple)
for (let cutDown = 1; cutDown <= 4; cutDown *= 2) {
let overGrownHeapSize = oldSize * (1 + 0.2 / cutDown)
overGrownHeapSize = Math.min(overGrownHeapSize, requestedSize + 100663296)
Expand Down
Loading

0 comments on commit 2be5b2d

Please sign in to comment.