Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for backreferences #132

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 56 additions & 2 deletions moo.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@
}
}

function zeropad(string, length) {
if (string.length < length) {
return new Array(length - string.length + 1).join("0") + string
}
return string
}

function objectToRules(object) {
var keys = Object.getOwnPropertyNames(object)
var result = []
Expand Down Expand Up @@ -227,13 +234,60 @@
// convert to RegExp
var pat = reUnion(match.map(regexpOrLiteral))

// Add backreference support
var groupCount = reGroups(pat)
let numberOfGroupsPreviousToBackreferences = groups.length
for (let g = 0; g < groupCount; g++) {
/*
* Stub group for this capture group, this should never be referenced
* later in the code since the capture group will only be non-null if
* the parent capture group (with lower index) is non-null, in which
* case the parent will win.
*/
groups.push(null)
}
/*
* Replace backreferences like \1 with backreferences to the correct
* placeholder in the built regexp, being careful to avoid
* false-positives due to escaped backslashes.
*/
var hasBackreference = false
if (groupCount > 0) {
/*
* WARNING: we require your regexp to contain a capture group to opt
* into this because you cannot use this with certain regexps, e.g.
* `/()[\1]/` should matches SOH (U+0001) but we see the \1 as a
* backreference and rewrite it.
*
* To solve this, avoid octal escapes, use `\u0001` instead.
*/

pat = pat.replace(/((?:^|[^\\])(?:\\\\)*\\)([1-9][0-9]*)(?=[^0-9])/g, (match, front, backreferenceGroupNumber) => {
const number = parseInt(backreferenceGroupNumber, 10)
const couldBeOctal = !!backreferenceGroupNumber.match(/^[0-7]+$/)
const octalNumber = couldBeOctal && parseInt(backreferenceGroupNumber, 8)
if (number < 1 || number > groupCount) {
throw new Error(
"Backreference \\" + backreferenceGroupNumber + " out of range in regexp " + pat
+ (
couldBeOctal
? " (if you meant to use an octal escape, instead use \\u" + zeropad(octalNumber.toString(16), 4) + ")"
: ""
)
)
}
hasBackreference = true
// Account for all the previous capture groups
return front + String(numberOfGroupsPreviousToBackreferences + number)
})
}

// validate
var regexp = new RegExp(pat)
if (regexp.test("")) {
throw new Error("RegExp matches empty string: " + regexp)
}
var groupCount = reGroups(pat)
if (groupCount > 0) {
if (groupCount > 0 && !hasBackreference) {
throw new Error("RegExp has capture groups: " + regexp + "\nUse (?: … ) instead")
}

Expand Down
50 changes: 50 additions & 0 deletions test/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,56 @@ describe('value transforms', () => {

})

describe('backreferences', () => {
test('does not get processed if no capture groups', () => {
expect(() => moo.compile({
tok: /foo\1/,
tok2: /[\1]/
})).not.toThrow()
})

test('throws error on invalid backreference when capture groups present', () => {
expect(() => moo.compile({
tok: /(f)(o)\13/
})).toThrow('use \\u000b')
})

test('enable back-references', () => {
let lexer = moo.compile({
// https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING
// The tag, if any, of a dollar-quoted string follows the same rules as an unquoted identifier, except that it cannot contain a dollar sign.
// SQL identifiers and key words must begin with a letter (a-z, but also letters with diacritical marks and non-Latin letters) or an underscore (_). Subsequent characters in an identifier or key word can be letters, underscores, digits (0-9), or dollar signs ($).
dollarStringConstant: {
match: /\$([\w_][\w\d_]*)?\$[^]*?\$\1\$/,
lineBreaks: true,
},

fubar: 'fubar',
});
const dollarString = '$outer$ outer $middle$ middle $inner$\n!inner!\n$inner$ /middle $middle$ /outer $outer$'
const fullString = 'fubar' + dollarString + 'fubar'
lexer.reset(fullString)
let tokens = lexAll(lexer).filter(t => t.type !== 'space')
expect(tokens.shift()).toMatchObject({ type: 'fubar', text: 'fubar', value: 'fubar' })
expect(tokens.shift()).toMatchObject({ type: 'dollarStringConstant', text: dollarString, value: dollarString })
expect(tokens.shift()).toMatchObject({ type: 'fubar', text: 'fubar', value: 'fubar' })
})

test('works with multi-digit backreferences', () => {
let lexer = moo.compile({
test: /(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\11/,
fubar: 'fubar',
});
const alpha = 'abcdefghijklk'
const fullString = 'fubar' + alpha + 'fubar'
lexer.reset(fullString)
let tokens = lexAll(lexer).filter(t => t.type !== 'space')
expect(tokens.shift()).toMatchObject({ type: 'fubar', text: 'fubar', value: 'fubar' })
expect(tokens.shift()).toMatchObject({ type: 'test', text: alpha, value: alpha })
expect(tokens.shift()).toMatchObject({ type: 'fubar', text: 'fubar', value: 'fubar' })
})
});

describe('lexer', () => {

var simpleLexer = compile({
Expand Down