Skip to content

Commit

Permalink
Add fixed issues to regex fuzzer (#6013)
Browse files Browse the repository at this point in the history
* Add choice to fuzzer and fix the failures

Signed-off-by: Anthony Chang <antchang@nvidia.com>

* Enable remaining known issues

Signed-off-by: Anthony Chang <antchang@nvidia.com>

* Address feedback

Signed-off-by: Anthony Chang <antchang@nvidia.com>
  • Loading branch information
anthony-chang authored Aug 2, 2022
1 parent 19a6957 commit ac95bf6
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ class RegexParser(pattern: String) {
case codePoint => RegexChar(codePoint.toChar)
}
case Some('0') =>
consumeExpected('0')
val octalChar = parseOctalDigit
octalChar.codePoint match {
case 0 => RegexHexDigit("00")
Expand Down Expand Up @@ -1030,14 +1029,19 @@ class CudfRegexTranspiler(mode: RegexMode) {
} else {
digits
}
if (r.codePoint >= 128) {

if (regexMetaChars.map(_.toInt).contains(r.codePoint)) {
RegexEscaped(r.codePoint.toChar)
} else if(r.codePoint >= 128) {
RegexChar(r.codePoint.toChar)
} else {
RegexOctalChar(octal)
}

case r @ RegexHexDigit(digits) =>
if (r.codePoint >= 128) {
if (regexMetaChars.map(_.toInt).contains(r.codePoint)) {
RegexEscaped(r.codePoint.toChar)
} else if (r.codePoint >= 128) {
// cuDF only supports 0x00 to 0x7f hexidecimal chars
RegexChar(r.codePoint.toChar)
} else {
Expand Down Expand Up @@ -1447,11 +1451,27 @@ class CudfRegexTranspiler(mode: RegexMode) {
case RegexGroup(capture, term) =>
term match {
case RegexSequence(parts) =>
parts.foreach { part => isBeginOrEndLineAnchor(part) match {
case true => throw new RegexUnsupportedException(
"Line and string anchors are not supported in capture groups", part.position)
case false =>
}}
parts.foreach { part =>
if (isBeginOrEndLineAnchor(part)) {
throw new RegexUnsupportedException(
"Line and string anchors are not supported in capture groups", part.position)
}
part match {
case RegexRepetition(base, quantifier) => (base, quantifier) match {
case (_, QuantifierVariableLength(0, Some(0))) =>
throw new RegexUnsupportedException(
"Repetition with {0,0} not supported in capture groups",
quantifier.position)

case (_, QuantifierFixedLength(0)) =>
throw new RegexUnsupportedException(
"Reptition with {0} not supported in capture groups",
quantifier.position)
case _ =>
}
case _ =>
}
}
RegexGroup(capture, rewrite(term, replacement, None))
case _ =>
RegexGroup(capture, rewrite(term, replacement, None))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -934,7 +934,7 @@ class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true,
// when we reach maximum depth we generate a non-nested type
nonNestedTerm
} else {
val baseGenerators: Seq[() => RegexAST] = Seq(
val generators: Seq[() => RegexAST] = Seq(
() => lineTerminator,
() => escapedChar,
() => char,
Expand All @@ -945,13 +945,8 @@ class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true,
() => group(depth),
() => boundaryMatch,
() => sequence(depth),
() => repetition(depth))
val generators = if (skipKnownIssues) {
baseGenerators
} else {
baseGenerators ++ Seq(
() => choice(depth)) // https://github.com/NVIDIA/spark-rapids/issues/4603
}
() => repetition(depth),
() => choice(depth))
generators(rr.nextInt(generators.length))()
}
}
Expand All @@ -971,36 +966,25 @@ class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true,
}

private def characterClassComponent = {
val baseGenerators = Seq[() => RegexCharacterClassComponent](
val generators = Seq[() => RegexCharacterClassComponent](
() => char,
() => charRange,
() => hexDigit,
() => octalDigit,
() => escapedChar)
val generators = if (skipKnownIssues) {
baseGenerators
} else {
baseGenerators ++ Seq(
() => octalDigit) // https://github.com/NVIDIA/spark-rapids/issues/4862
}
generators(rr.nextInt(generators.length))()
}

private def charRange: RegexCharacterClassComponent = {
val baseGenerators = Seq[() => RegexCharacterClassComponent](
val generators = Seq[() => RegexCharacterClassComponent](
() => RegexCharacterRange(RegexChar('a'), RegexChar('z')),
() => RegexCharacterRange(RegexChar('A'), RegexChar('Z')),
() => RegexCharacterRange(RegexChar('z'), RegexChar('a')),
() => RegexCharacterRange(RegexChar('Z'), RegexChar('A')),
() => RegexCharacterRange(RegexChar('0'), RegexChar('9')),
() => RegexCharacterRange(RegexChar('9'), RegexChar('0'))
() => RegexCharacterRange(RegexChar('9'), RegexChar('0')),
() => RegexCharacterRange(char, char)
)
val generators = if (skipKnownIssues) {
baseGenerators
} else {
// we do not support escaped characters in character ranges yet
// see https://github.com/NVIDIA/spark-rapids/issues/4505
baseGenerators ++ Seq(() => RegexCharacterRange(char, char))
}
generators(rr.nextInt(generators.length))()
}

Expand Down

0 comments on commit ac95bf6

Please sign in to comment.