Skip to content

Commit

Permalink
Add support for \h, \H, \v, \V, and \R character classes (N…
Browse files Browse the repository at this point in the history
…VIDIA#5477)

Signed-off-by: Anthony Chang <antchang@nvidia.com>
  • Loading branch information
anthony-chang committed May 17, 2022
1 parent ff62c97 commit 39960cc
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 2 deletions.
33 changes: 33 additions & 0 deletions integration_tests/src/main/python/string_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,39 @@ def test_regexp_whitespace():
),
conf=_regexp_conf)

def test_regexp_horizontal_vertical_whitespace():
gen = mk_str_gen(
'''\xA0\u1680\u180e[abcd]\t\n{1,3} [0-9]\n {1,3}\x0b\t[abcd]\r\f[0-9]{0,10}
[\u2001-\u200a]{1,3}\u202f\u205f\u3000\x85\u2028\u2029
''')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'rlike(a, "\\\\h{2}")',
'rlike(a, "\\\\v{3}")',
'rlike(a, "[abcd]+\\\\h+[0-9]+")',
'rlike(a, "[abcd]+\\\\v+[0-9]+")',
'rlike(a, "\\\\H")',
'rlike(a, "\\\\V")',
'rlike(a, "[abcd]+\\\\h+\\\\V{2,3}")',
'regexp_extract(a, "([a-d]+)([0-9]+\\\\v)([a-d]+)", 2)',
'regexp_extract(a, "([a-d]+)(\\\\H+)([0-9]+)", 2)',
'regexp_extract(a, "([a-d]+)(\\\\V+)([0-9]+)", 3)',
'regexp_replace(a, "(\\\\v+)", "@")',
'regexp_replace(a, "(\\\\H+)", "#")',
),
conf=_regexp_conf)

def test_regexp_linebreak():
gen = mk_str_gen(
'[abc]{1,3}\u000D\u000A[def]{1,3}[\u000A\u000B\u000C\u000D\u0085\u2028\u2029]{0,5}[123]')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'rlike(a, "\\\\R")',
'regexp_extract(a, "([a-d]+)(\\\\R)([a-d]+)", 1)',
'regexp_replace(a, "\\\\R", "")',
),
conf=_regexp_conf)

def test_regexp_octal_digits():
gen = mk_str_gen('[abcd]\u0000\u0041\u007f\u0080\u00ff[\\\\xa0-\\\\xb0][abcd]')
assert_gpu_and_cpu_are_equal_collect(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,39 @@ class RegexParser(pattern: String) {
// string anchors
consumeExpected(ch)
RegexEscaped(ch)
case 'h' | 'H' =>
// horizontal whitespace
// see https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
// under "Predefined character classes"
val chars: ListBuffer[RegexCharacterClassComponent] = ListBuffer(
RegexChar(' '), RegexChar('\u00A0'), RegexChar('\u1680'), RegexChar('\u180e'),
RegexChar('\u202f'), RegexChar('\u205f'), RegexChar('\u3000')
)
chars += RegexEscaped('t')
chars += RegexCharacterRange('\u2000', '\u200a')
consumeExpected(ch)
RegexCharacterClass(negated = ch.isUpper, characters = chars)
case 'v' | 'V' =>
// vertical whitespace
// see https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
// under "Predefined character classes"
val chars: ListBuffer[RegexCharacterClassComponent] = ListBuffer(
RegexChar('\u000B'), RegexChar('\u0085'), RegexChar('\u2028'), RegexChar('\u2029')
)
chars ++= Seq('n', 'f', 'r').map(RegexEscaped)
consumeExpected(ch)
RegexCharacterClass(negated = ch.isUpper, characters = chars)
case 'R' =>
// linebreak sequence
// see https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
// under "Linebreak matcher"
val l = RegexSequence(ListBuffer(RegexChar('\u000D'), RegexChar('\u000A')))
val r = RegexCharacterClass(false, ListBuffer[RegexCharacterClassComponent](
RegexChar('\u000A'), RegexChar('\u000B'), RegexChar('\u000C'), RegexChar('\u000D'),
RegexChar('\u0085'), RegexChar('\u2028'), RegexChar('\u2029')
))
consumeExpected(ch)
RegexGroup(true, RegexChoice(l, r))
case 's' | 'S' | 'd' | 'D' | 'w' | 'W' =>
// meta sequences
consumeExpected(ch)
Expand Down Expand Up @@ -514,8 +547,10 @@ class CudfRegexTranspiler(mode: RegexMode) {

private def isSupportedRepetitionBase(e: RegexAST): Boolean = {
e match {
case RegexEscaped(ch) if ch != 'd' && ch != 'w' => // example: "\B?"
false
case RegexEscaped(ch) => ch match {
case 'd' | 'w' | 'h' | 'H' | 'v' | 'V' => true
case _ => false
}

case RegexChar(a) if "$^".contains(a) =>
// example: "$*"
Expand Down

0 comments on commit 39960cc

Please sign in to comment.