From 07beaed6657bf701ae7256c0da86b34f0bbcb5ce Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Tue, 19 Dec 2023 05:34:53 -0500 Subject: [PATCH] feat(api): define `RegexSplit` operation and `re_split` API --- ibis/expr/operations/strings.py | 12 +++++++++ ibis/expr/types/strings.py | 46 +++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/ibis/expr/operations/strings.py b/ibis/expr/operations/strings.py index 871dac7f277a..d17f22e412a1 100644 --- a/ibis/expr/operations/strings.py +++ b/ibis/expr/operations/strings.py @@ -207,6 +207,18 @@ class RegexExtract(Value): dtype = dt.string +@public +class RegexSplit(Value): + arg: Value[dt.String] + pattern: Value[dt.String] + + dtype = dt.Array(dt.string) + + @attribute + def shape(self): + return rlz.highest_precedence_shape((self.arg, self.pattern)) + + @public class RegexReplace(Value): arg: Value[dt.String] diff --git a/ibis/expr/types/strings.py b/ibis/expr/types/strings.py index 7d22036ed12d..07d4fff48f20 100644 --- a/ibis/expr/types/strings.py +++ b/ibis/expr/types/strings.py @@ -1078,6 +1078,52 @@ def re_extract( """ return ops.RegexExtract(self, pattern, index).to_expr() + @util.backend_sensitive( + why="Different backends support different regular expression syntax." + ) + def re_split(self, pattern: str | StringValue) -> ir.ArrayValue: + """Split a string by a regular expression `pattern`. + + Parameters + ---------- + pattern + Regular expression string to split by + + Returns + ------- + ArrayValue + Array of strings from splitting by `pattern` + + Examples + -------- + >>> import ibis + >>> ibis.options.interactive = True + >>> t = ibis.memtable(dict(s=["a.b", "b.....c", "c.........a", "def"])) + >>> t.s + ┏━━━━━━━━━━━━━┓ + ┃ s ┃ + ┡━━━━━━━━━━━━━┩ + │ string │ + ├─────────────┤ + │ a.b │ + │ b.....c │ + │ c.........a │ + │ def │ + └─────────────┘ + >>> t.s.re_split("\.+").name("splits") + ┏━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ splits ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━┩ + │ array │ + ├──────────────────────┤ + │ ['a', 'b'] │ + │ ['b', 'c'] │ + │ ['c', 'a'] │ + │ ['def'] │ + └──────────────────────┘ + """ + return ops.RegexSplit(self, pattern).to_expr() + @util.backend_sensitive( why="Different backends support different regular expression syntax." )