Skip to content

Commit

Permalink
feat(mssql): add hashbytes and test for binary output hash fns (ibis-…
Browse files Browse the repository at this point in the history
…project#8107)

This adds support for `ops.HashBytes` to `mssql` and also adds a test
for that functionality so it's easier to port when we merge in the epic
split branch.

I've also added a new op, `HashHexDigest` which returns the hexdigest of
various cryptographic hashing functions since I imagine this is what
many users are _probably_ after. This newer op (and corresponding
`hexdigest` method) can also support many more backends, as most of them
default to returning the string hex digest and not the raw binary
output.

I tried to be very accurate in the `notimpl` and `notyet` portions of
both tests and I think I've done that.

For now, only exposing DuckDB, Pyspark, and MSSQL so we don't add a huge
extra burden for the epic split but also address the user request in

And I guess now we can commence debate over the method name? 🐎

Resolves ibis-project#8082

---------

Co-authored-by: Jim Crist-Harif <jcristharif@gmail.com>
  • Loading branch information
gforsyth and jcrist committed Feb 1, 2024
1 parent 1beb7cf commit 6d80fb3
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 0 deletions.
7 changes: 7 additions & 0 deletions ibis/backends/duckdb/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,13 @@ def visit_Quantile(self, op, *, arg, quantile, where):
funcname = f"percentile_{suffix}"
return self.agg[funcname](arg, quantile, where=where)

@visit_node.register(ops.HexDigest)
def visit_HexDigest(self, op, *, arg, how):
if how in ("md5", "sha256"):
return getattr(self.f, how)(arg)
else:
raise NotImplementedError(f"No available hashing function for {how}")


_SIMPLE_OPS = {
ops.ArrayPosition: "list_indexof",
Expand Down
30 changes: 30 additions & 0 deletions ibis/backends/mssql/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,36 @@ def visit_Not(self, op, *, arg):
return sge.FALSE if arg == sge.TRUE else sge.TRUE
return self.if_(arg, 1, 0).eq(0)

@visit_node.register(ops.HashBytes)
def visit_HashBytes(self, op, *, arg, how):
if how in ("md5", "sha1"):
return self.f.hashbytes(how, arg)
elif how == "sha256":
return self.f.hashbytes("sha2_256", arg)
elif how == "sha512":
return self.f.hashbytes("sha2_512", arg)
else:
raise NotImplementedError(how)

@visit_node.register(ops.HexDigest)
def visit_HexDigest(self, op, *, arg, how):
if how in ("md5", "sha1"):
hashbinary = self.f.hashbytes(how, arg)
elif how == "sha256":
hashbinary = self.f.hashbytes("sha2_256", arg)
elif how == "sha512":
hashbinary = self.f.hashbytes("sha2_512", arg)
else:
raise NotImplementedError(how)

# mssql uppercases the hexdigest which is inconsistent with several other
# implementations and inconsistent with Python, so lowercase it.
return self.f.lower(
self.f.convert(
sge.Literal(this="VARCHAR(MAX)", is_string=False), hashbinary, 2
)
)

@visit_node.register(ops.Any)
@visit_node.register(ops.All)
@visit_node.register(ops.ApproxMedian)
Expand Down
11 changes: 11 additions & 0 deletions ibis/backends/pyspark/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,17 @@ def visit_JoinLink(self, op, **kwargs):
def visit_Undefined(self, op, **_):
raise com.OperationNotDefinedError(type(op).__name__)

@visit_node.register(ops.HexDigest)
def visit_HexDigest(self, op, *, arg, how):
if how == "md5":
return self.f.md5(arg)
elif how == "sha1":
return self.f.sha1(arg)
elif how in ("sha256", "sha512"):
return self.f.sha2(arg, int(how[-3:]))
else:
raise NotImplementedError(f"No available hashing function for {how}")


_SIMPLE_OPS = {
ops.ArrayDistinct: "array_distinct",
Expand Down
14 changes: 14 additions & 0 deletions ibis/expr/operations/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,20 @@ class HashBytes(Value):
shape = rlz.shape_like("arg")


@public
class HexDigest(Value):
arg: Value[dt.String | dt.Binary]
how: LiteralType[
"md5",
"sha1",
"sha256",
"sha512",
]

dtype = dt.str
shape = rlz.shape_like("arg")


# TODO(kszucs): we should merge the case operations by making the
# cases, results and default optional arguments like they are in
# api.py
Expand Down
34 changes: 34 additions & 0 deletions ibis/expr/types/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,40 @@ def hashbytes(
"""
return ops.HashBytes(self, how).to_expr()

def hexdigest(
self,
how: Literal["md5", "sha1", "sha256", "sha512"] = "sha256",
) -> ir.StringValue:
"""Return the hash digest of the input as a hex encoded string.
Parameters
----------
how
Hash algorithm to use
Returns
-------
StringValue
Hexadecimal representation of the hash as a string
Examples
--------
>>> import ibis
>>> ibis.options.interactive = True
>>> t = ibis.memtable({"species": ["Adelie", "Chinstrap", "Gentoo"]})
>>> t.species.hexdigest()
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ HexDigest(species) ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string │
├──────────────────────────────────────────────────────────────────┤
│ a4d7d46b27480037bc1e513e0e157cbf258baae6ee69e3110d0f9ff418b57a3c │
│ cb97d113ca69899ae4f1fb581f4a90d86989db77b4a33873d604b0ee412b4cc9 │
│ b5e90cdff65949fe6bc226823245f7698110e563a12363fc57b3eed3e4a0a612 │
└──────────────────────────────────────────────────────────────────┘
"""
return ops.HexDigest(self, how.lower()).to_expr()

def substr(
self,
start: int | ir.IntegerValue,
Expand Down

0 comments on commit 6d80fb3

Please sign in to comment.