From 1b77030fd498e6dc2f58d94ac5d30cd3d122e27c Mon Sep 17 00:00:00 2001
From: Gil Forsyth <gil@forsyth.dev>
Date: Thu, 25 Jan 2024 10:16:57 -0500
Subject: [PATCH 1/4] feat(mssql): add hashbytes and test for binary output
 hash fns

---
 ibis/backends/mssql/registry.py     | 27 +++++++++++++++++++++++++
 ibis/backends/tests/test_generic.py | 31 +++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/ibis/backends/mssql/registry.py b/ibis/backends/mssql/registry.py
index 050b5a5ec947..f8cec2f8e9c3 100644
--- a/ibis/backends/mssql/registry.py
+++ b/ibis/backends/mssql/registry.py
@@ -226,6 +226,32 @@ def _literal(_, op):
     return sa.literal(value)
 
 
+def _hashbytes(translator, op):
+    how = op.how
+
+    arg_formatted = translator.translate(op.arg)
+
+    if how in ("md5", "sha1"):
+        return sa.func.hashbytes(how, arg_formatted)
+    elif how == "sha256":
+        return sa.func.hashbytes("sha2_256", arg_formatted)
+    elif how == "sha512":
+        return sa.func.hashbytes("sha2_512", arg_formatted)
+    else:
+        raise NotImplementedError(how)
+
+
+def _sha256(translator, op):
+    arg_formatted = translator.translate(op.arg)
+    # SO post on getting convert to play nice with VARCHAR in Sqlalchemy
+    # https://stackoverflow.com/questions/20291962/how-to-use-convert-function-in-sqlalchemy
+    return sa.func.convert(
+        sa.literal_column("VARCHAR(MAX)"),
+        sa.func.hashbytes("sha2_256", arg_formatted),
+        2,  # 2 means strip off leading '0x'
+    )
+
+
 operation_registry = sqlalchemy_operation_registry.copy()
 operation_registry.update(sqlalchemy_window_functions_registry)
 
@@ -316,6 +342,7 @@ def _literal(_, op):
         ops.DateTruncate: _timestamp_truncate,
         ops.TimestampBucket: _timestamp_bucket,
         ops.Hash: unary(sa.func.checksum),
+        ops.HashBytes: _hashbytes,
         ops.ExtractMicrosecond: fixed_arity(
             lambda arg: sa.func.datepart(sa.literal_column("microsecond"), arg), 1
         ),
diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py
index ad3b12ef363b..fe1a8ef47600 100644
--- a/ibis/backends/tests/test_generic.py
+++ b/ibis/backends/tests/test_generic.py
@@ -1237,6 +1237,37 @@ def test_hash_consistent(backend, alltypes):
     assert h1.dtype in ("i8", "uint64")  # polars likes returning uint64 for this
 
 
+@pytest.mark.notimpl(["trino", "oracle", "exasol", "snowflake"])
+@pytest.mark.notyet(
+    [
+        "clickhouse",
+        "dask",
+        "datafusion",
+        "druid",
+        "duckdb",
+        "impala",
+        "mysql",
+        "pandas",
+        "polars",
+        "postgres",
+        "pyspark",
+        "sqlite",
+    ]
+)
+def test_hashbytes(backend, alltypes):
+    h1 = alltypes.order_by("id").string_col.hashbytes().execute(limit=10)
+    df = alltypes.order_by("id").execute(limit=10)
+
+    import hashlib
+
+    def hash_256(col):
+        return hashlib.sha256(col.encode()).digest()
+
+    h2 = df["string_col"].apply(hash_256).rename("HashBytes(string_col)")
+
+    backend.assert_series_equal(h1, h2)
+
+
 @pytest.mark.notimpl(
     [
         "pandas",

From 59ee2c1e2035e3f2b619ed35b846c4f55daed63a Mon Sep 17 00:00:00 2001
From: Gil Forsyth <gil@forsyth.dev>
Date: Fri, 26 Jan 2024 11:38:48 -0500
Subject: [PATCH 2/4] fix(clickhouse): basic normalization of hash type

---
 ibis/backends/clickhouse/compiler/values.py | 2 ++
 ibis/backends/tests/test_generic.py         | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/ibis/backends/clickhouse/compiler/values.py b/ibis/backends/clickhouse/compiler/values.py
index 8486a6421a09..df169ac36af8 100644
--- a/ibis/backends/clickhouse/compiler/values.py
+++ b/ibis/backends/clickhouse/compiler/values.py
@@ -280,6 +280,8 @@ def _hash(op, *, arg, **_):
 
 @translate_val.register(ops.HashBytes)
 def _hash_bytes(op, *, arg, how, **_):
+    if how in ("md5", "sha1", "sha224", "sha256"):
+        how = how.upper()
     if how not in _SUPPORTED_ALGORITHMS:
         raise com.UnsupportedOperationError(f"Unsupported hash algorithm {how}")
 
diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py
index fe1a8ef47600..712f4ba78623 100644
--- a/ibis/backends/tests/test_generic.py
+++ b/ibis/backends/tests/test_generic.py
@@ -1240,7 +1240,6 @@ def test_hash_consistent(backend, alltypes):
 @pytest.mark.notimpl(["trino", "oracle", "exasol", "snowflake"])
 @pytest.mark.notyet(
     [
-        "clickhouse",
         "dask",
         "datafusion",
         "druid",

From c16977dff212a709d178abfac0b096e427d9dc6d Mon Sep 17 00:00:00 2001
From: Gil Forsyth <gil@forsyth.dev>
Date: Fri, 26 Jan 2024 11:51:03 -0500
Subject: [PATCH 3/4] feat(mssql): add hexdigest

feat(mssql): add hexdigest

Apply suggestions from code review

Co-authored-by: Jim Crist-Harif <jcristharif@gmail.com>
---
 ibis/backends/mssql/registry.py     | 28 +++++++++++++++-----
 ibis/backends/tests/test_generic.py | 40 +++++++++++++++++++++++++++++
 ibis/expr/operations/generic.py     | 14 ++++++++++
 ibis/expr/types/strings.py          | 34 ++++++++++++++++++++++++
 4 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/ibis/backends/mssql/registry.py b/ibis/backends/mssql/registry.py
index f8cec2f8e9c3..3c3e780ef203 100644
--- a/ibis/backends/mssql/registry.py
+++ b/ibis/backends/mssql/registry.py
@@ -241,14 +241,29 @@ def _hashbytes(translator, op):
         raise NotImplementedError(how)
 
 
-def _sha256(translator, op):
-    arg_formatted = translator.translate(op.arg)
+def _hexdigest(translator, op):
     # SO post on getting convert to play nice with VARCHAR in Sqlalchemy
     # https://stackoverflow.com/questions/20291962/how-to-use-convert-function-in-sqlalchemy
-    return sa.func.convert(
-        sa.literal_column("VARCHAR(MAX)"),
-        sa.func.hashbytes("sha2_256", arg_formatted),
-        2,  # 2 means strip off leading '0x'
+    how = op.how
+
+    arg_formatted = translator.translate(op.arg)
+    if how in ("md5", "sha1"):
+        hashbinary = sa.func.hashbytes(how, arg_formatted)
+    elif how == "sha256":
+        hashbinary = sa.func.hashbytes("sha2_256", arg_formatted)
+    elif how == "sha512":
+        hashbinary = sa.func.hashbytes("sha2_512", arg_formatted)
+    else:
+        raise NotImplementedError(how)
+
+    # mssql uppercases the hexdigest which is inconsistent with several other
+    # implementations and inconsistent with Python, so lowercase it.
+    return sa.func.lower(
+        sa.func.convert(
+            sa.literal_column("VARCHAR(MAX)"),
+            hashbinary,
+            2,  # 2 means strip off leading '0x'
+        )
     )
 
 
@@ -343,6 +358,7 @@ def _sha256(translator, op):
         ops.TimestampBucket: _timestamp_bucket,
         ops.Hash: unary(sa.func.checksum),
         ops.HashBytes: _hashbytes,
+        ops.HexDigest: _hexdigest,
         ops.ExtractMicrosecond: fixed_arity(
             lambda arg: sa.func.datepart(sa.literal_column("microsecond"), arg), 1
         ),
diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py
index 712f4ba78623..b571cafc977e 100644
--- a/ibis/backends/tests/test_generic.py
+++ b/ibis/backends/tests/test_generic.py
@@ -1244,6 +1244,7 @@ def test_hash_consistent(backend, alltypes):
         "datafusion",
         "druid",
         "duckdb",
+        "flink",
         "impala",
         "mysql",
         "pandas",
@@ -1267,6 +1268,45 @@ def hash_256(col):
     backend.assert_series_equal(h1, h2)
 
 
+@pytest.mark.notimpl(
+    [
+        "bigquery",
+        "clickhouse",
+        "dask",
+        "datafusion",
+        "exasol",
+        "flink",
+        "impala",
+        "mysql",
+        "oracle",
+        "pandas",
+        "polars",
+        "postgres",
+        "snowflake",
+        "trino",
+    ]
+)
+@pytest.mark.notyet(
+    [
+        "druid",
+        "polars",
+        "sqlite",
+    ]
+)
+def test_hexdigest(backend, alltypes):
+    h1 = alltypes.order_by("id").string_col.hexdigest().execute(limit=10)
+    df = alltypes.order_by("id").execute(limit=10)
+
+    import hashlib
+
+    def hash_256(col):
+        return hashlib.sha256(col.encode()).hexdigest()
+
+    h2 = df["string_col"].apply(hash_256).rename("HexDigest(string_col)")
+
+    backend.assert_series_equal(h1, h2)
+
+
 @pytest.mark.notimpl(
     [
         "pandas",
diff --git a/ibis/expr/operations/generic.py b/ibis/expr/operations/generic.py
index 8f2daaf29de9..be349cd45777 100644
--- a/ibis/expr/operations/generic.py
+++ b/ibis/expr/operations/generic.py
@@ -273,6 +273,20 @@ class HashBytes(Value):
     shape = rlz.shape_like("arg")
 
 
+@public
+class HexDigest(Value):
+    arg: Value[dt.String | dt.Binary]
+    how: LiteralType[
+        "md5",
+        "sha1",
+        "sha256",
+        "sha512",
+    ]
+
+    dtype = dt.str
+    shape = rlz.shape_like("arg")
+
+
 # TODO(kszucs): we should merge the case operations by making the
 # cases, results and default optional arguments like they are in
 # api.py
diff --git a/ibis/expr/types/strings.py b/ibis/expr/types/strings.py
index 07d4fff48f20..fc467ef6e1b3 100644
--- a/ibis/expr/types/strings.py
+++ b/ibis/expr/types/strings.py
@@ -465,6 +465,40 @@ def hashbytes(
         """
         return ops.HashBytes(self, how).to_expr()
 
+    def hexdigest(
+        self,
+        how: Literal["md5", "sha1", "sha256", "sha512"] = "sha256",
+    ) -> ir.StringValue:
+        """Return the hash digest of the input as a hex encoded string.
+
+        Parameters
+        ----------
+        how
+            Hash algorithm to use
+
+        Returns
+        -------
+        StringValue
+            Hexadecimal representation of the hash as a string
+
+        Examples
+        --------
+        >>> import ibis
+        >>> ibis.options.interactive = True
+        >>> t = ibis.memtable({"species": ["Adelie", "Chinstrap", "Gentoo"]})
+        >>> t.species.hexdigest()
+        ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+        ┃ HexDigest(species)                                           ┃
+        ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+        │ string                                                           │
+        ├──────────────────────────────────────────────────────────────────┤
+        │ a4d7d46b27480037bc1e513e0e157cbf258baae6ee69e3110d0f9ff418b57a3c │
+        │ cb97d113ca69899ae4f1fb581f4a90d86989db77b4a33873d604b0ee412b4cc9 │
+        │ b5e90cdff65949fe6bc226823245f7698110e563a12363fc57b3eed3e4a0a612 │
+        └──────────────────────────────────────────────────────────────────┘
+        """
+        return ops.HexDigest(self, how.lower()).to_expr()
+
     def substr(
         self,
         start: int | ir.IntegerValue,

From 531d25096354fbc2207d19f53ee11f7dec2bd5c2 Mon Sep 17 00:00:00 2001
From: Gil Forsyth <gil@forsyth.dev>
Date: Fri, 26 Jan 2024 12:02:40 -0500
Subject: [PATCH 4/4] feat(pyspark, duckdb): add hexdigest support

---
 ibis/backends/duckdb/registry.py  | 11 +++++++++++
 ibis/backends/pyspark/compiler.py | 15 +++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/ibis/backends/duckdb/registry.py b/ibis/backends/duckdb/registry.py
index 49b23b978e0d..a0733196c42a 100644
--- a/ibis/backends/duckdb/registry.py
+++ b/ibis/backends/duckdb/registry.py
@@ -397,6 +397,16 @@ def _array_remove(t, op):
     )
 
 
+def _hexdigest(translator, op):
+    how = op.how
+
+    arg_formatted = translator.translate(op.arg)
+    if how in ("md5", "sha256"):
+        return getattr(sa.func, how)(arg_formatted)
+    else:
+        raise NotImplementedError(how)
+
+
 operation_registry.update(
     {
         ops.Array: (
@@ -533,6 +543,7 @@ def _array_remove(t, op):
         ops.MapValues: unary(sa.func.map_values),
         ops.MapMerge: fixed_arity(sa.func.map_concat, 2),
         ops.Hash: unary(sa.func.hash),
+        ops.HexDigest: _hexdigest,
         ops.Median: reduction(sa.func.median),
         ops.First: reduction(sa.func.first),
         ops.Last: reduction(sa.func.last),
diff --git a/ibis/backends/pyspark/compiler.py b/ibis/backends/pyspark/compiler.py
index 47003a437fb4..f1d5e76b84b6 100644
--- a/ibis/backends/pyspark/compiler.py
+++ b/ibis/backends/pyspark/compiler.py
@@ -2065,6 +2065,21 @@ def compile_hash_column(t, op, **kwargs):
     return F.hash(t.translate(op.arg, **kwargs))
 
 
+@compiles(ops.HexDigest)
+def compile_hexdigest_column(t, op, **kwargs):
+    how = op.how
+    arg = t.translate(op.arg, **kwargs)
+
+    if how == "md5":
+        return F.md5(arg)
+    elif how == "sha1":
+        return F.sha1(arg)
+    elif how in ("sha256", "sha512"):
+        return F.sha2(arg, int(how[-3:]))
+    else:
+        raise NotImplementedError(how)
+
+
 @compiles(ops.ArrayZip)
 def compile_zip(t, op, **kwargs):
     return F.arrays_zip(*map(partial(t.translate, **kwargs), op.arg))