From 1b77030fd498e6dc2f58d94ac5d30cd3d122e27c Mon Sep 17 00:00:00 2001 From: Gil Forsyth Date: Thu, 25 Jan 2024 10:16:57 -0500 Subject: [PATCH 1/4] feat(mssql): add hashbytes and test for binary output hash fns --- ibis/backends/mssql/registry.py | 27 +++++++++++++++++++++++++ ibis/backends/tests/test_generic.py | 31 +++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/ibis/backends/mssql/registry.py b/ibis/backends/mssql/registry.py index 050b5a5ec947..f8cec2f8e9c3 100644 --- a/ibis/backends/mssql/registry.py +++ b/ibis/backends/mssql/registry.py @@ -226,6 +226,32 @@ def _literal(_, op): return sa.literal(value) +def _hashbytes(translator, op): + how = op.how + + arg_formatted = translator.translate(op.arg) + + if how in ("md5", "sha1"): + return sa.func.hashbytes(how, arg_formatted) + elif how == "sha256": + return sa.func.hashbytes("sha2_256", arg_formatted) + elif how == "sha512": + return sa.func.hashbytes("sha2_512", arg_formatted) + else: + raise NotImplementedError(how) + + +def _sha256(translator, op): + arg_formatted = translator.translate(op.arg) + # SO post on getting convert to play nice with VARCHAR in Sqlalchemy + # https://stackoverflow.com/questions/20291962/how-to-use-convert-function-in-sqlalchemy + return sa.func.convert( + sa.literal_column("VARCHAR(MAX)"), + sa.func.hashbytes("sha2_256", arg_formatted), + 2, # 2 means strip off leading '0x' + ) + + operation_registry = sqlalchemy_operation_registry.copy() operation_registry.update(sqlalchemy_window_functions_registry) @@ -316,6 +342,7 @@ def _literal(_, op): ops.DateTruncate: _timestamp_truncate, ops.TimestampBucket: _timestamp_bucket, ops.Hash: unary(sa.func.checksum), + ops.HashBytes: _hashbytes, ops.ExtractMicrosecond: fixed_arity( lambda arg: sa.func.datepart(sa.literal_column("microsecond"), arg), 1 ), diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index ad3b12ef363b..fe1a8ef47600 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -1237,6 +1237,37 @@ def test_hash_consistent(backend, alltypes): assert h1.dtype in ("i8", "uint64") # polars likes returning uint64 for this +@pytest.mark.notimpl(["trino", "oracle", "exasol", "snowflake"]) +@pytest.mark.notyet( + [ + "clickhouse", + "dask", + "datafusion", + "druid", + "duckdb", + "impala", + "mysql", + "pandas", + "polars", + "postgres", + "pyspark", + "sqlite", + ] +) +def test_hashbytes(backend, alltypes): + h1 = alltypes.order_by("id").string_col.hashbytes().execute(limit=10) + df = alltypes.order_by("id").execute(limit=10) + + import hashlib + + def hash_256(col): + return hashlib.sha256(col.encode()).digest() + + h2 = df["string_col"].apply(hash_256).rename("HashBytes(string_col)") + + backend.assert_series_equal(h1, h2) + + @pytest.mark.notimpl( [ "pandas", From 59ee2c1e2035e3f2b619ed35b846c4f55daed63a Mon Sep 17 00:00:00 2001 From: Gil Forsyth Date: Fri, 26 Jan 2024 11:38:48 -0500 Subject: [PATCH 2/4] fix(clickhouse): basic normalization of hash type --- ibis/backends/clickhouse/compiler/values.py | 2 ++ ibis/backends/tests/test_generic.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ibis/backends/clickhouse/compiler/values.py b/ibis/backends/clickhouse/compiler/values.py index 8486a6421a09..df169ac36af8 100644 --- a/ibis/backends/clickhouse/compiler/values.py +++ b/ibis/backends/clickhouse/compiler/values.py @@ -280,6 +280,8 @@ def _hash(op, *, arg, **_): @translate_val.register(ops.HashBytes) def _hash_bytes(op, *, arg, how, **_): + if how in ("md5", "sha1", "sha224", "sha256"): + how = how.upper() if how not in _SUPPORTED_ALGORITHMS: raise com.UnsupportedOperationError(f"Unsupported hash algorithm {how}") diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index fe1a8ef47600..712f4ba78623 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -1240,7 +1240,6 @@ def test_hash_consistent(backend, alltypes): @pytest.mark.notimpl(["trino", "oracle", "exasol", "snowflake"]) @pytest.mark.notyet( [ - "clickhouse", "dask", "datafusion", "druid", From c16977dff212a709d178abfac0b096e427d9dc6d Mon Sep 17 00:00:00 2001 From: Gil Forsyth Date: Fri, 26 Jan 2024 11:51:03 -0500 Subject: [PATCH 3/4] feat(mssql): add hexdigest feat(mssql): add hexdigest Apply suggestions from code review Co-authored-by: Jim Crist-Harif --- ibis/backends/mssql/registry.py | 28 +++++++++++++++----- ibis/backends/tests/test_generic.py | 40 +++++++++++++++++++++++++++++ ibis/expr/operations/generic.py | 14 ++++++++++ ibis/expr/types/strings.py | 34 ++++++++++++++++++++++++ 4 files changed, 110 insertions(+), 6 deletions(-) diff --git a/ibis/backends/mssql/registry.py b/ibis/backends/mssql/registry.py index f8cec2f8e9c3..3c3e780ef203 100644 --- a/ibis/backends/mssql/registry.py +++ b/ibis/backends/mssql/registry.py @@ -241,14 +241,29 @@ def _hashbytes(translator, op): raise NotImplementedError(how) -def _sha256(translator, op): - arg_formatted = translator.translate(op.arg) +def _hexdigest(translator, op): # SO post on getting convert to play nice with VARCHAR in Sqlalchemy # https://stackoverflow.com/questions/20291962/how-to-use-convert-function-in-sqlalchemy - return sa.func.convert( - sa.literal_column("VARCHAR(MAX)"), - sa.func.hashbytes("sha2_256", arg_formatted), - 2, # 2 means strip off leading '0x' + how = op.how + + arg_formatted = translator.translate(op.arg) + if how in ("md5", "sha1"): + hashbinary = sa.func.hashbytes(how, arg_formatted) + elif how == "sha256": + hashbinary = sa.func.hashbytes("sha2_256", arg_formatted) + elif how == "sha512": + hashbinary = sa.func.hashbytes("sha2_512", arg_formatted) + else: + raise NotImplementedError(how) + + # mssql uppercases the hexdigest which is inconsistent with several other + # implementations and inconsistent with Python, so lowercase it. + return sa.func.lower( + sa.func.convert( + sa.literal_column("VARCHAR(MAX)"), + hashbinary, + 2, # 2 means strip off leading '0x' + ) ) @@ -343,6 +358,7 @@ def _sha256(translator, op): ops.TimestampBucket: _timestamp_bucket, ops.Hash: unary(sa.func.checksum), ops.HashBytes: _hashbytes, + ops.HexDigest: _hexdigest, ops.ExtractMicrosecond: fixed_arity( lambda arg: sa.func.datepart(sa.literal_column("microsecond"), arg), 1 ), diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 712f4ba78623..b571cafc977e 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -1244,6 +1244,7 @@ def test_hash_consistent(backend, alltypes): "datafusion", "druid", "duckdb", + "flink", "impala", "mysql", "pandas", @@ -1267,6 +1268,45 @@ def hash_256(col): backend.assert_series_equal(h1, h2) +@pytest.mark.notimpl( + [ + "bigquery", + "clickhouse", + "dask", + "datafusion", + "exasol", + "flink", + "impala", + "mysql", + "oracle", + "pandas", + "polars", + "postgres", + "snowflake", + "trino", + ] +) +@pytest.mark.notyet( + [ + "druid", + "polars", + "sqlite", + ] +) +def test_hexdigest(backend, alltypes): + h1 = alltypes.order_by("id").string_col.hexdigest().execute(limit=10) + df = alltypes.order_by("id").execute(limit=10) + + import hashlib + + def hash_256(col): + return hashlib.sha256(col.encode()).hexdigest() + + h2 = df["string_col"].apply(hash_256).rename("HexDigest(string_col)") + + backend.assert_series_equal(h1, h2) + + @pytest.mark.notimpl( [ "pandas", diff --git a/ibis/expr/operations/generic.py b/ibis/expr/operations/generic.py index 8f2daaf29de9..be349cd45777 100644 --- a/ibis/expr/operations/generic.py +++ b/ibis/expr/operations/generic.py @@ -273,6 +273,20 @@ class HashBytes(Value): shape = rlz.shape_like("arg") +@public +class HexDigest(Value): + arg: Value[dt.String | dt.Binary] + how: LiteralType[ + "md5", + "sha1", + "sha256", + "sha512", + ] + + dtype = dt.str + shape = rlz.shape_like("arg") + + # TODO(kszucs): we should merge the case operations by making the # cases, results and default optional arguments like they are in # api.py diff --git a/ibis/expr/types/strings.py b/ibis/expr/types/strings.py index 07d4fff48f20..fc467ef6e1b3 100644 --- a/ibis/expr/types/strings.py +++ b/ibis/expr/types/strings.py @@ -465,6 +465,40 @@ def hashbytes( """ return ops.HashBytes(self, how).to_expr() + def hexdigest( + self, + how: Literal["md5", "sha1", "sha256", "sha512"] = "sha256", + ) -> ir.StringValue: + """Return the hash digest of the input as a hex encoded string. + + Parameters + ---------- + how + Hash algorithm to use + + Returns + ------- + StringValue + Hexadecimal representation of the hash as a string + + Examples + -------- + >>> import ibis + >>> ibis.options.interactive = True + >>> t = ibis.memtable({"species": ["Adelie", "Chinstrap", "Gentoo"]}) + >>> t.species.hexdigest() + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ HexDigest(species) ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ string │ + ├──────────────────────────────────────────────────────────────────┤ + │ a4d7d46b27480037bc1e513e0e157cbf258baae6ee69e3110d0f9ff418b57a3c │ + │ cb97d113ca69899ae4f1fb581f4a90d86989db77b4a33873d604b0ee412b4cc9 │ + │ b5e90cdff65949fe6bc226823245f7698110e563a12363fc57b3eed3e4a0a612 │ + └──────────────────────────────────────────────────────────────────┘ + """ + return ops.HexDigest(self, how.lower()).to_expr() + def substr( self, start: int | ir.IntegerValue, From 531d25096354fbc2207d19f53ee11f7dec2bd5c2 Mon Sep 17 00:00:00 2001 From: Gil Forsyth Date: Fri, 26 Jan 2024 12:02:40 -0500 Subject: [PATCH 4/4] feat(pyspark, duckdb): add hexdigest support --- ibis/backends/duckdb/registry.py | 11 +++++++++++ ibis/backends/pyspark/compiler.py | 15 +++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/ibis/backends/duckdb/registry.py b/ibis/backends/duckdb/registry.py index 49b23b978e0d..a0733196c42a 100644 --- a/ibis/backends/duckdb/registry.py +++ b/ibis/backends/duckdb/registry.py @@ -397,6 +397,16 @@ def _array_remove(t, op): ) +def _hexdigest(translator, op): + how = op.how + + arg_formatted = translator.translate(op.arg) + if how in ("md5", "sha256"): + return getattr(sa.func, how)(arg_formatted) + else: + raise NotImplementedError(how) + + operation_registry.update( { ops.Array: ( @@ -533,6 +543,7 @@ def _array_remove(t, op): ops.MapValues: unary(sa.func.map_values), ops.MapMerge: fixed_arity(sa.func.map_concat, 2), ops.Hash: unary(sa.func.hash), + ops.HexDigest: _hexdigest, ops.Median: reduction(sa.func.median), ops.First: reduction(sa.func.first), ops.Last: reduction(sa.func.last), diff --git a/ibis/backends/pyspark/compiler.py b/ibis/backends/pyspark/compiler.py index 47003a437fb4..f1d5e76b84b6 100644 --- a/ibis/backends/pyspark/compiler.py +++ b/ibis/backends/pyspark/compiler.py @@ -2065,6 +2065,21 @@ def compile_hash_column(t, op, **kwargs): return F.hash(t.translate(op.arg, **kwargs)) +@compiles(ops.HexDigest) +def compile_hexdigest_column(t, op, **kwargs): + how = op.how + arg = t.translate(op.arg, **kwargs) + + if how == "md5": + return F.md5(arg) + elif how == "sha1": + return F.sha1(arg) + elif how in ("sha256", "sha512"): + return F.sha2(arg, int(how[-3:])) + else: + raise NotImplementedError(how) + + @compiles(ops.ArrayZip) def compile_zip(t, op, **kwargs): return F.arrays_zip(*map(partial(t.translate, **kwargs), op.arg))