From 900eccadf8c91181fa512ec8ece3ccaf4f5dbed7 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Tue, 7 May 2024 10:00:01 -0800 Subject: [PATCH] fix: convert the uint64's from some backends' hash() to the desired int64 fixes https://github.com/ibis-project/ibis/issues/9135 --- ibis/backends/clickhouse/compiler.py | 2 +- .../tests/snapshots/test_functions/test_hash/out.sql | 2 +- ibis/backends/duckdb/compiler.py | 9 ++++++++- ibis/backends/polars/compiler.py | 3 ++- ibis/backends/sql/datatypes.py | 2 +- ibis/backends/tests/test_generic.py | 11 +++++++---- 6 files changed, 20 insertions(+), 9 deletions(-) diff --git a/ibis/backends/clickhouse/compiler.py b/ibis/backends/clickhouse/compiler.py index 9b01beef6dce..5f1f4b0b2fde 100644 --- a/ibis/backends/clickhouse/compiler.py +++ b/ibis/backends/clickhouse/compiler.py @@ -235,7 +235,7 @@ def visit_Sign(self, op, *, arg): return self.f.intDivOrZero(arg, self.f.abs(arg)) def visit_Hash(self, op, *, arg): - return self.f.sipHash64(arg) + return self.f.reinterpretAsInt64(self.f.sipHash64(arg)) def visit_HashBytes(self, op, *, arg, how): supported_algorithms = { diff --git a/ibis/backends/clickhouse/tests/snapshots/test_functions/test_hash/out.sql b/ibis/backends/clickhouse/tests/snapshots/test_functions/test_hash/out.sql index bfd18d927565..b61bfcbe186f 100644 --- a/ibis/backends/clickhouse/tests/snapshots/test_functions/test_hash/out.sql +++ b/ibis/backends/clickhouse/tests/snapshots/test_functions/test_hash/out.sql @@ -1,3 +1,3 @@ SELECT - sipHash64("t0"."string_col") AS "Hash(string_col)" + reinterpretAsInt64(sipHash64("t0"."string_col")) AS "Hash(string_col)" FROM "functional_alltypes" AS "t0" \ No newline at end of file diff --git a/ibis/backends/duckdb/compiler.py b/ibis/backends/duckdb/compiler.py index 9655086cc2d6..f5162844607e 100644 --- a/ibis/backends/duckdb/compiler.py +++ b/ibis/backends/duckdb/compiler.py @@ -46,7 +46,6 @@ class DuckDBCompiler(SQLGlotCompiler): ops.BitXor: "bit_xor", ops.EndsWith: "suffix", ops.ExtractIsoYear: "isoyear", - ops.Hash: "hash", ops.IntegerRange: "range", ops.TimestampRange: "range", ops.MapLength: "cardinality", @@ -467,6 +466,14 @@ def visit_HexDigest(self, op, *, arg, how): else: raise NotImplementedError(f"No available hashing function for {how}") + def visit_Hash(self, op, *, arg): + # duckdb's hash() returns a uint64, but ops.Hash is supposed to be int64 + # So do HASH(x)::BITSTRING::BIGINT + raw = self.f.hash(arg) + bitstring = sg.cast(sge.convert(raw), to=sge.DataType.Type.BIT, copy=False) + int64 = sg.cast(bitstring, to=sge.DataType.Type.BIGINT, copy=False) + return int64 + def visit_StringConcat(self, op, *, arg): return reduce(lambda x, y: sge.DPipe(this=x, expression=y), arg) diff --git a/ibis/backends/polars/compiler.py b/ibis/backends/polars/compiler.py index a793db0c609b..bcf4685ba98d 100644 --- a/ibis/backends/polars/compiler.py +++ b/ibis/backends/polars/compiler.py @@ -1206,7 +1206,8 @@ def execute_union(op, **kw): @translate.register(ops.Hash) def execute_hash(op, **kw): - return translate(op.arg, **kw).hash() + # polars' hash() returns a uint64, but we want to return an int64 + return translate(op.arg, **kw).hash().reinterpret(signed=True) def _arg_min_max(op, func, **kw): diff --git a/ibis/backends/sql/datatypes.py b/ibis/backends/sql/datatypes.py index 4e492fb15f52..d74714e5878d 100644 --- a/ibis/backends/sql/datatypes.py +++ b/ibis/backends/sql/datatypes.py @@ -180,7 +180,7 @@ def to_ibis(cls, typ: sge.DataType, nullable: bool | None = None) -> dt.DataType @classmethod def from_ibis(cls, dtype: dt.DataType) -> sge.DataType: - """Convert a sqlglot type to an ibis type.""" + """Convert an Ibis dtype to an sqlglot dtype.""" if method := getattr(cls, f"_from_ibis_{dtype.name}", None): return method(dtype) diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index b6677dc4944c..0d5b269b048c 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -1439,11 +1439,14 @@ def test_distinct_on_keep_is_none(backend, on): "trino", # checksum returns varbinary ] ) -def test_hash_consistent(backend, alltypes): - h1 = alltypes.string_col.hash().execute(limit=10) - h2 = alltypes.string_col.hash().execute(limit=10) +def test_hash(backend, alltypes): + # check that multiple executions return the same result + h1 = alltypes.string_col.hash().execute(limit=20) + h2 = alltypes.string_col.hash().execute(limit=20) backend.assert_series_equal(h1, h2) - assert h1.dtype in ("i8", "uint64") # polars likes returning uint64 for this + # check that the result is a signed 64-bit integer, no nulls + assert h1.dtype == "i8" + assert h1.notnull().all() @pytest.mark.notimpl(["trino", "oracle", "exasol", "snowflake"])