Skip to content

Commit

Permalink
refactor(ops.Hash): remove how from backend-specific hash operation
Browse files Browse the repository at this point in the history
BREAKING CHANGE: The `hash` method on table columns on longer accepts
the `how` argument.  The hashing functions available are highly
backend-dependent and the intention of the hash operation is to provide
a fast, consistent (on the same backend, only) integer value.
If you have been passing in a value for `how`, you can remove it and you
will get the same results as before, as there were no backends with
multiple hash functions working.
  • Loading branch information
gforsyth authored and cpcloud committed Apr 21, 2023
1 parent dc0289c commit 46a55fc
Show file tree
Hide file tree
Showing 15 changed files with 79 additions and 33 deletions.
14 changes: 0 additions & 14 deletions ibis/backends/base/sql/registry/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,19 +189,6 @@ def round(translator, op):
return f'round({arg_formatted})'


# XXX this is not added to operation_registry, but looks like impala is
# using it in the tests, and it works, even if it's not imported anywhere
def hash(translator, op):
how = op.how

arg_formatted = translator.translate(op.arg)

if how == 'fnv':
return f'fnv_hash({arg_formatted})'
else:
raise NotImplementedError(how)


def concat(translator, op):
joined_args = ', '.join(map(translator.translate, op.arg))
return f"concat({joined_args})"
Expand Down Expand Up @@ -272,7 +259,6 @@ def count_star(translator, op):
ops.Round: round,
ops.Sign: sign,
ops.Sqrt: unary('sqrt'),
ops.Hash: hash,
ops.HashBytes: hashbytes,
ops.RandomScalar: lambda *_: 'rand(utc_to_unix_micros(utc_timestamp()))',
ops.Log: log,
Expand Down
10 changes: 2 additions & 8 deletions ibis/backends/bigquery/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,8 @@ def _array_index(translator, op):


def _hash(translator, op):
arg, how = op.args

arg_formatted = translator.translate(arg)

if how == "farm_fingerprint":
return f"farm_fingerprint({arg_formatted})"
else:
raise NotImplementedError(how)
arg_formatted = translator.translate(op.arg)
return f"farm_fingerprint({arg_formatted})"


def _string_find(translator, op):
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/bigquery/tests/unit/test_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def test_day_of_week(case, dtype, snapshot):
)
def test_hash(case, dtype, snapshot):
var = ibis.literal(case, type=dtype)
expr = var.hash(how="farm_fingerprint")
expr = var.hash()
snapshot.assert_match(to_sql(expr), "out.sql")


Expand Down
6 changes: 6 additions & 0 deletions ibis/backends/clickhouse/compiler/values.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,12 @@ def _sign(op, **kw):

@translate_val.register(ops.Hash)
def _hash(op, **kw):
arg = translate_val(op.arg, **kw)
return f"sipHash64({arg})"


@translate_val.register(ops.HashBytes)
def _hash_bytes(op, **kw):
algorithms = {
"MD5",
"halfMD5",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
sipHash64(string_col)
5 changes: 5 additions & 0 deletions ibis/backends/clickhouse/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,3 +462,8 @@ def test_group_concat(alltypes, sep, where_case, translate, snapshot):
where = None if where_case is None else alltypes.bool_col == where_case
expr = alltypes.string_col.group_concat(sep, where)
snapshot.assert_match(translate(expr.op()), "out.sql")


def test_hash(alltypes, translate, snapshot):
expr = alltypes.string_col.hash()
snapshot.assert_match(translate(expr.op()), "out.sql")
1 change: 1 addition & 0 deletions ibis/backends/duckdb/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,7 @@ def _map_merge(t, op):
ops.MapKeys: _map_keys,
ops.MapValues: _map_values,
ops.MapMerge: _map_merge,
ops.Hash: unary(sa.func.hash),
}
)

Expand Down
8 changes: 6 additions & 2 deletions ibis/backends/impala/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import ibis.expr.operations as ops
from ibis.backends.base.sql.compiler import Compiler, ExprTranslator, TableSetFormatter
from ibis.backends.base.sql.registry import binary_infix_ops, operation_registry
from ibis.backends.base.sql.registry import binary_infix_ops, operation_registry, unary


class ImpalaTableSetFormatter(TableSetFormatter):
Expand All @@ -17,7 +17,11 @@ def _get_join_type(self, op):


class ImpalaExprTranslator(ExprTranslator):
_registry = {**operation_registry, **binary_infix_ops}
_registry = {
**operation_registry,
**binary_infix_ops,
**{ops.Hash: unary("fnv_hash")},
}
_forbids_frame_clause = (
*ExprTranslator._forbids_frame_clause,
ops.Lag,
Expand Down
1 change: 1 addition & 0 deletions ibis/backends/mssql/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ def _timestamp_truncate(t, op):
),
ops.TimestampTruncate: _timestamp_truncate,
ops.DateTruncate: _timestamp_truncate,
ops.Hash: unary(sa.func.checksum),
}
)

Expand Down
5 changes: 5 additions & 0 deletions ibis/backends/polars/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1024,3 +1024,8 @@ def execute_union(op, **kwargs):
if op.distinct:
return result.unique()
return result


@translate.register(ops.Hash)
def execute_hash(op, **kwargs):
return translate(op.arg).hash()
5 changes: 5 additions & 0 deletions ibis/backends/pyspark/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2041,3 +2041,8 @@ def compile_array_union(t, op, **kwargs):
left = t.translate(op.left, **kwargs)
right = t.translate(op.right, **kwargs)
return F.array_union(left, right)


@compiles(ops.Hash)
def compile_hash_column(t, op, **kwargs):
return F.hash(t.translate(op.arg, **kwargs))
1 change: 1 addition & 0 deletions ibis/backends/snowflake/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,7 @@ def _group_concat(t, op):
ops.StartsWith: fixed_arity(sa.func.startswith, 2),
ops.EndsWith: fixed_arity(sa.func.endswith, 2),
ops.GroupConcat: _group_concat,
ops.Hash: unary(sa.func.hash),
}
)

Expand Down
24 changes: 24 additions & 0 deletions ibis/backends/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import numpy as np
import pandas as pd
import pandas.testing as tm
import pytest
import sqlalchemy as sa
import toolz
Expand Down Expand Up @@ -1167,3 +1168,26 @@ def test_distinct_on_keep_is_none(backend, on):
.reset_index(drop=True)
)
assert len(result) == len(expected)


@pytest.mark.notimpl(
[
"dask",
"pandas",
"postgres",
]
)
@pytest.mark.notyet(
[
"sqlite",
"datafusion",
"druid", # ???
"mysql", # CHECKSUM TABLE but not column
"trino", # checksum returns varbinary
]
)
def test_hash_consistent(backend, alltypes):
h1 = alltypes.string_col.hash().execute(limit=10)
h2 = alltypes.string_col.hash().execute(limit=10)
tm.assert_series_equal(h1, h2)
assert h1.dtype in ("i8", "uint64") # polars likes returning uint64 for this
20 changes: 18 additions & 2 deletions ibis/expr/operations/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,6 @@ class Pi(Constant):
@public
class Hash(Value):
arg = rlz.any
how = rlz.isin({'fnv', 'farm_fingerprint'})

output_dtype = dt.int64
output_shape = rlz.shape_like("arg")
Expand All @@ -273,7 +272,24 @@ class Hash(Value):
@public
class HashBytes(Value):
arg = rlz.one_of({rlz.value(dt.string), rlz.value(dt.binary)})
how = rlz.isin({'md5', 'sha1', 'sha256', 'sha512'})
# TODO: these don't necessarily all belong here
how = rlz.isin(
{
"md5",
"MD5",
"sha1",
"SHA1",
"SHA224",
"sha256",
"SHA256",
"sha512",
"intHash32",
"intHash64",
"cityHash64",
"sipHash64",
"sipHash128",
}
)

output_dtype = dt.binary
output_shape = rlz.shape_like("arg")
Expand Down
9 changes: 3 additions & 6 deletions ibis/expr/types/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,20 +63,17 @@ def type(self) -> dt.DataType:
"""Return the [DataType] of this expression."""
return self.op().output_dtype

def hash(self, how: str = "fnv") -> ir.IntegerValue:
def hash(self) -> ir.IntegerValue:
"""Compute an integer hash value.
Parameters
----------
how
Hash algorithm to use
!!! info "The hashing function used is backend-dependent."
Returns
-------
IntegerValue
The hash value of `self`
"""
return ops.Hash(self, how).to_expr()
return ops.Hash(self).to_expr()

def cast(self, target_type: dt.DataType) -> Value:
"""Cast expression to indicated data type.
Expand Down

0 comments on commit 46a55fc

Please sign in to comment.