Eventual-Inc · universalmind303 · Jun 20, 2024 · Jun 18, 2024 · Jun 20, 2024 · samster25
diff --git a/daft/daft.pyi b/daft/daft.pyi
@@ -1016,6 +1016,7 @@ class PyExpr:
     def mean(self) -> PyExpr: ...
     def min(self) -> PyExpr: ...
     def max(self) -> PyExpr: ...
+    def hash(self, seed: Any | None = None) -> PyExpr: ...
     def any_value(self, ignore_nulls: bool) -> PyExpr: ...
     def agg_list(self) -> PyExpr: ...
     def agg_concat(self) -> PyExpr: ...

diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py
@@ -720,6 +720,16 @@ def between(self, lower: Any, upper: Any) -> Expression:
         expr = self._expr.between(lower._expr, upper._expr)
         return Expression._from_pyexpr(expr)
 
+    def hash(self, seed: Any | None = None) -> Expression:
+        """Hashes the values in the Expression"""
+        if seed is None:
+            expr = self._expr.hash()
+        else:
+            if not isinstance(seed, Expression):
+                seed = lit(seed)
+            expr = self._expr.hash(seed._expr)
+        return Expression._from_pyexpr(expr)
+
     def name(self) -> builtins.str:
         return self._expr.name()
 

diff --git a/src/daft-dsl/src/functions/hash.rs b/src/daft-dsl/src/functions/hash.rs
@@ -0,0 +1,66 @@
+use common_error::{DaftError, DaftResult};
+use daft_core::{
+    datatypes::{Field, UInt64Array},
+    schema::Schema,
+    DataType, IntoSeries, Series,
+};
+
+use crate::{
+    functions::{FunctionEvaluator, FunctionExpr},
+    Expr, ExprRef,
+};
+
+pub(super) struct HashEvaluator {}
+
+impl FunctionEvaluator for HashEvaluator {
+    fn fn_name(&self) -> &'static str {
+        "hash"
+    }
+
+    fn to_field(&self, inputs: &[ExprRef], schema: &Schema, _: &FunctionExpr) -> DaftResult<Field> {
+        match inputs {
+            [input] | [input, _] => match input.to_field(schema) {
+                Ok(field) => Ok(Field::new(field.name, DataType::UInt64)),
+                e => e,
+            },
+            _ => Err(DaftError::SchemaMismatch(format!(
+                "Expected 2 input arg, got {}",
+                inputs.len()
+            ))),
+        }
+    }
+
+    fn evaluate(&self, inputs: &[Series], _: &FunctionExpr) -> DaftResult<Series> {
+        match inputs {
+            [input] => input.hash(None).map(|s| s.into_series()),
+            [input, seed] => {
+                if seed.len() != 1 {
+                    return Err(DaftError::ValueError("Expected 1 seed arg".to_string()));
+                }
+
+                let seed = seed.cast(&DataType::UInt64)?;
+                // There's no way to natively extend the array, so we extract the element and repeat it.
+                let seed = seed.u64().unwrap();
+                let seed = seed.get(0).unwrap();
+                let seed =
+                    UInt64Array::from_iter("seed", std::iter::repeat(Some(seed)).take(input.len()));
+
+                input.hash(Some(&seed)).map(|s| s.into_series())
+            }
+            _ => Err(DaftError::ValueError("Expected 2 input arg".to_string())),
+        }
+    }
+}
+
+pub fn hash(input: ExprRef, seed: Option<ExprRef>) -> ExprRef {
+    let inputs = match seed {
+        Some(seed) => vec![input, seed],
+        None => vec![input],
+    };
+
+    Expr::Function {
+        func: FunctionExpr::Hash,
+        inputs,
+    }
+    .into()
+}
diff --git a/src/daft-dsl/src/functions/mod.rs b/src/daft-dsl/src/functions/mod.rs
@@ -1,4 +1,5 @@
 pub mod float;
+pub mod hash;
 pub mod image;
 pub mod json;
 pub mod list;
@@ -29,6 +30,7 @@ use self::{float::FloatExpr, uri::UriExpr};
 use common_error::DaftResult;
 use daft_core::datatypes::FieldID;
 use daft_core::{datatypes::Field, schema::Schema, series::Series};
+use hash::HashEvaluator;
 use serde::{Deserialize, Serialize};
 
 #[cfg(feature = "python")]
@@ -52,6 +54,7 @@ pub enum FunctionExpr {
     Python(PythonUDF),
     Partitioning(PartitioningExpr),
     Uri(UriExpr),
+    Hash,
 }
 
 pub trait FunctionEvaluator {
@@ -84,6 +87,7 @@ impl FunctionExpr {
             #[cfg(feature = "python")]
             Python(expr) => expr,
             Partitioning(expr) => expr.get_evaluator(),
+            Hash => &HashEvaluator {},
         }
     }
 }

diff --git a/src/daft-dsl/src/python.rs b/src/daft-dsl/src/python.rs
@@ -813,6 +813,11 @@ impl PyExpr {
         )
         .into())
     }
+
+    pub fn hash(&self, seed: Option<PyExpr>) -> PyResult<Self> {
+        use crate::functions::hash::hash;
+        Ok(hash(self.into(), seed.map(|s| s.into())).into())
+    }
 }
 
 impl_bincode_py_state_serialization!(PyExpr);

diff --git a/tests/expressions/test_expressions.py b/tests/expressions/test_expressions.py
@@ -249,6 +249,24 @@ def test_repr_functions_sqrt() -> None:
     assert repr_out == repr(copied)
 
 
+def test_repr_functions_hash() -> None:
+    a = col("a")
+    y = a.hash()
+    repr_out = repr(y)
+    assert repr_out == "hash(col(a))"
+    copied = copy.deepcopy(y)
+    assert repr_out == repr(copied)
+
+
+def test_repr_functions_hash_2() -> None:
+    a = col("a")
+    y = a.hash(lit(1))
+    repr_out = repr(y)
+    assert repr_out == "hash(col(a), lit(1))"
+    copied = copy.deepcopy(y)
+    assert repr_out == repr(copied)
+
+
 def test_expr_structurally_equal() -> None:
     e1 = (col("a").max() == col("b").alias("moo") - 3).is_null()
     e2 = (col("a").max() == col("b").alias("moo") - 3).is_null()