Skip to content

Commit

Permalink
Draft hash_pivot API
Browse files Browse the repository at this point in the history
  • Loading branch information
pitrou committed Jan 15, 2025
1 parent 3b932bb commit 9c7e26d
Show file tree
Hide file tree
Showing 7 changed files with 584 additions and 2 deletions.
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,7 @@ if(ARROW_COMPUTE)
compute/kernels/aggregate_tdigest.cc
compute/kernels/aggregate_var_std.cc
compute/kernels/hash_aggregate.cc
compute/kernels/pivot_internal.cc
compute/kernels/scalar_arithmetic.cc
compute/kernels/scalar_boolean.cc
compute/kernels/scalar_compare.cc
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/acero/groupby_aggregate_node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,11 @@ Status GroupByNode::Merge() {
DCHECK(state0->agg_states[span_i]);
batch_ctx.SetState(state0->agg_states[span_i].get());

// XXX this resizes each KernelState (state0->agg_states[span_i]) multiple times.
// An alternative would be a two-pass algorithm:
// 1. Compute all transpositions (one per local state) and the final number of
// groups.
// 2. Process all agg kernels, resizing each KernelState only once.
RETURN_NOT_OK(
agg_kernels_[span_i]->resize(&batch_ctx, state0->grouper->num_groups()));
RETURN_NOT_OK(agg_kernels_[span_i]->merge(
Expand Down
69 changes: 69 additions & 0 deletions cpp/src/arrow/compute/api_aggregate.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,75 @@ class ARROW_EXPORT TDigestOptions : public FunctionOptions {
uint32_t min_count;
};

/// \brief Control Pivot kernel behavior
///
/// These options apply to the "pivot" (TODO) and "hash_pivot" (TODO) functions.
///
/// Constraints:
/// - The corresponding `Aggregate::target` must have two FieldRef elements;
/// the first one points to the pivot key column, the second points to the
/// pivoted data column.
/// - The pivot key column must be string-like; its values will be matched
/// against `key_names` in order to dispatch the pivoted data into the
/// output.
///
/// "hash_pivot" example
/// --------------------
///
/// Assuming the following input with schema
/// `{"group": int32, "key": utf8, "value": int16}`:
/// ```
/// group | key | value
/// -----------------------------
/// 1 | height | 11
/// 1 | width | 12
/// 2 | width | 13
/// 3 | height | 14
/// 3 | depth | 15
/// ```
/// and the following settings:
/// - a hash grouping key "group"
/// - Aggregate(
/// .function = "hash_pivot",
/// .options = PivotOptions(.key_names = {"height", "width"}),
/// .target = {"key", "value"},
/// .name = {"props"})
///
/// then the output will have the schema
/// `{"group": int32, "props": struct{"height": int16, "width": int16}}`
/// and the following value:
/// ```
/// group | props
/// | height | width
/// -----------------------------
/// 1 | 11 | 12
/// 2 | null | 13
/// 3 | 14 | null
/// ```
class ARROW_EXPORT PivotOptions : public FunctionOptions {
public:
// Configure the behavior of pivot keys not in `key_names`
enum UnexpectedKeyBehavior {
// Unexpected pivot keys are ignored silently
kIgnore,
// Unexpected pivot keys return a KeyError
kRaise
};
// TODO should duplicate key behavior be configurable as well?

explicit PivotOptions(std::vector<std::string> key_names,
UnexpectedKeyBehavior unexpected_key_behavior = kIgnore);
// Default constructor for serialization
PivotOptions();
static constexpr char const kTypeName[] = "PivotOptions";
static PivotOptions Defaults() { return PivotOptions{}; }

// The values expected in the pivot key column
std::vector<std::string> key_names;
// The behavior when pivot keys not in `key_names` are encountered
UnexpectedKeyBehavior unexpected_key_behavior = kIgnore;
};

/// \brief Control Index kernel behavior
class ARROW_EXPORT IndexOptions : public FunctionOptions {
public:
Expand Down
Loading

0 comments on commit 9c7e26d

Please sign in to comment.