Skip to content

Commit

Permalink
Add Aggregation fuzzer framework (#12667)
Browse files Browse the repository at this point in the history
* impl primitive arrays generator.

* sort out the test record batch generating codes.

* draft for `DataSetsGenerator`.

* tmp

* improve the data generator, and start to impl the session context generator.

* impl context generator.

* tmp

* define the `AggregationFuzzer`.

* add ut for data generator.

* improve comments for `SessionContextGenerator`.

* define `GeneratedSessionContextBuilder` to reduce repeated codes.

* extract the check equality logic for reusing.

* add ut for `SessionContextGenerator`.

* tmp

* finish the main logic of `AggregationFuzzer`.

* try to rewrite some test using the fuzzer.

* fix header.

* expose table name through `AggregationFuzzerBuilder`.

* throw err to aggr fuzzer, and expect them then.

* switch to Arc<str> to slightly improve performance.

* throw more errors to fuzzer.

* print task informantion before panic.

* improve comments.

* support printing generated session context params in error reporting.

* add todo.

* add some new fuzz case based on `AggregationFuzzer`.

* fix lint.

* print more information in error report.

* fix clippy.

* improve comment of `SessionContextGenerator`.

* just use fixed `data_gen_rounds` and `ctx_gen_rounds` currently, because we will hardly set them.

* improve comments for rounds constants.

* small improvements.

* select sql from some candidates ranther than fixed one.

* make `data_gen_rounds` able to set again, and add more tests.

* add no group cases.

* add fuzz test for basic string aggr.

* make `data_gen_rounds` smaller.

* add comments.

* fix typo.

* fix comment.
  • Loading branch information
Rachelint authored Oct 9, 2024
1 parent 3d347c9 commit 3353c06
Show file tree
Hide file tree
Showing 12 changed files with 1,646 additions and 63 deletions.
302 changes: 302 additions & 0 deletions datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,307 @@ use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tokio::task::JoinSet;

use crate::fuzz_cases::aggregation_fuzzer::{
AggregationFuzzerBuilder, ColumnDescr, DatasetGeneratorConfig,
};

// ========================================================================
// The new aggregation fuzz tests based on [`AggregationFuzzer`]
// ========================================================================

// TODO: write more test case to cover more `group by`s and `aggregation function`s
// TODO: maybe we can use macro to simply the case creating

/// Fuzz test for `basic prim aggr(sum/sum distinct/max/min/count/avg)` + `no group by`
#[tokio::test(flavor = "multi_thread")]
async fn test_basic_prim_aggr_no_group() {
let builder = AggregationFuzzerBuilder::default();

// Define data generator config
let columns = vec![ColumnDescr::new("a", DataType::Int32)];

let data_gen_config = DatasetGeneratorConfig {
columns,
rows_num_range: (512, 1024),
sort_keys_set: Vec::new(),
};

// Build fuzzer
let fuzzer = builder
.data_gen_config(data_gen_config)
.data_gen_rounds(16)
.add_sql("SELECT sum(a) FROM fuzz_table")
.add_sql("SELECT sum(distinct a) FROM fuzz_table")
.add_sql("SELECT max(a) FROM fuzz_table")
.add_sql("SELECT min(a) FROM fuzz_table")
.add_sql("SELECT count(a) FROM fuzz_table")
.add_sql("SELECT count(distinct a) FROM fuzz_table")
.add_sql("SELECT avg(a) FROM fuzz_table")
.table_name("fuzz_table")
.build();

fuzzer.run().await;
}

/// Fuzz test for `basic prim aggr(sum/sum distinct/max/min/count/avg)` + `group by single int64`
#[tokio::test(flavor = "multi_thread")]
async fn test_basic_prim_aggr_group_by_single_int64() {
let builder = AggregationFuzzerBuilder::default();

// Define data generator config
let columns = vec![
ColumnDescr::new("a", DataType::Int32),
ColumnDescr::new("b", DataType::Int64),
ColumnDescr::new("c", DataType::Int64),
];
let sort_keys_set = vec![
vec!["b".to_string()],
vec!["c".to_string(), "b".to_string()],
];
let data_gen_config = DatasetGeneratorConfig {
columns,
rows_num_range: (512, 1024),
sort_keys_set,
};

// Build fuzzer
let fuzzer = builder
.data_gen_config(data_gen_config)
.data_gen_rounds(16)
.add_sql("SELECT b, sum(a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, sum(distinct a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, max(a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, min(a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, count(a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, count(distinct a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, avg(a) FROM fuzz_table GROUP BY b")
.table_name("fuzz_table")
.build();

fuzzer.run().await;
}

/// Fuzz test for `basic prim aggr(sum/sum distinct/max/min/count/avg)` + `group by single string`
#[tokio::test(flavor = "multi_thread")]
async fn test_basic_prim_aggr_group_by_single_string() {
let builder = AggregationFuzzerBuilder::default();

// Define data generator config
let columns = vec![
ColumnDescr::new("a", DataType::Int32),
ColumnDescr::new("b", DataType::Utf8),
ColumnDescr::new("c", DataType::Int64),
];
let sort_keys_set = vec![
vec!["b".to_string()],
vec!["c".to_string(), "b".to_string()],
];
let data_gen_config = DatasetGeneratorConfig {
columns,
rows_num_range: (512, 1024),
sort_keys_set,
};

// Build fuzzer
let fuzzer = builder
.data_gen_config(data_gen_config)
.data_gen_rounds(16)
.add_sql("SELECT b, sum(a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, sum(distinct a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, max(a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, min(a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, count(a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, count(distinct a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, avg(a) FROM fuzz_table GROUP BY b")
.table_name("fuzz_table")
.build();

fuzzer.run().await;
}

/// Fuzz test for `basic prim aggr(sum/sum distinct/max/min/count/avg)` + `group by string + int64`
#[tokio::test(flavor = "multi_thread")]
async fn test_basic_prim_aggr_group_by_mixed_string_int64() {
let builder = AggregationFuzzerBuilder::default();

// Define data generator config
let columns = vec![
ColumnDescr::new("a", DataType::Int32),
ColumnDescr::new("b", DataType::Utf8),
ColumnDescr::new("c", DataType::Int64),
ColumnDescr::new("d", DataType::Int32),
];
let sort_keys_set = vec![
vec!["b".to_string(), "c".to_string()],
vec!["d".to_string(), "b".to_string(), "c".to_string()],
];
let data_gen_config = DatasetGeneratorConfig {
columns,
rows_num_range: (512, 1024),
sort_keys_set,
};

// Build fuzzer
let fuzzer = builder
.data_gen_config(data_gen_config)
.data_gen_rounds(16)
.add_sql("SELECT b, c, sum(a) FROM fuzz_table GROUP BY b, c")
.add_sql("SELECT b, c, sum(distinct a) FROM fuzz_table GROUP BY b,c")
.add_sql("SELECT b, c, max(a) FROM fuzz_table GROUP BY b, c")
.add_sql("SELECT b, c, min(a) FROM fuzz_table GROUP BY b, c")
.add_sql("SELECT b, c, count(a) FROM fuzz_table GROUP BY b, c")
.add_sql("SELECT b, c, count(distinct a) FROM fuzz_table GROUP BY b, c")
.add_sql("SELECT b, c, avg(a) FROM fuzz_table GROUP BY b, c")
.table_name("fuzz_table")
.build();

fuzzer.run().await;
}

/// Fuzz test for `basic string aggr(count/count distinct/min/max)` + `no group by`
#[tokio::test(flavor = "multi_thread")]
async fn test_basic_string_aggr_no_group() {
let builder = AggregationFuzzerBuilder::default();

// Define data generator config
let columns = vec![ColumnDescr::new("a", DataType::Utf8)];

let data_gen_config = DatasetGeneratorConfig {
columns,
rows_num_range: (512, 1024),
sort_keys_set: Vec::new(),
};

// Build fuzzer
let fuzzer = builder
.data_gen_config(data_gen_config)
.data_gen_rounds(8)
.add_sql("SELECT max(a) FROM fuzz_table")
.add_sql("SELECT min(a) FROM fuzz_table")
.add_sql("SELECT count(a) FROM fuzz_table")
.add_sql("SELECT count(distinct a) FROM fuzz_table")
.table_name("fuzz_table")
.build();

fuzzer.run().await;
}

/// Fuzz test for `basic string aggr(count/count distinct/min/max)` + `group by single int64`
#[tokio::test(flavor = "multi_thread")]
async fn test_basic_string_aggr_group_by_single_int64() {
let builder = AggregationFuzzerBuilder::default();

// Define data generator config
let columns = vec![
ColumnDescr::new("a", DataType::Utf8),
ColumnDescr::new("b", DataType::Int64),
ColumnDescr::new("c", DataType::Int64),
];
let sort_keys_set = vec![
vec!["b".to_string()],
vec!["c".to_string(), "b".to_string()],
];
let data_gen_config = DatasetGeneratorConfig {
columns,
rows_num_range: (512, 1024),
sort_keys_set,
};

// Build fuzzer
let fuzzer = builder
.data_gen_config(data_gen_config)
.data_gen_rounds(8)
// FIXME: Encounter error in min/max
// ArrowError(InvalidArgumentError("number of columns(1) must match number of fields(2) in schema"))
// .add_sql("SELECT b, max(a) FROM fuzz_table GROUP BY b")
// .add_sql("SELECT b, min(a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, count(a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, count(distinct a) FROM fuzz_table GROUP BY b")
.table_name("fuzz_table")
.build();

fuzzer.run().await;
}

/// Fuzz test for `basic string aggr(count/count distinct/min/max)` + `group by single string`
#[tokio::test(flavor = "multi_thread")]
async fn test_basic_string_aggr_group_by_single_string() {
let builder = AggregationFuzzerBuilder::default();

// Define data generator config
let columns = vec![
ColumnDescr::new("a", DataType::Utf8),
ColumnDescr::new("b", DataType::Utf8),
ColumnDescr::new("c", DataType::Int64),
];
let sort_keys_set = vec![
vec!["b".to_string()],
vec!["c".to_string(), "b".to_string()],
];
let data_gen_config = DatasetGeneratorConfig {
columns,
rows_num_range: (512, 1024),
sort_keys_set,
};

// Build fuzzer
let fuzzer = builder
.data_gen_config(data_gen_config)
.data_gen_rounds(16)
// FIXME: Encounter error in min/max
// ArrowError(InvalidArgumentError("number of columns(1) must match number of fields(2) in schema"))
// .add_sql("SELECT b, max(a) FROM fuzz_table GROUP BY b")
// .add_sql("SELECT b, min(a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, count(a) FROM fuzz_table GROUP BY b")
.add_sql("SELECT b, count(distinct a) FROM fuzz_table GROUP BY b")
.table_name("fuzz_table")
.build();

fuzzer.run().await;
}

/// Fuzz test for `basic string aggr(count/count distinct/min/max)` + `group by string + int64`
#[tokio::test(flavor = "multi_thread")]
async fn test_basic_string_aggr_group_by_mixed_string_int64() {
let builder = AggregationFuzzerBuilder::default();

// Define data generator config
let columns = vec![
ColumnDescr::new("a", DataType::Utf8),
ColumnDescr::new("b", DataType::Utf8),
ColumnDescr::new("c", DataType::Int64),
ColumnDescr::new("d", DataType::Int32),
];
let sort_keys_set = vec![
vec!["b".to_string(), "c".to_string()],
vec!["d".to_string(), "b".to_string(), "c".to_string()],
];
let data_gen_config = DatasetGeneratorConfig {
columns,
rows_num_range: (512, 1024),
sort_keys_set,
};

// Build fuzzer
let fuzzer = builder
.data_gen_config(data_gen_config)
.data_gen_rounds(16)
// FIXME: Encounter error in min/max
// ArrowError(InvalidArgumentError("number of columns(1) must match number of fields(2) in schema"))
// .add_sql("SELECT b, c, max(a) FROM fuzz_table GROUP BY b, c")
// .add_sql("SELECT b, c, min(a) FROM fuzz_table GROUP BY b, c")
.add_sql("SELECT b, c, count(a) FROM fuzz_table GROUP BY b, c")
.add_sql("SELECT b, c, count(distinct a) FROM fuzz_table GROUP BY b, c")
.table_name("fuzz_table")
.build();

fuzzer.run().await;
}

// ========================================================================
// The old aggregation fuzz tests
// ========================================================================
/// Tracks if this stream is generating input or output
/// Tests that streaming aggregate and batch (non streaming) aggregate produce
/// same results
#[tokio::test(flavor = "multi_thread")]
Expand Down Expand Up @@ -311,6 +612,7 @@ async fn group_by_string_test(
let actual = extract_result_counts(results);
assert_eq!(expected, actual);
}

async fn verify_ordered_aggregate(frame: &DataFrame, expected_sort: bool) {
struct Visitor {
expected_sort: bool,
Expand Down
Loading

0 comments on commit 3353c06

Please sign in to comment.