Skip to content

Commit

Permalink
Merge pull request #2486 from dathere/joinp-asof-sort-and-match-options
Browse files Browse the repository at this point in the history
feat: additional `joinp` `asof` join sort and match options
  • Loading branch information
jqnatividad authored Jan 28, 2025
2 parents 4402df6 + 6213df0 commit 2d858cf
Show file tree
Hide file tree
Showing 2 changed files with 199 additions and 6 deletions.
31 changes: 25 additions & 6 deletions src/cmd/joinp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,16 @@ joinp options:
ASOF JOIN OPTIONS:
--asof Do an 'asof' join. This is similar to a left inner
join, except we match on nearest key rather than
equal keys. Note that both CSV data sets will be SORTED
AUTOMATICALLY on the join columns.
equal keys (see --allow-exact-matches).
Particularly useful for time series data.
Note that both CSV data sets will be SORTED on the join columns
by default, unless --no-sort is set.
--no-sort Do not sort the CSV data sets on the join columns by default.
Note that asof joins REQUIRE the join keys to be sorted,
so this option should only be used as a performance optimization
when you know the CSV join keys are already sorted.
If the CSV join keys are not sorted, the asof join will fail or
return incorrect results.
--left_by <arg> Do an 'asof_by' join - a special implementation of the asof
join that searches for the nearest keys within a subgroup
set by the asof_by columns. This specifies the column/s for
Expand Down Expand Up @@ -198,6 +205,10 @@ joinp options:
Suffix with “_saturating” to indicate that dates too
large for their month should saturate at the largest date
(e.g. 2022-02-29 -> 2022-02-28) instead of erroring.
-X, --allow-exact-matches When set, the asof join will allow exact matches.
(i.e. less-than-or-equal-to or greater-than-or-equal-to)
Otherwise, the asof join will only allow nearest matches
(strictly less-than or greater-than) by default.
OUTPUT FORMAT OPTIONS:
--sql-filter <SQL> The SQL expression to apply against the join result.
Expand Down Expand Up @@ -293,10 +304,12 @@ struct Args {
flag_no_optimizations: bool,
flag_ignore_errors: bool,
flag_asof: bool,
flag_no_sort: bool,
flag_left_by: Option<String>,
flag_right_by: Option<String>,
flag_strategy: Option<String>,
flag_tolerance: Option<String>,
flag_allow_exact_matches: bool,
flag_sql_filter: Option<String>,
flag_datetime_format: Option<String>,
flag_date_format: Option<String>,
Expand All @@ -314,7 +327,8 @@ struct Args {
#[derive(PartialEq, Eq)]
enum SpecialJoin {
NonEqui(String),
AsOf,
AsOfAutoSort,
AsOfNoSort,
None,
}

Expand Down Expand Up @@ -483,6 +497,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {

let mut asof_options = AsOfOptions {
strategy,
allow_eq: args.flag_allow_exact_matches,
..Default::default()
};

Expand Down Expand Up @@ -520,7 +535,11 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
JoinType::AsOf(asof_options),
validation,
MaintainOrderJoin::None,
SpecialJoin::AsOf,
if args.flag_no_sort {
SpecialJoin::AsOfNoSort
} else {
SpecialJoin::AsOfAutoSort
},
normalization_form,
)
},
Expand Down Expand Up @@ -702,8 +721,8 @@ impl JoinStruct {
.finish()
.collect()?
} else {
if special_join == SpecialJoin::AsOf {
// it's an asof join
if special_join == SpecialJoin::AsOfAutoSort {
// it's an asof join and --no-sort is not set
// sort by the asof columns, as asof joins require sorted join column data
let left_selcols_vec: Vec<PlSmallStr> =
self.left_sel.split(',').map(PlSmallStr::from_str).collect();
Expand Down
174 changes: 174 additions & 0 deletions tests/test_joinp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2508,3 +2508,177 @@ fn joinp_unicode_normalization_ligatures() {
];
assert_eq!(got, expected);
}

#[test]
fn joinp_asof_allow_exact_matches() {
let wrk = Workdir::new("joinp_asof_allow_exact_matches");

// Test data includes both exact and inexact matches
wrk.create(
"trades.csv",
vec![
svec!["time", "price"],
svec!["2024-01-01 10:00:00", "100.0"], // Exact match
svec!["2024-01-01 10:00:03", "100.5"], // In between quotes
svec!["2024-01-01 10:00:05", "101.0"], // Exact match
svec!["2024-01-01 10:00:08", "101.5"], // In between quotes
svec!["2024-01-01 10:00:10", "102.0"], // Exact match
svec!["2024-01-01 10:00:12", "102.5"], // In between quotes
svec!["2024-01-01 10:00:15", "103.0"], // Exact match
],
);

wrk.create(
"quotes.csv",
vec![
svec!["time", "bid"],
svec!["2024-01-01 10:00:00", "99.5"], // Matches trades[0]
svec!["2024-01-01 10:00:05", "99.5"], // Matches trades[2]
svec!["2024-01-01 10:00:10", "101.5"], // Matches trades[4]
svec!["2024-01-01 10:00:15", "102.25"], // Matches trades[6]
],
);

let mut cmd = wrk.command("joinp");
cmd.arg("--asof")
.args(["time", "trades.csv", "time", "quotes.csv"])
.arg("--allow-exact-matches")
.arg("--try-parsedates")
.args(["--datetime-format", "%Y-%m-%d %H:%M:%S"]);

let expected = vec![
svec!["time", "price", "bid"],
svec!["2024-01-01 10:00:00", "100.0", "99.5"], // Exact match
svec!["2024-01-01 10:00:03", "100.5", "99.5"], // Uses previous quote
svec!["2024-01-01 10:00:05", "101.0", "99.5"], // Exact match
svec!["2024-01-01 10:00:08", "101.5", "99.5"], // Uses previous quote
svec!["2024-01-01 10:00:10", "102.0", "101.5"], // Exact match
svec!["2024-01-01 10:00:12", "102.5", "101.5"], // Uses previous quote
svec!["2024-01-01 10:00:15", "103.0", "102.25"], // Exact match
];

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
assert_eq!(got, expected);

// Test without --allow-exact-matches
let mut cmd = wrk.command("joinp");
cmd.arg("--asof")
.args(["time", "trades.csv", "time", "quotes.csv"])
.arg("--try-parsedates")
.args(["--datetime-format", "%Y-%m-%d %H:%M:%S"]);

let expected = vec![
svec!["time", "price", "bid"],
svec!["2024-01-01 10:00:00", "100.0", ""], // No match since exact matches not allowed
svec!["2024-01-01 10:00:03", "100.5", "99.5"], // Uses quote from 10:00:00
svec!["2024-01-01 10:00:05", "101.0", "99.5"], /* Uses quote from 10:00:00 (exact not
* allowed) */
svec!["2024-01-01 10:00:08", "101.5", "99.5"], // Uses quote from 10:00:05
svec!["2024-01-01 10:00:10", "102.0", "99.5"], /* Uses quote from 10:00:05 (exact not
* allowed) */
svec!["2024-01-01 10:00:12", "102.5", "101.5"], // Uses quote from 10:00:10
svec!["2024-01-01 10:00:15", "103.0", "101.5"], /* Uses quote from 10:00:10 (exact not
* allowed) */
];

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
assert_eq!(got, expected);
}

#[test]
fn joinp_asof_sortkey_options() {
let wrk = Workdir::new("joinp_asof_sortkey_options");

// Exactly the same trades and quotes test data as the joinp_asof_allow_exact_matches test
// above, but the data are not sorted.
wrk.create(
"trades.csv",
vec![
svec!["time", "price"],
svec!["2024-01-01 10:00:10", "102.0"],
svec!["2024-01-01 10:00:05", "101.0"],
svec!["2024-01-01 10:00:00", "100.0"],
svec!["2024-01-01 10:00:15", "103.0"],
svec!["2024-01-01 10:00:12", "102.5"],
svec!["2024-01-01 10:00:03", "100.5"],
svec!["2024-01-01 10:00:08", "101.5"],
],
);

wrk.create(
"quotes.csv",
vec![
svec!["time", "bid"],
svec!["2024-01-01 10:00:10", "101.5"],
svec!["2024-01-01 10:00:05", "99.5"],
svec!["2024-01-01 10:00:00", "99.5"],
svec!["2024-01-01 10:00:15", "102.25"],
],
);

// But we automatically sort by the asof columns by default, so this works
let mut cmd = wrk.command("joinp");
cmd.arg("--asof")
.args(["time", "trades.csv", "time", "quotes.csv"])
.arg("--allow-exact-matches")
.arg("--try-parsedates")
.args(["--datetime-format", "%Y-%m-%d %H:%M:%S"]);

let expected = vec![
svec!["time", "price", "bid"],
svec!["2024-01-01 10:00:00", "100.0", "99.5"], // Exact match
svec!["2024-01-01 10:00:03", "100.5", "99.5"], // Uses previous quote
svec!["2024-01-01 10:00:05", "101.0", "99.5"], // Exact match
svec!["2024-01-01 10:00:08", "101.5", "99.5"], // Uses previous quote
svec!["2024-01-01 10:00:10", "102.0", "101.5"], // Exact match
svec!["2024-01-01 10:00:12", "102.5", "101.5"], // Uses previous quote
svec!["2024-01-01 10:00:15", "103.0", "102.25"], // Exact match
];

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
assert_eq!(got, expected);

// Test without --allow-exact-matches
let mut cmd = wrk.command("joinp");
cmd.arg("--asof")
.args(["time", "trades.csv", "time", "quotes.csv"])
.arg("--try-parsedates")
.args(["--datetime-format", "%Y-%m-%d %H:%M:%S"]);

let expected = vec![
svec!["time", "price", "bid"],
svec!["2024-01-01 10:00:00", "100.0", ""],
svec!["2024-01-01 10:00:03", "100.5", "99.5"],
svec!["2024-01-01 10:00:05", "101.0", "99.5"],
svec!["2024-01-01 10:00:08", "101.5", "99.5"],
svec!["2024-01-01 10:00:10", "102.0", "99.5"],
svec!["2024-01-01 10:00:12", "102.5", "101.5"],
svec!["2024-01-01 10:00:15", "103.0", "101.5"],
];

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
assert_eq!(got, expected);

// Test with --no-sort
let mut cmd = wrk.command("joinp");
cmd.arg("--asof")
.args(["time", "trades.csv", "time", "quotes.csv"])
.arg("--no-sort")
.arg("--try-parsedates")
.args(["--datetime-format", "%Y-%m-%d %H:%M:%S"]);

// and the output is INCORRECT because the data is not sorted
let expected = vec![
svec!["time", "price", "bid"],
svec!["2024-01-01 10:00:10", "102.0", ""],
svec!["2024-01-01 10:00:05", "101.0", ""],
svec!["2024-01-01 10:00:00", "100.0", ""],
svec!["2024-01-01 10:00:15", "103.0", "99.5"],
svec!["2024-01-01 10:00:12", "102.5", "99.5"],
svec!["2024-01-01 10:00:03", "100.5", "99.5"],
svec!["2024-01-01 10:00:08", "101.5", "99.5"],
];

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
assert_eq!(got, expected);
}

0 comments on commit 2d858cf

Please sign in to comment.