From 9a43db29958a0da95fddb537356b14254d277265 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 7 Dec 2024 08:01:29 -0500 Subject: [PATCH 1/4] deps: bump polars to latest upstream --- Cargo.lock | 40 ++++++++++++++++++++-------------------- Cargo.toml | 4 ++-- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c18d2dffe..8afe6b2b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4534,7 +4534,7 @@ dependencies = [ [[package]] name = "polars" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "getrandom", "polars-arrow", @@ -4553,7 +4553,7 @@ dependencies = [ [[package]] name = "polars-arrow" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "ahash", "atoi", @@ -4597,7 +4597,7 @@ dependencies = [ [[package]] name = "polars-compute" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "atoi_simd", "bytemuck", @@ -4618,7 +4618,7 @@ dependencies = [ [[package]] name = "polars-core" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4654,7 +4654,7 @@ dependencies = [ [[package]] name = "polars-error" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "avro-schema", "object_store", @@ -4667,7 +4667,7 @@ dependencies = [ [[package]] name = "polars-expr" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4690,7 +4690,7 @@ dependencies = [ [[package]] name = "polars-io" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "ahash", "async-trait", @@ -4738,7 +4738,7 @@ dependencies = [ [[package]] name = "polars-json" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "ahash", "chrono", @@ -4760,7 +4760,7 @@ dependencies = [ [[package]] name = "polars-lazy" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4787,7 +4787,7 @@ dependencies = [ [[package]] name = "polars-mem-engine" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "futures", "memmap2", @@ -4808,7 +4808,7 @@ dependencies = [ [[package]] name = "polars-ops" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "ahash", "argminmax", @@ -4843,7 +4843,7 @@ dependencies = [ [[package]] name = "polars-parquet" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "ahash", "async-stream", @@ -4881,7 +4881,7 @@ dependencies = [ [[package]] name = "polars-pipe" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -4907,7 +4907,7 @@ dependencies = [ [[package]] name = "polars-plan" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "ahash", "bitflags 2.6.0", @@ -4942,7 +4942,7 @@ dependencies = [ [[package]] name = "polars-row" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "bitflags 2.6.0", "bytemuck", @@ -4955,7 +4955,7 @@ dependencies = [ [[package]] name = "polars-schema" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "indexmap", "polars-error", @@ -4967,7 +4967,7 @@ dependencies = [ [[package]] name = "polars-sql" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "hex", "once_cell", @@ -4988,7 +4988,7 @@ dependencies = [ [[package]] name = "polars-stream" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "atomic-waker", "crossbeam-deque", @@ -5017,7 +5017,7 @@ dependencies = [ [[package]] name = "polars-time" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "atoi", "bytemuck", @@ -5039,7 +5039,7 @@ dependencies = [ [[package]] name = "polars-utils" version = "0.44.2" -source = "git+https://github.com/pola-rs/polars?rev=39550c0#39550c099094d55691d1adfeedcc5afd2efb7f3d" +source = "git+https://github.com/pola-rs/polars?rev=a6ca94d#a6ca94dc920873b0757fd656ed100b576583b936" dependencies = [ "ahash", "bytemuck", diff --git a/Cargo.toml b/Cargo.toml index 52cf457f9..cab5af43b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -329,9 +329,9 @@ strum_macros = { git = "https://github.com/dathere/strum", branch = "bump-phf-to # BUILD NOTE: Be sure to set QSV_POLARS_REV below to the latest commit short hash or tag # of polars/py-polars before building qsv. This allows us to show the polars rev/tag in --version. # if we are using a release version of Rust Polars, leave QSV_POLARS_REV empty -# QSV_POLARS_REV=39550c0 +# QSV_POLARS_REV=a6ca94d # polars = { git = "https://github.com/pola-rs/polars", tag = "py-1.16.0" } -polars = { git = "https://github.com/pola-rs/polars", rev = "39550c0" } +polars = { git = "https://github.com/pola-rs/polars", rev = "a6ca94d" } [features] default = ["mimalloc"] From cd31fdb453f8f0830876b535f9ee943da18c6fee Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 7 Dec 2024 08:01:58 -0500 Subject: [PATCH 2/4] feat: `joinp` add `--maintain-order` option --- src/cmd/joinp.rs | 61 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/src/cmd/joinp.rs b/src/cmd/joinp.rs index 256eeb528..afe25d3e1 100644 --- a/src/cmd/joinp.rs +++ b/src/cmd/joinp.rs @@ -1,14 +1,12 @@ static USAGE: &str = r#" -Joins two sets of CSV data on the specified columns using the Pola.rs engine. +Joins two sets of CSV data on the specified columns using the Polars engine. The default join operation is an 'inner' join. This corresponds to the intersection of rows on the keys specified. Unlike the join command, joinp can process files larger than RAM, is multithreaded, -has join key validation, pre-join filtering, supports asof joins & its output columns -can be coalesced (no duplicate columns). - -However, joinp doesn't have an --ignore-case option. +has join key validation, a maintain row order option, pre-join filtering, supports +asof joins and its output columns can be coalesced (no duplicate columns). Returns the shape of the join result (number of rows, number of columns) to stderr. @@ -76,7 +74,13 @@ joinp options: onetoone - join keys are unique in both left & right data sets. [default: none] - JOIN OPTIONS: + JOIN OPTIONS: + --maintain-order Which row order to preserve, if any. Valid values are: + none, left, right, left_right, right_left + Do not rely on any observed ordering without explicitly + setting this parameter. Not specifying any order can improve + performance. Supported for inner, left, right and full joins. + [default: none] --nulls When set, joins will work on empty fields. Otherwise, empty fields are completely ignored. --streaming When set, the join will be done in a streaming fashion. @@ -228,6 +232,7 @@ struct Args { flag_filter_left: Option, flag_filter_right: Option, flag_validate: Option, + flag_maintain_order: Option, flag_nulls: bool, flag_streaming: bool, flag_try_parsedates: bool, @@ -283,6 +288,19 @@ pub fn run(argv: &[&str]) -> CliResult<()> { s => return fail_incorrectusage_clierror!("Invalid join validation: {s}"), }; + let flag_maintain_order = args + .flag_maintain_order + .unwrap_or_else(|| "none".to_string()) + .to_lowercase(); + let maintain_order = match flag_maintain_order.as_str() { + "none" => MaintainOrderJoin::None, + "left" => MaintainOrderJoin::Left, + "right" => MaintainOrderJoin::Right, + "left_right" => MaintainOrderJoin::LeftRight, + "right_left" => MaintainOrderJoin::RightLeft, + s => return fail_incorrectusage_clierror!("Invalid maintain order option: {s}"), + }; + let join_shape: (usize, usize) = match ( args.flag_left, args.flag_left_anti, @@ -292,27 +310,35 @@ pub fn run(argv: &[&str]) -> CliResult<()> { args.flag_cross, args.flag_asof, ) { + // default inner join (false, false, false, false, false, false, false) => { - join.run(JoinType::Inner, validation, false) + join.run(JoinType::Inner, validation, maintain_order, false) }, + // left join (true, false, false, false, false, false, false) => { - join.run(JoinType::Left, validation, false) + join.run(JoinType::Left, validation, maintain_order, false) }, + // left anti join (false, true, false, false, false, false, false) => { - join.run(JoinType::Anti, validation, false) + join.run(JoinType::Anti, validation, maintain_order, false) }, + // left semi join (false, false, true, false, false, false, false) => { - join.run(JoinType::Semi, validation, false) + join.run(JoinType::Semi, validation, maintain_order, false) }, + // right join (false, false, false, true, false, false, false) => { - join.run(JoinType::Right, validation, false) + join.run(JoinType::Right, validation, maintain_order, false) }, + // full join (false, false, false, false, true, false, false) => { - join.run(JoinType::Full, validation, false) + join.run(JoinType::Full, validation, maintain_order, false) }, + // cross join (false, false, false, false, false, true, false) => { - join.run(JoinType::Cross, validation, false) + join.run(JoinType::Cross, validation, MaintainOrderJoin::None, false) }, + // as of join (false, false, false, false, false, false, true) => { // safety: flag_strategy is always is_some() as it has a default value args.flag_strategy = Some(args.flag_strategy.unwrap().to_lowercase()); @@ -358,7 +384,12 @@ pub fn run(argv: &[&str]) -> CliResult<()> { .collect(), ); } - join.run(JoinType::AsOf(asof_options), validation, true) + join.run( + JoinType::AsOf(asof_options), + validation, + MaintainOrderJoin::None, + true, + ) }, _ => fail_incorrectusage_clierror!("Please pick exactly one join operation."), }?; @@ -394,6 +425,7 @@ impl JoinStruct { mut self, jointype: JoinType, validation: JoinValidation, + maintain_order: MaintainOrderJoin, asof_join: bool, ) -> CliResult<(usize, usize)> { let mut left_selcols: Vec<_> = self @@ -537,6 +569,7 @@ impl JoinStruct { .left_on(left_selcols) .right_on(right_selcols) .how(jointype) + .maintain_order(maintain_order) .coalesce(coalesce_flag) .allow_parallel(true) .validate(validation) From dc62e5fa38c83890e6ef90a7f09d899cf47347f1 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 7 Dec 2024 08:02:20 -0500 Subject: [PATCH 3/4] tests: add `joinp --maintain-order` tests --- tests/test_joinp.rs | 138 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/tests/test_joinp.rs b/tests/test_joinp.rs index 6a141920b..8cb4e82c0 100644 --- a/tests/test_joinp.rs +++ b/tests/test_joinp.rs @@ -1284,3 +1284,141 @@ fn joinp_ignore_case() { ]; assert_eq!(got, expected); } + +#[test] +fn joinp_ignore_case_maintain_order_right() { + let wrk = Workdir::new("joinp_mo_r"); + + // Create test data with mixed case cities + wrk.create( + "cities_mixed.csv", + vec![ + svec!["city", "state"], + svec!["BOSTON", "MA"], + svec!["new york", "NY"], + svec!["San Francisco", "CA"], + svec!["BUFFALO", "NY"], + ], + ); + + wrk.create( + "places_mixed.csv", + vec![ + svec!["city", "place"], + svec!["Boston", "Logan Airport"], + svec!["boston", "Boston Garden"], + svec!["BUFFALO", "Ralph Wilson Stadium"], + svec!["orlando", "Disney World"], + svec!["new York", "Madison Square Garden"], + svec!["san francisco", "Fisherman's Wharf"], + ], + ); + + let mut cmd = wrk.command("joinp"); + cmd.args(&["city", "cities_mixed.csv", "city", "places_mixed.csv"]) + .arg("--ignore-case") + .args(["--maintain-order", "right"]); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["city", "state", "city_right", "place"], + svec!["BOSTON", "MA", "Boston", "Logan Airport"], + svec!["BOSTON", "MA", "boston", "Boston Garden"], + svec!["BUFFALO", "NY", "BUFFALO", "Ralph Wilson Stadium"], + svec!["new york", "NY", "new York", "Madison Square Garden"], + svec!["San Francisco", "CA", "san francisco", "Fisherman's Wharf"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn joinp_ignore_case_maintain_order_left() { + let wrk = Workdir::new("joinp_mo_l"); + + // Create test data with mixed case cities + wrk.create( + "cities_mixed.csv", + vec![ + svec!["city", "state"], + svec!["BOSTON", "MA"], + svec!["new york", "NY"], + svec!["San Francisco", "CA"], + svec!["BUFFALO", "NY"], + ], + ); + + wrk.create( + "places_mixed.csv", + vec![ + svec!["city", "place"], + svec!["Boston", "Logan Airport"], + svec!["boston", "Boston Garden"], + svec!["BUFFALO", "Ralph Wilson Stadium"], + svec!["orlando", "Disney World"], + svec!["new York", "Madison Square Garden"], + svec!["san francisco", "Fisherman's Wharf"], + ], + ); + + let mut cmd = wrk.command("joinp"); + cmd.args(&["city", "cities_mixed.csv", "city", "places_mixed.csv"]) + .arg("--ignore-case") + .args(["--maintain-order", "left"]); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["city", "state", "city_right", "place"], + svec!["BOSTON", "MA", "Boston", "Logan Airport"], + svec!["BOSTON", "MA", "boston", "Boston Garden"], + svec!["new york", "NY", "new York", "Madison Square Garden"], + svec!["San Francisco", "CA", "san francisco", "Fisherman's Wharf"], + svec!["BUFFALO", "NY", "BUFFALO", "Ralph Wilson Stadium"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn joinp_ignore_case_maintain_order_right_left() { + let wrk = Workdir::new("joinp_mo_rl"); + + // Create test data with mixed case cities + wrk.create( + "cities_mixed.csv", + vec![ + svec!["city", "state"], + svec!["BOSTON", "MA"], + svec!["new york", "NY"], + svec!["San Francisco", "CA"], + svec!["BUFFALO", "NY"], + ], + ); + + wrk.create( + "places_mixed.csv", + vec![ + svec!["city", "place"], + svec!["Boston", "Logan Airport"], + svec!["boston", "Boston Garden"], + svec!["BUFFALO", "Ralph Wilson Stadium"], + svec!["orlando", "Disney World"], + svec!["new York", "Madison Square Garden"], + svec!["san francisco", "Fisherman's Wharf"], + ], + ); + + let mut cmd = wrk.command("joinp"); + cmd.args(&["city", "cities_mixed.csv", "city", "places_mixed.csv"]) + .arg("--ignore-case") + .args(["--maintain-order", "left_right"]); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["city", "state", "city_right", "place"], + svec!["BOSTON", "MA", "Boston", "Logan Airport"], + svec!["BOSTON", "MA", "boston", "Boston Garden"], + svec!["new york", "NY", "new York", "Madison Square Garden"], + svec!["San Francisco", "CA", "san francisco", "Fisherman's Wharf"], + svec!["BUFFALO", "NY", "BUFFALO", "Ralph Wilson Stadium"], + ]; + assert_eq!(got, expected); +} From 3d8b27ba955a81dd8edf487998c5d65c7634fa3f Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 7 Dec 2024 08:02:55 -0500 Subject: [PATCH 4/4] docs: update `joinp` description to add `--maintain-order` option --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8043eb756..6bc1105f0 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ | [index](/src/cmd/index.rs#L2) | Create an index (📇) for a CSV. This is very quick (even the 15gb, 28m row NYC 311 dataset takes all of 14 seconds to index) & provides constant time indexing/random access into the CSV. With an index, `count`, `sample` & `slice` work instantaneously; random access mode is enabled in `luau`; and multithreading (🏎️) is enabled for the `frequency`, `split`, `stats`, `schema` & `tojsonl` commands. | | [input](/src/cmd/input.rs#L2) | Read CSV data with special commenting, quoting, trimming, line-skipping & non-UTF8 encoding handling rules. Typically used to "normalize" a CSV for further processing with other qsv commands. | | [join](/src/cmd/join.rs#L2)
👆 | Inner, outer, right, cross, anti & semi joins. Automatically creates a simple, in-memory hash index to make it fast. | -| [joinp](/src/cmd/joinp.rs#L2)✨
🚀🐻‍❄️🪄 | Inner, outer, right, cross, anti, semi & asof joins using the [Pola.rs](https://www.pola.rs) engine. Unlike the `join` command, `joinp` can process files larger than RAM, is multithreaded, has join key validation, pre-join filtering, supports [asof joins](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.join_asof.html) (which is [particularly useful for time series data](https://github.com/dathere/qsv/blob/30cc920d0812a854fcbfedc5db81788a0600c92b/tests/test_joinp.rs#L509-L983)) & its output columns can be coalesced. | +| [joinp](/src/cmd/joinp.rs#L2)✨
🚀🐻‍❄️🪄 | Inner, outer, right, cross, anti, semi & asof joins using the [Pola.rs](https://www.pola.rs) engine. Unlike the `join` command, `joinp` can process files larger than RAM, is multithreaded, has join key validation, a maintain row order option, pre-join filtering, supports [asof joins](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.join_asof.html) (which is [particularly useful for time series data](https://github.com/dathere/qsv/blob/30cc920d0812a854fcbfedc5db81788a0600c92b/tests/test_joinp.rs#L509-L983)) & its output columns can be coalesced. | | [json](/src/cmd/json.rs#L2)
👆 | Convert JSON to CSV. | [jsonl](/src/cmd/jsonl.rs#L2)
🚀🔣 | Convert newline-delimited JSON ([JSONL](https://jsonlines.org/)/[NDJSON](http://ndjson.org/)) to CSV. See `tojsonl` command to convert CSV to JSONL. | [lens](/src/cmd/lens.rs#L2)✨ | Interactively view, search & filter a CSV using the [csvlens](https://github.com/YS-L/csvlens#csvlens) engine.