From 4023c2edbac340d4970cefc2591e607915c0d7ae Mon Sep 17 00:00:00 2001 From: Kartheek Date: Fri, 16 Feb 2024 22:33:01 +0530 Subject: [PATCH 1/4] Bump Polars 0.37 --- datasets/iris.csv | 1 - lib/explorer/series.ex | 14 +- native/explorer/Cargo.lock | 481 ++++++++++++++++++++-------- native/explorer/Cargo.toml | 7 +- native/explorer/src/dataframe.rs | 8 +- native/explorer/src/dataframe/io.rs | 7 + native/explorer/src/encoding.rs | 72 ++--- native/explorer/src/expressions.rs | 10 +- native/explorer/src/lazyframe.rs | 6 +- native/explorer/src/lazyframe/io.rs | 8 +- native/explorer/src/series.rs | 62 +--- test/explorer/series_test.exs | 2 +- 12 files changed, 423 insertions(+), 255 deletions(-) diff --git a/datasets/iris.csv b/datasets/iris.csv index a39534222..94c1b12be 100644 --- a/datasets/iris.csv +++ b/datasets/iris.csv @@ -149,4 +149,3 @@ sepal_length,sepal_width,petal_length,petal_width,species 6.5,3.0,5.2,2.0,Iris-virginica 6.2,3.4,5.4,2.3,Iris-virginica 5.9,3.0,5.1,1.8,Iris-virginica - diff --git a/lib/explorer/series.ex b/lib/explorer/series.ex index 9ae8d70c1..386a7d685 100644 --- a/lib/explorer/series.ex +++ b/lib/explorer/series.ex @@ -1274,13 +1274,19 @@ defmodule Explorer.Series do """ @doc type: :element_wise - def categorise(%Series{dtype: l_dtype} = series, %Series{dtype: dtype} = categories) - when K.and(K.in(l_dtype, [:string | @integer_types]), K.in(dtype, [:string, :category])), + def categorise(%Series{dtype: l_dtype} = series, %Series{dtype: :category} = categories) + when K.in(l_dtype, [:string | @integer_types]), do: apply_series(series, :categorise, [categories]) + def categorise(%Series{dtype: l_dtype} = series, %Series{dtype: :string} = categories) + when K.in(l_dtype, [:string | @integer_types]) do + categories = categories |> distinct() |> cast(:category) + apply_series(series, :categorise, [categories]) + end + def categorise(%Series{dtype: l_dtype} = series, [head | _] = categories) when K.and(K.in(l_dtype, [:string | @integer_types]), is_binary(head)), - do: apply_series(series, :categorise, [from_list(categories, dtype: :string)]) + do: apply_series(series, :categorise, [from_list(categories, dtype: :category)]) # Slice and dice @@ -2086,7 +2092,7 @@ defmodule Explorer.Series do iex> s1 = Explorer.Series.from_list([<<1>>, <<239, 191, 19>>], dtype: :binary) iex> s2 = Explorer.Series.from_list([<<3>>, <<4>>], dtype: :binary) iex> Explorer.Series.format([s1, s2]) - ** (RuntimeError) Polars Error: invalid utf-8 sequence + ** (RuntimeError) Polars Error: invalid utf8 """ @doc type: :shape @spec format([Series.t() | String.t()]) :: Series.t() diff --git a/native/explorer/Cargo.lock b/native/explorer/Cargo.lock index 10e0271e7..d00717542 100644 --- a/native/explorer/Cargo.lock +++ b/native/explorer/Cargo.lock @@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.7" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" +checksum = "42cd52102d3df161c77a887b608d7a4897d7cc112886a9537b738a887a03aaff" dependencies = [ "cfg-if", "getrandom", @@ -96,16 +96,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf7d0a018de4f6aa429b9d33d69edf69072b1c5b1cb8d3e4a5f7ef898fc3eb76" -[[package]] -name = "arrow-format" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07884ea216994cdc32a2d5f8274a8bee979cfe90274b83f86f440866ee3132c7" -dependencies = [ - "planus", - "serde", -] - [[package]] name = "async-stream" version = "0.3.5" @@ -125,7 +115,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -136,7 +126,7 @@ checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -216,15 +206,15 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.14.0" +version = "3.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" +checksum = "d32a994c2b3ca201d9b263612a374263f05e7adde37c4707f693dcd375076d1f" [[package]] name = "bytemuck" -version = "1.14.0" +version = "1.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6" +checksum = "a2ef034f05691a48569bd920a96c81b9d91bbad1ab5ac7c4616c1f6ef36cb79f" dependencies = [ "bytemuck_derive", ] @@ -237,7 +227,7 @@ checksum = "965ab7eb5f8f97d2a083c799f3a1b994fc397b2fe2da5d1da1626ce15a39f2b1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -264,9 +254,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.31" +version = "0.4.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" +checksum = "5bc015644b92d5890fab7489e49d21f879d5c990186827d42ec511919404f38b" dependencies = [ "android-tzdata", "iana-time-zone", @@ -274,7 +264,29 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-targets 0.48.5", + "windows-targets 0.52.0", +] + +[[package]] +name = "chrono-tz" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d59ae0466b83e838b81a54256c39d5d7c20b9d7daa10510a242d9b75abd5936e" +dependencies = [ + "chrono", + "chrono-tz-build", + "phf", +] + +[[package]] +name = "chrono-tz-build" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f" +dependencies = [ + "parse-zoneinfo", + "phf", + "phf_codegen", ] [[package]] @@ -304,9 +316,9 @@ checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" [[package]] name = "crc32fast" -version = "1.3.2" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" dependencies = [ "cfg-if", ] @@ -368,9 +380,9 @@ checksum = "545b22097d44f8a9581187cdf93de7a71e4722bf51200cfaba810865b49a495d" [[package]] name = "either" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" [[package]] name = "encoding_rs" @@ -390,7 +402,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -413,7 +425,7 @@ dependencies = [ "chrono", "either", "mimalloc", - "object_store", + "object_store 0.8.0", "polars", "polars-ops", "rand", @@ -534,7 +546,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -649,9 +661,15 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" -version = "0.3.4" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f" +checksum = "bd5256b483761cd23699d0da46cc6fd2ee3be420bbe6d020ae4a091e70b7e9fd" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "home" @@ -742,9 +760,9 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.59" +version = "0.1.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -775,9 +793,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.1.0" +version = "2.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" +checksum = "233cf39063f058ea2caae4091bf4a3ef70a653afbc026f5c4a4135d114e3c177" dependencies = [ "equivalent", "hashbrown 0.14.3", @@ -798,6 +816,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.10" @@ -812,18 +839,18 @@ checksum = "9028f49264629065d057f340a86acb84867925865f73bbf8d47b4d149a7e88b8" [[package]] name = "jobserver" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" +checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" dependencies = [ "libc", ] [[package]] name = "js-sys" -version = "0.3.67" +version = "0.3.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a1d36f1235bc969acba30b7f5990b864423a6068a10f7c90ae8f0112e3a59d1" +checksum = "406cda4b368d531c842222cf9d2600a9a4acce8d29423695379c6868a143a9ee" dependencies = [ "wasm-bindgen", ] @@ -910,9 +937,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.152" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "libm" @@ -1008,9 +1035,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" dependencies = [ "adler", ] @@ -1068,9 +1095,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.17" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" +checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" dependencies = [ "autocfg", "libm", @@ -1108,7 +1135,36 @@ dependencies = [ "futures", "humantime", "hyper", - "itertools", + "itertools 0.11.0", + "parking_lot", + "percent-encoding", + "quick-xml", + "rand", + "reqwest", + "ring", + "serde", + "serde_json", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] + +[[package]] +name = "object_store" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d139f545f64630e2e3688fd9f81c470888ab01edeb72d13b4e86c566f1130000" +dependencies = [ + "async-trait", + "base64", + "bytes", + "chrono", + "futures", + "humantime", + "hyper", + "itertools 0.12.1", "parking_lot", "percent-encoding", "quick-xml", @@ -1130,6 +1186,12 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + [[package]] name = "parking_lot" version = "0.12.1" @@ -1163,12 +1225,59 @@ dependencies = [ "futures", ] +[[package]] +name = "parse-zoneinfo" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c705f256449c60da65e11ff6626e0c16a0a0b96aaa348de61376b249bc340f41" +dependencies = [ + "regex", +] + [[package]] name = "percent-encoding" version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project-lite" version = "0.2.13" @@ -1183,9 +1292,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.29" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" [[package]] name = "planus" @@ -1198,9 +1307,9 @@ dependencies = [ [[package]] name = "polars" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "938048fcda6a8e2ace6eb168bee1b415a92423ce51e418b853bf08fc40349b6b" +checksum = "e43795c49010cb851d45227caa17769e83760e21d260ba6285c563b754e1652f" dependencies = [ "getrandom", "polars-core", @@ -1214,16 +1323,16 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce68a02f698ff7787c261aea1b4c040a8fe183a8fb200e2436d7f35d95a1b86f" +checksum = "faacd21a2548fa6d50c72d6b8d4649a8e029a0f3c6c5545b7f436f0610e49b0f" dependencies = [ "ahash", - "arrow-format", "atoi", "atoi_simd", "bytemuck", "chrono", + "chrono-tz", "dyn-clone", "either", "ethnum", @@ -1233,9 +1342,11 @@ dependencies = [ "getrandom", "hashbrown 0.14.3", "itoa", + "itoap", "lz4", "multiversion", "num-traits", + "polars-arrow-format", "polars-error", "polars-utils", "ryu", @@ -1246,33 +1357,44 @@ dependencies = [ "zstd", ] +[[package]] +name = "polars-arrow-format" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b0ef2474af9396b19025b189d96e992311e6a47f90c53cd998b36c4c64b84c" +dependencies = [ + "planus", + "serde", +] + [[package]] name = "polars-compute" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b14fbc5f141b29b656a4cec4802632e5bff10bf801c6809c6bbfbd4078a044dd" +checksum = "32d9dc87f8003ae0edeef5ad9ac92b2a345480bbe17adad64496113ae84706dd" dependencies = [ "bytemuck", "num-traits", "polars-arrow", + "polars-error", "polars-utils", "version_check", ] [[package]] name = "polars-core" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0f5efe734b6cbe5f97ea769be8360df5324fade396f1f3f5ad7fe9360ca4a23" +checksum = "befd4d280a82219a01035c4f901319ceba65998c594d0c64f9a439cdee1d7777" dependencies = [ "ahash", "bitflags 2.4.2", "bytemuck", "chrono", + "chrono-tz", "either", "hashbrown 0.14.3", "indexmap", - "itoap", "num-traits", "once_cell", "polars-arrow", @@ -1292,12 +1414,12 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6396de788f99ebfc9968e7b6f523e23000506cde4ba6dfc62ae4ce949002a886" +checksum = "50f2435b02d1ba36d8c1f6a722cad04e4c0b2705a3112c5706e6960d405d7798" dependencies = [ - "arrow-format", - "object_store", + "object_store 0.9.0", + "polars-arrow-format", "regex", "simdutf8", "thiserror", @@ -1305,9 +1427,9 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d0458efe8946f4718fd352f230c0db5a37926bd0d2bd25af79dc24746abaaea" +checksum = "b51fba2cf014cb39c2b38353d601540fb9db643be65abb9ca8ff44b9c4c4a88e" dependencies = [ "ahash", "async-trait", @@ -1322,7 +1444,7 @@ dependencies = [ "memchr", "memmap2", "num-traits", - "object_store", + "object_store 0.9.0", "once_cell", "percent-encoding", "polars-arrow", @@ -1348,9 +1470,9 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea47d46b7a98fa683ef235ad48b783abf61734828e754096cfbdc77404fff9b3" +checksum = "973d1f40ba964e70cf0038779056a7850f649538f72d8828c21bc1a7bce312ed" dependencies = [ "ahash", "chrono", @@ -1369,9 +1491,9 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d7105b40905bb38e8fc4a7fd736594b7491baa12fad3ac492969ca221a1b5d5" +checksum = "d83343e413346f048f3a5ad07c0ea4b5d0bada701a482878213142970b0ddff8" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1395,15 +1517,19 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e09afc456ab11e75e5dcb43e00a01c71f3a46a2781e450054acb6bb096ca78e" +checksum = "6395f5fd5e1adf016fd6403c0a493181c1a349a7a145b2687cdf50a0d630310a" dependencies = [ "ahash", "argminmax", + "base64", "bytemuck", + "chrono", + "chrono-tz", "either", "hashbrown 0.14.3", + "hex", "indexmap", "jsonpath_lib", "memchr", @@ -1420,14 +1546,15 @@ dependencies = [ "regex", "serde_json", "smartstring", + "unicode-reverse", "version_check", ] [[package]] name = "polars-parquet" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ba24d67b1f64ab85143033dd46fa090b13c0f74acdf91b0780c16aecf005e3d" +checksum = "b664cac41636cc9f146fba584a8e7c2790d7335a278964529fa3e9b4eae96daf" dependencies = [ "ahash", "async-stream", @@ -1451,9 +1578,9 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b7ead073cc3917027d77b59861a9f071db47125de9314f8907db1a0a3e4100" +checksum = "390a831b864bc57a4cb260b0595030dfb6a4260a3723cf8ca17968ee2078b8ff" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -1477,12 +1604,13 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "384a175624d050c31c473ee11df9d7af5d729ae626375e522158cfb3d150acd0" +checksum = "7fb7d7527be2aa33baace9000f6772eb9df7cd57ec010a4b273435d2dc1349e8" dependencies = [ "ahash", "bytemuck", + "chrono-tz", "once_cell", "percent-encoding", "polars-arrow", @@ -1502,9 +1630,9 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32322f7acbb83db3e9c7697dc821be73d06238da89c817dcc8bc1549a5e9c72f" +checksum = "f4984d97aad3d0db92afe76ebcab10b5e37a1216618b5703ae0d2917ccd6168c" dependencies = [ "polars-arrow", "polars-error", @@ -1513,10 +1641,11 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f0b4c6ddffdfd0453e84bc3918572c633014d661d166654399cf93752aa95b5" +checksum = "77f62a8b8f93146ec1eb2ef340d77eeb174e8010035e449bfdd424d2b1fd944a" dependencies = [ + "hex", "polars-arrow", "polars-core", "polars-error", @@ -1530,12 +1659,13 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee2649fc96bd1b6584e0e4a4b3ca7d22ed3d117a990e63ad438ecb26f7544d0" +checksum = "6d75348a51d0c97f3b83df860ecb35a6ac6c5dafc6278cac4e1ac101d96dc753" dependencies = [ "atoi", "chrono", + "chrono-tz", "now", "once_cell", "polars-arrow", @@ -1549,9 +1679,9 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.36.2" +version = "0.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b174ca4a77ad47d7b91a0460aaae65bbf874c8bfbaaa5308675dadef3976bbda" +checksum = "38f9c955bb1e9b55d835aeb7fe4e4e8826e01abe5f0ada979ceb7d2b9af7b569" dependencies = [ "ahash", "bytemuck", @@ -1574,9 +1704,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.76" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] @@ -1695,14 +1825,14 @@ checksum = "5fddb4f8d99b0a2ebafc65a87a69a7b9875e4b1ae1f00db265d300ef7f28bccc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] name = "regex" -version = "1.10.2" +version = "1.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" dependencies = [ "aho-corasick", "memchr", @@ -1712,9 +1842,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.3" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" dependencies = [ "aho-corasick", "memchr", @@ -1729,9 +1859,9 @@ checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "reqwest" -version = "0.11.23" +version = "0.11.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37b1ae8d9ac08420c66222fb9096fc5de435c3c48542bc5336c51892cffafb41" +checksum = "c6920094eb85afde5e4a138be3f2de8bbdf28000f0029e72c45025a56b042251" dependencies = [ "base64", "bytes", @@ -1751,10 +1881,12 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rustls", + "rustls-native-certs", "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", + "sync_wrapper", "system-configuration", "tokio", "tokio-rustls", @@ -1809,14 +1941,14 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] name = "rustler_sys" -version = "2.3.1" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a7c0740e5322b64e2b952d8f0edce5f90fcf6f6fe74cca3f6e78eb3de5ea858" +checksum = "ff76ba8524729d7c9db2b3e80f2269d1fdef39b5a60624c33fd794797e69b558" dependencies = [ "regex", "unreachable", @@ -1834,6 +1966,18 @@ dependencies = [ "sct", ] +[[package]] +name = "rustls-native-certs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +dependencies = [ + "openssl-probe", + "rustls-pemfile", + "schannel", + "security-framework", +] + [[package]] name = "rustls-pemfile" version = "1.0.4" @@ -1874,6 +2018,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "schannel" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -1890,6 +2043,29 @@ dependencies = [ "untrusted", ] +[[package]] +name = "security-framework" +version = "2.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "seq-macro" version = "0.3.5" @@ -1898,29 +2074,29 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.195" +version = "1.0.196" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63261df402c67811e9ac6def069e4786148c4563f4b50fd4bf30aa370d626b02" +checksum = "870026e60fa08c69f064aa766c10f10b1d62db9ccd4d0abb206472bee0ce3b32" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.195" +version = "1.0.196" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46fe8f8603d81ba86327b23a2e9cdf49e1255fb94a4c5f297f6ee0547178ea2c" +checksum = "33c85360c95e7d137454dc81d9a4ed2b8efd8fbe19cee57357b32b9771fccb67" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] name = "serde_json" -version = "1.0.111" +version = "1.0.113" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "176e46fa42316f18edd598015a5166857fc835ec732f5215eac6b7bdbf0a84f4" +checksum = "69801b70b1c3dac963ecb03a364ba0ceda9cf60c71cfe475e99864759c8b8a79" dependencies = [ "indexmap", "itoa", @@ -1964,6 +2140,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + [[package]] name = "slab" version = "0.4.9" @@ -2080,7 +2262,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -2096,15 +2278,21 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.48" +version = "2.0.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" +checksum = "915aea9e586f80826ee59f8453c1101f9d1c4b3964cd2460185ee8e299ada496" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" + [[package]] name = "sysinfo" version = "0.30.5" @@ -2148,22 +2336,22 @@ checksum = "cfb5fa503293557c5158bd215fdc225695e567a77e453f5d4452a50a193969bd" [[package]] name = "thiserror" -version = "1.0.56" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" +checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.56" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" +checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -2183,9 +2371,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.35.1" +version = "1.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104" +checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" dependencies = [ "backtrace", "bytes", @@ -2206,7 +2394,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -2258,7 +2446,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -2297,6 +2485,21 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-reverse" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bea5dacebb0d2d0a69a6700a05b59b3908bf801bf563a49bd27a1b60122962c" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "unicode-segmentation" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" + [[package]] name = "unreachable" version = "1.0.0" @@ -2325,9 +2528,9 @@ dependencies = [ [[package]] name = "value-trait" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea87257cfcbedcb9444eda79c59fdfea71217e6305afee8ee33f500375c2ac97" +checksum = "dad8db98c1e677797df21ba03fca7d3bf9bec3ca38db930954e4fe6e1ea27eb4" dependencies = [ "float-cmp", "halfbrown", @@ -2374,9 +2577,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1223296a201415c7fad14792dbefaace9bd52b62d33453ade1c5b5f07555406" +checksum = "c1e124130aee3fb58c5bdd6b639a0509486b0338acaaae0c84a5124b0f588b7f" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -2384,24 +2587,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcdc935b63408d58a32f8cc9738a0bffd8f05cc7c002086c6ef20b7312ad9dcd" +checksum = "c9e7e1900c352b609c8488ad12639a311045f40a35491fb69ba8c12f758af70b" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.40" +version = "0.4.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bde2032aeb86bdfaecc8b261eef3cba735cc426c1f3a3416d1e0791be95fc461" +checksum = "877b9c3f61ceea0e56331985743b13f3d25c406a7098d45180fb5f09bc19ed97" dependencies = [ "cfg-if", "js-sys", @@ -2411,9 +2614,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e4c238561b2d428924c49815533a8b9121c664599558a5d9ec51f8a1740a999" +checksum = "b30af9e2d358182b5c7449424f017eba305ed32a7010509ede96cdc4696c46ed" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2421,28 +2624,28 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bae1abb6806dc1ad9e560ed242107c0f6c84335f1749dd4e8ddb012ebd5e25a7" +checksum = "642f325be6301eb8107a83d12a8ac6c1e1c54345a7ef1a9261962dfefda09e66" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d91413b1c31d7539ba5ef2451af3f0b833a005eb27a631cec32bc0635a8602b" +checksum = "4f186bd2dcf04330886ce82d6f33dd75a7bfcf69ecf5763b89fcde53b6ac9838" [[package]] name = "wasm-streams" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4609d447824375f43e1ffbc051b50ad8f4b3ae8219680c94452ea05eb240ac7" +checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" dependencies = [ "futures-util", "js-sys", @@ -2453,9 +2656,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.67" +version = "0.3.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58cd2333b6e0be7a39605f0e255892fd7418a682d8da8fe042fe25128794d2ed" +checksum = "96565907687f7aceb35bc5fc03770a8a0471d82e479f25832f54a0e3f4b28446" dependencies = [ "js-sys", "wasm-bindgen", @@ -2463,9 +2666,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "0.25.3" +version = "0.25.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10" +checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1" [[package]] name = "winapi" @@ -2682,7 +2885,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] diff --git a/native/explorer/Cargo.toml b/native/explorer/Cargo.toml index ffad2494e..6be1b7c43 100644 --- a/native/explorer/Cargo.toml +++ b/native/explorer/Cargo.toml @@ -40,7 +40,7 @@ mimalloc = { version = "*", default-features = false } jsonpath_lib = { version = "0.3", git = "https://github.com/ritchie46/jsonpath", branch = "improve_compiled" } [dependencies.polars] -version = "0.36" +version = "0.37" default-features = false features = [ "abs", @@ -57,9 +57,8 @@ features = [ "group_by_list", "ipc", "ipc_streaming", - "horizontal_concat", "lazy", - "lazy_regex", + "regex", "log", "mode", "parquet", @@ -86,7 +85,7 @@ features = [ ] [dependencies.polars-ops] -version = "0.36" +version = "0.37" features = ["abs", "ewma", "cum_agg", "cov"] [features] diff --git a/native/explorer/src/dataframe.rs b/native/explorer/src/dataframe.rs index 0ca0ad0a4..de84d361a 100644 --- a/native/explorer/src/dataframe.rs +++ b/native/explorer/src/dataframe.rs @@ -121,14 +121,14 @@ pub fn df_concat_columns( others: Vec, ) -> Result { let id_column = "__row_count_id__"; - let first = data.clone_inner().lazy().with_row_count(id_column, None); + let first = data.clone_inner().lazy().with_row_index(id_column, None); // We need to be able to handle arbitrary column name overlap. // This builds up a join and suffixes conflicting names with _N where // N is the index of the df in the join array. let (out_df, _) = others .iter() - .map(|data| data.clone_inner().lazy().with_row_count(id_column, None)) + .map(|data| data.clone_inner().lazy().with_row_index(id_column, None)) .fold((first, 1), |(acc_df, count), lazy_df| { let suffix = format!("_{count}"); let new_df = acc_df @@ -142,9 +142,7 @@ pub fn df_concat_columns( (new_df, count + 1) }); - Ok(ExDataFrame::new( - out_df.drop_columns([id_column]).collect()?, - )) + Ok(ExDataFrame::new(out_df.drop([id_column]).collect()?)) } #[rustler::nif(schedule = "DirtyCpu")] diff --git a/native/explorer/src/dataframe/io.rs b/native/explorer/src/dataframe/io.rs index 55b7f6449..f21d0f37c 100644 --- a/native/explorer/src/dataframe/io.rs +++ b/native/explorer/src/dataframe/io.rs @@ -9,6 +9,7 @@ // Today we have the following formats: CSV, NDJSON, Parquet, Apache Arrow and Apache Arrow Stream. // use polars::prelude::*; +use std::num::NonZeroUsize; use rustler::{Binary, Env, NewBinary}; use std::convert::TryFrom; @@ -503,6 +504,9 @@ pub fn df_from_ndjson( ) -> Result { let file = File::open(filename)?; let buf_reader = BufReader::new(file); + let batch_size = NonZeroUsize::new(batch_size).ok_or(ExplorerError::Other( + "\"batch_size\" expected to be non zero.".to_string(), + ))?; let reader = JsonReader::new(buf_reader) .with_json_format(JsonFormat::JsonLines) .with_batch_size(batch_size) @@ -557,6 +561,9 @@ pub fn df_load_ndjson( batch_size: usize, ) -> Result { let cursor = Cursor::new(binary.as_slice()); + let batch_size = NonZeroUsize::new(batch_size).ok_or(ExplorerError::Other( + "\"batch_size\" expected to be non zero.".to_string(), + ))?; let reader = JsonReader::new(cursor) .with_json_format(JsonFormat::JsonLines) .with_batch_size(batch_size) diff --git a/native/explorer/src/encoding.rs b/native/explorer/src/encoding.rs index 43ebc34b5..cc2744630 100644 --- a/native/explorer/src/encoding.rs +++ b/native/explorer/src/encoding.rs @@ -1,5 +1,4 @@ use chrono::prelude::*; -use polars::export::arrow::array::GenericBinaryArray; use polars::prelude::*; use rustler::{Encoder, Env, NewBinary, OwnedBinary, ResourceArc, Term}; use std::collections::HashMap; @@ -15,7 +14,7 @@ use crate::datatypes::{ use crate::ExplorerError; use rustler::types::atom; -use rustler::wrapper::{binary, list, map, NIF_TERM}; +use rustler::wrapper::{list, map, NIF_TERM}; // Encoding helpers @@ -371,54 +370,32 @@ fn time_series_to_list<'b>(s: &Series, env: Env<'b>) -> Result, Explore )) } -fn generic_binary_series_to_list<'a, 'b, T, G>( +fn generic_string_series_to_list<'b>(s: &Series, env: Env<'b>) -> Result, ExplorerError> { + Ok(unsafe_iterator_series_to_list!( + env, + s.str()?.into_iter().map(|option| option.encode(env)) + )) +} + +fn generic_binary_series_to_list<'b>( resource: &ResourceArc, - iter: T, + s: &Series, env: Env<'b>, -) -> Result, ExplorerError> -where - T: Iterator + DoubleEndedIterator, - G: GenericBinaryArray, -{ +) -> Result, ExplorerError> { let env_as_c_arg = env.as_c_arg(); let nil_as_c_arg = atom::nil().to_term(env).as_c_arg(); let acc = unsafe { list::make_list(env_as_c_arg, &[]) }; - - let list = iter.rfold(acc, |acc, array| { - // Create a binary per array buffer - let values = array.values(); - - let binary = unsafe { resource.make_binary_unsafe(env, |_| values) } - .to_term(env) - .as_c_arg(); - - // Offsets have one more element than values and validity, - // so we read the last one as the initial accumulator and skip it. - let len = array.offsets().len(); - let iter = array.offsets()[0..len - 1].iter(); - let mut last_offset = array.offsets()[len - 1] as NIF_TERM; - - let mut validity_iter = match array.validity() { - Some(validity) => validity.iter(), - None => polars::export::arrow::bitmap::utils::BitmapIter::new(&[], 0, 0), - }; - - iter.rfold(acc, |acc, uncast_offset| { - let offset = *uncast_offset as NIF_TERM; - - let term_as_c_arg = if validity_iter.next_back().unwrap_or(true) { - unsafe { - binary::make_subbinary(env_as_c_arg, binary, offset, last_offset - offset) - } - } else { - nil_as_c_arg + let list = s.binary()?.downcast_iter().rfold(acc, |acc, array| { + array.iter().rfold(acc, |acc, v| { + let term_as_c_arg = match v { + Some(values) => unsafe { resource.make_binary_unsafe(env, |_| values) } + .to_term(env) + .as_c_arg(), + None => nil_as_c_arg, }; - - last_offset = offset; unsafe { list::make_list_cell(env_as_c_arg, term_as_c_arg, acc) } }) }); - Ok(unsafe { Term::new(env, list) }) } @@ -624,12 +601,13 @@ pub fn list_from_series(s: ExSeries, env: Env) -> Result { DataType::Time => time_series_to_list(&s, env), DataType::Datetime(time_unit, None) => datetime_series_to_list(&s, *time_unit, env), DataType::Duration(time_unit) => duration_series_to_list(&s, *time_unit, env), - DataType::String => { - generic_binary_series_to_list(&s.resource, s.str()?.downcast_iter(), env) - } - DataType::Binary => { - generic_binary_series_to_list(&s.resource, s.binary()?.downcast_iter(), env) - } + DataType::Binary => generic_binary_series_to_list(&s.resource, &s, env), + DataType::String => generic_string_series_to_list(&s, env), + // generic_binary_series_to_list(&s.resource, s.str()?.downcast_iter(), env) + // } + // DataType::Binary => { + // generic_binary_series_to_list(&s.resource, s.binary()?.downcast_iter(), env) + // } DataType::Categorical(Some(mapping), _) => categorical_series_to_list(&s, env, mapping), DataType::List(_inner_dtype) => s .list()? diff --git a/native/explorer/src/expressions.rs b/native/explorer/src/expressions.rs index 03e3df966..f304c31ed 100644 --- a/native/explorer/src/expressions.rs +++ b/native/explorer/src/expressions.rs @@ -602,7 +602,7 @@ pub fn expr_last(expr: ExExpr) -> ExExpr { #[rustler::nif] pub fn expr_format(exprs: Vec) -> ExExpr { - ExExpr::new(concat_str(ex_expr_to_exprs(exprs), "")) + ExExpr::new(concat_str(ex_expr_to_exprs(exprs), "", true)) //TODO: ignore_nulls } #[rustler::nif] @@ -879,8 +879,12 @@ pub fn expr_rstrip(expr: ExExpr, string: Option) -> ExExpr { #[rustler::nif] pub fn expr_substring(expr: ExExpr, offset: i64, length: Option) -> ExExpr { + let length = match length { + Some(l) => l.lit(), + None => Expr::Literal(LiteralValue::Null), + }; let expr = expr.clone_inner(); - ExExpr::new(expr.str().slice(offset, length)) + ExExpr::new(expr.str().slice(offset.lit(), length)) } #[rustler::nif] @@ -1051,7 +1055,7 @@ pub fn expr_second(expr: ExExpr) -> ExExpr { pub fn expr_join(expr: ExExpr, sep: String) -> ExExpr { let expr = expr.clone_inner(); - ExExpr::new(expr.list().join(sep.lit())) + ExExpr::new(expr.list().join(sep.lit(), true)) //TODO: ignore_nulls } #[rustler::nif] diff --git a/native/explorer/src/lazyframe.rs b/native/explorer/src/lazyframe.rs index c7465e3af..e78f46edd 100644 --- a/native/explorer/src/lazyframe.rs +++ b/native/explorer/src/lazyframe.rs @@ -266,14 +266,14 @@ pub fn lf_concat_columns( others: Vec, ) -> Result { let id_column = "__row_count_id__"; - let first = data.clone_inner().with_row_count(id_column, None); + let first = data.clone_inner().with_row_index(id_column, None); // We need to be able to handle arbitrary column name overlap. // This builds up a join and suffixes conflicting names with _N where // N is the index of the df in the join array. let (out_df, _) = others .iter() - .map(|data| data.clone_inner().with_row_count(id_column, None)) + .map(|data| data.clone_inner().with_row_index(id_column, None)) .fold((first, 1), |(acc_df, count), df| { let suffix = format!("_{count}"); let new_df = acc_df @@ -287,5 +287,5 @@ pub fn lf_concat_columns( (new_df, count + 1) }); - Ok(ExLazyFrame::new(out_df.drop_columns([id_column]))) + Ok(ExLazyFrame::new(out_df.drop([id_column]))) } diff --git a/native/explorer/src/lazyframe/io.rs b/native/explorer/src/lazyframe/io.rs index 794152800..c2ba01dfb 100644 --- a/native/explorer/src/lazyframe/io.rs +++ b/native/explorer/src/lazyframe/io.rs @@ -1,6 +1,7 @@ use polars::prelude::*; use std::fs::File; use std::io::BufWriter; +use std::num::NonZeroUsize; use std::result::Result; use crate::dataframe::io::schema_from_dtypes_pairs; @@ -230,11 +231,14 @@ pub fn lf_from_csv( pub fn lf_from_ndjson( filename: String, infer_schema_length: Option, - batch_size: Option, + batch_size: usize, ) -> Result { + let batch_size = NonZeroUsize::new(batch_size).ok_or(ExplorerError::Other( + "\"batch_size\" expected to be non zero.".to_string(), + ))?; let lf = LazyJsonLineReader::new(filename) .with_infer_schema_length(infer_schema_length) - .with_batch_size(batch_size) + .with_batch_size(Some(batch_size)) .finish()?; Ok(ExLazyFrame::new(lf)) diff --git a/native/explorer/src/series.rs b/native/explorer/src/series.rs index bc487d858..2df21f1d3 100644 --- a/native/explorer/src/series.rs +++ b/native/explorer/src/series.rs @@ -8,7 +8,7 @@ use crate::{ }; use encoding::encode_datetime; -use polars::export::arrow::array::Utf8Array; + use polars::prelude::*; use polars_ops::chunked_array::cov::{cov, pearson_corr}; use polars_ops::prelude::peaks::*; @@ -1346,52 +1346,12 @@ pub fn s_categorise(s: ExSeries, cat: ExSeries) -> Result { - if cat.len() != cat.unique()?.len() { - return Err(ExplorerError::Other( - "categories as strings cannot have duplicated values".into(), - )); - }; - - let utf8s = cat.str()?; - - if utf8s.has_validity() { - Err(ExplorerError::Other( - "categories as strings cannot have nil values".into(), - )) - } else { - let values: Vec> = utf8s.into(); - let array = Utf8Array::::from(values); - let mapping = RevMapping::build_local(array); - - let chunks = if s.dtype() == &DataType::String { - let ids: ChunkedArray = s - .str()? - .into_iter() - .map(|opt_str| opt_str.and_then(|slice| mapping.find(slice))) - .collect(); - - ids - } else { - s.cast(&DataType::UInt32)?.u32()?.clone() - }; - - let categorical_chunks = unsafe { - CategoricalChunked::from_cats_and_rev_map_unchecked( - chunks, - Arc::new(mapping), - CategoricalOrdering::default(), - ) - }; - - Ok(ExSeries::new(categorical_chunks.into_series())) - } - } _ => panic!("Cannot get categories from non categorical or string series"), } } @@ -1590,12 +1550,22 @@ pub fn s_rstrip(s1: ExSeries, pattern: Option<&str>) -> Result, ) -> Result { - let s2 = s1.str()?.str_slice(offset, length).into_series(); - + let length = match length { + Some(l) => l.lit(), + None => Expr::Literal(LiteralValue::Null), + }; + let s2 = s + .clone_inner() + .into_frame() + .lazy() + .select([col(s.name()).str().slice(offset.lit(), length)]) + .collect()? + .column(s.name())? + .clone(); Ok(ExSeries::new(s2)) } @@ -1778,7 +1748,7 @@ pub fn s_atan(s: ExSeries) -> Result { pub fn s_join(s1: ExSeries, separator: &str) -> Result { let s2 = s1 .list()? - .lst_join(&ChunkedArray::new("a", &[separator]))? + .lst_join(&ChunkedArray::new("a", &[separator]), true)? //TODO: ignore_nulls .into_series(); Ok(ExSeries::new(s2)) diff --git a/test/explorer/series_test.exs b/test/explorer/series_test.exs index 35b8cafe0..9566b512a 100644 --- a/test/explorer/series_test.exs +++ b/test/explorer/series_test.exs @@ -2981,7 +2981,7 @@ defmodule Explorer.SeriesTest do s2 = Series.from_list([<<3>>, <<4>>], dtype: :binary) assert_raise RuntimeError, - "Polars Error: invalid utf-8 sequence", + "Polars Error: invalid utf8", fn -> Series.format([s1, s2]) end end From 71850cce7d98ba1460adbd03e5ee10a1f0b8b361 Mon Sep 17 00:00:00 2001 From: Kartheek Date: Sun, 18 Feb 2024 21:00:10 +0530 Subject: [PATCH 2/4] series fixes --- lib/explorer/backend/lazy_series.ex | 7 +++- lib/explorer/polars_backend/series.ex | 22 +++++++++++-- lib/explorer/series.ex | 24 +++++++++++--- native/explorer/src/encoding.rs | 5 --- native/explorer/src/expressions.rs | 4 +-- native/explorer/src/series.rs | 2 +- test/explorer/series_test.exs | 46 +++++++++++++++++++++++++++ 7 files changed, 94 insertions(+), 16 deletions(-) diff --git a/lib/explorer/backend/lazy_series.ex b/lib/explorer/backend/lazy_series.ex index 5bd0b10c1..caa937900 100644 --- a/lib/explorer/backend/lazy_series.ex +++ b/lib/explorer/backend/lazy_series.ex @@ -605,7 +605,12 @@ defmodule Explorer.Backend.LazySeries do @impl true def format(list) do - series_list = Enum.map(list, &series_or_lazy_series!/1) + series_list = + Enum.map(list, fn + s when is_binary(s) -> s + s -> series_or_lazy_series!(s) + end) + data = new(:format, [series_list], :string, aggregations?(series_list)) Backend.Series.new(data, :string) diff --git a/lib/explorer/polars_backend/series.ex b/lib/explorer/polars_backend/series.ex index f35c0183b..c88c010fb 100644 --- a/lib/explorer/polars_backend/series.ex +++ b/lib/explorer/polars_backend/series.ex @@ -132,10 +132,26 @@ defmodule Explorer.PolarsBackend.Series do @impl true def format(list) do - polars_series = for s <- list, do: s.data + {_, df_args, params} = + Enum.reduce(list, {0, [], []}, fn s, {counter, df_args, params} -> + if is_binary(s) do + {counter, df_args, [s | params]} + else + counter = counter + 1 + name = "#{counter}" + column = Explorer.Backend.LazySeries.new(:column, [name], :string) + {counter, [{name, s} | df_args], [column | params]} + end + end) - Shared.apply(:s_format, [polars_series]) - |> Shared.create_series() + df = Explorer.PolarsBackend.DataFrame.from_series(df_args) + format_expr = Explorer.Backend.LazySeries.new(:format, [Enum.reverse(params)], :string) + out_dtypes = Map.put(df.dtypes, "result", :string) + out_names = df.names ++ ["result"] + out_df = %{df | dtypes: out_dtypes, names: out_names} + + Explorer.PolarsBackend.DataFrame.mutate_with(df, out_df, [{"result", format_expr}]) + |> Explorer.PolarsBackend.DataFrame.pull("result") end @impl true diff --git a/lib/explorer/series.ex b/lib/explorer/series.ex index 386a7d685..14ccbfe40 100644 --- a/lib/explorer/series.ex +++ b/lib/explorer/series.ex @@ -1280,13 +1280,27 @@ defmodule Explorer.Series do def categorise(%Series{dtype: l_dtype} = series, %Series{dtype: :string} = categories) when K.in(l_dtype, [:string | @integer_types]) do - categories = categories |> distinct() |> cast(:category) + if nil_count(categories) != 0, + do: + raise( + ArgumentError, + "categories as strings cannot have nil values" + ) + + if count(categories) != n_distinct(categories), + do: + raise( + ArgumentError, + "categories as strings cannot have duplicated values" + ) + + categories = cast(categories, :category) apply_series(series, :categorise, [categories]) end def categorise(%Series{dtype: l_dtype} = series, [head | _] = categories) when K.and(K.in(l_dtype, [:string | @integer_types]), is_binary(head)), - do: apply_series(series, :categorise, [from_list(categories, dtype: :category)]) + do: categorise(series, from_list(categories, dtype: :string)) # Slice and dice @@ -2109,8 +2123,10 @@ defmodule Explorer.Series do %Series{} = s -> cast(s, :string) - value when is_binary(value) -> - from_list([value], dtype: :string) + value when K.or(is_binary(value), K.is_nil(value)) -> + value + + # from_list([value], dtype: :string) other -> raise ArgumentError, diff --git a/native/explorer/src/encoding.rs b/native/explorer/src/encoding.rs index cc2744630..ad036e9db 100644 --- a/native/explorer/src/encoding.rs +++ b/native/explorer/src/encoding.rs @@ -603,11 +603,6 @@ pub fn list_from_series(s: ExSeries, env: Env) -> Result { DataType::Duration(time_unit) => duration_series_to_list(&s, *time_unit, env), DataType::Binary => generic_binary_series_to_list(&s.resource, &s, env), DataType::String => generic_string_series_to_list(&s, env), - // generic_binary_series_to_list(&s.resource, s.str()?.downcast_iter(), env) - // } - // DataType::Binary => { - // generic_binary_series_to_list(&s.resource, s.binary()?.downcast_iter(), env) - // } DataType::Categorical(Some(mapping), _) => categorical_series_to_list(&s, env, mapping), DataType::List(_inner_dtype) => s .list()? diff --git a/native/explorer/src/expressions.rs b/native/explorer/src/expressions.rs index f304c31ed..3a18147e9 100644 --- a/native/explorer/src/expressions.rs +++ b/native/explorer/src/expressions.rs @@ -602,7 +602,7 @@ pub fn expr_last(expr: ExExpr) -> ExExpr { #[rustler::nif] pub fn expr_format(exprs: Vec) -> ExExpr { - ExExpr::new(concat_str(ex_expr_to_exprs(exprs), "", true)) //TODO: ignore_nulls + ExExpr::new(concat_str(ex_expr_to_exprs(exprs), "", true)) } #[rustler::nif] @@ -1055,7 +1055,7 @@ pub fn expr_second(expr: ExExpr) -> ExExpr { pub fn expr_join(expr: ExExpr, sep: String) -> ExExpr { let expr = expr.clone_inner(); - ExExpr::new(expr.list().join(sep.lit(), true)) //TODO: ignore_nulls + ExExpr::new(expr.list().join(sep.lit(), true)) } #[rustler::nif] diff --git a/native/explorer/src/series.rs b/native/explorer/src/series.rs index 2df21f1d3..d07d2d6ad 100644 --- a/native/explorer/src/series.rs +++ b/native/explorer/src/series.rs @@ -1748,7 +1748,7 @@ pub fn s_atan(s: ExSeries) -> Result { pub fn s_join(s1: ExSeries, separator: &str) -> Result { let s2 = s1 .list()? - .lst_join(&ChunkedArray::new("a", &[separator]), true)? //TODO: ignore_nulls + .lst_join(&ChunkedArray::new("a", &[separator]), true)? .into_series(); Ok(ExSeries::new(s2)) diff --git a/test/explorer/series_test.exs b/test/explorer/series_test.exs index 9566b512a..a7f52fa25 100644 --- a/test/explorer/series_test.exs +++ b/test/explorer/series_test.exs @@ -2917,6 +2917,10 @@ defmodule Explorer.SeriesTest do assert Series.format([s1, s2]) |> Series.to_list() == ["ac", "bd"] end + test "with two strings with nulls" do + assert Series.format(["a", nil, "b"]) |> Series.to_list() == ["ab"] + end + test "with two strings" do assert Series.format(["a", "b"]) |> Series.to_list() == ["ab"] end @@ -3728,6 +3732,42 @@ defmodule Explorer.SeriesTest do assert Series.to_list(categorized) == ["a", "c", "b", "a", "c"] assert Series.dtype(categorized) == :category end + + test "raise for string list with nils" do + categories = ["a", "b", "c", nil] + indexes = Series.from_list([0, 2, 1, 0, 2], dtype: :u32) + + assert_raise ArgumentError, + ~r"categories as strings cannot have nil values", + fn -> Series.categorise(indexes, categories) end + end + + test "raise for string list with duplicated" do + categories = ["a", "b", "c", "c"] + indexes = Series.from_list([0, 2, 1, 0, 2], dtype: :u32) + + assert_raise ArgumentError, + ~r"categories as strings cannot have duplicated values", + fn -> Series.categorise(indexes, categories) end + end + + test "raise for string series with nils" do + categories = Series.from_list(["a", "b", "c", nil], dtype: :string) + indexes = Series.from_list([0, 2, 1, 0, 2], dtype: :u32) + + assert_raise ArgumentError, + ~r"categories as strings cannot have nil values", + fn -> Series.categorise(indexes, categories) end + end + + test "raise for string series with duplicated" do + categories = Series.from_list(["a", "b", "c", "c"], dtype: :string) + indexes = Series.from_list([0, 2, 1, 0, 2], dtype: :u32) + + assert_raise ArgumentError, + ~r"categories as strings cannot have duplicated values", + fn -> Series.categorise(indexes, categories) end + end end describe "cast/2" do @@ -5364,6 +5404,12 @@ defmodule Explorer.SeriesTest do assert series |> Series.join("|") |> Series.to_list() == ["1", "1|2"] end + + test "with nulls" do + series = Series.from_list([["1"], ["1", nil, "2"]]) + + assert series |> Series.join("|") |> Series.to_list() == ["1", "1|2"] + end end describe "lengths/1" do From 22e46f52a11eb76b0bba7473400bfadb3ce4cc3c Mon Sep 17 00:00:00 2001 From: Kartheek Date: Sun, 18 Feb 2024 21:23:21 +0530 Subject: [PATCH 3/4] string format fix --- lib/explorer/polars_backend/native.ex | 1 - lib/explorer/polars_backend/series.ex | 2 +- lib/explorer/series.ex | 11 ++++++++--- native/explorer/src/lib.rs | 1 - native/explorer/src/series.rs | 12 ------------ test/explorer/series_test.exs | 2 +- 6 files changed, 10 insertions(+), 19 deletions(-) diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex index 6eb918cce..55dbd5b4a 100644 --- a/lib/explorer/polars_backend/native.ex +++ b/lib/explorer/polars_backend/native.ex @@ -313,7 +313,6 @@ defmodule Explorer.PolarsBackend.Native do def s_fill_missing_with_atom(_s, _value), do: err() def s_fill_missing_with_date(_s, _value), do: err() def s_fill_missing_with_datetime(_s, _value), do: err() - def s_format(_series_list), do: err() def s_greater(_s, _rhs), do: err() def s_greater_equal(_s, _rhs), do: err() def s_head(_s, _length), do: err() diff --git a/lib/explorer/polars_backend/series.ex b/lib/explorer/polars_backend/series.ex index c88c010fb..ba746dd10 100644 --- a/lib/explorer/polars_backend/series.ex +++ b/lib/explorer/polars_backend/series.ex @@ -134,7 +134,7 @@ defmodule Explorer.PolarsBackend.Series do def format(list) do {_, df_args, params} = Enum.reduce(list, {0, [], []}, fn s, {counter, df_args, params} -> - if is_binary(s) do + if is_binary(s) or Kernel.is_nil(s) do {counter, df_args, [s | params]} else counter = counter + 1 diff --git a/lib/explorer/series.ex b/lib/explorer/series.ex index 14ccbfe40..fa19d7b0d 100644 --- a/lib/explorer/series.ex +++ b/lib/explorer/series.ex @@ -2112,7 +2112,14 @@ defmodule Explorer.Series do @spec format([Series.t() | String.t()]) :: Series.t() def format([_ | _] = list) do list = cast_to_string(list) - impl!(list).format(list) + + if impl = impl!(list) do + impl.format(list) + else + [hd | rest] = list + s = Series.from_list([hd], dtype: :string) + impl!([s]).format([s | rest]) + end end defp cast_to_string(list) do @@ -2126,8 +2133,6 @@ defmodule Explorer.Series do value when K.or(is_binary(value), K.is_nil(value)) -> value - # from_list([value], dtype: :string) - other -> raise ArgumentError, "format/1 expects a list of series or strings, got: #{inspect(other)}" diff --git a/native/explorer/src/lib.rs b/native/explorer/src/lib.rs index 6f77fbc56..86648a16c 100644 --- a/native/explorer/src/lib.rs +++ b/native/explorer/src/lib.rs @@ -365,7 +365,6 @@ rustler::init!( s_fill_missing_with_atom, s_fill_missing_with_date, s_fill_missing_with_datetime, - s_format, s_greater, s_greater_equal, s_head, diff --git a/native/explorer/src/series.rs b/native/explorer/src/series.rs index d07d2d6ad..f3eb0e4e6 100644 --- a/native/explorer/src/series.rs +++ b/native/explorer/src/series.rs @@ -259,18 +259,6 @@ pub fn s_slice(series: ExSeries, offset: i64, length: usize) -> Result) -> Result { - let mut iter = series_vec.iter(); - let mut series = iter.next().unwrap().clone_inner().str()?.clone(); - - for s in iter { - series = series.concat(s.str()?); - } - - Ok(ExSeries::new(series.into_series())) -} - #[rustler::nif(schedule = "DirtyCpu")] pub fn s_concat(series_vec: Vec) -> Result { let mut iter = series_vec.iter(); diff --git a/test/explorer/series_test.exs b/test/explorer/series_test.exs index a7f52fa25..bf8838d12 100644 --- a/test/explorer/series_test.exs +++ b/test/explorer/series_test.exs @@ -3111,7 +3111,7 @@ defmodule Explorer.SeriesTest do s4 = Series.from_list(["m", "n", "o", "p"]) assert Series.format([s1, " / ", s2, " - ", s3, " / ", s4]) |> Series.to_list() == - ["a / e - i / m", "b / f - j / n", nil, "d / h - l / p"] + ["a / e - i / m", "b / f - j / n", "c / g - / o", "d / h - l / p"] end end From ad60f89e4246c4528de2097bcfe48717bdac2ae8 Mon Sep 17 00:00:00 2001 From: lkarthee Date: Sun, 25 Feb 2024 08:09:51 +0530 Subject: [PATCH 4/4] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: José Valim --- lib/explorer/polars_backend/series.ex | 2 +- lib/explorer/series.ex | 18 ++++++------------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/lib/explorer/polars_backend/series.ex b/lib/explorer/polars_backend/series.ex index ba746dd10..677ff6452 100644 --- a/lib/explorer/polars_backend/series.ex +++ b/lib/explorer/polars_backend/series.ex @@ -147,7 +147,7 @@ defmodule Explorer.PolarsBackend.Series do df = Explorer.PolarsBackend.DataFrame.from_series(df_args) format_expr = Explorer.Backend.LazySeries.new(:format, [Enum.reverse(params)], :string) out_dtypes = Map.put(df.dtypes, "result", :string) - out_names = df.names ++ ["result"] + out_names = ["result" | df.names] out_df = %{df | dtypes: out_dtypes, names: out_names} Explorer.PolarsBackend.DataFrame.mutate_with(df, out_df, [{"result", format_expr}]) diff --git a/lib/explorer/series.ex b/lib/explorer/series.ex index fa19d7b0d..25af9b668 100644 --- a/lib/explorer/series.ex +++ b/lib/explorer/series.ex @@ -1280,19 +1280,13 @@ defmodule Explorer.Series do def categorise(%Series{dtype: l_dtype} = series, %Series{dtype: :string} = categories) when K.in(l_dtype, [:string | @integer_types]) do - if nil_count(categories) != 0, - do: - raise( - ArgumentError, - "categories as strings cannot have nil values" - ) + if nil_count(categories) != 0 do + raise(ArgumentError, "categories as strings cannot have nil values") + end - if count(categories) != n_distinct(categories), - do: - raise( - ArgumentError, - "categories as strings cannot have duplicated values" - ) + if count(categories) != n_distinct(categories) do + raise(ArgumentError, "categories as strings cannot have duplicated values") + end categories = cast(categories, :category) apply_series(series, :categorise, [categories])