From 741bbf68546ffa2d9342cece82f3b1456f7a1f5b Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Wed, 17 Jul 2024 06:05:00 +0800 Subject: [PATCH 01/21] bump `tonic` to 0.12 and `prost` to 0.13 for `arrow-flight` (#6041) * bump `tonic` to 0.12 and `prost` to 0.13 for `arrow-flight` Signed-off-by: Bugen Zhao * fix example tests Signed-off-by: Bugen Zhao --------- Signed-off-by: Bugen Zhao --- arrow-flight/Cargo.toml | 11 +++--- arrow-flight/examples/flight_sql_server.rs | 6 ++-- arrow-flight/gen/Cargo.toml | 4 +-- arrow-flight/src/arrow.flight.protocol.rs | 36 +++++-------------- .../src/sql/arrow.flight.protocol.sql.rs | 12 +++---- arrow-flight/tests/common/trailers_layer.rs | 32 ++++++----------- arrow-integration-testing/Cargo.toml | 4 +-- 7 files changed, 38 insertions(+), 67 deletions(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 111bf94d804c..3e7d6fa29230 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -44,11 +44,11 @@ bytes = { version = "1", default-features = false } futures = { version = "0.3", default-features = false, features = ["alloc"] } once_cell = { version = "1", optional = true } paste = { version = "1.0" } -prost = { version = "0.12.3", default-features = false, features = ["prost-derive"] } +prost = { version = "0.13.1", default-features = false, features = ["prost-derive"] } # For Timestamp type -prost-types = { version = "0.12.3", default-features = false } +prost-types = { version = "0.13.1", default-features = false } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } -tonic = { version = "0.11.0", default-features = false, features = ["transport", "codegen", "prost"] } +tonic = { version = "0.12.0", default-features = false, features = ["transport", "codegen", "prost"] } # CLI-related dependencies anyhow = { version = "1.0", optional = true } @@ -70,8 +70,9 @@ cli = ["anyhow", "arrow-cast/prettyprint", "clap", "tracing-log", "tracing-subsc [dev-dependencies] arrow-cast = { workspace = true, features = ["prettyprint"] } assert_cmd = "2.0.8" -http = "0.2.9" -http-body = "0.4.5" +http = "1.1.0" +http-body = "1.0.0" +hyper-util = "0.1" pin-project-lite = "0.2" tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 031628eaa833..d5168debc433 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -783,7 +783,8 @@ impl ProstMessageExt for FetchResults { #[cfg(test)] mod tests { use super::*; - use futures::TryStreamExt; + use futures::{TryFutureExt, TryStreamExt}; + use hyper_util::rt::TokioIo; use std::fs; use std::future::Future; use std::net::SocketAddr; @@ -843,7 +844,8 @@ mod tests { .serve_with_incoming(stream); let request_future = async { - let connector = service_fn(move |_| UnixStream::connect(path.clone())); + let connector = + service_fn(move |_| UnixStream::connect(path.clone()).map_ok(TokioIo::new)); let channel = Endpoint::try_from("http://example.com") .unwrap() .connect_with_connector(connector) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 7264a527ca8d..a12c683776b4 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -33,5 +33,5 @@ publish = false # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.86", default-features = false } -prost-build = { version = "=0.12.6", default-features = false } -tonic-build = { version = "=0.11.0", default-features = false, features = ["transport", "prost"] } +prost-build = { version = "=0.13.1", default-features = false } +tonic-build = { version = "=0.12.0", default-features = false, features = ["transport", "prost"] } diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index bc314de9d19f..8c7292894eab 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -38,7 +38,7 @@ pub struct BasicAuth { pub password: ::prost::alloc::string::String, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct Empty {} /// /// Describes an available action, including both the name used for execution @@ -103,7 +103,7 @@ pub struct Result { /// /// The result should be stored in Result.body. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct CancelFlightInfoResult { #[prost(enumeration = "CancelStatus", tag = "1")] pub status: i32, @@ -1053,19 +1053,17 @@ pub mod flight_service_server { /// can expose a set of actions that are available. #[derive(Debug)] pub struct FlightServiceServer { - inner: _Inner, + inner: Arc, accept_compression_encodings: EnabledCompressionEncodings, send_compression_encodings: EnabledCompressionEncodings, max_decoding_message_size: Option, max_encoding_message_size: Option, } - struct _Inner(Arc); impl FlightServiceServer { pub fn new(inner: T) -> Self { Self::from_arc(Arc::new(inner)) } pub fn from_arc(inner: Arc) -> Self { - let inner = _Inner(inner); Self { inner, accept_compression_encodings: Default::default(), @@ -1128,7 +1126,6 @@ pub mod flight_service_server { Poll::Ready(Ok(())) } fn call(&mut self, req: http::Request) -> Self::Future { - let inner = self.inner.clone(); match req.uri().path() { "/arrow.flight.protocol.FlightService/Handshake" => { #[allow(non_camel_case_types)] @@ -1162,7 +1159,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = HandshakeSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1209,7 +1205,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = ListFlightsSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1255,7 +1250,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = GetFlightInfoSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1302,7 +1296,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = PollFlightInfoSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1348,7 +1341,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = GetSchemaSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1395,7 +1387,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = DoGetSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1442,7 +1433,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = DoPutSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1489,7 +1479,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = DoExchangeSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1536,7 +1525,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = DoActionSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1583,7 +1571,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = ListActionsSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1605,8 +1592,11 @@ pub mod flight_service_server { Ok( http::Response::builder() .status(200) - .header("grpc-status", "12") - .header("content-type", "application/grpc") + .header("grpc-status", tonic::Code::Unimplemented as i32) + .header( + http::header::CONTENT_TYPE, + tonic::metadata::GRPC_CONTENT_TYPE, + ) .body(empty_body()) .unwrap(), ) @@ -1627,16 +1617,6 @@ pub mod flight_service_server { } } } - impl Clone for _Inner { - fn clone(&self) -> Self { - Self(Arc::clone(&self.0)) - } - } - impl std::fmt::Debug for _Inner { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self.0) - } - } impl tonic::server::NamedService for FlightServiceServer { const NAME: &'static str = "arrow.flight.protocol.FlightService"; } diff --git a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs index c1f0fac0f6ba..5e6f198df75c 100644 --- a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs +++ b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs @@ -101,7 +101,7 @@ pub struct CommandGetSqlInfo { /// > /// The returned data should be ordered by data_type and then by type_name. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct CommandGetXdbcTypeInfo { /// /// Specifies the data type to search for the info. @@ -121,7 +121,7 @@ pub struct CommandGetXdbcTypeInfo { /// > /// The returned data should be ordered by catalog_name. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct CommandGetCatalogs {} /// /// Represents a request to retrieve the list of database schemas on a Flight SQL enabled backend. @@ -232,7 +232,7 @@ pub struct CommandGetTables { /// > /// The returned data should be ordered by table_type. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct CommandGetTableTypes {} /// /// Represents a request to retrieve the primary keys of a table on a Flight SQL enabled backend. @@ -511,7 +511,7 @@ pub struct ActionClosePreparedStatementRequest { /// Request message for the "BeginTransaction" action. /// Begins a transaction. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct ActionBeginTransactionRequest {} /// /// Request message for the "BeginSavepoint" action. @@ -802,7 +802,7 @@ pub struct CommandPreparedStatementUpdate { /// CommandPreparedStatementUpdate was in the request, containing /// results from the update. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct DoPutUpdateResult { /// The number of records updated. A return value of -1 represents /// an unknown updated record count. @@ -862,7 +862,7 @@ pub struct ActionCancelQueryRequest { /// This command is deprecated since 13.0.0. Use the "CancelFlightInfo" /// action with DoAction instead. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct ActionCancelQueryResult { #[prost(enumeration = "action_cancel_query_result::CancelResult", tag = "1")] pub result: i32, diff --git a/arrow-flight/tests/common/trailers_layer.rs b/arrow-flight/tests/common/trailers_layer.rs index b2ab74f7d925..0ccb7df86c74 100644 --- a/arrow-flight/tests/common/trailers_layer.rs +++ b/arrow-flight/tests/common/trailers_layer.rs @@ -21,7 +21,7 @@ use std::task::{Context, Poll}; use futures::ready; use http::{HeaderValue, Request, Response}; -use http_body::SizeHint; +use http_body::{Frame, SizeHint}; use pin_project_lite::pin_project; use tower::{Layer, Service}; @@ -99,31 +99,19 @@ impl http_body::Body for WrappedBody { type Data = B::Data; type Error = B::Error; - fn poll_data( - mut self: Pin<&mut Self>, + fn poll_frame( + self: Pin<&mut Self>, cx: &mut Context<'_>, - ) -> Poll>> { - self.as_mut().project().inner.poll_data(cx) - } - - fn poll_trailers( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll, Self::Error>> { - let result: Result, Self::Error> = - ready!(self.as_mut().project().inner.poll_trailers(cx)); - - let mut trailers = http::header::HeaderMap::new(); - trailers.insert("test-trailer", HeaderValue::from_static("trailer_val")); + ) -> Poll, Self::Error>>> { + let mut result = ready!(self.project().inner.poll_frame(cx)); - match result { - Ok(Some(mut existing)) => { - existing.extend(trailers.iter().map(|(k, v)| (k.clone(), v.clone()))); - Poll::Ready(Ok(Some(existing))) + if let Some(Ok(frame)) = &mut result { + if let Some(trailers) = frame.trailers_mut() { + trailers.insert("test-trailer", HeaderValue::from_static("trailer_val")); } - Ok(None) => Poll::Ready(Ok(Some(trailers))), - Err(e) => Poll::Ready(Err(e)), } + + Poll::Ready(result) } fn is_end_stream(&self) -> bool { diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 032b99f4fbbb..7be56d919852 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -42,11 +42,11 @@ async-trait = { version = "0.1.41", default-features = false } clap = { version = "4", default-features = false, features = ["std", "derive", "help", "error-context", "usage"] } futures = { version = "0.3", default-features = false } hex = { version = "0.4", default-features = false, features = ["std"] } -prost = { version = "0.12", default-features = false } +prost = { version = "0.13", default-features = false } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } tokio = { version = "1.0", default-features = false } -tonic = { version = "0.11", default-features = false } +tonic = { version = "0.12", default-features = false } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["fmt"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } flate2 = { version = "1", default-features = false, features = ["rust_backend"] } From 8f762482221a4b2104a770d1c132dea3b58c117d Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Tue, 16 Jul 2024 18:06:57 -0400 Subject: [PATCH 02/21] Remove `impl> From for Buffer` that easily accidentally copies data (#6043) * deprecate auto copy, ask explicit reference * update comments * make cargo doc happy --- arrow-buffer/src/buffer/immutable.rs | 33 +++++++++++++++++++--------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 52e201ca15a2..c53ef18ba58f 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -356,16 +356,29 @@ impl Buffer { } } -/// Creating a `Buffer` instance by copying the memory from a `AsRef<[u8]>` into a newly -/// allocated memory region. -impl> From for Buffer { - fn from(p: T) -> Self { - // allocate aligned memory buffer - let slice = p.as_ref(); - let len = slice.len(); - let mut buffer = MutableBuffer::new(len); - buffer.extend_from_slice(slice); - buffer.into() +/// Note that here we deliberately do not implement +/// `impl> From for Buffer` +/// As it would accept `Buffer::from(vec![...])` that would cause an unexpected copy. +/// Instead, we ask user to be explicit when copying is occurring, e.g., `Buffer::from(vec![...].to_byte_slice())`. +/// For zero-copy conversion, user should use `Buffer::from_vec(vec![...])`. +/// +/// Since we removed impl for `AsRef`, we added the following three specific implementations to reduce API breakage. +/// See for more discussion on this. +impl From<&[u8]> for Buffer { + fn from(p: &[u8]) -> Self { + Self::from_slice_ref(p) + } +} + +impl From<[u8; N]> for Buffer { + fn from(p: [u8; N]) -> Self { + Self::from_slice_ref(p) + } +} + +impl From<&[u8; N]> for Buffer { + fn from(p: &[u8; N]) -> Self { + Self::from_slice_ref(p) } } From bb5f12bd784af8a9e42843aa572fe5fef89d5752 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 17 Jul 2024 06:09:51 +0800 Subject: [PATCH 03/21] Make display of interval types more pretty (#6006) * improve dispaly for interval. * update test in pretty, and fix display problem. * tmp * fix tests in arrow-cast. * fix tests in pretty. * fix style. --- arrow-cast/src/cast/mod.rs | 26 ++--- arrow-cast/src/display.rs | 196 +++++++++++++++++++++++++++++-------- arrow-cast/src/pretty.rs | 54 +++++----- 3 files changed, 196 insertions(+), 80 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index c9de714e7d55..3f9c9c8a3356 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -4314,8 +4314,8 @@ mod tests { IntervalUnit::YearMonth, IntervalYearMonthArray, vec![ - Some("1 years 1 mons 0 days 0 hours 0 mins 0.00 secs"), - Some("2 years 7 mons 0 days 0 hours 0 mins 0.00 secs"), + Some("1 years 1 mons"), + Some("2 years 7 mons"), None, None, None, @@ -4338,9 +4338,9 @@ mod tests { IntervalUnit::DayTime, IntervalDayTimeArray, vec![ - Some("0 years 0 mons 390 days 0 hours 0 mins 0.000 secs"), - Some("0 years 0 mons 930 days 0 hours 0 mins 0.000 secs"), - Some("0 years 0 mons 30 days 0 hours 0 mins 0.000 secs"), + Some("390 days"), + Some("930 days"), + Some("30 days"), None, None, ] @@ -4366,16 +4366,16 @@ mod tests { IntervalUnit::MonthDayNano, IntervalMonthDayNanoArray, vec![ - Some("0 years 13 mons 1 days 0 hours 0 mins 0.000000000 secs"), + Some("13 mons 1 days"), None, - Some("0 years 31 mons 35 days 0 hours 0 mins 0.001400000 secs"), - Some("0 years 0 mons 3 days 0 hours 0 mins 0.000000000 secs"), - Some("0 years 0 mons 0 days 0 hours 0 mins 8.000000000 secs"), + Some("31 mons 35 days 0.001400000 secs"), + Some("3 days"), + Some("8.000000000 secs"), None, - Some("0 years 0 mons 1 days 0 hours 0 mins 29.800000000 secs"), - Some("0 years 3 mons 0 days 0 hours 0 mins 1.000000000 secs"), - Some("0 years 0 mons 0 days 0 hours 8 mins 0.000000000 secs"), - Some("0 years 63 mons 9 days 19 hours 9 mins 2.222000000 secs"), + Some("1 days 29.800000000 secs"), + Some("3 mons 1.000000000 secs"), + Some("8 mins"), + Some("63 mons 9 days 19 hours 9 mins 2.222000000 secs"), None, ] ); diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 6a40d036350a..ab172cb240cf 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -654,10 +654,7 @@ impl<'a> DisplayIndex for &'a PrimitiveArray { let years = (interval / 12_f64).floor(); let month = interval - (years * 12_f64); - write!( - f, - "{years} years {month} mons 0 days 0 hours 0 mins 0.00 secs", - )?; + write!(f, "{years} years {month} mons",)?; Ok(()) } } @@ -665,62 +662,181 @@ impl<'a> DisplayIndex for &'a PrimitiveArray { impl<'a> DisplayIndex for &'a PrimitiveArray { fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { let value = self.value(idx); + let mut first_part = true; - let secs = value.milliseconds / 1_000; + if value.days != 0 { + write!(f, "{} days", value.days)?; + first_part = false; + } + + if value.milliseconds != 0 { + let millis_fmt = MillisecondsFormatter { + milliseconds: value.milliseconds, + first_part, + }; + + f.write_fmt(format_args!("{millis_fmt}"))?; + } + + Ok(()) + } +} + +impl<'a> DisplayIndex for &'a PrimitiveArray { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + let value = self.value(idx); + let mut first_part = true; + + if value.months != 0 { + write!(f, "{} mons", value.months)?; + first_part = false; + } + + if value.days != 0 { + if first_part { + write!(f, "{} days", value.days)?; + first_part = false; + } else { + write!(f, " {} days", value.days)?; + } + } + + if value.nanoseconds != 0 { + let nano_fmt = NanosecondsFormatter { + nanoseconds: value.nanoseconds, + first_part, + }; + f.write_fmt(format_args!("{nano_fmt}"))?; + } + + Ok(()) + } +} + +struct NanosecondsFormatter { + nanoseconds: i64, + first_part: bool, +} + +impl Display for NanosecondsFormatter { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let mut first_part = self.first_part; + + let secs = self.nanoseconds / 1_000_000_000; let mins = secs / 60; let hours = mins / 60; let secs = secs - (mins * 60); let mins = mins - (hours * 60); - let milliseconds = value.milliseconds % 1_000; + let nanoseconds = self.nanoseconds % 1_000_000_000; - let secs_sign = if secs < 0 || milliseconds < 0 { - "-" - } else { - "" - }; + if hours != 0 { + if first_part { + write!(f, "{} hours", hours)?; + first_part = false; + } else { + write!(f, " {} hours", hours)?; + } + } + + if mins != 0 { + if first_part { + write!(f, "{} mins", mins)?; + first_part = false; + } else { + write!(f, " {} mins", mins)?; + } + } + + if secs != 0 || nanoseconds != 0 { + let secs_sign = if secs < 0 || nanoseconds < 0 { "-" } else { "" }; + + if first_part { + write!( + f, + "{}{}.{:09} secs", + secs_sign, + secs.abs(), + nanoseconds.abs() + )?; + } else { + write!( + f, + " {}{}.{:09} secs", + secs_sign, + secs.abs(), + nanoseconds.abs() + )?; + } + } - write!( - f, - "0 years 0 mons {} days {} hours {} mins {}{}.{:03} secs", - value.days, - hours, - mins, - secs_sign, - secs.abs(), - milliseconds.abs(), - )?; Ok(()) } } -impl<'a> DisplayIndex for &'a PrimitiveArray { - fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { - let value = self.value(idx); +struct MillisecondsFormatter { + milliseconds: i32, + first_part: bool, +} + +impl Display for MillisecondsFormatter { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let mut first_part = self.first_part; - let secs = value.nanoseconds / 1_000_000_000; + let secs = self.milliseconds / 1_000; let mins = secs / 60; let hours = mins / 60; let secs = secs - (mins * 60); let mins = mins - (hours * 60); - let nanoseconds = value.nanoseconds % 1_000_000_000; - - let secs_sign = if secs < 0 || nanoseconds < 0 { "-" } else { "" }; - - write!( - f, - "0 years {} mons {} days {} hours {} mins {}{}.{:09} secs", - value.months, - value.days, - hours, - mins, - secs_sign, - secs.abs(), - nanoseconds.abs(), - )?; + let milliseconds = self.milliseconds % 1_000; + + if hours != 0 { + if first_part { + write!(f, "{} hours", hours,)?; + first_part = false; + } else { + write!(f, " {} hours", hours,)?; + } + } + + if mins != 0 { + if first_part { + write!(f, "{} mins", mins,)?; + first_part = false; + } else { + write!(f, " {} mins", mins,)?; + } + } + + if secs != 0 || milliseconds != 0 { + let secs_sign = if secs < 0 || milliseconds < 0 { + "-" + } else { + "" + }; + + if first_part { + write!( + f, + "{}{}.{:03} secs", + secs_sign, + secs.abs(), + milliseconds.abs() + )?; + } else { + write!( + f, + " {}{}.{:03} secs", + secs_sign, + secs.abs(), + milliseconds.abs() + )?; + } + } + Ok(()) } } diff --git a/arrow-cast/src/pretty.rs b/arrow-cast/src/pretty.rs index 9383b9f73f61..f41471e38d5e 100644 --- a/arrow-cast/src/pretty.rs +++ b/arrow-cast/src/pretty.rs @@ -986,16 +986,16 @@ mod tests { let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ - "+----------------------------------------------------+", - "| IntervalDayTime |", - "+----------------------------------------------------+", - "| 0 years 0 mons -1 days 0 hours -10 mins 0.000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins -1.001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins -0.001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.010 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.100 secs |", - "+----------------------------------------------------+", + "+------------------+", + "| IntervalDayTime |", + "+------------------+", + "| -1 days -10 mins |", + "| -1.001 secs |", + "| -0.001 secs |", + "| 0.001 secs |", + "| 0.010 secs |", + "| 0.100 secs |", + "+------------------+", ]; let actual: Vec<&str> = table.lines().collect(); @@ -1032,23 +1032,23 @@ mod tests { let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ - "+-----------------------------------------------------------+", - "| IntervalMonthDayNano |", - "+-----------------------------------------------------------+", - "| 0 years -1 mons -1 days 0 hours -10 mins 0.000000000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins -1.000000001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins -0.000000001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000010 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000100 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000001000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000010000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000100000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.001000000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.010000000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.100000000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 1.000000000 secs |", - "+-----------------------------------------------------------+", + "+--------------------------+", + "| IntervalMonthDayNano |", + "+--------------------------+", + "| -1 mons -1 days -10 mins |", + "| -1.000000001 secs |", + "| -0.000000001 secs |", + "| 0.000000001 secs |", + "| 0.000000010 secs |", + "| 0.000000100 secs |", + "| 0.000001000 secs |", + "| 0.000010000 secs |", + "| 0.000100000 secs |", + "| 0.001000000 secs |", + "| 0.010000000 secs |", + "| 0.100000000 secs |", + "| 1.000000000 secs |", + "+--------------------------+", ]; let actual: Vec<&str> = table.lines().collect(); From 756b1fb26d1702f36f446faf9bb40a4869c3e840 Mon Sep 17 00:00:00 2001 From: Jesse Date: Wed, 17 Jul 2024 00:10:59 +0200 Subject: [PATCH 04/21] Update snafu (#5930) --- object_store/Cargo.toml | 2 +- object_store/src/client/get.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 3f7b2c0b2be0..234c7746b33b 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -38,7 +38,7 @@ humantime = "2.1" itertools = "0.13.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" -snafu = "0.7" +snafu = { version = "0.8", default-features = false, features = ["std", "rust_1_61"] } tracing = { version = "0.1" } url = "2.2" walkdir = "2" diff --git a/object_store/src/client/get.rs b/object_store/src/client/get.rs index b45eaa143702..0fef5785c052 100644 --- a/object_store/src/client/get.rs +++ b/object_store/src/client/get.rs @@ -103,7 +103,7 @@ enum GetResultError { source: crate::client::header::Error, }, - #[snafu(context(false))] + #[snafu(transparent)] InvalidRangeRequest { source: crate::util::InvalidGetRange, }, @@ -386,7 +386,7 @@ mod tests { let err = get_result::(&path, Some(get_range.clone()), resp).unwrap_err(); assert_eq!( err.to_string(), - "InvalidRangeRequest: Wanted range starting at 2, but object was only 2 bytes long" + "Wanted range starting at 2, but object was only 2 bytes long" ); let resp = make_response( From fe04e09b63fce64c8f54b39ce7200ecea93f87ef Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 16 Jul 2024 15:16:38 -0700 Subject: [PATCH 05/21] Update Parquet thrift generated structures (#6045) * update to latest thrift (as of 11 Jul 2024) from parquet-format * pass None for optional size statistics * escape HTML tags * don't need to escape brackets in arrays --- parquet/regen.sh | 2 +- parquet/src/file/metadata/mod.rs | 5 +- parquet/src/format.rs | 397 ++++++++++++++++++++++++++----- 3 files changed, 339 insertions(+), 65 deletions(-) diff --git a/parquet/regen.sh b/parquet/regen.sh index d1b82108a018..39999c7872cd 100755 --- a/parquet/regen.sh +++ b/parquet/regen.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2 +REVISION=5b564f3c47679526cf72e54f207013f28f53acc4 SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 40922d52bfd4..278d1e464e94 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -790,6 +790,7 @@ impl ColumnChunkMetaData { .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()), bloom_filter_offset: self.bloom_filter_offset, bloom_filter_length: self.bloom_filter_length, + size_statistics: None, } } @@ -1004,6 +1005,8 @@ impl ColumnIndexBuilder { self.max_values, self.boundary_order, self.null_counts, + None, + None, ) } } @@ -1052,7 +1055,7 @@ impl OffsetIndexBuilder { .zip(self.first_row_index_array.iter()) .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index)) .collect::>(); - OffsetIndex::new(locations) + OffsetIndex::new(locations, None) } } diff --git a/parquet/src/format.rs b/parquet/src/format.rs index b210d6ec1b7e..6c93097b7359 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -117,12 +117,12 @@ impl ConvertedType { /// a list is converted into an optional field containing a repeated field for its /// values pub const LIST: ConvertedType = ConvertedType(3); - /// an enum is converted into a binary field + /// an enum is converted into a BYTE_ARRAY field pub const ENUM: ConvertedType = ConvertedType(4); /// A decimal value. /// - /// This may be used to annotate binary or fixed primitive types. The - /// underlying byte array stores the unscaled value encoded as two's + /// This may be used to annotate BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY primitive + /// types. The underlying byte array stores the unscaled value encoded as two's /// complement using big-endian byte order (the most significant byte is the /// zeroth element). The value of the decimal is the value * 10^{-scale}. /// @@ -185,7 +185,7 @@ impl ConvertedType { pub const JSON: ConvertedType = ConvertedType(19); /// An embedded BSON document /// - /// A BSON document embedded within a single BINARY column. + /// A BSON document embedded within a single BYTE_ARRAY column. pub const BSON: ConvertedType = ConvertedType(20); /// An interval of time /// @@ -288,9 +288,9 @@ impl From<&ConvertedType> for i32 { pub struct FieldRepetitionType(pub i32); impl FieldRepetitionType { - /// This field is required (can not be null) and each record has exactly 1 value. + /// This field is required (can not be null) and each row has exactly 1 value. pub const REQUIRED: FieldRepetitionType = FieldRepetitionType(0); - /// The field is optional (can be null) and each record has 0 or 1 values. + /// The field is optional (can be null) and each row has 0 or 1 values. pub const OPTIONAL: FieldRepetitionType = FieldRepetitionType(1); /// The field is repeated and can contain 0 or more values pub const REPEATED: FieldRepetitionType = FieldRepetitionType(2); @@ -379,12 +379,15 @@ impl Encoding { pub const DELTA_BYTE_ARRAY: Encoding = Encoding(7); /// Dictionary encoding: the ids are encoded using the RLE encoding pub const RLE_DICTIONARY: Encoding = Encoding(8); - /// Encoding for floating-point data. + /// Encoding for fixed-width data (FLOAT, DOUBLE, INT32, INT64, FIXED_LEN_BYTE_ARRAY). /// K byte-streams are created where K is the size in bytes of the data type. - /// The individual bytes of an FP value are scattered to the corresponding stream and + /// The individual bytes of a value are scattered to the corresponding stream and /// the streams are concatenated. /// This itself does not reduce the size of the data but can lead to better compression /// afterwards. + /// + /// Added in 2.8 for FLOAT and DOUBLE. + /// Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11. pub const BYTE_STREAM_SPLIT: Encoding = Encoding(9); pub const ENUM_VALUES: &'static [Self] = &[ Self::PLAIN, @@ -634,6 +637,143 @@ impl From<&BoundaryOrder> for i32 { } } +// +// SizeStatistics +// + +/// A structure for capturing metadata for estimating the unencoded, +/// uncompressed size of data written. This is useful for readers to estimate +/// how much memory is needed to reconstruct data in their memory model and for +/// fine grained filter pushdown on nested structures (the histograms contained +/// in this structure can help determine the number of nulls at a particular +/// nesting level and maximum length of lists). +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct SizeStatistics { + /// The number of physical bytes stored for BYTE_ARRAY data values assuming + /// no encoding. This is exclusive of the bytes needed to store the length of + /// each byte array. In other words, this field is equivalent to the `(size + /// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values + /// written)`. To determine unencoded sizes of other types readers can use + /// schema information multiplied by the number of non-null and null values. + /// The number of null/non-null values can be inferred from the histograms + /// below. + /// + /// For example, if a column chunk is dictionary-encoded with dictionary + /// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], + /// then this value for that data page should be 7 (1 + 1 + 2 + 3). + /// + /// This field should only be set for types that use BYTE_ARRAY as their + /// physical type. + pub unencoded_byte_array_data_bytes: Option, + /// When present, there is expected to be one element corresponding to each + /// repetition (i.e. size=max repetition_level+1) where each element + /// represents the number of times the repetition level was observed in the + /// data. + /// + /// This field may be omitted if max_repetition_level is 0 without loss + /// of information. + /// + pub repetition_level_histogram: Option>, + /// Same as repetition_level_histogram except for definition levels. + /// + /// This field may be omitted if max_definition_level is 0 or 1 without + /// loss of information. + /// + pub definition_level_histogram: Option>, +} + +impl SizeStatistics { + pub fn new(unencoded_byte_array_data_bytes: F1, repetition_level_histogram: F2, definition_level_histogram: F3) -> SizeStatistics where F1: Into>, F2: Into>>, F3: Into>> { + SizeStatistics { + unencoded_byte_array_data_bytes: unencoded_byte_array_data_bytes.into(), + repetition_level_histogram: repetition_level_histogram.into(), + definition_level_histogram: definition_level_histogram.into(), + } + } +} + +impl crate::thrift::TSerializable for SizeStatistics { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option> = None; + let mut f_3: Option> = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i64()?; + f_1 = Some(val); + }, + 2 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_0 = i_prot.read_i64()?; + val.push(list_elem_0); + } + i_prot.read_list_end()?; + f_2 = Some(val); + }, + 3 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_1 = i_prot.read_i64()?; + val.push(list_elem_1); + } + i_prot.read_list_end()?; + f_3 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = SizeStatistics { + unencoded_byte_array_data_bytes: f_1, + repetition_level_histogram: f_2, + definition_level_histogram: f_3, + }; + Ok(ret) + } + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("SizeStatistics"); + o_prot.write_struct_begin(&struct_ident)?; + if let Some(fld_var) = self.unencoded_byte_array_data_bytes { + o_prot.write_field_begin(&TFieldIdentifier::new("unencoded_byte_array_data_bytes", TType::I64, 1))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.repetition_level_histogram { + o_prot.write_field_begin(&TFieldIdentifier::new("repetition_level_histogram", TType::List, 2))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.definition_level_histogram { + o_prot.write_field_begin(&TFieldIdentifier::new("definition_level_histogram", TType::List, 3))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + // // Statistics // @@ -1123,7 +1263,7 @@ impl crate::thrift::TSerializable for NullType { /// To maintain forward-compatibility in v1, implementations using this logical /// type must also set scale and precision on the annotated SchemaElement. /// -/// Allowed for physical types: INT32, INT64, FIXED, and BINARY +/// Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY. #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DecimalType { pub scale: i32, @@ -1620,7 +1760,7 @@ impl crate::thrift::TSerializable for IntType { /// Embedded JSON logical type annotation /// -/// Allowed for physical types: BINARY +/// Allowed for physical types: BYTE_ARRAY #[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct JsonType { } @@ -1660,7 +1800,7 @@ impl crate::thrift::TSerializable for JsonType { /// Embedded BSON logical type annotation /// -/// Allowed for physical types: BINARY +/// Allowed for physical types: BYTE_ARRAY #[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct BsonType { } @@ -2146,7 +2286,12 @@ impl crate::thrift::TSerializable for SchemaElement { /// Data page header #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DataPageHeader { - /// Number of values, including NULLs, in this data page. * + /// Number of values, including NULLs, in this data page. + /// + /// If a OffsetIndex is present, a page must begin at a row + /// boundary (repetition_level = 0). Otherwise, pages may begin + /// within a row (repetition_level > 0). + /// pub num_values: i32, /// Encoding used for this data page * pub encoding: Encoding, @@ -2154,7 +2299,7 @@ pub struct DataPageHeader { pub definition_level_encoding: Encoding, /// Encoding used for repetition levels * pub repetition_level_encoding: Encoding, - /// Optional statistics for the data in this page* + /// Optional statistics for the data in this page * pub statistics: Option, } @@ -2390,21 +2535,24 @@ pub struct DataPageHeaderV2 { /// Number of NULL values, in this data page. /// Number of non-null = num_values - num_nulls which is also the number of values in the data section * pub num_nulls: i32, - /// Number of rows in this data page. which means pages change on record boundaries (r = 0) * + /// Number of rows in this data page. Every page must begin at a + /// row boundary (repetition_level = 0): rows must **not** be + /// split across page boundaries when using V2 data pages. + /// pub num_rows: i32, /// Encoding used for data in this page * pub encoding: Encoding, - /// length of the definition levels + /// Length of the definition levels pub definition_levels_byte_length: i32, - /// length of the repetition levels + /// Length of the repetition levels pub repetition_levels_byte_length: i32, - /// whether the values are compressed. + /// Whether the values are compressed. /// Which means the section of the page between /// definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) /// is compressed with the compression_codec. /// If missing it is considered compressed pub is_compressed: Option, - /// optional statistics for the data in this page * + /// Optional statistics for the data in this page * pub statistics: Option, } @@ -3207,10 +3355,10 @@ impl crate::thrift::TSerializable for KeyValue { // SortingColumn // -/// Wrapper struct to specify sort order +/// Sort order within a RowGroup of a leaf column #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct SortingColumn { - /// The column index (in this row group) * + /// The ordinal position of the column (in this row group) * pub column_idx: i32, /// If true, indicates this column is sorted in descending order. * pub descending: bool, @@ -3417,10 +3565,15 @@ pub struct ColumnMetaData { /// Writers should write this field so readers can read the bloom filter /// in a single I/O. pub bloom_filter_length: Option, + /// Optional statistics to help estimate total memory when converted to in-memory + /// representations. The histograms contained in these statistics can + /// also be useful in some cases for more fine-grained nullability/list length + /// filter pushdown. + pub size_statistics: Option, } impl ColumnMetaData { - pub fn new(type_: Type, encodings: Vec, path_in_schema: Vec, codec: CompressionCodec, num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64, key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10, dictionary_page_offset: F11, statistics: F12, encoding_stats: F13, bloom_filter_offset: F14, bloom_filter_length: F15) -> ColumnMetaData where F8: Into>>, F10: Into>, F11: Into>, F12: Into>, F13: Into>>, F14: Into>, F15: Into> { + pub fn new(type_: Type, encodings: Vec, path_in_schema: Vec, codec: CompressionCodec, num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64, key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10, dictionary_page_offset: F11, statistics: F12, encoding_stats: F13, bloom_filter_offset: F14, bloom_filter_length: F15, size_statistics: F16) -> ColumnMetaData where F8: Into>>, F10: Into>, F11: Into>, F12: Into>, F13: Into>>, F14: Into>, F15: Into>, F16: Into> { ColumnMetaData { type_, encodings, @@ -3437,6 +3590,7 @@ impl ColumnMetaData { encoding_stats: encoding_stats.into(), bloom_filter_offset: bloom_filter_offset.into(), bloom_filter_length: bloom_filter_length.into(), + size_statistics: size_statistics.into(), } } } @@ -3459,6 +3613,7 @@ impl crate::thrift::TSerializable for ColumnMetaData { let mut f_13: Option> = None; let mut f_14: Option = None; let mut f_15: Option = None; + let mut f_16: Option = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -3474,8 +3629,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_0 = Encoding::read_from_in_protocol(i_prot)?; - val.push(list_elem_0); + let list_elem_2 = Encoding::read_from_in_protocol(i_prot)?; + val.push(list_elem_2); } i_prot.read_list_end()?; f_2 = Some(val); @@ -3484,8 +3639,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_1 = i_prot.read_string()?; - val.push(list_elem_1); + let list_elem_3 = i_prot.read_string()?; + val.push(list_elem_3); } i_prot.read_list_end()?; f_3 = Some(val); @@ -3510,8 +3665,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_2 = KeyValue::read_from_in_protocol(i_prot)?; - val.push(list_elem_2); + let list_elem_4 = KeyValue::read_from_in_protocol(i_prot)?; + val.push(list_elem_4); } i_prot.read_list_end()?; f_8 = Some(val); @@ -3536,8 +3691,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_3 = PageEncodingStats::read_from_in_protocol(i_prot)?; - val.push(list_elem_3); + let list_elem_5 = PageEncodingStats::read_from_in_protocol(i_prot)?; + val.push(list_elem_5); } i_prot.read_list_end()?; f_13 = Some(val); @@ -3550,6 +3705,10 @@ impl crate::thrift::TSerializable for ColumnMetaData { let val = i_prot.read_i32()?; f_15 = Some(val); }, + 16 => { + let val = SizeStatistics::read_from_in_protocol(i_prot)?; + f_16 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -3581,6 +3740,7 @@ impl crate::thrift::TSerializable for ColumnMetaData { encoding_stats: f_13, bloom_filter_offset: f_14, bloom_filter_length: f_15, + size_statistics: f_16, }; Ok(ret) } @@ -3662,6 +3822,11 @@ impl crate::thrift::TSerializable for ColumnMetaData { o_prot.write_i32(fld_var)?; o_prot.write_field_end()? } + if let Some(ref fld_var) = self.size_statistics { + o_prot.write_field_begin(&TFieldIdentifier::new("size_statistics", TType::Struct, 16))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -3741,8 +3906,8 @@ impl crate::thrift::TSerializable for EncryptionWithColumnKey { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_4 = i_prot.read_string()?; - val.push(list_elem_4); + let list_elem_6 = i_prot.read_string()?; + val.push(list_elem_6); } i_prot.read_list_end()?; f_1 = Some(val); @@ -3881,11 +4046,19 @@ pub struct ColumnChunk { /// metadata. This path is relative to the current file. /// pub file_path: Option, - /// Byte offset in file_path to the ColumnMetaData * + /// Deprecated: Byte offset in file_path to the ColumnMetaData + /// + /// Past use of this field has been inconsistent, with some implementations + /// using it to point to the ColumnMetaData and some using it to point to + /// the first page in the column chunk. In many cases, the ColumnMetaData at this + /// location is wrong. This field is now deprecated and should not be used. + /// Writers should set this field to 0 if no ColumnMetaData has been written outside + /// the footer. pub file_offset: i64, - /// Column metadata for this chunk. This is the same content as what is at - /// file_path/file_offset. Having it here has it replicated in the file - /// metadata. + /// Column metadata for this chunk. Some writers may also replicate this at the + /// location pointed to by file_path/file_offset. + /// Note: while marked as optional, this field is in fact required by most major + /// Parquet implementations. As such, writers MUST populate this field. /// pub meta_data: Option, /// File offset of ColumnChunk's OffsetIndex * @@ -4107,8 +4280,8 @@ impl crate::thrift::TSerializable for RowGroup { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_5 = ColumnChunk::read_from_in_protocol(i_prot)?; - val.push(list_elem_5); + let list_elem_7 = ColumnChunk::read_from_in_protocol(i_prot)?; + val.push(list_elem_7); } i_prot.read_list_end()?; f_1 = Some(val); @@ -4125,8 +4298,8 @@ impl crate::thrift::TSerializable for RowGroup { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_6 = SortingColumn::read_from_in_protocol(i_prot)?; - val.push(list_elem_6); + let list_elem_8 = SortingColumn::read_from_in_protocol(i_prot)?; + val.push(list_elem_8); } i_prot.read_list_end()?; f_4 = Some(val); @@ -4331,8 +4504,9 @@ pub struct PageLocation { /// Size of the page, including header. Sum of compressed_page_size and header /// length pub compressed_page_size: i32, - /// Index within the RowGroup of the first row of the page; this means pages - /// change on record boundaries (r = 0). + /// Index within the RowGroup of the first row of the page. When an + /// OffsetIndex is present, pages must begin on row boundaries + /// (repetition_level = 0). pub first_row_index: i64, } @@ -4409,17 +4583,28 @@ impl crate::thrift::TSerializable for PageLocation { // OffsetIndex // +/// Optional offsets for each data page in a ColumnChunk. +/// +/// Forms part of the page index, along with ColumnIndex. +/// +/// OffsetIndex may be present even if ColumnIndex is not. #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct OffsetIndex { /// PageLocations, ordered by increasing PageLocation.offset. It is required /// that page_locations\[i\].first_row_index < page_locations\[i+1\].first_row_index. pub page_locations: Vec, + /// Unencoded/uncompressed size for BYTE_ARRAY types. + /// + /// See documention for unencoded_byte_array_data_bytes in SizeStatistics for + /// more details on this field. + pub unencoded_byte_array_data_bytes: Option>, } impl OffsetIndex { - pub fn new(page_locations: Vec) -> OffsetIndex { + pub fn new(page_locations: Vec, unencoded_byte_array_data_bytes: F2) -> OffsetIndex where F2: Into>> { OffsetIndex { page_locations, + unencoded_byte_array_data_bytes: unencoded_byte_array_data_bytes.into(), } } } @@ -4428,6 +4613,7 @@ impl crate::thrift::TSerializable for OffsetIndex { fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; + let mut f_2: Option> = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -4439,12 +4625,22 @@ impl crate::thrift::TSerializable for OffsetIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_7 = PageLocation::read_from_in_protocol(i_prot)?; - val.push(list_elem_7); + let list_elem_9 = PageLocation::read_from_in_protocol(i_prot)?; + val.push(list_elem_9); } i_prot.read_list_end()?; f_1 = Some(val); }, + 2 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_10 = i_prot.read_i64()?; + val.push(list_elem_10); + } + i_prot.read_list_end()?; + f_2 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -4455,6 +4651,7 @@ impl crate::thrift::TSerializable for OffsetIndex { verify_required_field_exists("OffsetIndex.page_locations", &f_1)?; let ret = OffsetIndex { page_locations: f_1.expect("auto-generated code should have checked for presence of required fields"), + unencoded_byte_array_data_bytes: f_2, }; Ok(ret) } @@ -4468,6 +4665,15 @@ impl crate::thrift::TSerializable for OffsetIndex { } o_prot.write_list_end()?; o_prot.write_field_end()?; + if let Some(ref fld_var) = self.unencoded_byte_array_data_bytes { + o_prot.write_field_begin(&TFieldIdentifier::new("unencoded_byte_array_data_bytes", TType::List, 2))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -4477,8 +4683,14 @@ impl crate::thrift::TSerializable for OffsetIndex { // ColumnIndex // -/// Description for ColumnIndex. -/// Each ``\[i\] refers to the page at OffsetIndex.page_locations\[i\] +/// Optional statistics for each data page in a ColumnChunk. +/// +/// Forms part the page index, along with OffsetIndex. +/// +/// If this structure is present, OffsetIndex must also be present. +/// +/// For each field in this structure, ``\[i\] refers to the page at +/// OffsetIndex.page_locations\[i\] #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct ColumnIndex { /// A list of Boolean values to determine the validity of the corresponding @@ -4504,16 +4716,33 @@ pub struct ColumnIndex { pub boundary_order: BoundaryOrder, /// A list containing the number of null values for each page * pub null_counts: Option>, + /// Contains repetition level histograms for each page + /// concatenated together. The repetition_level_histogram field on + /// SizeStatistics contains more details. + /// + /// When present the length should always be (number of pages * + /// (max_repetition_level + 1)) elements. + /// + /// Element 0 is the first element of the histogram for the first page. + /// Element (max_repetition_level + 1) is the first element of the histogram + /// for the second page. + /// + pub repetition_level_histograms: Option>, + /// Same as repetition_level_histograms except for definitions levels. + /// + pub definition_level_histograms: Option>, } impl ColumnIndex { - pub fn new(null_pages: Vec, min_values: Vec>, max_values: Vec>, boundary_order: BoundaryOrder, null_counts: F5) -> ColumnIndex where F5: Into>> { + pub fn new(null_pages: Vec, min_values: Vec>, max_values: Vec>, boundary_order: BoundaryOrder, null_counts: F5, repetition_level_histograms: F6, definition_level_histograms: F7) -> ColumnIndex where F5: Into>>, F6: Into>>, F7: Into>> { ColumnIndex { null_pages, min_values, max_values, boundary_order, null_counts: null_counts.into(), + repetition_level_histograms: repetition_level_histograms.into(), + definition_level_histograms: definition_level_histograms.into(), } } } @@ -4526,6 +4755,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let mut f_3: Option>> = None; let mut f_4: Option = None; let mut f_5: Option> = None; + let mut f_6: Option> = None; + let mut f_7: Option> = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -4537,8 +4768,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_8 = i_prot.read_bool()?; - val.push(list_elem_8); + let list_elem_11 = i_prot.read_bool()?; + val.push(list_elem_11); } i_prot.read_list_end()?; f_1 = Some(val); @@ -4547,8 +4778,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec> = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_9 = i_prot.read_bytes()?; - val.push(list_elem_9); + let list_elem_12 = i_prot.read_bytes()?; + val.push(list_elem_12); } i_prot.read_list_end()?; f_2 = Some(val); @@ -4557,8 +4788,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec> = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_10 = i_prot.read_bytes()?; - val.push(list_elem_10); + let list_elem_13 = i_prot.read_bytes()?; + val.push(list_elem_13); } i_prot.read_list_end()?; f_3 = Some(val); @@ -4571,12 +4802,32 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_11 = i_prot.read_i64()?; - val.push(list_elem_11); + let list_elem_14 = i_prot.read_i64()?; + val.push(list_elem_14); } i_prot.read_list_end()?; f_5 = Some(val); }, + 6 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_15 = i_prot.read_i64()?; + val.push(list_elem_15); + } + i_prot.read_list_end()?; + f_6 = Some(val); + }, + 7 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_16 = i_prot.read_i64()?; + val.push(list_elem_16); + } + i_prot.read_list_end()?; + f_7 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -4594,6 +4845,8 @@ impl crate::thrift::TSerializable for ColumnIndex { max_values: f_3.expect("auto-generated code should have checked for presence of required fields"), boundary_order: f_4.expect("auto-generated code should have checked for presence of required fields"), null_counts: f_5, + repetition_level_histograms: f_6, + definition_level_histograms: f_7, }; Ok(ret) } @@ -4633,6 +4886,24 @@ impl crate::thrift::TSerializable for ColumnIndex { o_prot.write_list_end()?; o_prot.write_field_end()? } + if let Some(ref fld_var) = self.repetition_level_histograms { + o_prot.write_field_begin(&TFieldIdentifier::new("repetition_level_histograms", TType::List, 6))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.definition_level_histograms { + o_prot.write_field_begin(&TFieldIdentifier::new("definition_level_histograms", TType::List, 7))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -4992,8 +5263,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_12 = SchemaElement::read_from_in_protocol(i_prot)?; - val.push(list_elem_12); + let list_elem_17 = SchemaElement::read_from_in_protocol(i_prot)?; + val.push(list_elem_17); } i_prot.read_list_end()?; f_2 = Some(val); @@ -5006,8 +5277,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_13 = RowGroup::read_from_in_protocol(i_prot)?; - val.push(list_elem_13); + let list_elem_18 = RowGroup::read_from_in_protocol(i_prot)?; + val.push(list_elem_18); } i_prot.read_list_end()?; f_4 = Some(val); @@ -5016,8 +5287,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_14 = KeyValue::read_from_in_protocol(i_prot)?; - val.push(list_elem_14); + let list_elem_19 = KeyValue::read_from_in_protocol(i_prot)?; + val.push(list_elem_19); } i_prot.read_list_end()?; f_5 = Some(val); @@ -5030,8 +5301,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_15 = ColumnOrder::read_from_in_protocol(i_prot)?; - val.push(list_elem_15); + let list_elem_20 = ColumnOrder::read_from_in_protocol(i_prot)?; + val.push(list_elem_20); } i_prot.read_list_end()?; f_7 = Some(val); From 2e7f7ef9b68f69e8707d01dbbf5ad0e57f190979 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Jul 2024 18:30:13 -0400 Subject: [PATCH 06/21] =?UTF-8?q?Revert=20"Revert=20"Write=20Bloom=20filte?= =?UTF-8?q?rs=20between=20row=20groups=20instead=20of=20the=20end=20=20(#?= =?UTF-8?q?=E2=80=A6"=20(#5933)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 22e0b4432c9838f2536284015271d3de9a165135. --- parquet/Cargo.toml | 8 ++ parquet/examples/write_parquet.rs | 131 ++++++++++++++++++++++++++ parquet/src/arrow/arrow_writer/mod.rs | 28 +++++- parquet/src/arrow/async_writer/mod.rs | 4 +- parquet/src/file/metadata/mod.rs | 5 + parquet/src/file/properties.rs | 36 +++++++ parquet/src/file/writer.rs | 117 ++++++++++++++--------- 7 files changed, 277 insertions(+), 52 deletions(-) create mode 100644 parquet/examples/write_parquet.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 2cc12a81dea5..7391d0964646 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -67,6 +67,7 @@ hashbrown = { version = "0.14", default-features = false } twox-hash = { version = "1.6", default-features = false } paste = { version = "1.0" } half = { version = "2.1", default-features = false, features = ["num-traits"] } +sysinfo = { version = "0.30.12", optional = true, default-features = false } [dev-dependencies] base64 = { version = "0.22", default-features = false, features = ["std"] } @@ -114,12 +115,19 @@ async = ["futures", "tokio"] object_store = ["dep:object_store", "async"] # Group Zstd dependencies zstd = ["dep:zstd", "zstd-sys"] +# Display memory in example/write_parquet.rs +sysinfo = ["dep:sysinfo"] [[example]] name = "read_parquet" required-features = ["arrow"] path = "./examples/read_parquet.rs" +[[example]] +name = "write_parquet" +required-features = ["cli", "sysinfo"] +path = "./examples/write_parquet.rs" + [[example]] name = "async_read_parquet" required-features = ["arrow", "async"] diff --git a/parquet/examples/write_parquet.rs b/parquet/examples/write_parquet.rs new file mode 100644 index 000000000000..d2ef550df840 --- /dev/null +++ b/parquet/examples/write_parquet.rs @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::fs::File; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use arrow::array::{StructArray, UInt64Builder}; +use arrow::datatypes::DataType::UInt64; +use arrow::datatypes::{Field, Schema}; +use clap::{Parser, ValueEnum}; +use parquet::arrow::ArrowWriter as ParquetWriter; +use parquet::basic::Encoding; +use parquet::errors::Result; +use parquet::file::properties::{BloomFilterPosition, WriterProperties}; +use sysinfo::{MemoryRefreshKind, Pid, ProcessRefreshKind, RefreshKind, System}; + +#[derive(ValueEnum, Clone)] +enum BloomFilterPositionArg { + End, + AfterRowGroup, +} + +#[derive(Parser)] +#[command(version)] +/// Writes sequences of integers, with a Bloom Filter, while logging timing and memory usage. +struct Args { + #[arg(long, default_value_t = 1000)] + /// Number of batches to write + iterations: u64, + + #[arg(long, default_value_t = 1000000)] + /// Number of rows in each batch + batch: u64, + + #[arg(long, value_enum, default_value_t=BloomFilterPositionArg::AfterRowGroup)] + /// Where to write Bloom Filters + bloom_filter_position: BloomFilterPositionArg, + + /// Path to the file to write + path: PathBuf, +} + +fn now() -> String { + chrono::Local::now().format("%Y-%m-%d %H:%M:%S").to_string() +} + +fn mem(system: &mut System) -> String { + let pid = Pid::from(std::process::id() as usize); + system.refresh_process_specifics(pid, ProcessRefreshKind::new().with_memory()); + system + .process(pid) + .map(|proc| format!("{}MB", proc.memory() / 1_000_000)) + .unwrap_or("N/A".to_string()) +} + +fn main() -> Result<()> { + let args = Args::parse(); + + let bloom_filter_position = match args.bloom_filter_position { + BloomFilterPositionArg::End => BloomFilterPosition::End, + BloomFilterPositionArg::AfterRowGroup => BloomFilterPosition::AfterRowGroup, + }; + + let properties = WriterProperties::builder() + .set_column_bloom_filter_enabled("id".into(), true) + .set_column_encoding("id".into(), Encoding::DELTA_BINARY_PACKED) + .set_bloom_filter_position(bloom_filter_position) + .build(); + let schema = Arc::new(Schema::new(vec![Field::new("id", UInt64, false)])); + // Create parquet file that will be read. + let file = File::create(args.path).unwrap(); + let mut writer = ParquetWriter::try_new(file, schema.clone(), Some(properties))?; + + let mut system = + System::new_with_specifics(RefreshKind::new().with_memory(MemoryRefreshKind::everything())); + eprintln!( + "{} Writing {} batches of {} rows. RSS = {}", + now(), + args.iterations, + args.batch, + mem(&mut system) + ); + + let mut array_builder = UInt64Builder::new(); + let mut last_log = Instant::now(); + for i in 0..args.iterations { + if Instant::now() - last_log > Duration::new(10, 0) { + last_log = Instant::now(); + eprintln!( + "{} Iteration {}/{}. RSS = {}", + now(), + i + 1, + args.iterations, + mem(&mut system) + ); + } + for j in 0..args.batch { + array_builder.append_value(i + j); + } + writer.write( + &StructArray::new( + schema.fields().clone(), + vec![Arc::new(array_builder.finish())], + None, + ) + .into(), + )?; + } + writer.flush()?; + writer.close()?; + + eprintln!("{} Done. RSS = {}", now(), mem(&mut system)); + + Ok(()) +} diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index cf46f3b64a57..070d740094f5 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -43,7 +43,7 @@ use crate::column::writer::{ }; use crate::data_type::{ByteArray, FixedLenByteArray}; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::{ColumnChunkMetaData, KeyValue, RowGroupMetaDataPtr}; +use crate::file::metadata::{ColumnChunkMetaData, KeyValue, RowGroupMetaData}; use crate::file::properties::{WriterProperties, WriterPropertiesPtr}; use crate::file::reader::{ChunkReader, Length}; use crate::file::writer::{SerializedFileWriter, SerializedRowGroupWriter}; @@ -204,7 +204,7 @@ impl ArrowWriter { } /// Returns metadata for any flushed row groups - pub fn flushed_row_groups(&self) -> &[RowGroupMetaDataPtr] { + pub fn flushed_row_groups(&self) -> &[RowGroupMetaData] { self.writer.flushed_row_groups() } @@ -1097,7 +1097,9 @@ mod tests { use crate::file::metadata::ParquetMetaData; use crate::file::page_index::index::Index; use crate::file::page_index::index_reader::read_pages_locations; - use crate::file::properties::{EnabledStatistics, ReaderProperties, WriterVersion}; + use crate::file::properties::{ + BloomFilterPosition, EnabledStatistics, ReaderProperties, WriterVersion, + }; use crate::file::serialized_reader::ReadOptionsBuilder; use crate::file::{ reader::{FileReader, SerializedFileReader}, @@ -1745,6 +1747,7 @@ mod tests { values: ArrayRef, schema: SchemaRef, bloom_filter: bool, + bloom_filter_position: BloomFilterPosition, } impl RoundTripOptions { @@ -1755,6 +1758,7 @@ mod tests { values, schema: Arc::new(schema), bloom_filter: false, + bloom_filter_position: BloomFilterPosition::AfterRowGroup, } } } @@ -1774,6 +1778,7 @@ mod tests { values, schema, bloom_filter, + bloom_filter_position, } = options; let encodings = match values.data_type() { @@ -1814,6 +1819,7 @@ mod tests { .set_dictionary_page_size_limit(dictionary_size.max(1)) .set_encoding(*encoding) .set_bloom_filter_enabled(bloom_filter) + .set_bloom_filter_position(bloom_filter_position) .build(); files.push(roundtrip_opts(&expected_batch, props)) @@ -2171,6 +2177,22 @@ mod tests { values_required::(many_vecs_iter); } + #[test] + fn i32_column_bloom_filter_at_end() { + let array = Arc::new(Int32Array::from_iter(0..SMALL_SIZE as i32)); + let mut options = RoundTripOptions::new(array, false); + options.bloom_filter = true; + options.bloom_filter_position = BloomFilterPosition::End; + + let files = one_column_roundtrip_with_options(options); + check_bloom_filter( + files, + "col".to_string(), + (0..SMALL_SIZE as i32).collect(), + (SMALL_SIZE as i32 + 1..SMALL_SIZE as i32 + 10).collect(), + ); + } + #[test] fn i32_column_bloom_filter() { let array = Arc::new(Int32Array::from_iter(0..SMALL_SIZE as i32)); diff --git a/parquet/src/arrow/async_writer/mod.rs b/parquet/src/arrow/async_writer/mod.rs index edeb0fec00b7..274d8fef8976 100644 --- a/parquet/src/arrow/async_writer/mod.rs +++ b/parquet/src/arrow/async_writer/mod.rs @@ -54,7 +54,7 @@ use crate::{ arrow::arrow_writer::ArrowWriterOptions, arrow::ArrowWriter, errors::{ParquetError, Result}, - file::{metadata::RowGroupMetaDataPtr, properties::WriterProperties}, + file::{metadata::RowGroupMetaData, properties::WriterProperties}, format::{FileMetaData, KeyValue}, }; use arrow_array::RecordBatch; @@ -172,7 +172,7 @@ impl AsyncArrowWriter { } /// Returns metadata for any flushed row groups - pub fn flushed_row_groups(&self) -> &[RowGroupMetaDataPtr] { + pub fn flushed_row_groups(&self) -> &[RowGroupMetaData] { self.sync_writer.flushed_row_groups() } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 278d1e464e94..39b6de41fa58 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -358,6 +358,11 @@ impl RowGroupMetaData { &self.columns } + /// Returns mutable slice of column chunk metadata. + pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] { + &mut self.columns + } + /// Number of rows in this row group. pub fn num_rows(&self) -> i64 { self.num_rows diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 654b5e23f9a9..8454625ff724 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -45,6 +45,8 @@ pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Pag pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096; /// Default value for [`WriterProperties::max_row_group_size`] pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; +/// Default value for [`WriterProperties::bloom_filter_position`] +pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup; /// Default value for [`WriterProperties::created_by`] pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION")); /// Default value for [`WriterProperties::column_index_truncate_length`] @@ -88,6 +90,24 @@ impl FromStr for WriterVersion { } } +/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should +/// write Bloom filters +/// +/// Basic constant, which is not part of the Thrift definition. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BloomFilterPosition { + /// Write Bloom Filters of each row group right after the row group + /// + /// This saves memory by writing it as soon as it is computed, at the cost + /// of data locality for readers + AfterRowGroup, + /// Write Bloom Filters at the end of the file + /// + /// This allows better data locality for readers, at the cost of memory usage + /// for writers. + End, +} + /// Reference counted writer properties. pub type WriterPropertiesPtr = Arc; @@ -132,6 +152,7 @@ pub struct WriterProperties { data_page_row_count_limit: usize, write_batch_size: usize, max_row_group_size: usize, + bloom_filter_position: BloomFilterPosition, writer_version: WriterVersion, created_by: String, pub(crate) key_value_metadata: Option>, @@ -219,6 +240,11 @@ impl WriterProperties { self.max_row_group_size } + /// Returns maximum number of rows in a row group. + pub fn bloom_filter_position(&self) -> BloomFilterPosition { + self.bloom_filter_position + } + /// Returns configured writer version. pub fn writer_version(&self) -> WriterVersion { self.writer_version @@ -340,6 +366,7 @@ pub struct WriterPropertiesBuilder { data_page_row_count_limit: usize, write_batch_size: usize, max_row_group_size: usize, + bloom_filter_position: BloomFilterPosition, writer_version: WriterVersion, created_by: String, key_value_metadata: Option>, @@ -359,6 +386,7 @@ impl WriterPropertiesBuilder { data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT, write_batch_size: DEFAULT_WRITE_BATCH_SIZE, max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE, + bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION, writer_version: DEFAULT_WRITER_VERSION, created_by: DEFAULT_CREATED_BY.to_string(), key_value_metadata: None, @@ -378,6 +406,7 @@ impl WriterPropertiesBuilder { data_page_row_count_limit: self.data_page_row_count_limit, write_batch_size: self.write_batch_size, max_row_group_size: self.max_row_group_size, + bloom_filter_position: self.bloom_filter_position, writer_version: self.writer_version, created_by: self.created_by, key_value_metadata: self.key_value_metadata, @@ -489,6 +518,12 @@ impl WriterPropertiesBuilder { self } + /// Sets where in the final file Bloom Filters are written (default `AfterRowGroup`) + pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self { + self.bloom_filter_position = value; + self + } + /// Sets "created by" property (defaults to `parquet-rs version `). pub fn set_created_by(mut self, value: String) -> Self { self.created_by = value; @@ -1054,6 +1089,7 @@ mod tests { ); assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE); assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE); + assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION); assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION); assert_eq!(props.created_by(), DEFAULT_CREATED_BY); assert_eq!(props.key_value_metadata(), None); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 7806384cdb52..eb633f31c477 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -34,8 +34,9 @@ use crate::column::{ }; use crate::data_type::DataType; use crate::errors::{ParquetError, Result}; +use crate::file::properties::{BloomFilterPosition, WriterPropertiesPtr}; use crate::file::reader::ChunkReader; -use crate::file::{metadata::*, properties::WriterPropertiesPtr, PARQUET_MAGIC}; +use crate::file::{metadata::*, PARQUET_MAGIC}; use crate::schema::types::{self, ColumnDescPtr, SchemaDescPtr, SchemaDescriptor, TypePtr}; /// A wrapper around a [`Write`] that keeps track of the number @@ -115,9 +116,10 @@ pub type OnCloseColumnChunk<'a> = Box Result<() /// - the row group metadata /// - the column index for each column chunk /// - the offset index for each column chunk -pub type OnCloseRowGroup<'a> = Box< +pub type OnCloseRowGroup<'a, W> = Box< dyn FnOnce( - RowGroupMetaDataPtr, + &'a mut TrackedWrite, + RowGroupMetaData, Vec>, Vec>, Vec>, @@ -143,7 +145,7 @@ pub struct SerializedFileWriter { schema: TypePtr, descr: SchemaDescPtr, props: WriterPropertiesPtr, - row_groups: Vec, + row_groups: Vec, bloom_filters: Vec>>, column_indexes: Vec>>, offset_indexes: Vec>>, @@ -197,18 +199,29 @@ impl SerializedFileWriter { self.row_group_index += 1; + let bloom_filter_position = self.properties().bloom_filter_position(); let row_groups = &mut self.row_groups; let row_bloom_filters = &mut self.bloom_filters; let row_column_indexes = &mut self.column_indexes; let row_offset_indexes = &mut self.offset_indexes; - let on_close = - |metadata, row_group_bloom_filter, row_group_column_index, row_group_offset_index| { - row_groups.push(metadata); - row_bloom_filters.push(row_group_bloom_filter); - row_column_indexes.push(row_group_column_index); - row_offset_indexes.push(row_group_offset_index); - Ok(()) + let on_close = move |buf, + mut metadata, + row_group_bloom_filter, + row_group_column_index, + row_group_offset_index| { + row_bloom_filters.push(row_group_bloom_filter); + row_column_indexes.push(row_group_column_index); + row_offset_indexes.push(row_group_offset_index); + // write bloom filters out immediately after the row group if requested + match bloom_filter_position { + BloomFilterPosition::AfterRowGroup => { + write_bloom_filters(buf, row_bloom_filters, &mut metadata)? + } + BloomFilterPosition::End => (), }; + row_groups.push(metadata); + Ok(()) + }; let row_group_writer = SerializedRowGroupWriter::new( self.descr.clone(), @@ -221,7 +234,7 @@ impl SerializedFileWriter { } /// Returns metadata for any flushed row groups - pub fn flushed_row_groups(&self) -> &[RowGroupMetaDataPtr] { + pub fn flushed_row_groups(&self) -> &[RowGroupMetaData] { &self.row_groups } @@ -273,34 +286,6 @@ impl SerializedFileWriter { Ok(()) } - /// Serialize all the bloom filter to the file - fn write_bloom_filters(&mut self, row_groups: &mut [RowGroup]) -> Result<()> { - // iter row group - // iter each column - // write bloom filter to the file - for (row_group_idx, row_group) in row_groups.iter_mut().enumerate() { - for (column_idx, column_chunk) in row_group.columns.iter_mut().enumerate() { - match &self.bloom_filters[row_group_idx][column_idx] { - Some(bloom_filter) => { - let start_offset = self.buf.bytes_written(); - bloom_filter.write(&mut self.buf)?; - let end_offset = self.buf.bytes_written(); - // set offset and index for bloom filter - let column_chunk_meta = column_chunk - .meta_data - .as_mut() - .expect("can't have bloom filter without column metadata"); - column_chunk_meta.bloom_filter_offset = Some(start_offset as i64); - column_chunk_meta.bloom_filter_length = - Some((end_offset - start_offset) as i32); - } - None => {} - } - } - } - Ok(()) - } - /// Serialize all the column index to the file fn write_column_indexes(&mut self, row_groups: &mut [RowGroup]) -> Result<()> { // iter row group @@ -331,6 +316,11 @@ impl SerializedFileWriter { self.finished = true; let num_rows = self.row_groups.iter().map(|x| x.num_rows()).sum(); + // write out any remaining bloom filters after all row groups + for row_group in &mut self.row_groups { + write_bloom_filters(&mut self.buf, &mut self.bloom_filters, row_group)?; + } + let mut row_groups = self .row_groups .as_slice() @@ -338,7 +328,6 @@ impl SerializedFileWriter { .map(|v| v.to_thrift()) .collect::>(); - self.write_bloom_filters(&mut row_groups)?; // Write column indexes and offset indexes self.write_column_indexes(&mut row_groups)?; self.write_offset_indexes(&mut row_groups)?; @@ -443,6 +432,40 @@ impl SerializedFileWriter { } } +/// Serialize all the bloom filters of the given row group to the given buffer, +/// and returns the updated row group metadata. +fn write_bloom_filters( + buf: &mut TrackedWrite, + bloom_filters: &mut [Vec>], + row_group: &mut RowGroupMetaData, +) -> Result<()> { + // iter row group + // iter each column + // write bloom filter to the file + + let row_group_idx: u16 = row_group + .ordinal() + .expect("Missing row group ordinal") + .try_into() + .expect("Negative row group ordinal"); + let row_group_idx = row_group_idx as usize; + for (column_idx, column_chunk) in row_group.columns_mut().iter_mut().enumerate() { + if let Some(bloom_filter) = bloom_filters[row_group_idx][column_idx].take() { + let start_offset = buf.bytes_written(); + bloom_filter.write(&mut *buf)?; + let end_offset = buf.bytes_written(); + // set offset and index for bloom filter + *column_chunk = column_chunk + .clone() + .into_builder() + .set_bloom_filter_offset(Some(start_offset as i64)) + .set_bloom_filter_length(Some((end_offset - start_offset) as i32)) + .build()?; + } + } + Ok(()) +} + /// Parquet row group writer API. /// Provides methods to access column writers in an iterator-like fashion, order is /// guaranteed to match the order of schema leaves (column descriptors). @@ -468,7 +491,7 @@ pub struct SerializedRowGroupWriter<'a, W: Write> { offset_indexes: Vec>, row_group_index: i16, file_offset: i64, - on_close: Option>, + on_close: Option>, } impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { @@ -485,7 +508,7 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { properties: WriterPropertiesPtr, buf: &'a mut TrackedWrite, row_group_index: i16, - on_close: Option>, + on_close: Option>, ) -> Self { let num_columns = schema_descr.num_columns(); let file_offset = buf.bytes_written() as i64; @@ -669,12 +692,12 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { .set_file_offset(self.file_offset) .build()?; - let metadata = Arc::new(row_group_metadata); - self.row_group_metadata = Some(metadata.clone()); + self.row_group_metadata = Some(Arc::new(row_group_metadata.clone())); if let Some(on_close) = self.on_close.take() { on_close( - metadata, + self.buf, + row_group_metadata, self.bloom_filters, self.column_indexes, self.offset_indexes, @@ -1446,7 +1469,7 @@ mod tests { assert_eq!(flushed.len(), idx + 1); assert_eq!(Some(idx as i16), last_group.ordinal()); assert_eq!(Some(row_group_file_offset as i64), last_group.file_offset()); - assert_eq!(flushed[idx].as_ref(), last_group.as_ref()); + assert_eq!(&flushed[idx], last_group.as_ref()); } let file_metadata = file_writer.close().unwrap(); From effccc13a9ef6e650049a160734d5ededb2a5383 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 16 Jul 2024 18:50:01 -0400 Subject: [PATCH 07/21] Revert "Update snafu (#5930)" (#6069) This reverts commit 756b1fb26d1702f36f446faf9bb40a4869c3e840. --- object_store/Cargo.toml | 2 +- object_store/src/client/get.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 234c7746b33b..3f7b2c0b2be0 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -38,7 +38,7 @@ humantime = "2.1" itertools = "0.13.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" -snafu = { version = "0.8", default-features = false, features = ["std", "rust_1_61"] } +snafu = "0.7" tracing = { version = "0.1" } url = "2.2" walkdir = "2" diff --git a/object_store/src/client/get.rs b/object_store/src/client/get.rs index 0fef5785c052..b45eaa143702 100644 --- a/object_store/src/client/get.rs +++ b/object_store/src/client/get.rs @@ -103,7 +103,7 @@ enum GetResultError { source: crate::client::header::Error, }, - #[snafu(transparent)] + #[snafu(context(false))] InvalidRangeRequest { source: crate::util::InvalidGetRange, }, @@ -386,7 +386,7 @@ mod tests { let err = get_result::(&path, Some(get_range.clone()), resp).unwrap_err(); assert_eq!( err.to_string(), - "Wanted range starting at 2, but object was only 2 bytes long" + "InvalidRangeRequest: Wanted range starting at 2, but object was only 2 bytes long" ); let resp = make_response( From 649d09d0ff3d4e36a0f10ec27c2e864a4e0b8792 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 17 Jul 2024 22:22:47 +0200 Subject: [PATCH 08/21] Update pyo3 requirement from 0.21.1 to 0.22.1 (fixed) (#6075) * Update pyo3 requirement from 0.21.1 to 0.22.1 Updates the requirements on [pyo3](https://github.com/pyo3/pyo3) to permit the latest version. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/main/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.21.1...v0.22.1) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] * refactor: remove deprecated `FromPyArrow::from_pyarrow` "GIL Refs" are being phased out. * chore: update `pyo3` in integration tests --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-pyarrow-integration-testing/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- arrow/src/pyarrow.rs | 5 ----- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 6f07d42d88c1..0834f2d13384 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -34,4 +34,4 @@ crate-type = ["cdylib"] [dependencies] arrow = { path = "../arrow", features = ["pyarrow"] } -pyo3 = { version = "0.21.1", features = ["extension-module"] } +pyo3 = { version = "0.22", features = ["extension-module"] } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 9d3c431b3048..b8f40e1b4b99 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -54,7 +54,7 @@ arrow-select = { workspace = true } arrow-string = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } -pyo3 = { version = "0.21.1", default-features = false, optional = true } +pyo3 = { version = "0.22.1", default-features = false, optional = true } [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"] diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 1733067c738a..43cdb4fe0919 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -83,11 +83,6 @@ fn to_py_err(err: ArrowError) -> PyErr { } pub trait FromPyArrow: Sized { - #[deprecated(since = "52.0.0", note = "Use from_pyarrow_bound")] - fn from_pyarrow(value: &PyAny) -> PyResult { - Self::from_pyarrow_bound(&value.as_borrowed()) - } - fn from_pyarrow_bound(value: &Bound) -> PyResult; } From 05e681d7669acc0f91facf621a98c00f3d985c71 Mon Sep 17 00:00:00 2001 From: kamille Date: Fri, 19 Jul 2024 05:59:03 +0800 Subject: [PATCH 09/21] remove repeated codes to make the codes more concise. (#6080) --- arrow-cast/src/display.rs | 121 +++++++++++++------------------------- 1 file changed, 40 insertions(+), 81 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index ab172cb240cf..312e7973963e 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -662,17 +662,17 @@ impl<'a> DisplayIndex for &'a PrimitiveArray { impl<'a> DisplayIndex for &'a PrimitiveArray { fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { let value = self.value(idx); - let mut first_part = true; + let mut prefix = ""; if value.days != 0 { - write!(f, "{} days", value.days)?; - first_part = false; + write!(f, "{prefix}{} days", value.days)?; + prefix = " "; } if value.milliseconds != 0 { let millis_fmt = MillisecondsFormatter { milliseconds: value.milliseconds, - first_part, + prefix, }; f.write_fmt(format_args!("{millis_fmt}"))?; @@ -685,26 +685,22 @@ impl<'a> DisplayIndex for &'a PrimitiveArray { impl<'a> DisplayIndex for &'a PrimitiveArray { fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { let value = self.value(idx); - let mut first_part = true; + let mut prefix = ""; if value.months != 0 { - write!(f, "{} mons", value.months)?; - first_part = false; + write!(f, "{prefix}{} mons", value.months)?; + prefix = " "; } if value.days != 0 { - if first_part { - write!(f, "{} days", value.days)?; - first_part = false; - } else { - write!(f, " {} days", value.days)?; - } + write!(f, "{prefix}{} days", value.days)?; + prefix = " "; } if value.nanoseconds != 0 { let nano_fmt = NanosecondsFormatter { nanoseconds: value.nanoseconds, - first_part, + prefix, }; f.write_fmt(format_args!("{nano_fmt}"))?; } @@ -713,14 +709,14 @@ impl<'a> DisplayIndex for &'a PrimitiveArray { } } -struct NanosecondsFormatter { +struct NanosecondsFormatter<'a> { nanoseconds: i64, - first_part: bool, + prefix: &'a str, } -impl Display for NanosecondsFormatter { +impl<'a> Display for NanosecondsFormatter<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let mut first_part = self.first_part; + let mut prefix = self.prefix; let secs = self.nanoseconds / 1_000_000_000; let mins = secs / 60; @@ -732,57 +728,38 @@ impl Display for NanosecondsFormatter { let nanoseconds = self.nanoseconds % 1_000_000_000; if hours != 0 { - if first_part { - write!(f, "{} hours", hours)?; - first_part = false; - } else { - write!(f, " {} hours", hours)?; - } + write!(f, "{prefix}{} hours", hours)?; + prefix = " "; } if mins != 0 { - if first_part { - write!(f, "{} mins", mins)?; - first_part = false; - } else { - write!(f, " {} mins", mins)?; - } + write!(f, "{prefix}{} mins", mins)?; + prefix = " "; } if secs != 0 || nanoseconds != 0 { let secs_sign = if secs < 0 || nanoseconds < 0 { "-" } else { "" }; - - if first_part { - write!( - f, - "{}{}.{:09} secs", - secs_sign, - secs.abs(), - nanoseconds.abs() - )?; - } else { - write!( - f, - " {}{}.{:09} secs", - secs_sign, - secs.abs(), - nanoseconds.abs() - )?; - } + write!( + f, + "{prefix}{}{}.{:09} secs", + secs_sign, + secs.abs(), + nanoseconds.abs() + )?; } Ok(()) } } -struct MillisecondsFormatter { +struct MillisecondsFormatter<'a> { milliseconds: i32, - first_part: bool, + prefix: &'a str, } -impl Display for MillisecondsFormatter { +impl<'a> Display for MillisecondsFormatter<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let mut first_part = self.first_part; + let mut prefix = self.prefix; let secs = self.milliseconds / 1_000; let mins = secs / 60; @@ -794,21 +771,13 @@ impl Display for MillisecondsFormatter { let milliseconds = self.milliseconds % 1_000; if hours != 0 { - if first_part { - write!(f, "{} hours", hours,)?; - first_part = false; - } else { - write!(f, " {} hours", hours,)?; - } + write!(f, "{prefix}{} hours", hours,)?; + prefix = " "; } if mins != 0 { - if first_part { - write!(f, "{} mins", mins,)?; - first_part = false; - } else { - write!(f, " {} mins", mins,)?; - } + write!(f, "{prefix}{} mins", mins,)?; + prefix = " "; } if secs != 0 || milliseconds != 0 { @@ -818,23 +787,13 @@ impl Display for MillisecondsFormatter { "" }; - if first_part { - write!( - f, - "{}{}.{:03} secs", - secs_sign, - secs.abs(), - milliseconds.abs() - )?; - } else { - write!( - f, - " {}{}.{:03} secs", - secs_sign, - secs.abs(), - milliseconds.abs() - )?; - } + write!( + f, + "{prefix}{}{}.{:03} secs", + secs_sign, + secs.abs(), + milliseconds.abs() + )?; } Ok(()) From e40b3115183c4e31132377839cc03a76244e10a7 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 19 Jul 2024 13:29:52 -0700 Subject: [PATCH 10/21] Add `unencoded_byte_array_data_bytes` to `ParquetMetaData` (#6068) * update to latest thrift (as of 11 Jul 2024) from parquet-format * pass None for optional size statistics * escape HTML tags * don't need to escape brackets in arrays * add support for unencoded_byte_array_data_bytes * add comments * change sig of ColumnMetrics::update_variable_length_bytes() * rename ParquetOffsetIndex to OffsetSizeIndex * rename some functions * suggestion from review Co-authored-by: Andrew Lamb * add Default trait to ColumnMetrics as suggested in review * rename OffsetSizeIndex to OffsetIndexMetaData --------- Co-authored-by: Andrew Lamb --- parquet/src/arrow/arrow_writer/byte_array.rs | 19 ++++- parquet/src/arrow/async_reader/metadata.rs | 8 +- parquet/src/arrow/async_reader/mod.rs | 1 + parquet/src/column/writer/encoder.rs | 8 ++ parquet/src/column/writer/mod.rs | 54 +++++++++---- parquet/src/data_type.rs | 11 +++ parquet/src/file/metadata/memory.rs | 1 + parquet/src/file/metadata/mod.rs | 77 ++++++++++++++++-- parquet/src/file/page_index/index_reader.rs | 53 ++++++++++-- parquet/src/file/page_index/mod.rs | 1 + parquet/src/file/page_index/offset_index.rs | 50 ++++++++++++ parquet/src/file/serialized_reader.rs | 15 +++- parquet/src/file/writer.rs | 85 +++++++++++++++++++- 13 files changed, 349 insertions(+), 34 deletions(-) create mode 100644 parquet/src/file/page_index/offset_index.rs diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index fc37ebfb4510..2d23ad8510f9 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -96,6 +96,7 @@ macro_rules! downcast_op { struct FallbackEncoder { encoder: FallbackEncoderImpl, num_values: usize, + variable_length_bytes: i64, } /// The fallback encoder in use @@ -152,6 +153,7 @@ impl FallbackEncoder { Ok(Self { encoder, num_values: 0, + variable_length_bytes: 0, }) } @@ -168,7 +170,8 @@ impl FallbackEncoder { let value = values.value(*idx); let value = value.as_ref(); buffer.extend_from_slice((value.len() as u32).as_bytes()); - buffer.extend_from_slice(value) + buffer.extend_from_slice(value); + self.variable_length_bytes += value.len() as i64; } } FallbackEncoderImpl::DeltaLength { buffer, lengths } => { @@ -177,6 +180,7 @@ impl FallbackEncoder { let value = value.as_ref(); lengths.put(&[value.len() as i32]).unwrap(); buffer.extend_from_slice(value); + self.variable_length_bytes += value.len() as i64; } } FallbackEncoderImpl::Delta { @@ -205,6 +209,7 @@ impl FallbackEncoder { buffer.extend_from_slice(&value[prefix_length..]); prefix_lengths.put(&[prefix_length as i32]).unwrap(); suffix_lengths.put(&[suffix_length as i32]).unwrap(); + self.variable_length_bytes += value.len() as i64; } } } @@ -269,12 +274,17 @@ impl FallbackEncoder { } }; + // Capture value of variable_length_bytes and reset for next page + let variable_length_bytes = Some(self.variable_length_bytes); + self.variable_length_bytes = 0; + Ok(DataPageValues { buf: buf.into(), num_values: std::mem::take(&mut self.num_values), encoding, min_value, max_value, + variable_length_bytes, }) } } @@ -321,6 +331,7 @@ impl Storage for ByteArrayStorage { struct DictEncoder { interner: Interner, indices: Vec, + variable_length_bytes: i64, } impl DictEncoder { @@ -336,6 +347,7 @@ impl DictEncoder { let value = values.value(*idx); let interned = self.interner.intern(value.as_ref()); self.indices.push(interned); + self.variable_length_bytes += value.as_ref().len() as i64; } } @@ -384,12 +396,17 @@ impl DictEncoder { self.indices.clear(); + // Capture value of variable_length_bytes and reset for next page + let variable_length_bytes = Some(self.variable_length_bytes); + self.variable_length_bytes = 0; + DataPageValues { buf: encoder.consume().into(), num_values, encoding: Encoding::RLE_DICTIONARY, min_value, max_value, + variable_length_bytes, } } } diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 9224ea3f68a8..4a3489a2084d 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -20,7 +20,9 @@ use crate::errors::{ParquetError, Result}; use crate::file::footer::{decode_footer, decode_metadata}; use crate::file::metadata::ParquetMetaData; use crate::file::page_index::index::Index; -use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index}; +use crate::file::page_index::index_reader::{ + acc_range, decode_column_index, decode_page_locations, +}; use bytes::Bytes; use futures::future::BoxFuture; use futures::FutureExt; @@ -177,7 +179,9 @@ impl MetadataLoader { x.columns() .iter() .map(|c| match c.offset_index_range() { - Some(r) => decode_offset_index(&data[r.start - offset..r.end - offset]), + Some(r) => { + decode_page_locations(&data[r.start - offset..r.end - offset]) + } None => Err(general_err!("missing offset index")), }) .collect::>>() diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index e4205b7ef2ce..5a790fa6aff0 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -1538,6 +1538,7 @@ mod tests { vec![row_group_meta], None, Some(vec![offset_index.clone()]), + None, ); let metadata = Arc::new(metadata); diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index b6c8212608b8..9d01c09040de 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -63,6 +63,7 @@ pub struct DataPageValues { pub encoding: Encoding, pub min_value: Option, pub max_value: Option, + pub variable_length_bytes: Option, } /// A generic encoder of [`ColumnValues`] to data and dictionary pages used by @@ -131,6 +132,7 @@ pub struct ColumnValueEncoderImpl { min_value: Option, max_value: Option, bloom_filter: Option, + variable_length_bytes: Option, } impl ColumnValueEncoderImpl { @@ -150,6 +152,10 @@ impl ColumnValueEncoderImpl { update_min(&self.descr, &min, &mut self.min_value); update_max(&self.descr, &max, &mut self.max_value); } + + if let Some(var_bytes) = T::T::variable_length_bytes(slice) { + *self.variable_length_bytes.get_or_insert(0) += var_bytes; + } } // encode the values into bloom filter if enabled @@ -203,6 +209,7 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { bloom_filter, min_value: None, max_value: None, + variable_length_bytes: None, }) } @@ -296,6 +303,7 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { num_values: std::mem::take(&mut self.num_values), min_value: self.min_value.take(), max_value: self.max_value.take(), + variable_length_bytes: self.variable_length_bytes.take(), }) } } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8594e59714dc..0e227a157d06 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -192,7 +192,8 @@ struct PageMetrics { } // Metrics per column writer -struct ColumnMetrics { +#[derive(Default)] +struct ColumnMetrics { total_bytes_written: u64, total_rows_written: u64, total_uncompressed_size: u64, @@ -204,6 +205,20 @@ struct ColumnMetrics { max_column_value: Option, num_column_nulls: u64, column_distinct_count: Option, + variable_length_bytes: Option, +} + +impl ColumnMetrics { + fn new() -> Self { + Default::default() + } + + /// Sum the provided page variable_length_bytes into the chunk variable_length_bytes + fn update_variable_length_bytes(&mut self, variable_length_bytes: Option) { + if let Some(var_bytes) = variable_length_bytes { + *self.variable_length_bytes.get_or_insert(0) += var_bytes; + } + } } /// Typed column writer for a primitive column. @@ -276,19 +291,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { num_buffered_rows: 0, num_page_nulls: 0, }, - column_metrics: ColumnMetrics { - total_bytes_written: 0, - total_rows_written: 0, - total_uncompressed_size: 0, - total_compressed_size: 0, - total_num_values: 0, - dictionary_page_offset: None, - data_page_offset: None, - min_column_value: None, - max_column_value: None, - num_column_nulls: 0, - column_distinct_count: None, - }, + column_metrics: ColumnMetrics::::new(), column_index_builder: ColumnIndexBuilder::new(), offset_index_builder: OffsetIndexBuilder::new(), encodings, @@ -634,7 +637,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } /// Update the column index and offset index when adding the data page - fn update_column_offset_index(&mut self, page_statistics: Option<&ValueStatistics>) { + fn update_column_offset_index( + &mut self, + page_statistics: Option<&ValueStatistics>, + page_variable_length_bytes: Option, + ) { // update the column index let null_page = (self.page_metrics.num_buffered_rows as u64) == self.page_metrics.num_page_nulls; @@ -708,6 +715,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // update the offset index self.offset_index_builder .append_row_count(self.page_metrics.num_buffered_rows as i64); + + self.offset_index_builder + .append_unencoded_byte_array_data_bytes(page_variable_length_bytes); } /// Determine if we should allow truncating min/max values for this column's statistics @@ -783,7 +793,15 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { }; // update column and offset index - self.update_column_offset_index(page_statistics.as_ref()); + self.update_column_offset_index( + page_statistics.as_ref(), + values_data.variable_length_bytes, + ); + + // Update variable_length_bytes in column_metrics + self.column_metrics + .update_variable_length_bytes(values_data.variable_length_bytes); + let page_statistics = page_statistics.map(Statistics::from); let compressed_page = match self.props.writer_version() { @@ -993,7 +1011,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { stats => stats, }; - builder = builder.set_statistics(statistics); + builder = builder + .set_statistics(statistics) + .set_unencoded_byte_array_data_bytes(self.column_metrics.variable_length_bytes); } let metadata = builder.build()?; diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index b85a75cfd410..01e92115c45b 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -644,6 +644,13 @@ pub(crate) mod private { (std::mem::size_of::(), 1) } + /// Return the number of variable length bytes in a given slice of data + /// + /// Returns the sum of lengths for BYTE_ARRAY data, and None for all other data types + fn variable_length_bytes(_: &[Self]) -> Option { + None + } + /// Return the value as i64 if possible /// /// This is essentially the same as `std::convert::TryInto` but can't be @@ -956,6 +963,10 @@ pub(crate) mod private { Ok(num_values) } + fn variable_length_bytes(values: &[Self]) -> Option { + Some(values.iter().map(|x| x.len() as i64).sum()) + } + fn skip(decoder: &mut PlainDecoderDetails, num_values: usize) -> Result { let data = decoder .data diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 57b2f7eec0c2..a1b40a8de95a 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -97,6 +97,7 @@ impl HeapSize for ColumnChunkMetaData { + self.compression.heap_size() + self.statistics.heap_size() + self.encoding_stats.heap_size() + + self.unencoded_byte_array_data_bytes.heap_size() } } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 39b6de41fa58..d86b1ce572fb 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -36,7 +36,7 @@ use std::sync::Arc; use crate::format::{ BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup, - SortingColumn, + SizeStatistics, SortingColumn, }; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; @@ -96,6 +96,8 @@ pub struct ParquetMetaData { column_index: Option, /// Offset index for all each page in each column chunk offset_index: Option, + /// `unencoded_byte_array_data_bytes` from the offset index + unencoded_byte_array_data_bytes: Option>>>>, } impl ParquetMetaData { @@ -107,6 +109,7 @@ impl ParquetMetaData { row_groups, column_index: None, offset_index: None, + unencoded_byte_array_data_bytes: None, } } @@ -117,12 +120,14 @@ impl ParquetMetaData { row_groups: Vec, column_index: Option, offset_index: Option, + unencoded_byte_array_data_bytes: Option>>>>, ) -> Self { ParquetMetaData { file_metadata, row_groups, column_index, offset_index, + unencoded_byte_array_data_bytes, } } @@ -179,6 +184,19 @@ impl ParquetMetaData { self.offset_index.as_ref() } + /// Returns `unencoded_byte_array_data_bytes` from the offset indexes in this file, if loaded + /// + /// This value represents the output size of the total bytes in this file, which can be useful for + /// allocating an appropriately sized output buffer. + /// + /// Returns `None` if the parquet file does not have a `OffsetIndex` or + /// [ArrowReaderOptions::with_page_index] was set to false. + /// + /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index + pub fn unencoded_byte_array_data_bytes(&self) -> Option<&Vec>>>> { + self.unencoded_byte_array_data_bytes.as_ref() + } + /// Estimate of the bytes allocated to store `ParquetMetadata` /// /// # Notes: @@ -199,6 +217,7 @@ impl ParquetMetaData { + self.row_groups.heap_size() + self.column_index.heap_size() + self.offset_index.heap_size() + + self.unencoded_byte_array_data_bytes.heap_size() } /// Override the column index @@ -543,6 +562,7 @@ pub struct ColumnChunkMetaData { offset_index_length: Option, column_index_offset: Option, column_index_length: Option, + unencoded_byte_array_data_bytes: Option, } /// Represents common operations for a column chunk. @@ -695,6 +715,14 @@ impl ColumnChunkMetaData { Some(offset..(offset + length)) } + /// Returns the number of bytes of variable length data after decoding. + /// + /// Only set for BYTE_ARRAY columns. This field may not be set by older + /// writers. + pub fn unencoded_byte_array_data_bytes(&self) -> Option { + self.unencoded_byte_array_data_bytes + } + /// Method to convert from Thrift. pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { if cc.meta_data.is_none() { @@ -732,6 +760,12 @@ impl ColumnChunkMetaData { let offset_index_length = cc.offset_index_length; let column_index_offset = cc.column_index_offset; let column_index_length = cc.column_index_length; + let unencoded_byte_array_data_bytes = if let Some(size_stats) = col_metadata.size_statistics + { + size_stats.unencoded_byte_array_data_bytes + } else { + None + }; let result = ColumnChunkMetaData { column_descr, @@ -753,6 +787,7 @@ impl ColumnChunkMetaData { offset_index_length, column_index_offset, column_index_length, + unencoded_byte_array_data_bytes, }; Ok(result) } @@ -776,6 +811,16 @@ impl ColumnChunkMetaData { /// Method to convert to Thrift `ColumnMetaData` pub fn to_column_metadata_thrift(&self) -> ColumnMetaData { + let size_statistics = if self.unencoded_byte_array_data_bytes.is_some() { + Some(SizeStatistics { + unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes, + repetition_level_histogram: None, + definition_level_histogram: None, + }) + } else { + None + }; + ColumnMetaData { type_: self.column_type().into(), encodings: self.encodings().iter().map(|&v| v.into()).collect(), @@ -795,7 +840,7 @@ impl ColumnChunkMetaData { .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()), bloom_filter_offset: self.bloom_filter_offset, bloom_filter_length: self.bloom_filter_length, - size_statistics: None, + size_statistics, } } @@ -831,6 +876,7 @@ impl ColumnChunkMetaDataBuilder { offset_index_length: None, column_index_offset: None, column_index_length: None, + unencoded_byte_array_data_bytes: None, }) } @@ -942,6 +988,12 @@ impl ColumnChunkMetaDataBuilder { self } + /// Sets optional length of variable length data in bytes. + pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option) -> Self { + self.0.unencoded_byte_array_data_bytes = value; + self + } + /// Builds column chunk metadata. pub fn build(self) -> Result { Ok(self.0) @@ -1021,6 +1073,7 @@ pub struct OffsetIndexBuilder { offset_array: Vec, compressed_page_size_array: Vec, first_row_index_array: Vec, + unencoded_byte_array_data_bytes_array: Option>, current_first_row_index: i64, } @@ -1036,6 +1089,7 @@ impl OffsetIndexBuilder { offset_array: Vec::new(), compressed_page_size_array: Vec::new(), first_row_index_array: Vec::new(), + unencoded_byte_array_data_bytes_array: None, current_first_row_index: 0, } } @@ -1051,6 +1105,17 @@ impl OffsetIndexBuilder { self.compressed_page_size_array.push(compressed_page_size); } + pub fn append_unencoded_byte_array_data_bytes( + &mut self, + unencoded_byte_array_data_bytes: Option, + ) { + if let Some(val) = unencoded_byte_array_data_bytes { + self.unencoded_byte_array_data_bytes_array + .get_or_insert(Vec::new()) + .push(val); + } + } + /// Build and get the thrift metadata of offset index pub fn build_to_thrift(self) -> OffsetIndex { let locations = self @@ -1060,7 +1125,7 @@ impl OffsetIndexBuilder { .zip(self.first_row_index_array.iter()) .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index)) .collect::>(); - OffsetIndex::new(locations, None) + OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array) } } @@ -1211,6 +1276,7 @@ mod tests { .set_offset_index_length(Some(25)) .set_column_index_offset(Some(8000)) .set_column_index_length(Some(25)) + .set_unencoded_byte_array_data_bytes(Some(2000)) .build() .unwrap(); @@ -1298,7 +1364,7 @@ mod tests { column_orders, ); let parquet_meta = ParquetMetaData::new(file_metadata.clone(), row_group_meta.clone()); - let base_expected_size = 1320; + let base_expected_size = 1376; assert_eq!(parquet_meta.memory_size(), base_expected_size); let mut column_index = ColumnIndexBuilder::new(); @@ -1315,9 +1381,10 @@ mod tests { vec![PageLocation::new(1, 2, 3)], vec![PageLocation::new(1, 2, 3)], ]]), + Some(vec![vec![Some(vec![10, 20, 30])]]), ); - let bigger_expected_size = 2304; + let bigger_expected_size = 2464; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); assert_eq!(parquet_meta.memory_size(), bigger_expected_size); diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 2ddf826fb022..7959cb95c052 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -22,6 +22,7 @@ use crate::data_type::Int96; use crate::errors::ParquetError; use crate::file::metadata::ColumnChunkMetaData; use crate::file::page_index::index::{Index, NativeIndex}; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::reader::ChunkReader; use crate::format::{ColumnIndex, OffsetIndex, PageLocation}; use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; @@ -45,9 +46,9 @@ pub(crate) fn acc_range(a: Option>, b: Option>) -> Opt /// Returns an empty vector if this row group does not contain a /// [`ColumnIndex`]. /// -/// See [Column Index Documentation] for more details. +/// See [Page Index Documentation] for more details. /// -/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub fn read_columns_indexes( reader: &R, chunks: &[ColumnChunkMetaData], @@ -81,9 +82,9 @@ pub fn read_columns_indexes( /// Return an empty vector if this row group does not contain an /// [`OffsetIndex]`. /// -/// See [Column Index Documentation] for more details. +/// See [Page Index Documentation] for more details. /// -/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub fn read_pages_locations( reader: &R, chunks: &[ColumnChunkMetaData], @@ -100,6 +101,42 @@ pub fn read_pages_locations( let bytes = reader.get_bytes(fetch.start as _, fetch.end - fetch.start)?; let get = |r: Range| &bytes[(r.start - fetch.start)..(r.end - fetch.start)]; + chunks + .iter() + .map(|c| match c.offset_index_range() { + Some(r) => decode_page_locations(get(r)), + None => Err(general_err!("missing offset index")), + }) + .collect() +} + +/// Reads per-column [`OffsetIndexMetaData`] for all columns of a row group by +/// decoding [`OffsetIndex`] . +/// +/// Returns a vector of `offset_index[column_number]`. +/// +/// Returns an empty vector if this row group does not contain an +/// [`OffsetIndex`]. +/// +/// See [Page Index Documentation] for more details. +/// +/// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +pub fn read_offset_indexes( + reader: &R, + chunks: &[ColumnChunkMetaData], +) -> Result, ParquetError> { + let fetch = chunks + .iter() + .fold(None, |range, c| acc_range(range, c.offset_index_range())); + + let fetch = match fetch { + Some(r) => r, + None => return Ok(vec![]), + }; + + let bytes = reader.get_bytes(fetch.start as _, fetch.end - fetch.start)?; + let get = |r: Range| &bytes[(r.start - fetch.start)..(r.end - fetch.start)]; + chunks .iter() .map(|c| match c.offset_index_range() { @@ -109,7 +146,13 @@ pub fn read_pages_locations( .collect() } -pub(crate) fn decode_offset_index(data: &[u8]) -> Result, ParquetError> { +pub(crate) fn decode_offset_index(data: &[u8]) -> Result { + let mut prot = TCompactSliceInputProtocol::new(data); + let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; + OffsetIndexMetaData::try_new(offset) +} + +pub(crate) fn decode_page_locations(data: &[u8]) -> Result, ParquetError> { let mut prot = TCompactSliceInputProtocol::new(data); let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; Ok(offset.page_locations) diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs index 9372645d76ee..a8077896db34 100644 --- a/parquet/src/file/page_index/mod.rs +++ b/parquet/src/file/page_index/mod.rs @@ -21,3 +21,4 @@ pub mod index; pub mod index_reader; +pub mod offset_index; diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs new file mode 100644 index 000000000000..9138620e3fcd --- /dev/null +++ b/parquet/src/file/page_index/offset_index.rs @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`OffsetIndexMetaData`] structure holding decoded [`OffsetIndex`] information + +use crate::errors::ParquetError; +use crate::format::{OffsetIndex, PageLocation}; + +/// [`OffsetIndex`] information for a column chunk. Contains offsets and sizes for each page +/// in the chunk. Optionally stores fully decoded page sizes for BYTE_ARRAY columns. +#[derive(Debug, Clone, PartialEq)] +pub struct OffsetIndexMetaData { + pub page_locations: Vec, + pub unencoded_byte_array_data_bytes: Option>, +} + +impl OffsetIndexMetaData { + /// Creates a new [`OffsetIndexMetaData`] from an [`OffsetIndex`]. + pub(crate) fn try_new(index: OffsetIndex) -> Result { + Ok(Self { + page_locations: index.page_locations, + unencoded_byte_array_data_bytes: index.unencoded_byte_array_data_bytes, + }) + } + + /// Vector of [`PageLocation`] objects, one per page in the chunk. + pub fn page_locations(&self) -> &Vec { + &self.page_locations + } + + /// Optional vector of unencoded page sizes, one per page in the chunk. Only defined + /// for BYTE_ARRAY columns. + pub fn unencoded_byte_array_data_bytes(&self) -> Option<&Vec> { + self.unencoded_byte_array_data_bytes.as_ref() + } +} diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index ac7d2d287488..65b6ebf2ec98 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -211,12 +211,22 @@ impl SerializedFileReader { if options.enable_page_index { let mut columns_indexes = vec![]; let mut offset_indexes = vec![]; + let mut unenc_byte_sizes = vec![]; for rg in &mut filtered_row_groups { let column_index = index_reader::read_columns_indexes(&chunk_reader, rg.columns())?; - let offset_index = index_reader::read_pages_locations(&chunk_reader, rg.columns())?; + let offset_index = index_reader::read_offset_indexes(&chunk_reader, rg.columns())?; + + // split offset_index into two vectors to not break API + let mut page_locations = vec![]; + let mut unenc_bytes = vec![]; + offset_index.into_iter().for_each(|index| { + page_locations.push(index.page_locations); + unenc_bytes.push(index.unencoded_byte_array_data_bytes); + }); columns_indexes.push(column_index); - offset_indexes.push(offset_index); + offset_indexes.push(page_locations); + unenc_byte_sizes.push(unenc_bytes); } Ok(Self { @@ -226,6 +236,7 @@ impl SerializedFileReader { filtered_row_groups, Some(columns_indexes), Some(offset_indexes), + Some(unenc_byte_sizes), )), props: Arc::new(options.props), }) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index eb633f31c477..b109a2da8eb0 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -659,7 +659,8 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { .set_total_uncompressed_size(metadata.uncompressed_size()) .set_num_values(metadata.num_values()) .set_data_page_offset(map_offset(src_data_offset)) - .set_dictionary_page_offset(src_dictionary_offset.map(map_offset)); + .set_dictionary_page_offset(src_dictionary_offset.map(map_offset)) + .set_unencoded_byte_array_data_bytes(metadata.unencoded_byte_array_data_bytes()); if let Some(statistics) = metadata.statistics() { builder = builder.set_statistics(statistics.clone()) @@ -827,7 +828,7 @@ mod tests { use crate::column::page::{Page, PageReader}; use crate::column::reader::get_typed_column_reader; use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; - use crate::data_type::{BoolType, Int32Type}; + use crate::data_type::{BoolType, ByteArrayType, Int32Type}; use crate::file::page_index::index::Index; use crate::file::properties::EnabledStatistics; use crate::file::serialized_reader::ReadOptionsBuilder; @@ -840,6 +841,7 @@ mod tests { use crate::record::{Row, RowAccessor}; use crate::schema::parser::parse_message_type; use crate::schema::types::{ColumnDescriptor, ColumnPath}; + use crate::util::test_common::rand_gen::RandGen; #[test] fn test_row_group_writer_error_not_all_columns_written() { @@ -1851,4 +1853,83 @@ mod tests { let b_idx = &column_index[0][1]; assert!(matches!(b_idx, Index::NONE), "{b_idx:?}"); } + + #[test] + fn test_byte_array_size_statistics() { + let message_type = " + message test_schema { + OPTIONAL BYTE_ARRAY a (UTF8); + } + "; + let schema = Arc::new(parse_message_type(message_type).unwrap()); + let data = ByteArrayType::gen_vec(32, 7); + let def_levels = [1, 1, 1, 1, 0, 1, 0, 1, 0, 1]; + let unenc_size: i64 = data.iter().map(|x| x.len() as i64).sum(); + let file: File = tempfile::tempfile().unwrap(); + let props = Arc::new( + WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .build(), + ); + + let mut writer = SerializedFileWriter::new(&file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer + .typed::() + .write_batch(&data, Some(&def_levels), None) + .unwrap(); + col_writer.close().unwrap(); + row_group_writer.close().unwrap(); + let file_metadata = writer.close().unwrap(); + + assert_eq!(file_metadata.row_groups.len(), 1); + assert_eq!(file_metadata.row_groups[0].columns.len(), 1); + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + let meta_data = file_metadata.row_groups[0].columns[0] + .meta_data + .as_ref() + .unwrap(); + assert!(meta_data.size_statistics.is_some()); + let size_stats = meta_data.size_statistics.as_ref().unwrap(); + + assert!(size_stats.repetition_level_histogram.is_none()); + assert!(size_stats.definition_level_histogram.is_none()); + assert!(size_stats.unencoded_byte_array_data_bytes.is_some()); + assert_eq!( + unenc_size, + size_stats.unencoded_byte_array_data_bytes.unwrap() + ); + + // check that the read metadata is also correct + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = SerializedFileReader::new_with_options(file, options).unwrap(); + + let rfile_metadata = reader.metadata().file_metadata(); + assert_eq!(rfile_metadata.num_rows(), file_metadata.num_rows); + assert_eq!(reader.num_row_groups(), 1); + let rowgroup = reader.get_row_group(0).unwrap(); + assert_eq!(rowgroup.num_columns(), 1); + let column = rowgroup.metadata().column(0); + assert!(column.unencoded_byte_array_data_bytes().is_some()); + assert_eq!( + unenc_size, + column.unencoded_byte_array_data_bytes().unwrap() + ); + + assert!(reader + .metadata() + .unencoded_byte_array_data_bytes() + .is_some()); + let unenc_sizes = reader.metadata().unencoded_byte_array_data_bytes().unwrap(); + assert_eq!(unenc_sizes.len(), 1); + assert_eq!(unenc_sizes[0].len(), 1); + assert!(unenc_sizes[0][0].is_some()); + let page_sizes = unenc_sizes[0][0].as_ref().unwrap(); + assert_eq!(page_sizes.len(), 1); + assert_eq!(page_sizes[0], unenc_size); + } } From 41b8f66a0c9ee3037106fff37c283c6cb228c75f Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 19 Jul 2024 13:45:56 -0700 Subject: [PATCH 11/21] deprecate read_page_locations --- parquet/src/arrow/arrow_reader/mod.rs | 4 +- parquet/src/arrow/arrow_reader/statistics.rs | 14 +++-- parquet/src/arrow/arrow_writer/mod.rs | 14 ++--- parquet/src/arrow/async_reader/metadata.rs | 8 +-- parquet/src/arrow/async_reader/mod.rs | 24 +++++---- parquet/src/bin/parquet-index.rs | 12 +++-- parquet/src/file/metadata/memory.rs | 7 +++ parquet/src/file/metadata/mod.rs | 54 ++++++++----------- parquet/src/file/page_index/index_reader.rs | 1 + parquet/src/file/serialized_reader.rs | 56 ++++++++------------ parquet/src/file/writer.rs | 18 +++---- parquet/tests/arrow_writer_layout.rs | 10 ++-- 12 files changed, 109 insertions(+), 113 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index e9edf7cb751d..22a4e5a90aa3 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -394,7 +394,7 @@ impl ArrowReaderMetadata { let offset_index = metadata .row_groups() .iter() - .map(|rg| index_reader::read_pages_locations(reader, rg.columns())) + .map(|rg| index_reader::read_offset_indexes(reader, rg.columns())) .collect::>>()?; metadata.set_offset_index(Some(offset_index)) @@ -689,7 +689,7 @@ impl Iterator for ReaderPageIterator { // To avoid `i[rg_idx][self.oolumn_idx`] panic, we need to filter out empty `i[rg_idx]`. let page_locations = offset_index .filter(|i| !i[rg_idx].is_empty()) - .map(|i| i[rg_idx][self.column_idx].clone()); + .map(|i| i[rg_idx][self.column_idx].page_locations.clone()); let total_rows = rg.num_rows() as usize; let reader = self.reader.clone(); diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index d536792b827b..6e2a6fafca25 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -1349,7 +1349,9 @@ impl<'a> StatisticsConverter<'a> { let iter = row_group_indices.into_iter().map(|rg_index| { let column_page_index_per_row_group_per_column = &column_page_index[*rg_index][parquet_index]; - let num_data_pages = &column_offset_index[*rg_index][parquet_index].len(); + let num_data_pages = &column_offset_index[*rg_index][parquet_index] + .page_locations() + .len(); (*num_data_pages, column_page_index_per_row_group_per_column) }); @@ -1378,7 +1380,9 @@ impl<'a> StatisticsConverter<'a> { let iter = row_group_indices.into_iter().map(|rg_index| { let column_page_index_per_row_group_per_column = &column_page_index[*rg_index][parquet_index]; - let num_data_pages = &column_offset_index[*rg_index][parquet_index].len(); + let num_data_pages = &column_offset_index[*rg_index][parquet_index] + .page_locations() + .len(); (*num_data_pages, column_page_index_per_row_group_per_column) }); @@ -1408,7 +1412,9 @@ impl<'a> StatisticsConverter<'a> { let iter = row_group_indices.into_iter().map(|rg_index| { let column_page_index_per_row_group_per_column = &column_page_index[*rg_index][parquet_index]; - let num_data_pages = &column_offset_index[*rg_index][parquet_index].len(); + let num_data_pages = &column_offset_index[*rg_index][parquet_index] + .page_locations() + .len(); (*num_data_pages, column_page_index_per_row_group_per_column) }); @@ -1450,7 +1456,7 @@ impl<'a> StatisticsConverter<'a> { let mut row_count_total = Vec::new(); for rg_idx in row_group_indices { - let page_locations = &column_offset_index[*rg_idx][parquet_index]; + let page_locations = &column_offset_index[*rg_idx][parquet_index].page_locations(); let row_count_per_page = page_locations .windows(2) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 070d740094f5..8f7b514ccf71 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1096,7 +1096,7 @@ mod tests { use crate::data_type::AsBytes; use crate::file::metadata::ParquetMetaData; use crate::file::page_index::index::Index; - use crate::file::page_index::index_reader::read_pages_locations; + use crate::file::page_index::index_reader::read_offset_indexes; use crate::file::properties::{ BloomFilterPosition, EnabledStatistics, ReaderProperties, WriterVersion, }; @@ -1669,16 +1669,16 @@ mod tests { "Expected a dictionary page" ); - let page_locations = read_pages_locations(&file, column).unwrap(); + let offset_indexes = read_offset_indexes(&file, column).unwrap(); - let offset_index = page_locations[0].clone(); + let page_locations = offset_indexes[0].page_locations.clone(); // We should fallback to PLAIN encoding after the first row and our max page size is 1 bytes // so we expect one dictionary encoded page and then a page per row thereafter. assert_eq!( - offset_index.len(), + page_locations.len(), 10, - "Expected 9 pages but got {offset_index:#?}" + "Expected 9 pages but got {page_locations:#?}" ); } @@ -3020,8 +3020,8 @@ mod tests { assert_eq!(index.len(), 1); assert_eq!(index[0].len(), 2); // 2 columns - assert_eq!(index[0][0].len(), 1); // 1 page - assert_eq!(index[0][1].len(), 1); // 1 page + assert_eq!(index[0][0].page_locations().len(), 1); // 1 page + assert_eq!(index[0][1].page_locations().len(), 1); // 1 page } #[test] diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 4a3489a2084d..9224ea3f68a8 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -20,9 +20,7 @@ use crate::errors::{ParquetError, Result}; use crate::file::footer::{decode_footer, decode_metadata}; use crate::file::metadata::ParquetMetaData; use crate::file::page_index::index::Index; -use crate::file::page_index::index_reader::{ - acc_range, decode_column_index, decode_page_locations, -}; +use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index}; use bytes::Bytes; use futures::future::BoxFuture; use futures::FutureExt; @@ -179,9 +177,7 @@ impl MetadataLoader { x.columns() .iter() .map(|c| match c.offset_index_range() { - Some(r) => { - decode_page_locations(&data[r.start - offset..r.end - offset]) - } + Some(r) => decode_offset_index(&data[r.start - offset..r.end - offset]), None => Err(general_err!("missing offset index")), }) .collect::>>() diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 5a790fa6aff0..5695dbc10fe1 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -106,9 +106,10 @@ use crate::column::page::{PageIterator, PageReader}; use crate::errors::{ParquetError, Result}; use crate::file::footer::{decode_footer, decode_metadata}; use crate::file::metadata::{ParquetMetaData, RowGroupMetaData}; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::reader::{ChunkReader, Length, SerializedPageReader}; use crate::file::FOOTER_SIZE; -use crate::format::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash, PageLocation}; +use crate::format::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash}; mod metadata; pub use metadata::*; @@ -489,7 +490,7 @@ where // TODO: calling build_array multiple times is wasteful let meta = self.metadata.row_group(row_group_idx); - let page_locations = self + let offset_index = self .metadata .offset_index() .map(|x| x[row_group_idx].as_slice()); @@ -499,7 +500,7 @@ where // schema: meta.schema_descr_ptr(), row_count: meta.num_rows() as usize, column_chunks: vec![None; meta.columns().len()], - page_locations, + offset_index, }; if let Some(filter) = self.filter.as_mut() { @@ -703,7 +704,7 @@ where /// An in-memory collection of column chunks struct InMemoryRowGroup<'a> { metadata: &'a RowGroupMetaData, - page_locations: Option<&'a [Vec]>, + offset_index: Option<&'a [OffsetIndexMetaData]>, column_chunks: Vec>>, row_count: usize, } @@ -716,7 +717,7 @@ impl<'a> InMemoryRowGroup<'a> { projection: &ProjectionMask, selection: Option<&RowSelection>, ) -> Result<()> { - if let Some((selection, page_locations)) = selection.zip(self.page_locations) { + if let Some((selection, offset_index)) = selection.zip(self.offset_index) { // If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the // `RowSelection` let mut page_start_offsets: Vec> = vec![]; @@ -734,14 +735,14 @@ impl<'a> InMemoryRowGroup<'a> { // then we need to also fetch a dictionary page. let mut ranges = vec![]; let (start, _len) = chunk_meta.byte_range(); - match page_locations[idx].first() { + match offset_index[idx].page_locations.first() { Some(first) if first.offset as u64 != start => { ranges.push(start as usize..first.offset as usize); } _ => (), } - ranges.extend(selection.scan_ranges(&page_locations[idx])); + ranges.extend(selection.scan_ranges(&offset_index[idx].page_locations)); page_start_offsets.push(ranges.iter().map(|range| range.start).collect()); ranges @@ -812,7 +813,9 @@ impl<'a> RowGroups for InMemoryRowGroup<'a> { "Invalid column index {i}, column was not fetched" ))), Some(data) => { - let page_locations = self.page_locations.map(|index| index[i].clone()); + let page_locations = self + .offset_index + .map(|index| index[i].page_locations.clone()); let page_reader: Box = Box::new(SerializedPageReader::new( data.clone(), self.metadata.column(i), @@ -1529,7 +1532,7 @@ mod tests { let metadata = parse_metadata(&data).unwrap(); let offset_index = - index_reader::read_pages_locations(&data, metadata.row_group(0).columns()) + index_reader::read_offset_indexes(&data, metadata.row_group(0).columns()) .expect("reading offset index"); let row_group_meta = metadata.row_group(0).clone(); @@ -1538,7 +1541,6 @@ mod tests { vec![row_group_meta], None, Some(vec![offset_index.clone()]), - None, ); let metadata = Arc::new(metadata); @@ -1575,7 +1577,7 @@ mod tests { }; let mut skip = true; - let mut pages = offset_index[0].iter().peekable(); + let mut pages = offset_index[0].page_locations.iter().peekable(); // Setup `RowSelection` so that we can skip every other page, selecting the last page let mut selectors = vec![]; diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs index 86e08b6dafa3..1a9b74dd78fb 100644 --- a/parquet/src/bin/parquet-index.rs +++ b/parquet/src/bin/parquet-index.rs @@ -37,6 +37,7 @@ use clap::Parser; use parquet::errors::{ParquetError, Result}; use parquet::file::page_index::index::{Index, PageIndex}; +use parquet::file::page_index::offset_index::OffsetIndexMetaData; use parquet::file::reader::{FileReader, SerializedFileReader}; use parquet::file::serialized_reader::ReadOptionsBuilder; use parquet::format::PageLocation; @@ -93,7 +94,8 @@ impl Args { )) })?; - let row_counts = compute_row_counts(offset_index, row_group.num_rows()); + let row_counts = + compute_row_counts(offset_index.page_locations.as_slice(), row_group.num_rows()); match &column_indices[column_idx] { Index::NONE => println!("NO INDEX"), Index::BOOLEAN(v) => print_index(&v.indexes, offset_index, &row_counts)?, @@ -131,20 +133,20 @@ fn compute_row_counts(offset_index: &[PageLocation], rows: i64) -> Vec { /// Prints index information for a single column chunk fn print_index( column_index: &[PageIndex], - offset_index: &[PageLocation], + offset_index: &OffsetIndexMetaData, row_counts: &[i64], ) -> Result<()> { - if column_index.len() != offset_index.len() { + if column_index.len() != offset_index.page_locations.len() { return Err(ParquetError::General(format!( "Index length mismatch, got {} and {}", column_index.len(), - offset_index.len() + offset_index.page_locations.len() ))); } for (idx, ((c, o), row_count)) in column_index .iter() - .zip(offset_index) + .zip(offset_index.page_locations()) .zip(row_counts) .enumerate() { diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index a1b40a8de95a..a7d3d4ab8f93 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -23,6 +23,7 @@ use crate::data_type::private::ParquetValueType; use crate::file::metadata::{ColumnChunkMetaData, FileMetaData, KeyValue, RowGroupMetaData}; use crate::file::page_encoding_stats::PageEncodingStats; use crate::file::page_index::index::{Index, NativeIndex, PageIndex}; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::statistics::{Statistics, ValueStatistics}; use crate::format::{BoundaryOrder, PageLocation, SortingColumn}; use std::sync::Arc; @@ -144,6 +145,12 @@ impl HeapSize for Statistics { } } +impl HeapSize for OffsetIndexMetaData { + fn heap_size(&self) -> usize { + self.page_locations.heap_size() + self.unencoded_byte_array_data_bytes.heap_size() + } +} + impl HeapSize for Index { fn heap_size(&self) -> usize { match self { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index d86b1ce572fb..9a3ca605857f 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -44,6 +44,7 @@ use crate::errors::{ParquetError, Result}; pub(crate) use crate::file::metadata::memory::HeapSize; use crate::file::page_encoding_stats::{self, PageEncodingStats}; use crate::file::page_index::index::Index; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::statistics::{self, Statistics}; use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, @@ -56,20 +57,19 @@ use crate::schema::types::{ /// [`Index`] corresponding to column `column_number` of row group /// `row_group_number`. /// -/// For example `column_index[2][3]` holds the [`Index`] for the forth +/// For example `column_index[2][3]` holds the [`Index`] for the fourth /// column in the third row group of the parquet file. pub type ParquetColumnIndex = Vec>; -/// [`PageLocation`] for each data page of each row group of each column. +/// [`OffsetIndexMetaData`] for each row group of each column. /// -/// `offset_index[row_group_number][column_number][page_number]` holds -/// the [`PageLocation`] corresponding to page `page_number` of column +/// `offset_index[row_group_number][column_number]` holds +/// the [`OffsetIndexMetaData`] corresponding to column /// `column_number`of row group `row_group_number`. /// -/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for -/// the fifth page of the forth column in the third row group of the -/// parquet file. -pub type ParquetOffsetIndex = Vec>>; +/// For example `offset_index[2][3]` holds the [`OffsetIndexMetaData`] for +/// the fourth column in the third row group of the parquet file. +pub type ParquetOffsetIndex = Vec>; /// Parsed metadata for a single Parquet file /// @@ -94,10 +94,8 @@ pub struct ParquetMetaData { row_groups: Vec, /// Page level index for each page in each column chunk column_index: Option, - /// Offset index for all each page in each column chunk + /// Offset index for each page in each column chunk offset_index: Option, - /// `unencoded_byte_array_data_bytes` from the offset index - unencoded_byte_array_data_bytes: Option>>>>, } impl ParquetMetaData { @@ -109,7 +107,6 @@ impl ParquetMetaData { row_groups, column_index: None, offset_index: None, - unencoded_byte_array_data_bytes: None, } } @@ -120,14 +117,12 @@ impl ParquetMetaData { row_groups: Vec, column_index: Option, offset_index: Option, - unencoded_byte_array_data_bytes: Option>>>>, ) -> Self { ParquetMetaData { file_metadata, row_groups, column_index, offset_index, - unencoded_byte_array_data_bytes, } } @@ -184,19 +179,6 @@ impl ParquetMetaData { self.offset_index.as_ref() } - /// Returns `unencoded_byte_array_data_bytes` from the offset indexes in this file, if loaded - /// - /// This value represents the output size of the total bytes in this file, which can be useful for - /// allocating an appropriately sized output buffer. - /// - /// Returns `None` if the parquet file does not have a `OffsetIndex` or - /// [ArrowReaderOptions::with_page_index] was set to false. - /// - /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index - pub fn unencoded_byte_array_data_bytes(&self) -> Option<&Vec>>>> { - self.unencoded_byte_array_data_bytes.as_ref() - } - /// Estimate of the bytes allocated to store `ParquetMetadata` /// /// # Notes: @@ -217,7 +199,6 @@ impl ParquetMetaData { + self.row_groups.heap_size() + self.column_index.heap_size() + self.offset_index.heap_size() - + self.unencoded_byte_array_data_bytes.heap_size() } /// Override the column index @@ -1364,7 +1345,7 @@ mod tests { column_orders, ); let parquet_meta = ParquetMetaData::new(file_metadata.clone(), row_group_meta.clone()); - let base_expected_size = 1376; + let base_expected_size = 1352; assert_eq!(parquet_meta.memory_size(), base_expected_size); let mut column_index = ColumnIndexBuilder::new(); @@ -1373,18 +1354,25 @@ mod tests { let native_index = NativeIndex::::try_new(column_index).unwrap(); // Now, add in OffsetIndex + let mut offset_index = OffsetIndexBuilder::new(); + offset_index.append_row_count(1); + offset_index.append_offset_and_size(2, 3); + offset_index.append_unencoded_byte_array_data_bytes(Some(10)); + offset_index.append_row_count(1); + offset_index.append_offset_and_size(2, 3); + offset_index.append_unencoded_byte_array_data_bytes(Some(10)); + let offset_index = offset_index.build_to_thrift(); + let parquet_meta = ParquetMetaData::new_with_page_index( file_metadata, row_group_meta, Some(vec![vec![Index::BOOLEAN(native_index)]]), Some(vec![vec![ - vec![PageLocation::new(1, 2, 3)], - vec![PageLocation::new(1, 2, 3)], + OffsetIndexMetaData::try_new(offset_index).unwrap() ]]), - Some(vec![vec![Some(vec![10, 20, 30])]]), ); - let bigger_expected_size = 2464; + let bigger_expected_size = 2400; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); assert_eq!(parquet_meta.memory_size(), bigger_expected_size); diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 7959cb95c052..395e9afe122c 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -85,6 +85,7 @@ pub fn read_columns_indexes( /// See [Page Index Documentation] for more details. /// /// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +#[deprecated(since = "53.0.0", note = "Use read_offset_indexes")] pub fn read_pages_locations( reader: &R, chunks: &[ColumnChunkMetaData], diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 65b6ebf2ec98..70aea6fd5ad3 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -28,6 +28,7 @@ use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; use crate::errors::{ParquetError, Result}; use crate::file::page_index::index_reader; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::{ footer, metadata::*, @@ -211,22 +212,12 @@ impl SerializedFileReader { if options.enable_page_index { let mut columns_indexes = vec![]; let mut offset_indexes = vec![]; - let mut unenc_byte_sizes = vec![]; for rg in &mut filtered_row_groups { let column_index = index_reader::read_columns_indexes(&chunk_reader, rg.columns())?; let offset_index = index_reader::read_offset_indexes(&chunk_reader, rg.columns())?; - - // split offset_index into two vectors to not break API - let mut page_locations = vec![]; - let mut unenc_bytes = vec![]; - offset_index.into_iter().for_each(|index| { - page_locations.push(index.page_locations); - unenc_bytes.push(index.unencoded_byte_array_data_bytes); - }); columns_indexes.push(column_index); - offset_indexes.push(page_locations); - unenc_byte_sizes.push(unenc_bytes); + offset_indexes.push(offset_index); } Ok(Self { @@ -236,7 +227,6 @@ impl SerializedFileReader { filtered_row_groups, Some(columns_indexes), Some(offset_indexes), - Some(unenc_byte_sizes), )), props: Arc::new(options.props), }) @@ -296,7 +286,7 @@ impl FileReader for SerializedFileReader { pub struct SerializedRowGroupReader<'a, R: ChunkReader> { chunk_reader: Arc, metadata: &'a RowGroupMetaData, - page_locations: Option<&'a [Vec]>, + offset_index: Option<&'a [OffsetIndexMetaData]>, props: ReaderPropertiesPtr, bloom_filters: Vec>, } @@ -306,7 +296,7 @@ impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { pub fn new( chunk_reader: Arc, metadata: &'a RowGroupMetaData, - page_locations: Option<&'a [Vec]>, + offset_index: Option<&'a [OffsetIndexMetaData]>, props: ReaderPropertiesPtr, ) -> Result { let bloom_filters = if props.read_bloom_filter() { @@ -321,7 +311,7 @@ impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { Ok(Self { chunk_reader, metadata, - page_locations, + offset_index, props, bloom_filters, }) @@ -341,7 +331,7 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<' fn get_column_page_reader(&self, i: usize) -> Result> { let col = self.metadata.column(i); - let page_locations = self.page_locations.map(|x| x[i].clone()); + let page_locations = self.offset_index.map(|x| x[i].page_locations.clone()); let props = Arc::clone(&self.props); Ok(Box::new(SerializedPageReader::new_with_properties( @@ -787,7 +777,7 @@ mod tests { use crate::data_type::private::ParquetValueType; use crate::data_type::{AsBytes, FixedLenByteArrayType}; use crate::file::page_index::index::{Index, NativeIndex}; - use crate::file::page_index::index_reader::{read_columns_indexes, read_pages_locations}; + use crate::file::page_index::index_reader::{read_columns_indexes, read_offset_indexes}; use crate::file::writer::SerializedFileWriter; use crate::record::RowAccessor; use crate::schema::parser::parse_message_type; @@ -1325,7 +1315,7 @@ mod tests { // only one row group assert_eq!(offset_indexes.len(), 1); let offset_index = &offset_indexes[0]; - let page_offset = &offset_index[0][0]; + let page_offset = &offset_index[0].page_locations()[0]; assert_eq!(4, page_offset.offset); assert_eq!(152, page_offset.compressed_page_size); @@ -1348,8 +1338,8 @@ mod tests { b.reverse(); assert_eq!(a, b); - let a = read_pages_locations(&test_file, columns).unwrap(); - let mut b = read_pages_locations(&test_file, &reversed).unwrap(); + let a = read_offset_indexes(&test_file, columns).unwrap(); + let mut b = read_offset_indexes(&test_file, &reversed).unwrap(); b.reverse(); assert_eq!(a, b); } @@ -1386,7 +1376,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 0), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[0].len(), 325); + assert_eq!(row_group_offset_indexes[0].page_locations.len(), 325); } else { unreachable!() }; @@ -1394,7 +1384,7 @@ mod tests { assert!(&column_index[0][1].is_sorted()); if let Index::BOOLEAN(index) = &column_index[0][1] { assert_eq!(index.indexes.len(), 82); - assert_eq!(row_group_offset_indexes[1].len(), 82); + assert_eq!(row_group_offset_indexes[1].page_locations.len(), 82); } else { unreachable!() }; @@ -1407,7 +1397,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 2), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[2].len(), 325); + assert_eq!(row_group_offset_indexes[2].page_locations.len(), 325); } else { unreachable!() }; @@ -1420,7 +1410,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 3), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[3].len(), 325); + assert_eq!(row_group_offset_indexes[3].page_locations.len(), 325); } else { unreachable!() }; @@ -1433,7 +1423,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 4), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[4].len(), 325); + assert_eq!(row_group_offset_indexes[4].page_locations.len(), 325); } else { unreachable!() }; @@ -1446,7 +1436,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 5), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[5].len(), 528); + assert_eq!(row_group_offset_indexes[5].page_locations.len(), 528); } else { unreachable!() }; @@ -1459,7 +1449,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 6), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[6].len(), 325); + assert_eq!(row_group_offset_indexes[6].page_locations.len(), 325); } else { unreachable!() }; @@ -1472,7 +1462,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 7), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[7].len(), 528); + assert_eq!(row_group_offset_indexes[7].page_locations.len(), 528); } else { unreachable!() }; @@ -1485,7 +1475,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 8), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[8].len(), 974); + assert_eq!(row_group_offset_indexes[8].page_locations.len(), 974); } else { unreachable!() }; @@ -1498,7 +1488,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 9), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[9].len(), 352); + assert_eq!(row_group_offset_indexes[9].page_locations.len(), 352); } else { unreachable!() }; @@ -1506,7 +1496,7 @@ mod tests { //Notice: min_max values for each page for this col not exits. assert!(!&column_index[0][10].is_sorted()); if let Index::NONE = &column_index[0][10] { - assert_eq!(row_group_offset_indexes[10].len(), 974); + assert_eq!(row_group_offset_indexes[10].page_locations.len(), 974); } else { unreachable!() }; @@ -1519,7 +1509,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 11), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[11].len(), 325); + assert_eq!(row_group_offset_indexes[11].page_locations.len(), 325); } else { unreachable!() }; @@ -1532,7 +1522,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 12), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[12].len(), 325); + assert_eq!(row_group_offset_indexes[12].page_locations.len(), 325); } else { unreachable!() }; diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index b109a2da8eb0..f7dde59f4f42 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1920,15 +1920,15 @@ mod tests { column.unencoded_byte_array_data_bytes().unwrap() ); - assert!(reader - .metadata() - .unencoded_byte_array_data_bytes() - .is_some()); - let unenc_sizes = reader.metadata().unencoded_byte_array_data_bytes().unwrap(); - assert_eq!(unenc_sizes.len(), 1); - assert_eq!(unenc_sizes[0].len(), 1); - assert!(unenc_sizes[0][0].is_some()); - let page_sizes = unenc_sizes[0][0].as_ref().unwrap(); + assert!(reader.metadata().offset_index().is_some()); + let offset_index = reader.metadata().offset_index().unwrap(); + assert_eq!(offset_index.len(), 1); + assert_eq!(offset_index[0].len(), 1); + assert!(offset_index[0][0].unencoded_byte_array_data_bytes.is_some()); + let page_sizes = offset_index[0][0] + .unencoded_byte_array_data_bytes + .as_ref() + .unwrap(); assert_eq!(page_sizes.len(), 1); assert_eq!(page_sizes[0], unenc_size); } diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index cd124031cfdc..3e0f6ce3a8b3 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -89,12 +89,15 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { for (column_index, column_layout) in offset_index.iter().zip(&row_group_layout.columns) { assert_eq!( - column_index.len(), + column_index.page_locations.len(), column_layout.pages.len(), "index page count mismatch" ); - for (idx, (page, page_layout)) in - column_index.iter().zip(&column_layout.pages).enumerate() + for (idx, (page, page_layout)) in column_index + .page_locations + .iter() + .zip(&column_layout.pages) + .enumerate() { assert_eq!( page.compressed_page_size as usize, @@ -102,6 +105,7 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { "index page {idx} size mismatch" ); let next_first_row_index = column_index + .page_locations .get(idx + 1) .map(|x| x.first_row_index) .unwrap_or_else(|| row_group.num_rows()); From c4c0a4d4a40efcf77b65d8ba29b6fb57d249f1e3 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 19 Jul 2024 14:22:36 -0700 Subject: [PATCH 12/21] add level histograms to metadata --- parquet/src/column/writer/mod.rs | 156 +++++++++++++++++++++++++-- parquet/src/file/metadata/memory.rs | 2 + parquet/src/file/metadata/mod.rs | 91 ++++++++++++++-- parquet/src/file/page_index/index.rs | 87 ++++++++++++--- parquet/src/file/writer.rs | 144 ++++++++++++++++++++++++- 5 files changed, 445 insertions(+), 35 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 0e227a157d06..054e087c72c8 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -183,12 +183,91 @@ pub struct ColumnCloseResult { pub offset_index: Option, } +/// Creates a vector to hold level histogram data. Length will be `max_level + 1`. +/// Because histograms are not necessary when `max_level == 0`, this will return +/// `None` in that case. +fn new_histogram(max_level: i16) -> Option> { + if max_level > 0 { + Some(vec![0; max_level as usize + 1]) + } else { + None + } +} + +/// Sum `page_histogram` into `chunk_histogram` +fn update_histogram(chunk_histogram: &mut Option>, page_histogram: &Option>) { + if page_histogram.is_some() && chunk_histogram.is_some() { + let chunk_hist = chunk_histogram.as_mut().unwrap(); + let page_hist = page_histogram.as_ref().unwrap(); + for i in 0..page_hist.len() { + chunk_hist[i] += page_hist[i] + } + } +} + // Metrics per page #[derive(Default)] struct PageMetrics { num_buffered_values: u32, num_buffered_rows: u32, num_page_nulls: u64, + repetition_level_histogram: Option>, + definition_level_histogram: Option>, +} + +impl PageMetrics { + fn new() -> Self { + Default::default() + } + + /// Initialize the repetition level histogram + fn with_repetition_level_histogram(mut self, max_level: i16) -> Self { + self.repetition_level_histogram = new_histogram(max_level); + self + } + + /// Initialize the definition level histogram + fn with_definition_level_histogram(mut self, max_level: i16) -> Self { + self.definition_level_histogram = new_histogram(max_level); + self + } + + /// Sets all elements of `histogram` to 0 + fn reset_histogram(histogram: &mut Option>) { + if let Some(ref mut hist) = histogram { + for v in hist { + *v = 0 + } + } + } + + /// Resets the state of this `PageMetrics` to the initial state. + /// If histograms have been initialized their contents will be reset to zero. + fn new_page(&mut self) { + self.num_buffered_values = 0; + self.num_buffered_rows = 0; + self.num_page_nulls = 0; + PageMetrics::reset_histogram(&mut self.repetition_level_histogram); + PageMetrics::reset_histogram(&mut self.definition_level_histogram); + } + + /// Updates histogram values using provided repetition levels + fn update_repetition_level_histogram(&mut self, levels: &[i16]) { + if let Some(ref mut rep_hist) = self.repetition_level_histogram { + for &level in levels { + rep_hist[level as usize] += 1; + } + } + } + + /// Updates histogram values using provided definition levels + fn update_definition_level_histogram(&mut self, levels: &[i16]) { + if let Some(ref mut def_hist) = self.definition_level_histogram { + for &level in levels { + def_hist[level as usize] += 1; + } + } + } } // Metrics per column writer @@ -206,6 +285,8 @@ struct ColumnMetrics { num_column_nulls: u64, column_distinct_count: Option, variable_length_bytes: Option, + repetition_level_histogram: Option>, + definition_level_histogram: Option>, } impl ColumnMetrics { @@ -213,6 +294,31 @@ impl ColumnMetrics { Default::default() } + /// Initialize the repetition level histogram + fn with_repetition_level_histogram(mut self, max_level: i16) -> Self { + self.repetition_level_histogram = new_histogram(max_level); + self + } + + /// Initialize the definition level histogram + fn with_definition_level_histogram(mut self, max_level: i16) -> Self { + self.definition_level_histogram = new_histogram(max_level); + self + } + + /// Sum the provided PageMetrics histograms into the chunk histograms. Does nothing if + /// page histograms are not initialized. + fn update_from_page_metrics(&mut self, page_metrics: &PageMetrics) { + update_histogram( + &mut self.definition_level_histogram, + &page_metrics.definition_level_histogram, + ); + update_histogram( + &mut self.repetition_level_histogram, + &page_metrics.repetition_level_histogram, + ); + } + /// Sum the provided page variable_length_bytes into the chunk variable_length_bytes fn update_variable_length_bytes(&mut self, variable_length_bytes: Option) { if let Some(var_bytes) = variable_length_bytes { @@ -275,6 +381,19 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // Used for level information encodings.insert(Encoding::RLE); + let mut page_metrics = PageMetrics::new(); + let mut column_metrics = ColumnMetrics::::new(); + + // Initialize level histograms if collecting page or chunk statistics + if statistics_enabled != EnabledStatistics::None { + page_metrics = page_metrics + .with_repetition_level_histogram(descr.max_rep_level()) + .with_definition_level_histogram(descr.max_def_level()); + column_metrics = column_metrics + .with_repetition_level_histogram(descr.max_rep_level()) + .with_definition_level_histogram(descr.max_def_level()) + } + Self { descr, props, @@ -286,12 +405,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { def_levels_sink: vec![], rep_levels_sink: vec![], data_pages: VecDeque::new(), - page_metrics: PageMetrics { - num_buffered_values: 0, - num_buffered_rows: 0, - num_page_nulls: 0, - }, - column_metrics: ColumnMetrics::::new(), + page_metrics, + column_metrics, column_index_builder: ColumnIndexBuilder::new(), offset_index_builder: OffsetIndexBuilder::new(), encodings, @@ -541,6 +656,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } } + // Update histogram + self.page_metrics.update_definition_level_histogram(levels); + self.def_levels_sink.extend_from_slice(levels); values_to_write } else { @@ -569,6 +687,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { self.page_metrics.num_buffered_rows += (level == 0) as u32 } + // Update histogram + self.page_metrics.update_repetition_level_histogram(levels); + self.rep_levels_sink.extend_from_slice(levels); } else { // Each value is exactly one row. @@ -712,6 +833,15 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } } } + + // update histograms + if self.column_index_builder.valid() { + self.column_index_builder.append_histograms( + &self.page_metrics.repetition_level_histogram, + &self.page_metrics.definition_level_histogram, + ); + } + // update the offset index self.offset_index_builder .append_row_count(self.page_metrics.num_buffered_rows as i64); @@ -798,7 +928,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { values_data.variable_length_bytes, ); - // Update variable_length_bytes in column_metrics + // Update histograms and variable_length_bytes in column_metrics + self.column_metrics + .update_from_page_metrics(&self.page_metrics); self.column_metrics .update_variable_length_bytes(values_data.variable_length_bytes); @@ -905,7 +1037,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // Reset state. self.rep_levels_sink.clear(); self.def_levels_sink.clear(); - self.page_metrics = PageMetrics::default(); + self.page_metrics.new_page(); Ok(()) } @@ -1013,7 +1145,13 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { builder = builder .set_statistics(statistics) - .set_unencoded_byte_array_data_bytes(self.column_metrics.variable_length_bytes); + .set_unencoded_byte_array_data_bytes(self.column_metrics.variable_length_bytes) + .set_repetition_level_histogram( + self.column_metrics.repetition_level_histogram.take(), + ) + .set_definition_level_histogram( + self.column_metrics.definition_level_histogram.take(), + ); } let metadata = builder.build()?; diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index a7d3d4ab8f93..d4401974cd99 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -99,6 +99,8 @@ impl HeapSize for ColumnChunkMetaData { + self.statistics.heap_size() + self.encoding_stats.heap_size() + self.unencoded_byte_array_data_bytes.heap_size() + + self.repetition_level_histogram.heap_size() + + self.definition_level_histogram.heap_size() } } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 9a3ca605857f..ab0dbcff2c6d 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -544,6 +544,8 @@ pub struct ColumnChunkMetaData { column_index_offset: Option, column_index_length: Option, unencoded_byte_array_data_bytes: Option, + repetition_level_histogram: Option>, + definition_level_histogram: Option>, } /// Represents common operations for a column chunk. @@ -704,6 +706,24 @@ impl ColumnChunkMetaData { self.unencoded_byte_array_data_bytes } + /// Returns the repetition level histogram. + /// + /// The returned value `vec[i]` is how many values are at repetition level `i`. For example, + /// `vec[0]` indicates how many rows the page contains. + /// This field may not be set by older writers. + pub fn repetition_level_histogram(&self) -> Option<&Vec> { + self.repetition_level_histogram.as_ref() + } + + /// Returns the definition level histogram. + /// + /// The returned value `vec[i]` is how many values are at definition level `i`. For example, + /// `vec[max_definition_level-1]` indicates how many non-null values are present in the page. + /// This field may not be set by older writers. + pub fn definition_level_histogram(&self) -> Option<&Vec> { + self.definition_level_histogram.as_ref() + } + /// Method to convert from Thrift. pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { if cc.meta_data.is_none() { @@ -741,11 +761,18 @@ impl ColumnChunkMetaData { let offset_index_length = cc.offset_index_length; let column_index_offset = cc.column_index_offset; let column_index_length = cc.column_index_length; - let unencoded_byte_array_data_bytes = if let Some(size_stats) = col_metadata.size_statistics - { - size_stats.unencoded_byte_array_data_bytes + let ( + unencoded_byte_array_data_bytes, + repetition_level_histogram, + definition_level_histogram, + ) = if let Some(size_stats) = col_metadata.size_statistics { + ( + size_stats.unencoded_byte_array_data_bytes, + size_stats.repetition_level_histogram, + size_stats.definition_level_histogram, + ) } else { - None + (None, None, None) }; let result = ColumnChunkMetaData { @@ -769,6 +796,8 @@ impl ColumnChunkMetaData { column_index_offset, column_index_length, unencoded_byte_array_data_bytes, + repetition_level_histogram, + definition_level_histogram, }; Ok(result) } @@ -792,11 +821,14 @@ impl ColumnChunkMetaData { /// Method to convert to Thrift `ColumnMetaData` pub fn to_column_metadata_thrift(&self) -> ColumnMetaData { - let size_statistics = if self.unencoded_byte_array_data_bytes.is_some() { + let size_statistics = if self.unencoded_byte_array_data_bytes.is_some() + || self.repetition_level_histogram.is_some() + || self.definition_level_histogram.is_some() + { Some(SizeStatistics { unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes, - repetition_level_histogram: None, - definition_level_histogram: None, + repetition_level_histogram: self.repetition_level_histogram.clone(), + definition_level_histogram: self.definition_level_histogram.clone(), }) } else { None @@ -858,6 +890,8 @@ impl ColumnChunkMetaDataBuilder { column_index_offset: None, column_index_length: None, unencoded_byte_array_data_bytes: None, + repetition_level_histogram: None, + definition_level_histogram: None, }) } @@ -975,6 +1009,18 @@ impl ColumnChunkMetaDataBuilder { self } + /// Sets optional repetition level histogram + pub fn set_repetition_level_histogram(mut self, value: Option>) -> Self { + self.0.repetition_level_histogram = value; + self + } + + /// Sets optional repetition level histogram + pub fn set_definition_level_histogram(mut self, value: Option>) -> Self { + self.0.definition_level_histogram = value; + self + } + /// Builds column chunk metadata. pub fn build(self) -> Result { Ok(self.0) @@ -988,6 +1034,8 @@ pub struct ColumnIndexBuilder { max_values: Vec>, null_counts: Vec, boundary_order: BoundaryOrder, + repetition_level_histograms: Option>, + definition_level_histograms: Option>, // If one page can't get build index, need to ignore all index in this column valid: bool, } @@ -1006,6 +1054,8 @@ impl ColumnIndexBuilder { max_values: Vec::new(), null_counts: Vec::new(), boundary_order: BoundaryOrder::UNORDERED, + repetition_level_histograms: None, + definition_level_histograms: None, valid: true, } } @@ -1023,6 +1073,23 @@ impl ColumnIndexBuilder { self.null_counts.push(null_count); } + pub fn append_histograms( + &mut self, + repetition_level_histogram: &Option>, + definition_level_histogram: &Option>, + ) { + if let Some(ref rep_lvl_hist) = repetition_level_histogram { + let hist = self.repetition_level_histograms.get_or_insert(Vec::new()); + hist.reserve(rep_lvl_hist.len()); + hist.extend(rep_lvl_hist); + } + if let Some(ref def_lvl_hist) = definition_level_histogram { + let hist = self.definition_level_histograms.get_or_insert(Vec::new()); + hist.reserve(def_lvl_hist.len()); + hist.extend(def_lvl_hist); + } + } + pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) { self.boundary_order = boundary_order; } @@ -1043,8 +1110,8 @@ impl ColumnIndexBuilder { self.max_values, self.boundary_order, self.null_counts, - None, - None, + self.repetition_level_histograms, + self.definition_level_histograms, ) } } @@ -1258,6 +1325,8 @@ mod tests { .set_column_index_offset(Some(8000)) .set_column_index_length(Some(25)) .set_unencoded_byte_array_data_bytes(Some(2000)) + .set_repetition_level_histogram(Some(vec![100, 100])) + .set_definition_level_histogram(Some(vec![0, 200])) .build() .unwrap(); @@ -1345,7 +1414,7 @@ mod tests { column_orders, ); let parquet_meta = ParquetMetaData::new(file_metadata.clone(), row_group_meta.clone()); - let base_expected_size = 1352; + let base_expected_size = 1448; assert_eq!(parquet_meta.memory_size(), base_expected_size); let mut column_index = ColumnIndexBuilder::new(); @@ -1372,7 +1441,7 @@ mod tests { ]]), ); - let bigger_expected_size = 2400; + let bigger_expected_size = 2784; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); assert_eq!(parquet_meta.memory_size(), bigger_expected_size); diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 71fb47afa960..6c81d4d1df03 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -41,6 +41,17 @@ pub struct PageIndex { pub max: Option, /// Null values in the page pub null_count: Option, + /// Repetition level histogram for the page + /// + /// `repetition_level_histogram[i]` is a count of how many values are at repetition level `i`. + /// For example, `repetition_level_histogram[0]` indicates how many rows the page contains. + pub repetition_level_histogram: Option>, + /// Definition level histogram for the page + /// + /// `definition_level_histogram[i]` is a count of how many values are at definition level `i`. + /// For example, `definition_level_histogram[max_definition_level-1]` indicates how many + /// non-null values are present in the page. + pub definition_level_histogram: Option>, } impl PageIndex { @@ -53,6 +64,12 @@ impl PageIndex { pub fn null_count(&self) -> Option { self.null_count } + pub fn repetition_level_histogram(&self) -> Option<&Vec> { + self.repetition_level_histogram.as_ref() + } + pub fn definition_level_histogram(&self) -> Option<&Vec> { + self.definition_level_histogram.as_ref() + } } impl PageIndex @@ -141,26 +158,57 @@ impl NativeIndex { .map(|x| x.into_iter().map(Some).collect::>()) .unwrap_or_else(|| vec![None; len]); + // histograms are a 1D array encoding a 2D num_pages X num_levels matrix. + let to_page_histograms = |opt_hist: Option>| { + if let Some(hist) = opt_hist { + // TODO: should we assert (hist.len() % len) == 0? + let num_levels = hist.len() / len; + let mut res = Vec::with_capacity(len); + for i in 0..len { + let page_idx = i * num_levels; + let page_hist = hist[page_idx..page_idx + num_levels].to_vec(); + res.push(Some(page_hist)); + } + res + } else { + vec![None; len] + } + }; + + let rep_hists: Vec>> = + to_page_histograms(index.repetition_level_histograms); + let def_hists: Vec>> = + to_page_histograms(index.definition_level_histograms); + let indexes = index .min_values .iter() .zip(index.max_values.into_iter()) .zip(index.null_pages.into_iter()) .zip(null_counts.into_iter()) - .map(|(((min, max), is_null), null_count)| { - let (min, max) = if is_null { - (None, None) - } else { - let min = min.as_slice(); - let max = max.as_slice(); - (Some(from_le_slice::(min)), Some(from_le_slice::(max))) - }; - Ok(PageIndex { - min, - max, - null_count, - }) - }) + .zip(rep_hists.into_iter()) + .zip(def_hists.into_iter()) + .map( + |( + ((((min, max), is_null), null_count), repetition_level_histogram), + definition_level_histogram, + )| { + let (min, max) = if is_null { + (None, None) + } else { + let min = min.as_slice(); + let max = max.as_slice(); + (Some(from_le_slice::(min)), Some(from_le_slice::(max))) + }; + Ok(PageIndex { + min, + max, + null_count, + repetition_level_histogram, + definition_level_histogram, + }) + }, + ) .collect::, ParquetError>>()?; Ok(Self { @@ -180,6 +228,8 @@ mod tests { min: Some(-123), max: Some(234), null_count: Some(0), + repetition_level_histogram: Some(vec![1, 2]), + definition_level_histogram: Some(vec![1, 2, 3]), }; assert_eq!(page_index.min().unwrap(), &-123); @@ -187,6 +237,11 @@ mod tests { assert_eq!(page_index.min_bytes().unwrap(), (-123).as_bytes()); assert_eq!(page_index.max_bytes().unwrap(), 234.as_bytes()); assert_eq!(page_index.null_count().unwrap(), 0); + assert_eq!(page_index.repetition_level_histogram(), Some(&vec![1, 2])); + assert_eq!( + page_index.definition_level_histogram(), + Some(&vec![1, 2, 3]) + ); } #[test] @@ -195,6 +250,8 @@ mod tests { min: None, max: None, null_count: None, + repetition_level_histogram: None, + definition_level_histogram: None, }; assert_eq!(page_index.min(), None); @@ -202,5 +259,7 @@ mod tests { assert_eq!(page_index.min_bytes(), None); assert_eq!(page_index.max_bytes(), None); assert_eq!(page_index.null_count(), None); + assert_eq!(page_index.repetition_level_histogram(), None); + assert_eq!(page_index.definition_level_histogram(), None); } } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index f7dde59f4f42..edbf3fc93867 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -662,6 +662,12 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { .set_dictionary_page_offset(src_dictionary_offset.map(map_offset)) .set_unencoded_byte_array_data_bytes(metadata.unencoded_byte_array_data_bytes()); + if let Some(rep_hist) = metadata.repetition_level_histogram() { + builder = builder.set_repetition_level_histogram(Some(rep_hist.clone())) + } + if let Some(def_hist) = metadata.definition_level_histogram() { + builder = builder.set_definition_level_histogram(Some(def_hist.clone())) + } if let Some(statistics) = metadata.statistics() { builder = builder.set_statistics(statistics.clone()) } @@ -1888,6 +1894,12 @@ mod tests { assert_eq!(file_metadata.row_groups[0].columns.len(), 1); assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + let check_def_hist = |def_hist: &Vec| { + assert_eq!(def_hist.len(), 2); + assert_eq!(def_hist[0], 3); + assert_eq!(def_hist[1], 7); + }; + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); let meta_data = file_metadata.row_groups[0].columns[0] .meta_data @@ -1897,12 +1909,13 @@ mod tests { let size_stats = meta_data.size_statistics.as_ref().unwrap(); assert!(size_stats.repetition_level_histogram.is_none()); - assert!(size_stats.definition_level_histogram.is_none()); + assert!(size_stats.definition_level_histogram.is_some()); assert!(size_stats.unencoded_byte_array_data_bytes.is_some()); assert_eq!( unenc_size, size_stats.unencoded_byte_array_data_bytes.unwrap() ); + check_def_hist(size_stats.definition_level_histogram.as_ref().unwrap()); // check that the read metadata is also correct let options = ReadOptionsBuilder::new().with_page_index().build(); @@ -1914,12 +1927,31 @@ mod tests { let rowgroup = reader.get_row_group(0).unwrap(); assert_eq!(rowgroup.num_columns(), 1); let column = rowgroup.metadata().column(0); + assert!(column.definition_level_histogram().is_some()); + assert!(column.repetition_level_histogram().is_none()); assert!(column.unencoded_byte_array_data_bytes().is_some()); + check_def_hist(column.definition_level_histogram().unwrap()); assert_eq!( unenc_size, column.unencoded_byte_array_data_bytes().unwrap() ); + // check histogram in column index as well + assert!(reader.metadata().column_index().is_some()); + let column_index = reader.metadata().column_index().unwrap(); + assert_eq!(column_index.len(), 1); + assert_eq!(column_index[0].len(), 1); + let col_idx = if let Index::BYTE_ARRAY(index) = &column_index[0][0] { + assert_eq!(index.indexes.len(), 1); + &index.indexes[0] + } else { + unreachable!() + }; + + assert!(col_idx.repetition_level_histogram().is_none()); + assert!(col_idx.definition_level_histogram().is_some()); + check_def_hist(col_idx.definition_level_histogram().unwrap()); + assert!(reader.metadata().offset_index().is_some()); let offset_index = reader.metadata().offset_index().unwrap(); assert_eq!(offset_index.len(), 1); @@ -1932,4 +1964,114 @@ mod tests { assert_eq!(page_sizes.len(), 1); assert_eq!(page_sizes[0], unenc_size); } + + #[test] + fn test_size_statistics_with_repetition_and_nulls() { + let message_type = " + message test_schema { + OPTIONAL group i32_list (LIST) { + REPEATED group list { + OPTIONAL INT32 element; + } + } + } + "; + // column is: + // row 0: [1, 2] + // row 1: NULL + // row 2: [4, NULL] + // row 3: [] + // row 4: [7, 8, 9, 10] + let schema = Arc::new(parse_message_type(message_type).unwrap()); + let data = [1, 2, 4, 7, 8, 9, 10]; + let def_levels = [3, 3, 0, 3, 2, 1, 3, 3, 3, 3]; + let rep_levels = [0, 1, 0, 0, 1, 0, 0, 1, 1, 1]; + let file = tempfile::tempfile().unwrap(); + let props = Arc::new( + WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .build(), + ); + let mut writer = SerializedFileWriter::new(&file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer + .typed::() + .write_batch(&data, Some(&def_levels), Some(&rep_levels)) + .unwrap(); + col_writer.close().unwrap(); + row_group_writer.close().unwrap(); + let file_metadata = writer.close().unwrap(); + + assert_eq!(file_metadata.row_groups.len(), 1); + assert_eq!(file_metadata.row_groups[0].columns.len(), 1); + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + + let check_def_hist = |def_hist: &Vec| { + assert_eq!(def_hist.len(), 4); + assert_eq!(def_hist[0], 1); + assert_eq!(def_hist[1], 1); + assert_eq!(def_hist[2], 1); + assert_eq!(def_hist[3], 7); + }; + + let check_rep_hist = |rep_hist: &Vec| { + assert_eq!(rep_hist.len(), 2); + assert_eq!(rep_hist[0], 5); + assert_eq!(rep_hist[1], 5); + }; + + // check that histograms are set properly in the write and read metadata + // also check that unencoded_byte_array_data_bytes is not set + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + let meta_data = file_metadata.row_groups[0].columns[0] + .meta_data + .as_ref() + .unwrap(); + assert!(meta_data.size_statistics.is_some()); + let size_stats = meta_data.size_statistics.as_ref().unwrap(); + assert!(size_stats.repetition_level_histogram.is_some()); + assert!(size_stats.definition_level_histogram.is_some()); + assert!(size_stats.unencoded_byte_array_data_bytes.is_none()); + check_def_hist(size_stats.definition_level_histogram.as_ref().unwrap()); + check_rep_hist(size_stats.repetition_level_histogram.as_ref().unwrap()); + + // check that the read metadata is also correct + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = SerializedFileReader::new_with_options(file, options).unwrap(); + + let rfile_metadata = reader.metadata().file_metadata(); + assert_eq!(rfile_metadata.num_rows(), file_metadata.num_rows); + assert_eq!(reader.num_row_groups(), 1); + let rowgroup = reader.get_row_group(0).unwrap(); + assert_eq!(rowgroup.num_columns(), 1); + let column = rowgroup.metadata().column(0); + assert!(column.definition_level_histogram().is_some()); + assert!(column.repetition_level_histogram().is_some()); + assert!(column.unencoded_byte_array_data_bytes().is_none()); + check_def_hist(column.definition_level_histogram().unwrap()); + check_rep_hist(column.repetition_level_histogram().unwrap()); + + // check histogram in column index as well + assert!(reader.metadata().column_index().is_some()); + let column_index = reader.metadata().column_index().unwrap(); + assert_eq!(column_index.len(), 1); + assert_eq!(column_index[0].len(), 1); + let col_idx = if let Index::INT32(index) = &column_index[0][0] { + assert_eq!(index.indexes.len(), 1); + &index.indexes[0] + } else { + unreachable!() + }; + + check_def_hist(col_idx.definition_level_histogram().unwrap()); + check_rep_hist(col_idx.repetition_level_histogram().unwrap()); + + assert!(reader.metadata().offset_index().is_some()); + let offset_index = reader.metadata().offset_index().unwrap(); + assert_eq!(offset_index.len(), 1); + assert_eq!(offset_index[0].len(), 1); + assert!(offset_index[0][0].unencoded_byte_array_data_bytes.is_none()); + } } From 1507c467681b180f701a95e085a3b83f453942a1 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 22 Jul 2024 16:16:49 -0700 Subject: [PATCH 13/21] add to_thrift() to OffsetIndexMetaData --- parquet/src/file/page_index/offset_index.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index 9138620e3fcd..2ae3464141ca 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -47,4 +47,13 @@ impl OffsetIndexMetaData { pub fn unencoded_byte_array_data_bytes(&self) -> Option<&Vec> { self.unencoded_byte_array_data_bytes.as_ref() } + + // TODO: remove annotation after merge + #[allow(dead_code)] + pub(crate) fn to_thrift(&self) -> OffsetIndex { + OffsetIndex::new( + self.page_locations.clone(), + self.unencoded_byte_array_data_bytes.clone(), + ) + } } From 81c34ac39161e1b8c866ecc20b411a13d9aa1d5d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 15:48:41 -0400 Subject: [PATCH 14/21] Update pyo3 requirement from 0.21.1 to 0.22.2 (#6085) Updates the requirements on [pyo3](https://github.com/pyo3/pyo3) to permit the latest version. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/v0.22.2/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.21.1...v0.22.2) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index b8f40e1b4b99..65a1e4073774 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -54,7 +54,7 @@ arrow-select = { workspace = true } arrow-string = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } -pyo3 = { version = "0.22.1", default-features = false, optional = true } +pyo3 = { version = "0.22.2", default-features = false, optional = true } [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"] From 3bc998792b19ba20285ca82765991f03c7fa845a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 23 Jul 2024 12:54:10 -0700 Subject: [PATCH 15/21] Deprecate read_page_locations() and simplify offset index in `ParquetMetaData` (#6095) * deprecate read_page_locations * add to_thrift() to OffsetIndexMetaData --- parquet/src/arrow/arrow_reader/mod.rs | 4 +- parquet/src/arrow/arrow_reader/statistics.rs | 14 +++-- parquet/src/arrow/arrow_writer/mod.rs | 14 ++--- parquet/src/arrow/async_reader/metadata.rs | 8 +-- parquet/src/arrow/async_reader/mod.rs | 24 +++++---- parquet/src/bin/parquet-index.rs | 12 +++-- parquet/src/file/metadata/memory.rs | 7 +++ parquet/src/file/metadata/mod.rs | 54 ++++++++----------- parquet/src/file/page_index/index_reader.rs | 1 + parquet/src/file/page_index/offset_index.rs | 9 ++++ parquet/src/file/serialized_reader.rs | 56 ++++++++------------ parquet/src/file/writer.rs | 18 +++---- parquet/tests/arrow_writer_layout.rs | 10 ++-- 13 files changed, 118 insertions(+), 113 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index e9edf7cb751d..22a4e5a90aa3 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -394,7 +394,7 @@ impl ArrowReaderMetadata { let offset_index = metadata .row_groups() .iter() - .map(|rg| index_reader::read_pages_locations(reader, rg.columns())) + .map(|rg| index_reader::read_offset_indexes(reader, rg.columns())) .collect::>>()?; metadata.set_offset_index(Some(offset_index)) @@ -689,7 +689,7 @@ impl Iterator for ReaderPageIterator { // To avoid `i[rg_idx][self.oolumn_idx`] panic, we need to filter out empty `i[rg_idx]`. let page_locations = offset_index .filter(|i| !i[rg_idx].is_empty()) - .map(|i| i[rg_idx][self.column_idx].clone()); + .map(|i| i[rg_idx][self.column_idx].page_locations.clone()); let total_rows = rg.num_rows() as usize; let reader = self.reader.clone(); diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index d536792b827b..6e2a6fafca25 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -1349,7 +1349,9 @@ impl<'a> StatisticsConverter<'a> { let iter = row_group_indices.into_iter().map(|rg_index| { let column_page_index_per_row_group_per_column = &column_page_index[*rg_index][parquet_index]; - let num_data_pages = &column_offset_index[*rg_index][parquet_index].len(); + let num_data_pages = &column_offset_index[*rg_index][parquet_index] + .page_locations() + .len(); (*num_data_pages, column_page_index_per_row_group_per_column) }); @@ -1378,7 +1380,9 @@ impl<'a> StatisticsConverter<'a> { let iter = row_group_indices.into_iter().map(|rg_index| { let column_page_index_per_row_group_per_column = &column_page_index[*rg_index][parquet_index]; - let num_data_pages = &column_offset_index[*rg_index][parquet_index].len(); + let num_data_pages = &column_offset_index[*rg_index][parquet_index] + .page_locations() + .len(); (*num_data_pages, column_page_index_per_row_group_per_column) }); @@ -1408,7 +1412,9 @@ impl<'a> StatisticsConverter<'a> { let iter = row_group_indices.into_iter().map(|rg_index| { let column_page_index_per_row_group_per_column = &column_page_index[*rg_index][parquet_index]; - let num_data_pages = &column_offset_index[*rg_index][parquet_index].len(); + let num_data_pages = &column_offset_index[*rg_index][parquet_index] + .page_locations() + .len(); (*num_data_pages, column_page_index_per_row_group_per_column) }); @@ -1450,7 +1456,7 @@ impl<'a> StatisticsConverter<'a> { let mut row_count_total = Vec::new(); for rg_idx in row_group_indices { - let page_locations = &column_offset_index[*rg_idx][parquet_index]; + let page_locations = &column_offset_index[*rg_idx][parquet_index].page_locations(); let row_count_per_page = page_locations .windows(2) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 070d740094f5..8f7b514ccf71 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1096,7 +1096,7 @@ mod tests { use crate::data_type::AsBytes; use crate::file::metadata::ParquetMetaData; use crate::file::page_index::index::Index; - use crate::file::page_index::index_reader::read_pages_locations; + use crate::file::page_index::index_reader::read_offset_indexes; use crate::file::properties::{ BloomFilterPosition, EnabledStatistics, ReaderProperties, WriterVersion, }; @@ -1669,16 +1669,16 @@ mod tests { "Expected a dictionary page" ); - let page_locations = read_pages_locations(&file, column).unwrap(); + let offset_indexes = read_offset_indexes(&file, column).unwrap(); - let offset_index = page_locations[0].clone(); + let page_locations = offset_indexes[0].page_locations.clone(); // We should fallback to PLAIN encoding after the first row and our max page size is 1 bytes // so we expect one dictionary encoded page and then a page per row thereafter. assert_eq!( - offset_index.len(), + page_locations.len(), 10, - "Expected 9 pages but got {offset_index:#?}" + "Expected 9 pages but got {page_locations:#?}" ); } @@ -3020,8 +3020,8 @@ mod tests { assert_eq!(index.len(), 1); assert_eq!(index[0].len(), 2); // 2 columns - assert_eq!(index[0][0].len(), 1); // 1 page - assert_eq!(index[0][1].len(), 1); // 1 page + assert_eq!(index[0][0].page_locations().len(), 1); // 1 page + assert_eq!(index[0][1].page_locations().len(), 1); // 1 page } #[test] diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 4a3489a2084d..9224ea3f68a8 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -20,9 +20,7 @@ use crate::errors::{ParquetError, Result}; use crate::file::footer::{decode_footer, decode_metadata}; use crate::file::metadata::ParquetMetaData; use crate::file::page_index::index::Index; -use crate::file::page_index::index_reader::{ - acc_range, decode_column_index, decode_page_locations, -}; +use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index}; use bytes::Bytes; use futures::future::BoxFuture; use futures::FutureExt; @@ -179,9 +177,7 @@ impl MetadataLoader { x.columns() .iter() .map(|c| match c.offset_index_range() { - Some(r) => { - decode_page_locations(&data[r.start - offset..r.end - offset]) - } + Some(r) => decode_offset_index(&data[r.start - offset..r.end - offset]), None => Err(general_err!("missing offset index")), }) .collect::>>() diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 5a790fa6aff0..5695dbc10fe1 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -106,9 +106,10 @@ use crate::column::page::{PageIterator, PageReader}; use crate::errors::{ParquetError, Result}; use crate::file::footer::{decode_footer, decode_metadata}; use crate::file::metadata::{ParquetMetaData, RowGroupMetaData}; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::reader::{ChunkReader, Length, SerializedPageReader}; use crate::file::FOOTER_SIZE; -use crate::format::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash, PageLocation}; +use crate::format::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash}; mod metadata; pub use metadata::*; @@ -489,7 +490,7 @@ where // TODO: calling build_array multiple times is wasteful let meta = self.metadata.row_group(row_group_idx); - let page_locations = self + let offset_index = self .metadata .offset_index() .map(|x| x[row_group_idx].as_slice()); @@ -499,7 +500,7 @@ where // schema: meta.schema_descr_ptr(), row_count: meta.num_rows() as usize, column_chunks: vec![None; meta.columns().len()], - page_locations, + offset_index, }; if let Some(filter) = self.filter.as_mut() { @@ -703,7 +704,7 @@ where /// An in-memory collection of column chunks struct InMemoryRowGroup<'a> { metadata: &'a RowGroupMetaData, - page_locations: Option<&'a [Vec]>, + offset_index: Option<&'a [OffsetIndexMetaData]>, column_chunks: Vec>>, row_count: usize, } @@ -716,7 +717,7 @@ impl<'a> InMemoryRowGroup<'a> { projection: &ProjectionMask, selection: Option<&RowSelection>, ) -> Result<()> { - if let Some((selection, page_locations)) = selection.zip(self.page_locations) { + if let Some((selection, offset_index)) = selection.zip(self.offset_index) { // If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the // `RowSelection` let mut page_start_offsets: Vec> = vec![]; @@ -734,14 +735,14 @@ impl<'a> InMemoryRowGroup<'a> { // then we need to also fetch a dictionary page. let mut ranges = vec![]; let (start, _len) = chunk_meta.byte_range(); - match page_locations[idx].first() { + match offset_index[idx].page_locations.first() { Some(first) if first.offset as u64 != start => { ranges.push(start as usize..first.offset as usize); } _ => (), } - ranges.extend(selection.scan_ranges(&page_locations[idx])); + ranges.extend(selection.scan_ranges(&offset_index[idx].page_locations)); page_start_offsets.push(ranges.iter().map(|range| range.start).collect()); ranges @@ -812,7 +813,9 @@ impl<'a> RowGroups for InMemoryRowGroup<'a> { "Invalid column index {i}, column was not fetched" ))), Some(data) => { - let page_locations = self.page_locations.map(|index| index[i].clone()); + let page_locations = self + .offset_index + .map(|index| index[i].page_locations.clone()); let page_reader: Box = Box::new(SerializedPageReader::new( data.clone(), self.metadata.column(i), @@ -1529,7 +1532,7 @@ mod tests { let metadata = parse_metadata(&data).unwrap(); let offset_index = - index_reader::read_pages_locations(&data, metadata.row_group(0).columns()) + index_reader::read_offset_indexes(&data, metadata.row_group(0).columns()) .expect("reading offset index"); let row_group_meta = metadata.row_group(0).clone(); @@ -1538,7 +1541,6 @@ mod tests { vec![row_group_meta], None, Some(vec![offset_index.clone()]), - None, ); let metadata = Arc::new(metadata); @@ -1575,7 +1577,7 @@ mod tests { }; let mut skip = true; - let mut pages = offset_index[0].iter().peekable(); + let mut pages = offset_index[0].page_locations.iter().peekable(); // Setup `RowSelection` so that we can skip every other page, selecting the last page let mut selectors = vec![]; diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs index 86e08b6dafa3..1a9b74dd78fb 100644 --- a/parquet/src/bin/parquet-index.rs +++ b/parquet/src/bin/parquet-index.rs @@ -37,6 +37,7 @@ use clap::Parser; use parquet::errors::{ParquetError, Result}; use parquet::file::page_index::index::{Index, PageIndex}; +use parquet::file::page_index::offset_index::OffsetIndexMetaData; use parquet::file::reader::{FileReader, SerializedFileReader}; use parquet::file::serialized_reader::ReadOptionsBuilder; use parquet::format::PageLocation; @@ -93,7 +94,8 @@ impl Args { )) })?; - let row_counts = compute_row_counts(offset_index, row_group.num_rows()); + let row_counts = + compute_row_counts(offset_index.page_locations.as_slice(), row_group.num_rows()); match &column_indices[column_idx] { Index::NONE => println!("NO INDEX"), Index::BOOLEAN(v) => print_index(&v.indexes, offset_index, &row_counts)?, @@ -131,20 +133,20 @@ fn compute_row_counts(offset_index: &[PageLocation], rows: i64) -> Vec { /// Prints index information for a single column chunk fn print_index( column_index: &[PageIndex], - offset_index: &[PageLocation], + offset_index: &OffsetIndexMetaData, row_counts: &[i64], ) -> Result<()> { - if column_index.len() != offset_index.len() { + if column_index.len() != offset_index.page_locations.len() { return Err(ParquetError::General(format!( "Index length mismatch, got {} and {}", column_index.len(), - offset_index.len() + offset_index.page_locations.len() ))); } for (idx, ((c, o), row_count)) in column_index .iter() - .zip(offset_index) + .zip(offset_index.page_locations()) .zip(row_counts) .enumerate() { diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index a1b40a8de95a..a7d3d4ab8f93 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -23,6 +23,7 @@ use crate::data_type::private::ParquetValueType; use crate::file::metadata::{ColumnChunkMetaData, FileMetaData, KeyValue, RowGroupMetaData}; use crate::file::page_encoding_stats::PageEncodingStats; use crate::file::page_index::index::{Index, NativeIndex, PageIndex}; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::statistics::{Statistics, ValueStatistics}; use crate::format::{BoundaryOrder, PageLocation, SortingColumn}; use std::sync::Arc; @@ -144,6 +145,12 @@ impl HeapSize for Statistics { } } +impl HeapSize for OffsetIndexMetaData { + fn heap_size(&self) -> usize { + self.page_locations.heap_size() + self.unencoded_byte_array_data_bytes.heap_size() + } +} + impl HeapSize for Index { fn heap_size(&self) -> usize { match self { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index d86b1ce572fb..9a3ca605857f 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -44,6 +44,7 @@ use crate::errors::{ParquetError, Result}; pub(crate) use crate::file::metadata::memory::HeapSize; use crate::file::page_encoding_stats::{self, PageEncodingStats}; use crate::file::page_index::index::Index; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::statistics::{self, Statistics}; use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, @@ -56,20 +57,19 @@ use crate::schema::types::{ /// [`Index`] corresponding to column `column_number` of row group /// `row_group_number`. /// -/// For example `column_index[2][3]` holds the [`Index`] for the forth +/// For example `column_index[2][3]` holds the [`Index`] for the fourth /// column in the third row group of the parquet file. pub type ParquetColumnIndex = Vec>; -/// [`PageLocation`] for each data page of each row group of each column. +/// [`OffsetIndexMetaData`] for each row group of each column. /// -/// `offset_index[row_group_number][column_number][page_number]` holds -/// the [`PageLocation`] corresponding to page `page_number` of column +/// `offset_index[row_group_number][column_number]` holds +/// the [`OffsetIndexMetaData`] corresponding to column /// `column_number`of row group `row_group_number`. /// -/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for -/// the fifth page of the forth column in the third row group of the -/// parquet file. -pub type ParquetOffsetIndex = Vec>>; +/// For example `offset_index[2][3]` holds the [`OffsetIndexMetaData`] for +/// the fourth column in the third row group of the parquet file. +pub type ParquetOffsetIndex = Vec>; /// Parsed metadata for a single Parquet file /// @@ -94,10 +94,8 @@ pub struct ParquetMetaData { row_groups: Vec, /// Page level index for each page in each column chunk column_index: Option, - /// Offset index for all each page in each column chunk + /// Offset index for each page in each column chunk offset_index: Option, - /// `unencoded_byte_array_data_bytes` from the offset index - unencoded_byte_array_data_bytes: Option>>>>, } impl ParquetMetaData { @@ -109,7 +107,6 @@ impl ParquetMetaData { row_groups, column_index: None, offset_index: None, - unencoded_byte_array_data_bytes: None, } } @@ -120,14 +117,12 @@ impl ParquetMetaData { row_groups: Vec, column_index: Option, offset_index: Option, - unencoded_byte_array_data_bytes: Option>>>>, ) -> Self { ParquetMetaData { file_metadata, row_groups, column_index, offset_index, - unencoded_byte_array_data_bytes, } } @@ -184,19 +179,6 @@ impl ParquetMetaData { self.offset_index.as_ref() } - /// Returns `unencoded_byte_array_data_bytes` from the offset indexes in this file, if loaded - /// - /// This value represents the output size of the total bytes in this file, which can be useful for - /// allocating an appropriately sized output buffer. - /// - /// Returns `None` if the parquet file does not have a `OffsetIndex` or - /// [ArrowReaderOptions::with_page_index] was set to false. - /// - /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index - pub fn unencoded_byte_array_data_bytes(&self) -> Option<&Vec>>>> { - self.unencoded_byte_array_data_bytes.as_ref() - } - /// Estimate of the bytes allocated to store `ParquetMetadata` /// /// # Notes: @@ -217,7 +199,6 @@ impl ParquetMetaData { + self.row_groups.heap_size() + self.column_index.heap_size() + self.offset_index.heap_size() - + self.unencoded_byte_array_data_bytes.heap_size() } /// Override the column index @@ -1364,7 +1345,7 @@ mod tests { column_orders, ); let parquet_meta = ParquetMetaData::new(file_metadata.clone(), row_group_meta.clone()); - let base_expected_size = 1376; + let base_expected_size = 1352; assert_eq!(parquet_meta.memory_size(), base_expected_size); let mut column_index = ColumnIndexBuilder::new(); @@ -1373,18 +1354,25 @@ mod tests { let native_index = NativeIndex::::try_new(column_index).unwrap(); // Now, add in OffsetIndex + let mut offset_index = OffsetIndexBuilder::new(); + offset_index.append_row_count(1); + offset_index.append_offset_and_size(2, 3); + offset_index.append_unencoded_byte_array_data_bytes(Some(10)); + offset_index.append_row_count(1); + offset_index.append_offset_and_size(2, 3); + offset_index.append_unencoded_byte_array_data_bytes(Some(10)); + let offset_index = offset_index.build_to_thrift(); + let parquet_meta = ParquetMetaData::new_with_page_index( file_metadata, row_group_meta, Some(vec![vec![Index::BOOLEAN(native_index)]]), Some(vec![vec![ - vec![PageLocation::new(1, 2, 3)], - vec![PageLocation::new(1, 2, 3)], + OffsetIndexMetaData::try_new(offset_index).unwrap() ]]), - Some(vec![vec![Some(vec![10, 20, 30])]]), ); - let bigger_expected_size = 2464; + let bigger_expected_size = 2400; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); assert_eq!(parquet_meta.memory_size(), bigger_expected_size); diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 7959cb95c052..395e9afe122c 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -85,6 +85,7 @@ pub fn read_columns_indexes( /// See [Page Index Documentation] for more details. /// /// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +#[deprecated(since = "53.0.0", note = "Use read_offset_indexes")] pub fn read_pages_locations( reader: &R, chunks: &[ColumnChunkMetaData], diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index 9138620e3fcd..2ae3464141ca 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -47,4 +47,13 @@ impl OffsetIndexMetaData { pub fn unencoded_byte_array_data_bytes(&self) -> Option<&Vec> { self.unencoded_byte_array_data_bytes.as_ref() } + + // TODO: remove annotation after merge + #[allow(dead_code)] + pub(crate) fn to_thrift(&self) -> OffsetIndex { + OffsetIndex::new( + self.page_locations.clone(), + self.unencoded_byte_array_data_bytes.clone(), + ) + } } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 65b6ebf2ec98..70aea6fd5ad3 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -28,6 +28,7 @@ use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; use crate::errors::{ParquetError, Result}; use crate::file::page_index::index_reader; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::{ footer, metadata::*, @@ -211,22 +212,12 @@ impl SerializedFileReader { if options.enable_page_index { let mut columns_indexes = vec![]; let mut offset_indexes = vec![]; - let mut unenc_byte_sizes = vec![]; for rg in &mut filtered_row_groups { let column_index = index_reader::read_columns_indexes(&chunk_reader, rg.columns())?; let offset_index = index_reader::read_offset_indexes(&chunk_reader, rg.columns())?; - - // split offset_index into two vectors to not break API - let mut page_locations = vec![]; - let mut unenc_bytes = vec![]; - offset_index.into_iter().for_each(|index| { - page_locations.push(index.page_locations); - unenc_bytes.push(index.unencoded_byte_array_data_bytes); - }); columns_indexes.push(column_index); - offset_indexes.push(page_locations); - unenc_byte_sizes.push(unenc_bytes); + offset_indexes.push(offset_index); } Ok(Self { @@ -236,7 +227,6 @@ impl SerializedFileReader { filtered_row_groups, Some(columns_indexes), Some(offset_indexes), - Some(unenc_byte_sizes), )), props: Arc::new(options.props), }) @@ -296,7 +286,7 @@ impl FileReader for SerializedFileReader { pub struct SerializedRowGroupReader<'a, R: ChunkReader> { chunk_reader: Arc, metadata: &'a RowGroupMetaData, - page_locations: Option<&'a [Vec]>, + offset_index: Option<&'a [OffsetIndexMetaData]>, props: ReaderPropertiesPtr, bloom_filters: Vec>, } @@ -306,7 +296,7 @@ impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { pub fn new( chunk_reader: Arc, metadata: &'a RowGroupMetaData, - page_locations: Option<&'a [Vec]>, + offset_index: Option<&'a [OffsetIndexMetaData]>, props: ReaderPropertiesPtr, ) -> Result { let bloom_filters = if props.read_bloom_filter() { @@ -321,7 +311,7 @@ impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { Ok(Self { chunk_reader, metadata, - page_locations, + offset_index, props, bloom_filters, }) @@ -341,7 +331,7 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<' fn get_column_page_reader(&self, i: usize) -> Result> { let col = self.metadata.column(i); - let page_locations = self.page_locations.map(|x| x[i].clone()); + let page_locations = self.offset_index.map(|x| x[i].page_locations.clone()); let props = Arc::clone(&self.props); Ok(Box::new(SerializedPageReader::new_with_properties( @@ -787,7 +777,7 @@ mod tests { use crate::data_type::private::ParquetValueType; use crate::data_type::{AsBytes, FixedLenByteArrayType}; use crate::file::page_index::index::{Index, NativeIndex}; - use crate::file::page_index::index_reader::{read_columns_indexes, read_pages_locations}; + use crate::file::page_index::index_reader::{read_columns_indexes, read_offset_indexes}; use crate::file::writer::SerializedFileWriter; use crate::record::RowAccessor; use crate::schema::parser::parse_message_type; @@ -1325,7 +1315,7 @@ mod tests { // only one row group assert_eq!(offset_indexes.len(), 1); let offset_index = &offset_indexes[0]; - let page_offset = &offset_index[0][0]; + let page_offset = &offset_index[0].page_locations()[0]; assert_eq!(4, page_offset.offset); assert_eq!(152, page_offset.compressed_page_size); @@ -1348,8 +1338,8 @@ mod tests { b.reverse(); assert_eq!(a, b); - let a = read_pages_locations(&test_file, columns).unwrap(); - let mut b = read_pages_locations(&test_file, &reversed).unwrap(); + let a = read_offset_indexes(&test_file, columns).unwrap(); + let mut b = read_offset_indexes(&test_file, &reversed).unwrap(); b.reverse(); assert_eq!(a, b); } @@ -1386,7 +1376,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 0), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[0].len(), 325); + assert_eq!(row_group_offset_indexes[0].page_locations.len(), 325); } else { unreachable!() }; @@ -1394,7 +1384,7 @@ mod tests { assert!(&column_index[0][1].is_sorted()); if let Index::BOOLEAN(index) = &column_index[0][1] { assert_eq!(index.indexes.len(), 82); - assert_eq!(row_group_offset_indexes[1].len(), 82); + assert_eq!(row_group_offset_indexes[1].page_locations.len(), 82); } else { unreachable!() }; @@ -1407,7 +1397,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 2), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[2].len(), 325); + assert_eq!(row_group_offset_indexes[2].page_locations.len(), 325); } else { unreachable!() }; @@ -1420,7 +1410,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 3), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[3].len(), 325); + assert_eq!(row_group_offset_indexes[3].page_locations.len(), 325); } else { unreachable!() }; @@ -1433,7 +1423,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 4), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[4].len(), 325); + assert_eq!(row_group_offset_indexes[4].page_locations.len(), 325); } else { unreachable!() }; @@ -1446,7 +1436,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 5), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[5].len(), 528); + assert_eq!(row_group_offset_indexes[5].page_locations.len(), 528); } else { unreachable!() }; @@ -1459,7 +1449,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 6), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[6].len(), 325); + assert_eq!(row_group_offset_indexes[6].page_locations.len(), 325); } else { unreachable!() }; @@ -1472,7 +1462,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 7), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[7].len(), 528); + assert_eq!(row_group_offset_indexes[7].page_locations.len(), 528); } else { unreachable!() }; @@ -1485,7 +1475,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 8), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[8].len(), 974); + assert_eq!(row_group_offset_indexes[8].page_locations.len(), 974); } else { unreachable!() }; @@ -1498,7 +1488,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 9), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[9].len(), 352); + assert_eq!(row_group_offset_indexes[9].page_locations.len(), 352); } else { unreachable!() }; @@ -1506,7 +1496,7 @@ mod tests { //Notice: min_max values for each page for this col not exits. assert!(!&column_index[0][10].is_sorted()); if let Index::NONE = &column_index[0][10] { - assert_eq!(row_group_offset_indexes[10].len(), 974); + assert_eq!(row_group_offset_indexes[10].page_locations.len(), 974); } else { unreachable!() }; @@ -1519,7 +1509,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 11), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[11].len(), 325); + assert_eq!(row_group_offset_indexes[11].page_locations.len(), 325); } else { unreachable!() }; @@ -1532,7 +1522,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 12), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[12].len(), 325); + assert_eq!(row_group_offset_indexes[12].page_locations.len(), 325); } else { unreachable!() }; diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index b109a2da8eb0..f7dde59f4f42 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1920,15 +1920,15 @@ mod tests { column.unencoded_byte_array_data_bytes().unwrap() ); - assert!(reader - .metadata() - .unencoded_byte_array_data_bytes() - .is_some()); - let unenc_sizes = reader.metadata().unencoded_byte_array_data_bytes().unwrap(); - assert_eq!(unenc_sizes.len(), 1); - assert_eq!(unenc_sizes[0].len(), 1); - assert!(unenc_sizes[0][0].is_some()); - let page_sizes = unenc_sizes[0][0].as_ref().unwrap(); + assert!(reader.metadata().offset_index().is_some()); + let offset_index = reader.metadata().offset_index().unwrap(); + assert_eq!(offset_index.len(), 1); + assert_eq!(offset_index[0].len(), 1); + assert!(offset_index[0][0].unencoded_byte_array_data_bytes.is_some()); + let page_sizes = offset_index[0][0] + .unencoded_byte_array_data_bytes + .as_ref() + .unwrap(); assert_eq!(page_sizes.len(), 1); assert_eq!(page_sizes[0], unenc_size); } diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index cd124031cfdc..3e0f6ce3a8b3 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -89,12 +89,15 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { for (column_index, column_layout) in offset_index.iter().zip(&row_group_layout.columns) { assert_eq!( - column_index.len(), + column_index.page_locations.len(), column_layout.pages.len(), "index page count mismatch" ); - for (idx, (page, page_layout)) in - column_index.iter().zip(&column_layout.pages).enumerate() + for (idx, (page, page_layout)) in column_index + .page_locations + .iter() + .zip(&column_layout.pages) + .enumerate() { assert_eq!( page.compressed_page_size as usize, @@ -102,6 +105,7 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { "index page {idx} size mismatch" ); let next_first_row_index = column_index + .page_locations .get(idx + 1) .map(|x| x.first_row_index) .unwrap_or_else(|| row_group.num_rows()); From bf4dadb7d6698f5a0dec472618986c296ee083e0 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 24 Jul 2024 20:40:25 -0700 Subject: [PATCH 16/21] move valid test into ColumnIndexBuilder::append_histograms --- parquet/src/column/writer/mod.rs | 14 ++++++-------- parquet/src/file/metadata/mod.rs | 5 +++++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 054e087c72c8..f8bed10269a7 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -834,15 +834,13 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } } - // update histograms - if self.column_index_builder.valid() { - self.column_index_builder.append_histograms( - &self.page_metrics.repetition_level_histogram, - &self.page_metrics.definition_level_histogram, - ); - } + // Append page histograms to the `ColumnIndex` histograms + self.column_index_builder.append_histograms( + &self.page_metrics.repetition_level_histogram, + &self.page_metrics.definition_level_histogram, + ); - // update the offset index + // Update the offset index self.offset_index_builder .append_row_count(self.page_metrics.num_buffered_rows as i64); diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index ab0dbcff2c6d..e95d645b7891 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1073,11 +1073,16 @@ impl ColumnIndexBuilder { self.null_counts.push(null_count); } + /// Append the given page-level histograms to the [`ColumnIndex`] histograms. + /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state. pub fn append_histograms( &mut self, repetition_level_histogram: &Option>, definition_level_histogram: &Option>, ) { + if !self.valid { + return; + } if let Some(ref rep_lvl_hist) = repetition_level_histogram { let hist = self.repetition_level_histograms.get_or_insert(Vec::new()); hist.reserve(rep_lvl_hist.len()); From 40024640a5ea2d584d24bb746f6a6b62a702badf Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 25 Jul 2024 13:32:33 -0700 Subject: [PATCH 17/21] move update_histogram() inside ColumnMetrics --- parquet/src/column/writer/mod.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index f8bed10269a7..75b4e990625d 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -194,17 +194,6 @@ fn new_histogram(max_level: i16) -> Option> { } } -/// Sum `page_histogram` into `chunk_histogram` -fn update_histogram(chunk_histogram: &mut Option>, page_histogram: &Option>) { - if page_histogram.is_some() && chunk_histogram.is_some() { - let chunk_hist = chunk_histogram.as_mut().unwrap(); - let page_hist = page_histogram.as_ref().unwrap(); - for i in 0..page_hist.len() { - chunk_hist[i] += page_hist[i] - } - } -} - // Metrics per page #[derive(Default)] struct PageMetrics { @@ -306,14 +295,25 @@ impl ColumnMetrics { self } + /// Sum `page_histogram` into `chunk_histogram` + fn update_histogram(chunk_histogram: &mut Option>, page_histogram: &Option>) { + if page_histogram.is_some() && chunk_histogram.is_some() { + let chunk_hist = chunk_histogram.as_mut().unwrap(); + let page_hist = page_histogram.as_ref().unwrap(); + for i in 0..page_hist.len() { + chunk_hist[i] += page_hist[i] + } + } + } + /// Sum the provided PageMetrics histograms into the chunk histograms. Does nothing if /// page histograms are not initialized. fn update_from_page_metrics(&mut self, page_metrics: &PageMetrics) { - update_histogram( + ColumnMetrics::::update_histogram( &mut self.definition_level_histogram, &page_metrics.definition_level_histogram, ); - update_histogram( + ColumnMetrics::::update_histogram( &mut self.repetition_level_histogram, &page_metrics.repetition_level_histogram, ); From a6353d176ed649cf4f80d21672acb45e73262a9c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 25 Jul 2024 17:34:48 -0400 Subject: [PATCH 18/21] Update parquet/src/column/writer/mod.rs Co-authored-by: Ed Seidl --- parquet/src/column/writer/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 643f34fbdd0f..2c0c957d87d3 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -298,7 +298,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { num_page_nulls: 0, }, column_metrics: ColumnMetrics::::new(), - column_index_builder: ColumnIndexBuilder::new(), + column_index_builder, offset_index_builder: OffsetIndexBuilder::new(), encodings, data_page_boundary_ascending: true, From 978264d49dc27225b570378e491ad69ae19f45cb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 26 Jul 2024 07:57:46 -0400 Subject: [PATCH 19/21] Implement LevelHistograms as a struct --- parquet/src/column/writer/mod.rs | 63 ++++--------- parquet/src/file/metadata/mod.rs | 149 ++++++++++++++++++++++++++++--- parquet/src/file/writer.rs | 12 +-- 3 files changed, 160 insertions(+), 64 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 40ae3b20f2b7..681267b7b491 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -33,7 +33,7 @@ use crate::data_type::private::ParquetValueType; use crate::data_type::*; use crate::encodings::levels::LevelEncoder; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::{ColumnIndexBuilder, OffsetIndexBuilder}; +use crate::file::metadata::{ColumnIndexBuilder, LevelHistogram, OffsetIndexBuilder}; use crate::file::properties::EnabledStatistics; use crate::file::statistics::{Statistics, ValueStatistics}; use crate::file::{ @@ -183,25 +183,14 @@ pub struct ColumnCloseResult { pub offset_index: Option, } -/// Creates a vector to hold level histogram data. Length will be `max_level + 1`. -/// Because histograms are not necessary when `max_level == 0`, this will return -/// `None` in that case. -fn new_histogram(max_level: i16) -> Option> { - if max_level > 0 { - Some(vec![0; max_level as usize + 1]) - } else { - None - } -} - // Metrics per page #[derive(Default)] struct PageMetrics { num_buffered_values: u32, num_buffered_rows: u32, num_page_nulls: u64, - repetition_level_histogram: Option>, - definition_level_histogram: Option>, + repetition_level_histogram: Option, + definition_level_histogram: Option, } impl PageMetrics { @@ -211,50 +200,37 @@ impl PageMetrics { /// Initialize the repetition level histogram fn with_repetition_level_histogram(mut self, max_level: i16) -> Self { - self.repetition_level_histogram = new_histogram(max_level); + self.repetition_level_histogram = LevelHistogram::try_new(max_level); self } /// Initialize the definition level histogram fn with_definition_level_histogram(mut self, max_level: i16) -> Self { - self.definition_level_histogram = new_histogram(max_level); + self.definition_level_histogram = LevelHistogram::try_new(max_level); self } - /// Sets all elements of `histogram` to 0 - fn reset_histogram(histogram: &mut Option>) { - if let Some(ref mut hist) = histogram { - for v in hist { - *v = 0 - } - } - } - /// Resets the state of this `PageMetrics` to the initial state. /// If histograms have been initialized their contents will be reset to zero. fn new_page(&mut self) { self.num_buffered_values = 0; self.num_buffered_rows = 0; self.num_page_nulls = 0; - PageMetrics::reset_histogram(&mut self.repetition_level_histogram); - PageMetrics::reset_histogram(&mut self.definition_level_histogram); + self.repetition_level_histogram.as_mut().map(LevelHistogram::reset); + self.definition_level_histogram.as_mut().map(LevelHistogram::reset); } /// Updates histogram values using provided repetition levels fn update_repetition_level_histogram(&mut self, levels: &[i16]) { if let Some(ref mut rep_hist) = self.repetition_level_histogram { - for &level in levels { - rep_hist[level as usize] += 1; - } + rep_hist.update_from_levels(levels); } } /// Updates histogram values using provided definition levels fn update_definition_level_histogram(&mut self, levels: &[i16]) { if let Some(ref mut def_hist) = self.definition_level_histogram { - for &level in levels { - def_hist[level as usize] += 1; - } + def_hist.update_from_levels(levels); } } } @@ -274,8 +250,8 @@ struct ColumnMetrics { num_column_nulls: u64, column_distinct_count: Option, variable_length_bytes: Option, - repetition_level_histogram: Option>, - definition_level_histogram: Option>, + repetition_level_histogram: Option, + definition_level_histogram: Option, } impl ColumnMetrics { @@ -285,24 +261,23 @@ impl ColumnMetrics { /// Initialize the repetition level histogram fn with_repetition_level_histogram(mut self, max_level: i16) -> Self { - self.repetition_level_histogram = new_histogram(max_level); + self.repetition_level_histogram = LevelHistogram::try_new(max_level); self } /// Initialize the definition level histogram fn with_definition_level_histogram(mut self, max_level: i16) -> Self { - self.definition_level_histogram = new_histogram(max_level); + self.definition_level_histogram = LevelHistogram::try_new(max_level); self } /// Sum `page_histogram` into `chunk_histogram` - fn update_histogram(chunk_histogram: &mut Option>, page_histogram: &Option>) { - if page_histogram.is_some() && chunk_histogram.is_some() { - let chunk_hist = chunk_histogram.as_mut().unwrap(); - let page_hist = page_histogram.as_ref().unwrap(); - for i in 0..page_hist.len() { - chunk_hist[i] += page_hist[i] - } + fn update_histogram( + chunk_histogram: &mut Option, + page_histogram: &Option, + ) { + if let (Some(page_hist), Some(chunk_hist)) = (page_histogram, chunk_histogram) { + chunk_hist.add(page_hist); } } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index c6895fad5bbd..864746ef8a83 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -557,8 +557,114 @@ pub struct ColumnChunkMetaData { column_index_offset: Option, column_index_length: Option, unencoded_byte_array_data_bytes: Option, - repetition_level_histogram: Option>, - definition_level_histogram: Option>, + repetition_level_histogram: Option, + definition_level_histogram: Option, +} + +/// Histograms for repetition and definition levels. +/// +/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of +/// values at level `i`. +/// +/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the +/// number of rows with level 1, and so on. +/// +#[derive(Debug, Clone, PartialEq)] +pub struct LevelHistogram { + inner: Vec, +} + +impl LevelHistogram { + /// Creates a new level histogram data. + /// + /// Length will be `max_level + 1`. + /// + /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case) + pub fn try_new(max_level: i16) -> Option { + if max_level > 0 { + Some(Self { + inner: vec![0; max_level as usize + 1], + }) + } else { + None + } + } + /// Returns a reference to the the histogram's values. + pub fn values(&self) -> &[i64] { + &self.inner + } + + /// Return the inner vector, consuming self + pub fn into_inner(self) -> Vec { + self.inner + } + + /// Returns the histogram value at the given index. + /// + /// The value of `i` is the number of values with level `i`. For example, + /// `get(1)` returns the number of values with level 1. + /// + /// Returns `None` if the index is out of bounds. + pub fn get(&self, index: usize) -> Option { + self.inner.get(index).copied() + } + + /// Adds the values from the other histogram to this histogram + /// + /// # Panics + /// If the histograms have different lengths + pub fn add(&mut self, other: &Self) { + assert_eq!(self.len(), other.len()); + for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) { + *dst += src; + } + } + + /// return the length of the histogram + pub fn len(&self) -> usize { + self.inner.len() + } + + /// returns if the histogram is empty + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + /// Sets the values of all histogram levels to 0. + pub fn reset(&mut self) { + for value in self.inner.iter_mut() { + *value = 0; + } + } + + /// Updates histogram values using provided repetition levels + /// + /// # Panics + /// if any of the levels is greater than the length of the histogram ( + /// the argument supplied to [`Self::try_new`]) + pub fn update_from_levels(&mut self, levels: &[i16]) { + for &level in levels { + self.inner[level as usize] += 1; + } + } +} + +impl From> for LevelHistogram { + fn from(inner: Vec) -> Self { + Self { inner } + } +} + +impl From for Vec { + fn from(value: LevelHistogram) -> Self { + value.into_inner() + } +} + +impl HeapSize for LevelHistogram { + fn heap_size(&self) -> usize { + self.inner.heap_size() + } } /// Represents common operations for a column chunk. @@ -724,7 +830,7 @@ impl ColumnChunkMetaData { /// The returned value `vec[i]` is how many values are at repetition level `i`. For example, /// `vec[0]` indicates how many rows the page contains. /// This field may not be set by older writers. - pub fn repetition_level_histogram(&self) -> Option<&Vec> { + pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> { self.repetition_level_histogram.as_ref() } @@ -733,7 +839,7 @@ impl ColumnChunkMetaData { /// The returned value `vec[i]` is how many values are at definition level `i`. For example, /// `vec[max_definition_level-1]` indicates how many non-null values are present in the page. /// This field may not be set by older writers. - pub fn definition_level_histogram(&self) -> Option<&Vec> { + pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> { self.definition_level_histogram.as_ref() } @@ -788,6 +894,9 @@ impl ColumnChunkMetaData { (None, None, None) }; + let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from); + let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from); + let result = ColumnChunkMetaData { column_descr, encodings, @@ -838,10 +947,20 @@ impl ColumnChunkMetaData { || self.repetition_level_histogram.is_some() || self.definition_level_histogram.is_some() { + let repetition_level_histogram = self + .repetition_level_histogram + .as_ref() + .map(|hist| hist.clone().into_inner()); + + let definition_level_histogram = self + .definition_level_histogram + .as_ref() + .map(|hist| hist.clone().into_inner()); + Some(SizeStatistics { unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes, - repetition_level_histogram: self.repetition_level_histogram.clone(), - definition_level_histogram: self.definition_level_histogram.clone(), + repetition_level_histogram, + definition_level_histogram, }) } else { None @@ -1023,13 +1142,13 @@ impl ColumnChunkMetaDataBuilder { } /// Sets optional repetition level histogram - pub fn set_repetition_level_histogram(mut self, value: Option>) -> Self { + pub fn set_repetition_level_histogram(mut self, value: Option) -> Self { self.0.repetition_level_histogram = value; self } /// Sets optional repetition level histogram - pub fn set_definition_level_histogram(mut self, value: Option>) -> Self { + pub fn set_definition_level_histogram(mut self, value: Option) -> Self { self.0.definition_level_histogram = value; self } @@ -1049,7 +1168,9 @@ pub struct ColumnIndexBuilder { max_values: Vec>, null_counts: Vec, boundary_order: BoundaryOrder, + /// contains the concatenation of the histograms of all pages repetition_level_histograms: Option>, + /// contains the concatenation of the histograms of all pages definition_level_histograms: Option>, /// Is the information in the builder valid? /// @@ -1099,8 +1220,8 @@ impl ColumnIndexBuilder { /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state. pub fn append_histograms( &mut self, - repetition_level_histogram: &Option>, - definition_level_histogram: &Option>, + repetition_level_histogram: &Option, + definition_level_histogram: &Option, ) { if !self.valid { return; @@ -1108,12 +1229,12 @@ impl ColumnIndexBuilder { if let Some(ref rep_lvl_hist) = repetition_level_histogram { let hist = self.repetition_level_histograms.get_or_insert(Vec::new()); hist.reserve(rep_lvl_hist.len()); - hist.extend(rep_lvl_hist); + hist.extend(rep_lvl_hist.values()); } if let Some(ref def_lvl_hist) = definition_level_histogram { let hist = self.definition_level_histograms.get_or_insert(Vec::new()); hist.reserve(def_lvl_hist.len()); - hist.extend(def_lvl_hist); + hist.extend(def_lvl_hist.values()); } } @@ -1358,8 +1479,8 @@ mod tests { .set_column_index_offset(Some(8000)) .set_column_index_length(Some(25)) .set_unencoded_byte_array_data_bytes(Some(2000)) - .set_repetition_level_histogram(Some(vec![100, 100])) - .set_definition_level_histogram(Some(vec![0, 200])) + .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100]))) + .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200]))) .build() .unwrap(); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index e27a414507db..f2e8f74a378c 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1895,7 +1895,7 @@ mod tests { assert_eq!(file_metadata.row_groups[0].columns.len(), 1); assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); - let check_def_hist = |def_hist: &Vec| { + let check_def_hist = |def_hist: &[i64]| { assert_eq!(def_hist.len(), 2); assert_eq!(def_hist[0], 3); assert_eq!(def_hist[1], 7); @@ -1931,7 +1931,7 @@ mod tests { assert!(column.definition_level_histogram().is_some()); assert!(column.repetition_level_histogram().is_none()); assert!(column.unencoded_byte_array_data_bytes().is_some()); - check_def_hist(column.definition_level_histogram().unwrap()); + check_def_hist(column.definition_level_histogram().unwrap().values()); assert_eq!( unenc_size, column.unencoded_byte_array_data_bytes().unwrap() @@ -2009,7 +2009,7 @@ mod tests { assert_eq!(file_metadata.row_groups[0].columns.len(), 1); assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); - let check_def_hist = |def_hist: &Vec| { + let check_def_hist = |def_hist: &[i64]| { assert_eq!(def_hist.len(), 4); assert_eq!(def_hist[0], 1); assert_eq!(def_hist[1], 1); @@ -2017,7 +2017,7 @@ mod tests { assert_eq!(def_hist[3], 7); }; - let check_rep_hist = |rep_hist: &Vec| { + let check_rep_hist = |rep_hist: &[i64]| { assert_eq!(rep_hist.len(), 2); assert_eq!(rep_hist[0], 5); assert_eq!(rep_hist[1], 5); @@ -2051,8 +2051,8 @@ mod tests { assert!(column.definition_level_histogram().is_some()); assert!(column.repetition_level_histogram().is_some()); assert!(column.unencoded_byte_array_data_bytes().is_none()); - check_def_hist(column.definition_level_histogram().unwrap()); - check_rep_hist(column.repetition_level_histogram().unwrap()); + check_def_hist(column.definition_level_histogram().unwrap().values()); + check_rep_hist(column.repetition_level_histogram().unwrap().values()); // check histogram in column index as well assert!(reader.metadata().column_index().is_some()); From 70af5ddee63f7018dc1a451e9877fd0f5dfaa9ab Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 26 Jul 2024 09:42:57 -0700 Subject: [PATCH 20/21] formatting --- parquet/src/column/writer/mod.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 681267b7b491..54d8fd3cc13e 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -216,8 +216,12 @@ impl PageMetrics { self.num_buffered_values = 0; self.num_buffered_rows = 0; self.num_page_nulls = 0; - self.repetition_level_histogram.as_mut().map(LevelHistogram::reset); - self.definition_level_histogram.as_mut().map(LevelHistogram::reset); + self.repetition_level_histogram + .as_mut() + .map(LevelHistogram::reset); + self.definition_level_histogram + .as_mut() + .map(LevelHistogram::reset); } /// Updates histogram values using provided repetition levels From e774bbe540bbc4b0fb8fd1b9c016bdaef3d1c666 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 26 Jul 2024 10:11:46 -0700 Subject: [PATCH 21/21] fix error in docs --- parquet/src/file/metadata/mod.rs | 2 +- parquet/src/file/page_index/index.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 864746ef8a83..cd3555de828c 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -837,7 +837,7 @@ impl ColumnChunkMetaData { /// Returns the definition level histogram. /// /// The returned value `vec[i]` is how many values are at definition level `i`. For example, - /// `vec[max_definition_level-1]` indicates how many non-null values are present in the page. + /// `vec[max_definition_level]` indicates how many non-null values are present in the page. /// This field may not be set by older writers. pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> { self.definition_level_histogram.as_ref() diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 197da2bb153b..68412572b5f2 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -44,7 +44,7 @@ pub struct PageIndex { /// Definition level histogram for the page /// /// `definition_level_histogram[i]` is a count of how many values are at definition level `i`. - /// For example, `definition_level_histogram[max_definition_level-1]` indicates how many + /// For example, `definition_level_histogram[max_definition_level]` indicates how many /// non-null values are present in the page. pub definition_level_histogram: Option>, }