Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added serde support for RowGroupMetaData. #202

Merged
merged 2 commits into from
Nov 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ flate2 = { version = "^1.0", optional = true, default-features = false }
lz4 = { version = "1.23.3", optional = true }
zstd = { version = "^0.11", optional = true, default-features = false }
lz4_flex = { version = "^0.9.2", optional = true }
serde = { version = "^1.0", optional = true }
serde_derive = { version = "^1.0", optional = true }

xxhash-rust = { version="0.8.3", optional = true, features = ["xxh64"] }

Expand All @@ -48,6 +50,7 @@ snappy = ["snap"]
gzip = ["flate2/rust_backend"]
gzip_zlib_ng = ["flate2/zlib-ng"]
bloom_filter = ["xxhash-rust"]
serde_types = ["serde", "serde_derive"]

[[bench]]
name = "decode_bitpacking"
Expand Down
51 changes: 51 additions & 0 deletions src/metadata/column_chunk_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,66 @@ use crate::error::{Error, Result};
use crate::schema::types::PhysicalType;
use crate::statistics::{deserialize_statistics, Statistics};

#[cfg(feature = "serde_types")]
youngsofun marked this conversation as resolved.
Show resolved Hide resolved
mod serde_types {
pub use parquet_format_safe::thrift::protocol::{
TCompactInputProtocol, TCompactOutputProtocol,
};
pub use serde::de::Error as DeserializeError;
pub use serde::ser::Error as SerializeError;
pub use serde::{Deserialize, Deserializer, Serializer};
pub use serde_derive::{Deserialize, Serialize};
pub use std::io::Cursor;
}
#[cfg(feature = "serde_types")]
use serde_types::*;

/// Metadata for a column chunk.
// This contains the `ColumnDescriptor` associated with the chunk so that deserializers have
// access to the descriptor (e.g. physical, converted, logical).
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub struct ColumnChunkMetaData {
#[cfg_attr(
feature = "serde_types",
serde(serialize_with = "serialize_column_chunk")
)]
#[cfg_attr(
feature = "serde_types",
serde(deserialize_with = "deserialize_column_chunk")
)]
column_chunk: ColumnChunk,
column_descr: ColumnDescriptor,
}

#[cfg(feature = "serde_types")]
fn serialize_column_chunk<S>(
column_chunk: &ColumnChunk,
serializer: S,
) -> std::result::Result<S::Ok, S::Error>
where
S: Serializer,
{
let mut buf = vec![];
let cursor = Cursor::new(&mut buf[..]);
let mut protocol = TCompactOutputProtocol::new(cursor);
column_chunk
.write_to_out_protocol(&mut protocol)
.map_err(S::Error::custom)?;
serializer.serialize_bytes(&buf)
}

#[cfg(feature = "serde_types")]
fn deserialize_column_chunk<'de, D>(deserializer: D) -> std::result::Result<ColumnChunk, D::Error>
where
D: Deserializer<'de>,
{
let buf = Vec::<u8>::deserialize(deserializer)?;
let mut cursor = Cursor::new(&buf[..]);
let mut protocol = TCompactInputProtocol::new(&mut cursor, usize::MAX);
ColumnChunk::read_from_in_protocol(&mut protocol).map_err(D::Error::custom)
}

// Represents common operations for a column chunk.
impl ColumnChunkMetaData {
/// Returns a new [`ColumnChunkMetaData`]
Expand Down
4 changes: 4 additions & 0 deletions src/metadata/column_descriptor.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
use crate::schema::types::{ParquetType, PrimitiveType};
#[cfg(feature = "serde_types")]
use serde_derive::{Deserialize, Serialize};

/// A descriptor of a parquet column. It contains the necessary information to deserialize
/// a parquet column.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub struct Descriptor {
/// The [`PrimitiveType`] of this column
pub primitive_type: PrimitiveType,
Expand All @@ -18,6 +21,7 @@ pub struct Descriptor {
/// This encapsulates information such as definition and repetition levels and is used to
/// re-assemble nested data.
#[derive(Debug, PartialEq, Clone)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub struct ColumnDescriptor {
/// The descriptor this columns' leaf.
pub descriptor: Descriptor,
Expand Down
3 changes: 3 additions & 0 deletions src/metadata/column_order.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
use super::sort::SortOrder;
#[cfg(feature = "serde_types")]
use serde_derive::{Deserialize, Serialize};

/// Column order that specifies what method was used to aggregate min/max values for
/// statistics.
///
/// If column order is undefined, then it is the legacy behaviour and all values should
/// be compared as signed values/bytes.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub enum ColumnOrder {
/// Column uses the order defined by its logical or physical type
/// (if there is no logical type), parquet-format 2.4.0+.
Expand Down
3 changes: 3 additions & 0 deletions src/metadata/row_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@ use crate::{
error::{Error, Result},
write::ColumnOffsetsMetadata,
};
#[cfg(feature = "serde_types")]
use serde_derive::{Deserialize, Serialize};

/// Metadata for a row group.
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub struct RowGroupMetaData {
columns: Vec<ColumnChunkMetaData>,
num_rows: usize,
Expand Down
4 changes: 4 additions & 0 deletions src/metadata/schema_descriptor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@ use crate::{error::Result, schema::types::FieldInfo};

use super::column_descriptor::{ColumnDescriptor, Descriptor};

#[cfg(feature = "serde_types")]
use serde_derive::{Deserialize, Serialize};

/// A schema descriptor. This encapsulates the top-level schemas for all the columns,
/// as well as all descriptors for all the primitive columns.
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub struct SchemaDescriptor {
name: String,
// The top-level schema (the "message" type).
Expand Down
4 changes: 4 additions & 0 deletions src/metadata/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ use crate::schema::types::{
IntegerType, PhysicalType, PrimitiveConvertedType, PrimitiveLogicalType,
};

#[cfg(feature = "serde_types")]
use serde_derive::{Deserialize, Serialize};

/// Sort order for page and column statistics.
///
/// Types are associated with sort orders and column stats are aggregated using a sort
Expand All @@ -11,6 +14,7 @@ use crate::schema::types::{
/// See reference in
/// <https://github.com/apache/parquet-cpp/blob/master/src/parquet/types.h>
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub enum SortOrder {
/// Signed (either value or legacy byte-wise) comparison.
Signed,
Expand Down
7 changes: 7 additions & 0 deletions src/parquet_bridge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@ use super::thrift_format::{
};

use crate::error::Error;
#[cfg(feature = "serde_types")]
use serde_derive::{Deserialize, Serialize};

/// The repetition of a parquet field
#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub enum Repetition {
/// When the field has no null values
Required,
Expand Down Expand Up @@ -433,6 +436,7 @@ impl DataPageHeaderExt for DataPageHeaderV2 {
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub enum TimeUnit {
Milliseconds,
Microseconds,
Expand Down Expand Up @@ -461,6 +465,7 @@ impl From<TimeUnit> for ParquetTimeUnit {

/// Enum of all valid logical integer types
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub enum IntegerType {
Int8,
Int16,
Expand All @@ -473,6 +478,7 @@ pub enum IntegerType {
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub enum PrimitiveLogicalType {
String,
Enum,
Expand All @@ -494,6 +500,7 @@ pub enum PrimitiveLogicalType {
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub enum GroupLogicalType {
Map,
List,
Expand Down
4 changes: 4 additions & 0 deletions src/schema/types/basic_type.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
use super::super::Repetition;

#[cfg(feature = "serde_types")]
use serde_derive::{Deserialize, Serialize};

/// Common type information.
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub struct FieldInfo {
/// The field name
pub name: String,
Expand Down
4 changes: 4 additions & 0 deletions src/schema/types/converted_type.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
use crate::error::Error;
use parquet_format_safe::ConvertedType;
#[cfg(feature = "serde_types")]
use serde_derive::{Deserialize, Serialize};

#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub enum PrimitiveConvertedType {
Utf8,
/// an enum is converted into a binary field
Expand Down Expand Up @@ -89,6 +92,7 @@ pub enum PrimitiveConvertedType {
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub enum GroupConvertedType {
/// a map is converted as an optional field containing a repeated key/value pair
Map,
Expand Down
4 changes: 4 additions & 0 deletions src/schema/types/parquet_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@ use super::{
spec, FieldInfo, GroupConvertedType, GroupLogicalType, PhysicalType, PrimitiveConvertedType,
PrimitiveLogicalType,
};
#[cfg(feature = "serde_types")]
use serde_derive::{Deserialize, Serialize};

/// The complete description of a parquet column
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub struct PrimitiveType {
/// The fields' generic information
pub field_info: FieldInfo,
Expand Down Expand Up @@ -41,6 +44,7 @@ impl PrimitiveType {
/// Representation of a Parquet type describing primitive and nested fields,
/// including the top-level schema of the parquet file.
#[derive(Clone, Debug, PartialEq)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub enum ParquetType {
PrimitiveType(PrimitiveType),
GroupType {
Expand Down
3 changes: 3 additions & 0 deletions src/schema/types/physical_type.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
use parquet_format_safe::Type;

use crate::error::Error;
#[cfg(feature = "serde_types")]
use serde_derive::{Deserialize, Serialize};

/// The set of all physical types representable in Parquet
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))]
pub enum PhysicalType {
Boolean,
Int32,
Expand Down