Skip to content

Commit

Permalink
doc: document new method
Browse files Browse the repository at this point in the history
  • Loading branch information
wjones127 committed Jan 4, 2023
1 parent 39b25b2 commit 92c63de
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 43 deletions.
57 changes: 17 additions & 40 deletions python/deltalake/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,12 +448,12 @@ def get_add_actions_df(self, flatten: bool = False) -> pyarrow.RecordBatch:
Add actions represent the files that currently make up the table. This
data is a low-level representation parsed from the transaction log.
:param flatten: whether to use a flattened schema in output (default False).
When False, all partition values are in a single map column,
`partition_values`. When True, each partition column becomes it's own
field, with the prefix `partition_` in the name.
:param flatten: whether to flatten the schema. Partition values columns are
given the prefix `partition.`, statistics (null_count, min, and max) are
given the prefix `null_count.`, `min.`, and `max.`, and tags the
prefix `tags.`. Nested field names are concatenated with `.`.
:returns: a PyArrow Table containing the add action data.
:returns: a PyArrow RecordBatch containing the add action data.
Examples:
Expand All @@ -462,38 +462,15 @@ def get_add_actions_df(self, flatten: bool = False) -> pyarrow.RecordBatch:
>>> data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]})
>>> write_deltalake("tmp", data, partition_by=["x"])
>>> dt = DeltaTable("tmp")
>>> dt.get_add_actions_df()
pyarrow.Table
path: string
size_bytes: int64
modification_time: timestamp[ms]
data_change: bool
stats: string
partition_values: map<string, string>
child 0, entries: struct<key: string not null, value: string> not null
child 0, key: string not null
child 1, value: string
----
path: [["x=2/0-6e6952dd-ed03-4488-810e-ebe76a1a1200-0.parquet","x=3/0-6e6952dd-ed03-4488-810e-ebe76a1a1200-0.parquet","x=1/0-6e6952dd-ed03-4488-810e-ebe76a1a1200-0.parquet"]]
size_bytes: [[565,565,565]]
modification_time: [[1970-01-20 08:20:49.399,1970-01-20 08:20:49.399,1970-01-20 08:20:49.399]]
data_change: [[true,true,true]]
stats: [["{"numRecords": 1, "minValues": {"y": 5}, "maxValues": {"y": 5}, "nullCount": {"y": 0}}","{"numRecords": 1, "minValues": {"y": 6}, "maxValues": {"y": 6}, "nullCount": {"y": 0}}","{"numRecords": 1, "minValues": {"y": 4}, "maxValues": {"y": 4}, "nullCount": {"y": 0}}"]]
partition_values: [[keys:["x"]values:["2"],keys:["x"]values:["3"],keys:["x"]values:["1"]]]
>>> dt.get_add_actions_df(flatten=True)
pyarrow.Table
path: string
size_bytes: int64
modification_time: timestamp[ms]
data_change: bool
stats: string
partition_x: string
----
path: [["x=2/0-6e6952dd-ed03-4488-810e-ebe76a1a1200-0.parquet","x=3/0-6e6952dd-ed03-4488-810e-ebe76a1a1200-0.parquet","x=1/0-6e6952dd-ed03-4488-810e-ebe76a1a1200-0.parquet"]]
size_bytes: [[565,565,565]]
modification_time: [[1970-01-20 08:20:49.399,1970-01-20 08:20:49.399,1970-01-20 08:20:49.399]]
data_change: [[true,true,true]]
stats: [["{"numRecords": 1, "minValues": {"y": 5}, "maxValues": {"y": 5}, "nullCount": {"y": 0}}","{"numRecords": 1, "minValues": {"y": 6}, "maxValues": {"y": 6}, "nullCount": {"y": 0}}","{"numRecords": 1, "minValues": {"y": 4}, "maxValues": {"y": 4}, "nullCount": {"y": 0}}"]]
partition_x: [["2","3","1"]]
"""
return self._table.get_add_actions_df(flatten, parse_stats)
>>> dt.get_add_actions_df().to_pandas()
path size_bytes modification_time data_change partition_values num_records null_count min max
0 x=2/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True {'x': 2} 1 {'y': 0} {'y': 5} {'y': 5}
1 x=3/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True {'x': 3} 1 {'y': 0} {'y': 6} {'y': 6}
2 x=1/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True {'x': 1} 1 {'y': 0} {'y': 4} {'y': 4}
>>> dt.get_add_actions_df(flatten=True).to_pandas()
path size_bytes modification_time data_change partition.x num_records null_count.y min.y max.y
0 x=2/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True 2 1 0 5 5
1 x=3/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True 3 1 0 6 6
2 x=1/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True 1 1 0 4 4
"""
return self._table.get_add_actions_df(flatten)
1 change: 1 addition & 0 deletions rust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
#![deny(warnings)]
#![deny(missing_docs)]
#![allow(rustdoc::invalid_html_tags)]

#[cfg(all(feature = "parquet", feature = "parquet2"))]
compile_error!(
Expand Down
32 changes: 30 additions & 2 deletions rust/src/table_state_arrow.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
//! Methods to get Delta Table state in Arrow structures
//!
//! See [crate::table_state::DeltaTableState].
use crate::action::{ColumnCountStat, ColumnValueStat, Stats};
use crate::table_state::DeltaTableState;
Expand All @@ -21,7 +23,33 @@ use std::collections::{HashMap, HashSet, VecDeque};
use std::sync::Arc;

impl DeltaTableState {
/// Get the add actions as a RecordBatch
/// Get an [arrow::record_batch::RecordBatch] containing add action data.
///
/// # Arguments
///
/// * `flatten` - whether to flatten the schema. Partition values columns are
/// given the prefix `partition.`, statistics (null_count, min, and max) are
/// given the prefix `null_count.`, `min.`, and `max.`, and tags the
/// prefix `tags.`. Nested field names are concatenated with `.`.
///
/// # Data schema
///
/// Each row represents a file that is a part of the selected tables state.
///
/// * `path` (String): relative or absolute to a file.
/// * `size_bytes` (Int64): size of file in bytes.
/// * `modification_time` (Millisecond Timestamp): time the file was created.
/// * `data_change` (Boolean): false if data represents data moved from other files
/// in the same transaction.
/// * `partition.{partition column name}` (matches column type): value of
/// partition the file corresponds to.
/// * `null_count.{col_name}` (Int64): number of null values for column in
/// this file.
/// * `min.{col_name}` (matches column type): minimum value of column in file
/// (if available).
/// * `max.{col_name}` (matches column type): maximum value of column in file
/// (if available).
/// * `tag.{tag_key}` (String): value of a metadata tag for the file.
pub fn add_actions_table(
&self,
flatten: bool,
Expand Down Expand Up @@ -160,7 +188,7 @@ impl DeltaTableState {
.into_iter()
.zip(metadata.partition_columns.iter())
.map(|(array, name)| {
let name: Cow<str> = Cow::Owned(format!("partition_{}", name));
let name: Cow<str> = Cow::Owned(format!("partition.{}", name));
(name, array)
})
.collect()
Expand Down
2 changes: 1 addition & 1 deletion rust/tests/add_actions_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ async fn test_with_partitions() {
1627990384000, 1627990384000
]))),
("data_change", Arc::new(array::BooleanArray::from(vec![true, true]))),
("partition_k", Arc::new(array::StringArray::from(vec![Some("A"), None]))),
("partition.k", Arc::new(array::StringArray::from(vec![Some("A"), None]))),
];
let expected = RecordBatch::try_from_iter(expected_columns.clone()).unwrap();

Expand Down

0 comments on commit 92c63de

Please sign in to comment.