doc: document new method

delta-io · Jan 4, 2023 · 92c63de · 92c63de
1 parent 39b25b2
commit 92c63de
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 43 deletions.
diff --git a/python/deltalake/table.py b/python/deltalake/table.py
@@ -448,12 +448,12 @@ def get_add_actions_df(self, flatten: bool = False) -> pyarrow.RecordBatch:
         Add actions represent the files that currently make up the table. This
         data is a low-level representation parsed from the transaction log.
 
-        :param flatten: whether to use a flattened schema in output (default False).
-            When False, all partition values are in a single map column,
-            `partition_values`. When True, each partition column becomes it's own
-            field, with the prefix `partition_` in the name.
+        :param flatten: whether to flatten the schema. Partition values columns are
+          given the prefix `partition.`, statistics (null_count, min, and max) are
+          given the prefix `null_count.`, `min.`, and `max.`, and tags the 
+          prefix `tags.`. Nested field names are concatenated with `.`.
 
-        :returns: a PyArrow Table containing the add action data.
+        :returns: a PyArrow RecordBatch containing the add action data.
 
         Examples:
 
@@ -462,38 +462,15 @@ def get_add_actions_df(self, flatten: bool = False) -> pyarrow.RecordBatch:
         >>> data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]})
         >>> write_deltalake("tmp", data, partition_by=["x"])
         >>> dt = DeltaTable("tmp")
-        >>> dt.get_add_actions_df()
-        pyarrow.Table
-        path: string
-        size_bytes: int64
-        modification_time: timestamp[ms]
-        data_change: bool
-        stats: string
-        partition_values: map<string, string>
-          child 0, entries: struct<key: string not null, value: string> not null
-              child 0, key: string not null
-              child 1, value: string
-        ----
-        path: [["x=2/0-6e6952dd-ed03-4488-810e-ebe76a1a1200-0.parquet","x=3/0-6e6952dd-ed03-4488-810e-ebe76a1a1200-0.parquet","x=1/0-6e6952dd-ed03-4488-810e-ebe76a1a1200-0.parquet"]]
-        size_bytes: [[565,565,565]]
-        modification_time: [[1970-01-20 08:20:49.399,1970-01-20 08:20:49.399,1970-01-20 08:20:49.399]]
-        data_change: [[true,true,true]]
-        stats: [["{"numRecords": 1, "minValues": {"y": 5}, "maxValues": {"y": 5}, "nullCount": {"y": 0}}","{"numRecords": 1, "minValues": {"y": 6}, "maxValues": {"y": 6}, "nullCount": {"y": 0}}","{"numRecords": 1, "minValues": {"y": 4}, "maxValues": {"y": 4}, "nullCount": {"y": 0}}"]]
-        partition_values: [[keys:["x"]values:["2"],keys:["x"]values:["3"],keys:["x"]values:["1"]]]
-        >>> dt.get_add_actions_df(flatten=True)
-        pyarrow.Table
-        path: string
-        size_bytes: int64
-        modification_time: timestamp[ms]
-        data_change: bool
-        stats: string
-        partition_x: string
-        ----
-        path: [["x=2/0-6e6952dd-ed03-4488-810e-ebe76a1a1200-0.parquet","x=3/0-6e6952dd-ed03-4488-810e-ebe76a1a1200-0.parquet","x=1/0-6e6952dd-ed03-4488-810e-ebe76a1a1200-0.parquet"]]
-        size_bytes: [[565,565,565]]
-        modification_time: [[1970-01-20 08:20:49.399,1970-01-20 08:20:49.399,1970-01-20 08:20:49.399]]
-        data_change: [[true,true,true]]
-        stats: [["{"numRecords": 1, "minValues": {"y": 5}, "maxValues": {"y": 5}, "nullCount": {"y": 0}}","{"numRecords": 1, "minValues": {"y": 6}, "maxValues": {"y": 6}, "nullCount": {"y": 0}}","{"numRecords": 1, "minValues": {"y": 4}, "maxValues": {"y": 4}, "nullCount": {"y": 0}}"]]
-        partition_x: [["2","3","1"]]
-        """
-        return self._table.get_add_actions_df(flatten, parse_stats)
+        >>> dt.get_add_actions_df().to_pandas()
+                                                                path  size_bytes       modification_time  data_change partition_values  num_records null_count       min       max
+        0  x=2/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p...         565 1970-01-20 08:40:08.071         True         {'x': 2}            1   {'y': 0}  {'y': 5}  {'y': 5}
+        1  x=3/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p...         565 1970-01-20 08:40:08.071         True         {'x': 3}            1   {'y': 0}  {'y': 6}  {'y': 6}
+        2  x=1/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p...         565 1970-01-20 08:40:08.071         True         {'x': 1}            1   {'y': 0}  {'y': 4}  {'y': 4}
+        >>> dt.get_add_actions_df(flatten=True).to_pandas()
+                                                                path  size_bytes       modification_time  data_change  partition.x  num_records  null_count.y  min.y  max.y
+        0  x=2/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p...         565 1970-01-20 08:40:08.071         True            2            1             0      5      5
+        1  x=3/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p...         565 1970-01-20 08:40:08.071         True            3            1             0      6      6
+        2  x=1/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p...         565 1970-01-20 08:40:08.071         True            1            1             0      4      4
+        """
+        return self._table.get_add_actions_df(flatten)
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
@@ -70,6 +70,7 @@
 
 #![deny(warnings)]
 #![deny(missing_docs)]
+#![allow(rustdoc::invalid_html_tags)]
 
 #[cfg(all(feature = "parquet", feature = "parquet2"))]
 compile_error!(

diff --git a/rust/src/table_state_arrow.rs b/rust/src/table_state_arrow.rs
@@ -1,4 +1,6 @@
 //! Methods to get Delta Table state in Arrow structures
+//! 
+//! See [crate::table_state::DeltaTableState].
 
 use crate::action::{ColumnCountStat, ColumnValueStat, Stats};
 use crate::table_state::DeltaTableState;
@@ -21,7 +23,33 @@ use std::collections::{HashMap, HashSet, VecDeque};
 use std::sync::Arc;
 
 impl DeltaTableState {
-    /// Get the add actions as a RecordBatch
+    /// Get an [arrow::record_batch::RecordBatch] containing add action data.
+    /// 
+    /// # Arguments
+    /// 
+    /// * `flatten` - whether to flatten the schema. Partition values columns are
+    ///   given the prefix `partition.`, statistics (null_count, min, and max) are
+    ///   given the prefix `null_count.`, `min.`, and `max.`, and tags the 
+    ///   prefix `tags.`. Nested field names are concatenated with `.`.
+    /// 
+    /// # Data schema
+    /// 
+    /// Each row represents a file that is a part of the selected tables state.
+    /// 
+    /// * `path` (String): relative or absolute to a file.
+    /// * `size_bytes` (Int64): size of file in bytes.
+    /// * `modification_time` (Millisecond Timestamp): time the file was created.
+    /// * `data_change` (Boolean): false if data represents data moved from other files
+    ///   in the same transaction.
+    /// * `partition.{partition column name}` (matches column type): value of 
+    ///   partition the file corresponds to.
+    /// * `null_count.{col_name}` (Int64): number of null values for column in
+    ///   this file.
+    /// * `min.{col_name}` (matches column type): minimum value of column in file
+    ///   (if available).
+    /// * `max.{col_name}` (matches column type): maximum value of column in file
+    ///   (if available).
+    /// * `tag.{tag_key}` (String): value of a metadata tag for the file.
     pub fn add_actions_table(
         &self,
         flatten: bool,
@@ -160,7 +188,7 @@ impl DeltaTableState {
                 .into_iter()
                 .zip(metadata.partition_columns.iter())
                 .map(|(array, name)| {
-                    let name: Cow<str> = Cow::Owned(format!("partition_{}", name));
+                    let name: Cow<str> = Cow::Owned(format!("partition.{}", name));
                     (name, array)
                 })
                 .collect()

diff --git a/rust/tests/add_actions_test.rs b/rust/tests/add_actions_test.rs
@@ -43,7 +43,7 @@ async fn test_with_partitions() {
             1627990384000, 1627990384000
         ]))),
         ("data_change", Arc::new(array::BooleanArray::from(vec![true, true]))),
-        ("partition_k", Arc::new(array::StringArray::from(vec![Some("A"), None]))),
+        ("partition.k", Arc::new(array::StringArray::from(vec![Some("A"), None]))),
     ];
     let expected = RecordBatch::try_from_iter(expected_columns.clone()).unwrap();