From 2d5d9a4061ba15b397e51e8685d95ef43074336c Mon Sep 17 00:00:00 2001 From: Manish Date: Sat, 9 Oct 2021 19:51:40 +0200 Subject: [PATCH 1/2] Record Batch support for delta log stats_parsed (#435) --- rust/src/action.rs | 181 +++++++++++++++++- .../_delta_log/00000000000000000000.crc | 1 + .../_delta_log/00000000000000000000.json | 3 + .../_delta_log/00000000000000000001.crc | 1 + .../_delta_log/00000000000000000001.json | 2 + .../00000000000000000002.checkpoint.parquet | Bin 0 -> 12792 bytes .../_delta_log/00000000000000000002.crc | 1 + .../_delta_log/00000000000000000002.json | 9 + .../00000000000000000003.checkpoint.parquet | Bin 0 -> 14005 bytes .../_delta_log/00000000000000000003.crc | 1 + .../_delta_log/00000000000000000003.json | 10 + .../_delta_log/_last_checkpoint | 1 + ...4520-a078-12f3357a77f5-c000.snappy.parquet | Bin 0 -> 514 bytes ...45fd-843d-22dc39b6486a-c000.snappy.parquet | Bin 0 -> 552 bytes ...4f3d-a188-8317bba46a58-c000.snappy.parquet | Bin 0 -> 514 bytes ...4d8c-8595-727f54980600-c000.snappy.parquet | Bin 0 -> 514 bytes ...4ce8-8aa2-533924d86f26-c000.snappy.parquet | Bin 0 -> 521 bytes ...4004-8e65-18c44af19d07-c000.snappy.parquet | Bin 0 -> 514 bytes ...49e5-9c4f-e8e9760cf2c2-c000.snappy.parquet | Bin 0 -> 514 bytes ...4739-bf99-7ece54e88b8d-c000.snappy.parquet | Bin 0 -> 514 bytes ...4c20-84ce-ce06c9007223-c000.snappy.parquet | Bin 0 -> 521 bytes rust/tests/write_exploration.rs | 2 +- 22 files changed, 210 insertions(+), 2 deletions(-) create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000000.crc create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000000.json create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000001.crc create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000001.json create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000002.checkpoint.parquet create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000002.crc create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000002.json create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000003.checkpoint.parquet create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000003.crc create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000003.json create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/_last_checkpoint create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/part-00000-5d345c6a-525d-4520-a078-12f3357a77f5-c000.snappy.parquet create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/part-00000-e5b7d6de-4830-45fd-843d-22dc39b6486a-c000.snappy.parquet create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/part-00001-f9dc9a5d-8503-4f3d-a188-8317bba46a58-c000.snappy.parquet create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/part-00002-930505f3-0d84-4d8c-8595-727f54980600-c000.snappy.parquet create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/part-00003-5bafc8ea-a7a9-4ce8-8aa2-533924d86f26-c000.snappy.parquet create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/part-00004-19ec0ebc-a688-4004-8e65-18c44af19d07-c000.snappy.parquet create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/part-00005-e1389522-dd44-49e5-9c4f-e8e9760cf2c2-c000.snappy.parquet create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/part-00006-a928894d-b601-4739-bf99-7ece54e88b8d-c000.snappy.parquet create mode 100644 rust/tests/data/simple_table_with_stats_parsed_optimized/part-00007-b41961a2-330a-4c20-84ce-ce06c9007223-c000.snappy.parquet diff --git a/rust/src/action.rs b/rust/src/action.rs index 285cf6100a..401d3d64cc 100644 --- a/rust/src/action.rs +++ b/rust/src/action.rs @@ -7,8 +7,13 @@ use percent_encoding::percent_decode; use serde::{Deserialize, Serialize}; use serde_json::{Map, Value}; use std::collections::HashMap; - +use arrow::record_batch::RecordBatch; use super::schema::*; +use std::sync::Arc; +use std::collections::hash_map::Entry; +use arrow::array::ArrayRef; +use arrow::array::Int64Array; +use arrow::datatypes::Field; /// Error returned when an invalid Delta log action is encountered. #[derive(thiserror::Error, Debug)] @@ -150,6 +155,21 @@ pub struct StatsParsed { pub null_count: HashMap, } +/// RecordBatch representation of the stats from parquet +#[derive(Debug)] +pub struct StatRecordBatch { + /// Number of records in the file associated with the log action. + pub num_records: DeltaDataTypeLong, + + // start of per column stats + /// Contains a value smaller than all values present in the file for all columns. + pub min_values: RecordBatch, + /// Contains a value larger than all values present in the file for all columns. + pub max_values: RecordBatch, + /// The number of null values for all columns. + pub null_counts: RecordBatch, +} + /// Delta log action that describes a parquet data file that is part of the table. #[derive(Serialize, Deserialize, Clone, Debug, Default)] #[serde(rename_all = "camelCase")] @@ -302,6 +322,113 @@ impl Add { .map_or(Ok(None), |s| serde_json::from_str(s)) } + fn get_stat_row_rb(&self, row: &parquet::record::Row, cols: &[String]) -> arrow::error::Result { + let mut stat_col_map: HashMap<&String, Vec>> = HashMap::new(); + for col in cols { + let data: Vec> = vec![]; + stat_col_map.insert(col, data); + } + for (_, (name, value)) in row.get_column_iter().enumerate() { + for col in cols { + if name == col { + match stat_col_map.entry(name) { + Entry::Occupied(mut e) => { + let val = value.to_string(); // all values of type String for now + e.get_mut().push(Some(val)); + } + Entry::Vacant(_) => {} + } + } + } + } + // now we have column name and corresponding vector inside a hashmap + // we can create a record batch out of it + let mut rb_vec: Vec<(&str, ArrayRef)> = vec![]; + for col in cols { + match stat_col_map.entry(col) { + Entry::Occupied(e) => { + // TODO: i64s only for now. should support all possible column types + let my_vec: Vec = e.get().iter().map(|v| { + v.as_ref().unwrap().parse::().unwrap() + }).collect::>(); + let a : ArrayRef = Arc::new(Int64Array::from(my_vec)); + rb_vec.push( + (col.as_str(), a) + ) + } + Entry::Vacant(_) => {} + } + } + RecordBatch::try_from_iter(rb_vec) + } + + /// The stats_parsed can be converted into the record batches and the original mem-heavy data structure + /// can eventually be eliminated entirely. + pub fn get_stats_as_record_batch(&self, batch_columns: &[String]) -> Result, parquet::errors::ParquetError> { + self.stats_parsed.as_ref().map_or(Ok(None), |record| { + let mut min_val_vec = vec![]; + let mut max_val_vec = vec![]; + let mut num_records = 0; + for (i, (name, _)) in record.get_column_iter().enumerate() { + match name.as_str() { + "numRecords" => match record.get_long(i) { + Ok(v) => { + num_records = v; + } + _ => { + log::error!("Expect type of stats_parsed field numRecords to be long, got: {}", record); + } + } + "minValues" => match record.get_group(i) { + Ok(row) => { + min_val_vec.push( + self.get_stat_row_rb(row, batch_columns).unwrap() + ); + } + _ => { + log::error!("Expect type of stats_parsed field minRecords to be struct, got: {}", record); + } + } + "maxValues" => match record.get_group(i) { + Ok(row) => { + max_val_vec.push( + self.get_stat_row_rb(row, batch_columns).unwrap() + ); + } + _ => { + log::error!("Expect type of stats_parsed field maxRecords to be struct, got: {}", record); + } + } + _ => { + log::warn!( + "Unexpected field name `{}` for stats_parsed: {:?}", + name, + record, + ); + } + } + } + + let schema = Arc::new(self.new_schema(batch_columns)); + let stats = StatRecordBatch{ + num_records, + min_values: RecordBatch::concat(&schema, &min_val_vec).unwrap(), + max_values: RecordBatch::concat(&schema, &max_val_vec).unwrap(), + null_counts: RecordBatch::new_empty(schema) + }; + Ok(Some(stats)) + }) + } + + /// Creates the schema for the Record Batch for the columns provided + fn new_schema(&self, batch_columns: &[String]) -> arrow::datatypes::Schema { + let mut fields: Vec = vec![]; + for col_name in batch_columns { + fields.push(Field::new(col_name, arrow::datatypes::DataType::Int64, false)) + } + arrow::datatypes::Schema::new(fields) + } + /// Returns the composite HashMap representation of stats contained in the action if present. /// Since stats are defined as optional in the protocol, this may be None. pub fn get_stats_parsed(&self) -> Result, parquet::errors::ParquetError> { @@ -892,6 +1019,58 @@ mod tests { assert_eq!(add_action.stats, None); } + #[test] + fn test_load_table_stats_as_record_batch() { + let path = "./tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000003.checkpoint.parquet"; + let preader = SerializedFileReader::new(File::open(path).unwrap()).unwrap(); + let mut iter = preader.get_row_iter(None).unwrap(); + let record = iter.nth(3).unwrap(); + let add_record = record.get_group(1).unwrap(); + let action = Add::from_parquet_record(&add_record).unwrap(); + + // Verify normal stats + let stats = action.get_stats().unwrap().unwrap(); + assert_eq!(stats.num_records, 10); + + // Verify stats rb + let cols = vec!["id".to_string()]; + let schema = Arc::new(arrow::datatypes::Schema::new(vec![ + Field::new("id", arrow::datatypes::DataType::Int64, false), + ])); + + let stats_rb = action.get_stats_as_record_batch(&cols).unwrap().unwrap(); + assert_eq!(stats_rb.num_records, 10); + + let min_batch = stats_rb.min_values; + let min_arr = Int64Array::from(vec![1]); + let expected_min_batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(min_arr)] + ).unwrap(); + assert_eq!(min_batch.num_columns(), 1); + assert_eq!(min_batch.num_rows(), 1); + assert_eq!(min_batch, expected_min_batch); + + let max_batch = stats_rb.max_values; + let max_arr = Int64Array::from(vec![10]); + let expected_max_batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(max_arr)] + ).unwrap(); + assert_eq!(max_batch, expected_max_batch); + + // TODO + let null_count_batch = stats_rb.null_counts; + assert_eq!(null_count_batch.num_columns(), 1); + let expected_null_count_batch = RecordBatch::new_empty( + schema.clone() + ); + assert_eq!(null_count_batch.num_columns(), 1); + assert_eq!(null_count_batch.num_rows(), 0); + assert_eq!(null_count_batch, expected_null_count_batch); + + } + #[test] fn test_load_table_stats() { let action = Add { diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000000.crc b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000000.crc new file mode 100644 index 0000000000..40da51e092 --- /dev/null +++ b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"tableSizeBytes":0,"numFiles":0,"numMetadata":1,"numProtocol":1,"numTransactions":0} diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000000.json b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..2fd9b5b089 --- /dev/null +++ b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} +{"metaData":{"id":"86813337-feec-4527-886b-384bdd6f7f6a","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1633794687045}} +{"commitInfo":{"timestamp":1633794687150,"userId":"906424503339340","userName":"contact@manishgill.com","operation":"CREATE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{}"},"notebook":{"notebookId":"2167422995023007"},"clusterId":"0604-105630-ajar222","isolationLevel":"SnapshotIsolation","isBlindAppend":true,"operationMetrics":{}}} diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000001.crc b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000001.crc new file mode 100644 index 0000000000..40da51e092 --- /dev/null +++ b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000001.crc @@ -0,0 +1 @@ +{"tableSizeBytes":0,"numFiles":0,"numMetadata":1,"numProtocol":1,"numTransactions":0} diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000001.json b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..c3e71f909e --- /dev/null +++ b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"metaData":{"id":"86813337-feec-4527-886b-384bdd6f7f6a","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true"},"createdTime":1633794687045}} +{"commitInfo":{"timestamp":1633794691673,"userId":"906424503339340","userName":"contact@manishgill.com","operation":"SET TBLPROPERTIES","operationParameters":{"properties":"{\"delta.checkpoint.writeStatsAsStruct\":\"true\"}"},"notebook":{"notebookId":"2167422995023007"},"clusterId":"0604-105630-ajar222","readVersion":0,"isolationLevel":"SnapshotIsolation","isBlindAppend":true,"operationMetrics":{}}} diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000002.checkpoint.parquet b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000002.checkpoint.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0c3c92fadb2de481bcc755d8f7aea979eeedb063 GIT binary patch literal 12792 zcmds84{#LK8Q;BZl1oS+;JxjpMw=etjFP>%zk7SkX+=Z`gMlF_MO%mN?e4`JE_d4_d)6v$MT57FhBkfqKmNFC>tW0N|I#cV=ijlU8XdR|H$fzA#JI0pUXsPylZ+GAB z{Q>S`fZCg|xBK>ezwdkB_r33Z?|aAErR#!-L2al59eeG$U$%B34?H#@cbDX07^cbN zf~$v_wJu+8LHnD~-Uoj5&o&1uu6Nm}k}rw@_{79yBp8>1N=OWc1Bw_?Ljh5asJf`? z0ZENRk5DKqs?gP6$SR|wTm3+L%a~piY>*D*-ALYEzB)N1yHmW48k<;-s8eDHB5cvX;a%- zF7yo+-SO-<|Ja^D3$WcDVdW0!6qLf@SUeI7h69akpaoh&k{k&~BT;G9XW{iK5H^G_ zVSI_;OB5~xTNow=^2#WICSRL;8GOefZwti`Ivfme)CvHHu8bvS6Bd){zsJLIjU5Xv z@*w_G0-zpjMJHY8ogcmZ3iV%zvs~}m+q~Jak#)M7%WDPil2D&_B%NKaWX5!Oy3L!` zyi4Na(5P%FZ43sWH9MBcEX$2$i&odb__(m96fsx@F<2GxG=MOk2OFoBQI}W)4KrlH zqHZek{43lb?R|p`7aCtR$gt?mL9oaGzJo?E(EDH;cyvd{`*sX61FKm^GT!cD;G2n& z2TUZBJKVPRn&mFsF%A!0FmU7btU1qKz5m1S+Vc#i82Ol!8wVFWyMOTGJKkt=&NJ)1 z;I=66JgaO4FmrZKeKe23vztNd(T(V(?@T;P0F6~u;IJI8G14 z;2xMw1N_5&g?$RYcEdl6LGah3sXM>DgM=dszZKru`1TVtFJ4^@zwx4LOVKe6=&S?c z#G@0By$BJ}g=dz#3);yCSbrKq@x41A`g?;@SJrz0%Sb+0SJu7V3N7schbT-3AfC4r zqy|RV6TW}wnMdYW0n%JL+WN3XJj5;}DXO{S1!0;*DfIrkFFej#!+<#7BTI5H48c)M z>AH#&qnIemQc?`dk))=$%D0P;Br?5@4{$zyx4wkrW(f0pihvPv5tj zuJdN@+R{zX@Z71nS1;;AZ~Xh@JIxM_HCySWbpzwJZUyTGn0Qtber<4zecPl0);JOQ z*E>`HA?r7urqvP=0Z53T_4Pi?qTjOUp+npM;Btt3!)@N;)=}NN#9Jul$JC;?&znl? z8N9fB5yG}If?vV1(r0x`XN&rxN!bG^MD(>M}ll)3|W%ph2zY z4*VR-4Q5L{M;-SDc&t_X^Do<3v*JxZI&%N_o~GfxnjhihhxkzjjiTr8-~Myz%1vFu zLQT&U6~B73u5KF50dN1U`E*eqEGosq(!wC|m6|$%t!51i#FdU3ag|P>h0s`ote}(- zlF&b}oR&_dRRtpsrAMHd0nu7hae8*m zFhlhMnHw)0*MgxeH-V8`2Iue!WIfE$^n#jCkKz#< zZO)OdGqQM_;rkHi(%E4Hu7kSlvRr0tB#UvIu+MUtbfJjNhFzr@OfsTpiv~@*n#-or z!((~knDx3Ic7r);UdMBZI5pN5jIo$ga~V8n>{%V)YwL6cM!nvehgS0!aLA_3yr;9P z^OVo$?m}&yQO4cW*^4K&F`1Ez_wd*7*ZJHDDCc!{K_mfq2nfD`F2p&yTeF=K43rbx z3rVDt!Wj`L)xB&v*)braazW4_!GiREZVvyU+jk5o{+FtMP95Rhe_Q7wrm1sf1BhN8 z-0Yx+gBcmq5Wm6qa%;H`_QN)!CY(kiL}H_D;J@VCKc^h&E-0FU5jbLHtW6v%3Fp^Y z{%zKGS6iZvdQ@j0?D9D5;t-n*x0r?;)ybeD0y~&Zw*#}vf)n2IRkku!n7s~(3}ZPV zjK&ey37aIJ^1VJUQC6CAT4=I^XqGcAL{@?dv-I%0T>L(lZ?XfsT5UC_D$-6bI1?{w zx*mQn$N$OYn_hsqov~t6g~?Iacsi#`BUzS+HV(G)CmVf75r+K)6pWl^I&?AhDjCnD zlEv1s$U2UBMKEd)&vc3bCB+gKv zaSrL^rDueL{Cs^OfO99@l$iPf-Je7p!{oPb!Rivj`g$4f-%fHI{_IHEM42@x( zS$DvO-rKF!CggUn?R{p3V#jN|jBNLoN2H@bVl4|MNoD>*G>qwwR?zQQXvYAPzB~+I z!W|3C86U4CteOu|-p4B-cJ+`MjpMt?x`q%xz_0V|>M56Ix(+BBDm**}o2~$xTvVY` zJJ4foOfD+tf48rEDAi-3!SZfjIsJ*V(U%E*q5|Q-Ww!6o+|TmQLk;}l2H%0p615eU zYU>4?p4luvIcN8_#Z?AFvR2|wEBV#FZHvowc(@voGrOg3jY=3j31ec(@=!*b3!ga!kBGAmrpxp73Y48M|Es$vsGHc!^3 ziI@)C{fJa<_kFQSu+rUC6^EQ`;D6uX+ZRjJRH~|}6If>U>RC>0omx`+03EYrg5`h6 z`c5r@eBrdUEGL_ZguP@?t2j5A#0I-relNt11f&U8b;`a26;x?0i5~~Aq)AUN-LY2e zR0*qXEPp%eJ9s4wz%Fe83&&Fl^r5Rx16{J)bwD4w3ZU(p7lyWe(77U65@PfB!@0$2B;ka4!EeIG6Po8z$oGP!!l9Yw(UAZ{{`l}~%H zV2cRb1BzXXMsV?w64b2_TKqBUl@y3_9#KZ25%%{(^gPATIN7RKZCsYi7L|0iFaReh z#wkkeqsRMsDulN_>y#HuXNqH2S8ar3E|<|2?5k%yAF|DeYqFI5f@u@icc~P0JBPC* zOi}1q&Tz73myIb|y|$#k|FDkeYT-Mh6TO;EN{ifdP#0b%x)YwUOts9hvxcva-@%?a z#>qfk-cOIZwuFh=i&ceAdcj_%?(s{ z&>sx=gCbt>lKHf{sSxk(O^&5ATDO{#R5=t*$x$_&l;RrvNIn>ih4qN6hr&`YBCD}L aT$e+a!=Wwwpz=KbF|A(sg)ID^kN*NAL*-lm literal 0 HcmV?d00001 diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000002.crc b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000002.crc new file mode 100644 index 0000000000..0cc27b65d0 --- /dev/null +++ b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000002.crc @@ -0,0 +1 @@ +{"tableSizeBytes":4126,"numFiles":8,"numMetadata":1,"numProtocol":1,"numTransactions":0} diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000002.json b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..8d99d65719 --- /dev/null +++ b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000002.json @@ -0,0 +1,9 @@ +{"add":{"path":"part-00000-5d345c6a-525d-4520-a078-12f3357a77f5-c000.snappy.parquet","partitionValues":{},"size":514,"modificationTime":1633794713000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1},\"maxValues\":{\"id\":1},\"nullCount\":{\"id\":0}}","tags":{"INSERTION_TIME":"1633794713000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"add":{"path":"part-00001-f9dc9a5d-8503-4f3d-a188-8317bba46a58-c000.snappy.parquet","partitionValues":{},"size":514,"modificationTime":1633794713000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2},\"maxValues\":{\"id\":2},\"nullCount\":{\"id\":0}}","tags":{"INSERTION_TIME":"1633794713000001","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"add":{"path":"part-00002-930505f3-0d84-4d8c-8595-727f54980600-c000.snappy.parquet","partitionValues":{},"size":514,"modificationTime":1633794713000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3},\"maxValues\":{\"id\":3},\"nullCount\":{\"id\":0}}","tags":{"INSERTION_TIME":"1633794713000002","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"add":{"path":"part-00003-5bafc8ea-a7a9-4ce8-8aa2-533924d86f26-c000.snappy.parquet","partitionValues":{},"size":521,"modificationTime":1633794714000,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"id\":4},\"maxValues\":{\"id\":5},\"nullCount\":{\"id\":0}}","tags":{"INSERTION_TIME":"1633794713000003","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"add":{"path":"part-00004-19ec0ebc-a688-4004-8e65-18c44af19d07-c000.snappy.parquet","partitionValues":{},"size":514,"modificationTime":1633794713000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":6},\"maxValues\":{\"id\":6},\"nullCount\":{\"id\":0}}","tags":{"INSERTION_TIME":"1633794713000004","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"add":{"path":"part-00005-e1389522-dd44-49e5-9c4f-e8e9760cf2c2-c000.snappy.parquet","partitionValues":{},"size":514,"modificationTime":1633794713000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":7},\"maxValues\":{\"id\":7},\"nullCount\":{\"id\":0}}","tags":{"INSERTION_TIME":"1633794713000005","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"add":{"path":"part-00006-a928894d-b601-4739-bf99-7ece54e88b8d-c000.snappy.parquet","partitionValues":{},"size":514,"modificationTime":1633794713000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":8},\"maxValues\":{\"id\":8},\"nullCount\":{\"id\":0}}","tags":{"INSERTION_TIME":"1633794713000006","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"add":{"path":"part-00007-b41961a2-330a-4c20-84ce-ce06c9007223-c000.snappy.parquet","partitionValues":{},"size":521,"modificationTime":1633794713000,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"id\":9},\"maxValues\":{\"id\":10},\"nullCount\":{\"id\":0}}","tags":{"INSERTION_TIME":"1633794713000007","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"commitInfo":{"timestamp":1633794714030,"userId":"906424503339340","userName":"contact@manishgill.com","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"2167422995023007"},"clusterId":"0604-105630-ajar222","readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"8","numOutputBytes":"4126","numOutputRows":"10"}}} diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000003.checkpoint.parquet b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000003.checkpoint.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c6d600ac80eb4dabfc72efcc2c6b23f7c611b3d8 GIT binary patch literal 14005 zcmds84UiPY6`r}<+uef$54|4e$X0SWVLXu8+TYn(Ypa66gTq2ZI0+^pWOim2x7gb~ zclQutNTFi1Or$D=q=-@#gOM_X0Hq0*mPOFSNYFADqexMNBIQr~ixNX96iV~DXQpTN z2X6Mf^Cw$(b2B~N-+TSu>-S#2-dl9V;xJ-R8)`?#-giQv94jrC1h2_#^7^bV8FfU5v@mkQj?+G*OO4H8B#=)Mzp- z#biklRp=Qk<(0v~wLutr?La)b z?dOO7JJG=_%kV4%IOH#koG^{@gQr(UtI^4Qw;Tdt@bKtGAh?d6j%yb6&%b*9*Lzp= zU$J;zZ~uxVaL-Skzi>(K;{N&9EL;I^djah7Kr>uh+L;#C_uKt&KN%h?dv!odJv1Pe zK^nUIwm<*KTS2=uBEiy#$Kn#-9+slfL^75LN69h@0anx^>0^D1m_BG>+TqvHvxN<{ zLJNffN{_SXNfy1cd-orF4$)kDb9a7dV6m-e#U1l5f_=s55R693kT<M}{Nd>xUO2U_(8XMi`{CAl52)CEJbC8+ryP-j_rOYIj4Plyo}l>x*>Dnm zxMy%b6XW+=h7YznEX;#x@Z^dy>?dfhz`$<+LqIdpt8YAUy#s=ffVVI?e8nF;IpfaR z3N07)IoPs#0PL-;Xpa{?@vjGVI6Q;obwq`Pn2v>$Qdo(I(P&5!V`?M>5lYoXRS!vO zG89TgBGC~eR9uV3;wpr)cqFbtm;(?cl#s=6BomFs6G|eHiPr-GUZ3z7VWDfs%&M?3 zTF}Tm=0qwiW|EqkRDc~h9*T;w3`AWeEX$%C4JXoRB_=6xxgJjtn>sgciHON)C?1Mu zqGCvsV`5B`RX|F{#Y6-q6HCe=3F7}6P(2viYxw|=qnh?ZAf1vOSBT|8FacC;~;Md1@5y$EwNfz(7-rIMax2H}k z=zU@j@fld{OgrG=7q+~>|2p8tnZq?30ANNb)jkKwMVz_@TcGb zWYwN(GINDi^DjX1D1bIT@}CnB_&Df>6I0%Az32Vb2rRSOR+voNEGjd$`A$ISN6XNu zziv9@HL1@0WkZ2@CJLfQo_^pU2v8vWz!bf%lK`{`=Ssf5>sl~Z%_q)s&1}n#9RPHs z34NbMw?A{oOOsOQLM)Eou<{7e`l>HOFuw}2h6uhy@g;^Yakwm8!!QYupNvxI?|Yxy z-CV)%2#G@=XR+zUY%;BzIFF~L{lben5O*=oebSJSy#MdNJ?hNKrpb~VjzSV5W^`S} z9xWltvXmC1axATBQYMj+6y8-Y@Y=5WxkG~~g!dCGz@cCo{JP;3W_cl)+_tBG`rri1 zoXt&9YZpPoGIHIk2hBpSo;mrhbFKQUor`q?^R{kz>jsE)EFkpS;1&lha}+K(p z{p2)NL%sFJ!w|w~W4)hc(Vtj!|ACEvar%nuZ|*Ly9n`xo?=F>#Lu$EuR(B?==WtH= z4ai}X0sIP*!C6+fY`(0o)Qix7lRzb%gAOoKhg4B7E1FVPASYb6ZuvTX{GdUs;|D&D ze1qCD!_;uEhex;4pK|Gat_5$}(I1~Z^&~h6WA|Urj<7Q#c9cVd==D9XpQU!;4e*y} zdakSl)f;qm)nEZ=2X887%leYCQZ8LlT0(5aRVUEZm|+3C(hJ5$l`f!5pwR`dAe0D_ z(5bsl|B5EMFij6OUE>OjI-UTv^V*?14A~(c!Vk9r#8IxymS5gd)Mzbcd^zT zpw?*xC^H$y9?a?ss8b<2@BQJ6&$n2LgXri;zcG(?Y7Rpug@_<9qh>0O5#D_G^!rY} zJor6Nt{B07szku;sHyYpSNhsY;FDIZD1rJ16f}W}H}TEoHF@NP!qW66z6HuO^R?;F z+XGL&)q1g%E#!@-i8-ZIzHSg4fv%Bpil(7`q~w%@ir7fCJG$#CNOhL;T{};tcQu3UQ&zMK|Eta7v4;vIG~gbQvsyGVqtYw z(~HJiUx9QTmB!l)+XqLN&95}z+NsITDddI*@)*~PZPt^^mdaRd*i?L=k^wzmHpWa+ z3;9fTbIXKdzG`jkC061Br^RsA*nF08-xT)N*v}>?3Q=>w zdc|IwP=1i*cCx~z)|6dJw(oG@w2H=dYlUU*gFN1eToVKwRQ654RBFn}gi1|o1(i3U zZp2s%0Qh&r6oQ}TGQyh^Q&4Gk7Mq5apWEjgWq#87(6bm;*WiY4ZgOu!r+hc0muSLfFM4kf|#AzK$Irr zHn7~CtZyIOyh>o^2T*3a9 z=|ox{oD7)k*Cv%j&Cj#k5mwkT9R%gN!r3_s3Sl0c89_>GWE1ZRwl_5i)^n)u36?{( zDxXdrAHK8(q;7^w7{~tk@TCA(Q5AK@FZ~Co48#~ z!qLmHF}ph>twZjv;0WZXPy}wLhdbIN+!e$G&mX92&Fr-hreh?zbbn#3`2)?|v(3V5 zp_IFFqq|<9o&>Vs$1%c1Fa&`?v+dL$PY(`LNy~=MIXF{jg^2!8j!-Q|Xy| zExN}d#1O4YL+PBef; zZOlL$PE@1ql-~5gu*w~L9fLgB%stmE9G{(XRStF42{bcyHB+awc3oL3t#eJ# zu}ZeE+*VfDbtU9r=d71H$sFvZIhehEoP`Xc17rJluoohjRROi{Km>JKEBwc)xipdN zp;eKZY6u~M!l}7#bJP#!oddd}qT+!5U>-o*byeWr@#N&1+qi&(`pN=6P$;ep zDuW7?Sc9b68qDb{6?N_Wh5bnhj(B*!rh{2$NVo(mQWHwCRD z8GhRr+?+H(HHaScxam*Iz;pyYL4so0NGC(H=&_PH9~&T7Xk@sf(A*fra`PKZ=RCZ! zS%ke7#daeRT;Qey8WspGFd8*+3PdRnD1*=lm79KgW@kv86k+N%&MD-}N;Y3w2&Z+% zNuB%I<4O%>qPsEeR2Sq&i(^;tZG?28kkb`xtLHs~wAF}HS~~nwlO{expjB)Jrs))R3>t zSZ=amC+|4nq#QR4VjPOn@or{p9h}lZ4HGrbS}6df`ibN$4OoQpV3%^+@O0@)HqU}7 z5}mhFEGlcE**p)fPR`(9c+`1W#i4R7CL15(kQ1<_4-O z91MqoVG&2XbTO;0DkZym(nHyt)}>}7RgOe6a$Jq3rKASmpbp0qQ9UN>k*E}o$!a2$ Z)aA$w_+mhSVLss>CJ$fUTm=8H_CKNHG~xgN literal 0 HcmV?d00001 diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000003.crc b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000003.crc new file mode 100644 index 0000000000..97241378fe --- /dev/null +++ b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000003.crc @@ -0,0 +1 @@ +{"tableSizeBytes":552,"numFiles":1,"numMetadata":1,"numProtocol":1,"numTransactions":0} diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000003.json b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000003.json new file mode 100644 index 0000000000..02198358f4 --- /dev/null +++ b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/00000000000000000003.json @@ -0,0 +1,10 @@ +{"remove":{"path":"part-00000-5d345c6a-525d-4520-a078-12f3357a77f5-c000.snappy.parquet","deletionTimestamp":1633796137327,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{},"size":514,"tags":{"INSERTION_TIME":"1633794713000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"remove":{"path":"part-00001-f9dc9a5d-8503-4f3d-a188-8317bba46a58-c000.snappy.parquet","deletionTimestamp":1633796137327,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{},"size":514,"tags":{"INSERTION_TIME":"1633794713000001","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"remove":{"path":"part-00002-930505f3-0d84-4d8c-8595-727f54980600-c000.snappy.parquet","deletionTimestamp":1633796137327,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{},"size":514,"tags":{"INSERTION_TIME":"1633794713000002","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"remove":{"path":"part-00003-5bafc8ea-a7a9-4ce8-8aa2-533924d86f26-c000.snappy.parquet","deletionTimestamp":1633796137327,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{},"size":521,"tags":{"INSERTION_TIME":"1633794713000003","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"remove":{"path":"part-00004-19ec0ebc-a688-4004-8e65-18c44af19d07-c000.snappy.parquet","deletionTimestamp":1633796137327,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{},"size":514,"tags":{"INSERTION_TIME":"1633794713000004","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"remove":{"path":"part-00005-e1389522-dd44-49e5-9c4f-e8e9760cf2c2-c000.snappy.parquet","deletionTimestamp":1633796137327,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{},"size":514,"tags":{"INSERTION_TIME":"1633794713000005","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"remove":{"path":"part-00006-a928894d-b601-4739-bf99-7ece54e88b8d-c000.snappy.parquet","deletionTimestamp":1633796137327,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{},"size":514,"tags":{"INSERTION_TIME":"1633794713000006","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"remove":{"path":"part-00007-b41961a2-330a-4c20-84ce-ce06c9007223-c000.snappy.parquet","deletionTimestamp":1633796137327,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{},"size":521,"tags":{"INSERTION_TIME":"1633794713000007","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"add":{"path":"part-00000-e5b7d6de-4830-45fd-843d-22dc39b6486a-c000.snappy.parquet","partitionValues":{},"size":552,"modificationTime":1633796137000,"dataChange":false,"stats":"{\"numRecords\":10,\"minValues\":{\"id\":1},\"maxValues\":{\"id\":10},\"nullCount\":{\"id\":0}}","tags":{"INSERTION_TIME":"1633794713000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"commitInfo":{"timestamp":1633796137867,"userId":"906424503339340","userName":"contact@manishgill.com","operation":"OPTIMIZE","operationParameters":{"predicate":"[]","zOrderBy":"[]","batchId":"0","auto":false},"notebook":{"notebookId":"2167422995023007"},"clusterId":"0604-105630-ajar222","readVersion":2,"isolationLevel":"SnapshotIsolation","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"8","numRemovedBytes":"4126","p25FileSize":"552","minFileSize":"552","numAddedFiles":"1","maxFileSize":"552","p75FileSize":"552","p50FileSize":"552","numAddedBytes":"552"}}} diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/_last_checkpoint b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/_last_checkpoint new file mode 100644 index 0000000000..1b0987022d --- /dev/null +++ b/rust/tests/data/simple_table_with_stats_parsed_optimized/_delta_log/_last_checkpoint @@ -0,0 +1 @@ +{"version":3,"size":11} diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00000-5d345c6a-525d-4520-a078-12f3357a77f5-c000.snappy.parquet b/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00000-5d345c6a-525d-4520-a078-12f3357a77f5-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5622279bd8ea5a9670b8fd923444c8742c5f993b GIT binary patch literal 514 zcmaJ;!AiqG5Z$g?f_ikr1_FjfOAGDNG;Nv`FM-tAFFw zFYvd-iLD?MybR2nxAWfH>7HJj1d#(m$>-bq*Pc!=v_+!mbi05MT2&R5|8xNF9$~0k zg`7gLRz2TOCn6a=q~4>91i=6vV~PVGsaVrBy`;cmt%a1LmZJo#Sbb4nOUlVVGMWxn zU=wHn86)kI>f>a{#YBJ%|54h-4n-X6>pLCB{02VREz1&06lSp9J10 zb&Ya=7KFax4Q$V8SOcf!S$*5}o31olZA&&C*|2Q0>3HqBE1gC~({=dL54c?yuKx|@ CXMddl literal 0 HcmV?d00001 diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00000-e5b7d6de-4830-45fd-843d-22dc39b6486a-c000.snappy.parquet b/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00000-e5b7d6de-4830-45fd-843d-22dc39b6486a-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0321190a84d894fb0193164a8a4801c4c7f8e06b GIT binary patch literal 552 zcmaJ<-AcnS7)|S(Rd(5w6atQ*W5O2JbwBO!M({#GL}Uo!A6c8tV!O?@Q^wpod8?P= zy^r9l_*Ewe120ZOPENk>oFpfErxr%|5?|ww=g*e{#X2nWSfeDjfe<1&1x5pv|Ez%E zTPlpYl5Hv*Pb)Cx0IPsJU=5%F)&W@+teTaBLNXTd-A&@%%1B@p$ZeIVfsYK`q$VwC z6l6`L<;{%6s-YWiit(g@F3$P7!Ap)%`IL>WM@f{+=m2xevhb3t|@lsY{ z{Xm9(!a5f-HWX2|75G3eWWg|%0~rG`oP?q1g&=^Fvci6Avd(NiznUYmxn$!f$-YQs zH;7~w=4|1r8pXv$jH1P@8n>#Mzg`@8cZpjp_a;H;7rlPVbLw{AX?k|A<@ya*TFtgC c8;-2oEvw;p?V2l{`i@B{-03U4@Cp2pFBb@cg8%>k literal 0 HcmV?d00001 diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00001-f9dc9a5d-8503-4f3d-a188-8317bba46a58-c000.snappy.parquet b/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00001-f9dc9a5d-8503-4f3d-a188-8317bba46a58-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..03d8d843fb35abff7fb53e575291af4a72f97e06 GIT binary patch literal 514 zcmaJ;+e*Vg5Z$g?s`%)J4FoKUmKNHjX?jVDFM{-K`?;Fm}1{U8rBWnC>3C_-a>`Eo}mP5SbNc4ONEnvWOM_p zz$VZDGDi9(H73!Ji>Uw^{-ekPH577wan*O)6ztLs!jp<(e=?>!7B_hV(aHuRo&_A- z&r+4}X!6WMl?YEHg5EAzGM_5eWpNUvGGSFV@KxZ&tb4a$V-aQ}-vf9}^T&xAstAB_ z8U(@(fB-L5mHqBy-NkZwzeMzK!^SGfnMhRM4^pLM#XiEqRGuVH*1+cH}d5u zc8X;;^#iXc2W@FLtbyH zj6$$cJ>O5JA|5>?@=-;CU;vLX#et7>Y?y{w(m>c~A+2C!D8V|`U-Z|KcJhynVL}v8 z0uLZ#WL#2n5)ZkU3b5fnjyzODAr}`{JqfQIMQv{1GgpZzT^2#Pg$+Dt(vZC*pAh7WxMVvw^1=n6Q1-lhfo)O{~M2+ Be{%o; literal 0 HcmV?d00001 diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00003-5bafc8ea-a7a9-4ce8-8aa2-533924d86f26-c000.snappy.parquet b/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00003-5bafc8ea-a7a9-4ce8-8aa2-533924d86f26-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..29287123782875d6501ca936c5694e1085120fe4 GIT binary patch literal 521 zcmaJ<%SyvQ6rH4L$wJ&sn81Kxz|ulHG)*5#aU-};5Ghgw5s_&!t-&O1lT<0C8yEh7 z;79lgevIGZjaCo}ZZ6z&&)jp)WroL>Hb$6Y3xB@8y(<)J&{nWciB?7k5lw??pxU1f zfc=gNwNxsG1r4ia<8&{Zi}c|x^Y3LWunORzO4KkwhGtTeR&@%rF48MzfnwFrjAzAo zQL6gUKVl{YL$C`}3c7~Ob3$k7l#96lCH^hSBRLiR>g;0V^$55oJA_A>X5lO$Wry3m zjYxfmku4$)0TxS{^K|yaW0{LUtdnk6DGMqvQ( zmKG+toXQk{WEn+b9037d$_D%0$p-7q=5~X~!H$ieBqt)5qcE05n5(U;W}I5v7?#cL z7Pp&)zi}G+51D7x#>+4YEPvAT-L^AvyS_8-c|pgMcDL`yjw{zvW4{T{kI( Mn|>$=4dD}f0ZTc5yZ`_I literal 0 HcmV?d00001 diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00004-19ec0ebc-a688-4004-8e65-18c44af19d07-c000.snappy.parquet b/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00004-19ec0ebc-a688-4004-8e65-18c44af19d07-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..15e007fcf456da5ef8cc230ea621ee3352f89190 GIT binary patch literal 514 zcmaJ;-AV#M7#*iFi7uOE78a7BL_r7F)pb|83A!kVkdPoEV%!~Vad*|(wJ@;<=|y^l z9-;^6G5R(Q0->7^zB6ae&rJ99+9Ze^5K2DYzCL#if?>^*fWp7eq2EbF)rWi&Sx4qiAxo&dpkuudf1c zl(@xme-?y((Hq#F)363k%d`5n>o;9#w%V3#I1K9UE+6UH8)tZv-zCM1~AOL?lhKS=M%2+bMI*=kX1E zF)zFk{Hha#fj1v~Cnx78>787e1d)A0$>+!W>z+X{ta(x}=;k&;=x1uE{AUAr=Mcl( z$Ym6Q_3GJfG7<6UK9LV96a)i!gemqtq+#9AjZy)G^%g4R^$aCg!`id3pJCmnCUDlPRmRfv*BDVcpvWi$s_m`5wS4njfWVsA2%3 zSr7;}00P`pRragNx{KxVZi(o@T8y72ry^B-KU7(n>s6{+7#CMHiY7Pf+^l8!+}M|+ z#3`2DnICvXIcQ6}VGZn-wA{AiH63NPI+kkMs$sRwrY$>lN7;>vZW!>SZ#jf|@cUnN CAAoQG literal 0 HcmV?d00001 diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00006-a928894d-b601-4739-bf99-7ece54e88b8d-c000.snappy.parquet b/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00006-a928894d-b601-4739-bf99-7ece54e88b8d-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4b111fe98d82203042e4c4425cc4078913f1db3e GIT binary patch literal 514 zcmaJ;Z%e{J96m3T5xwlj4F<_kqM(O!IydP}(2Ify2?-)1Zrf?YZR$1)6Z=TLjXp-- zqYu$jL5AcQ-#F-q)1b}!R zhQbYj05@fw{c5t_a<#f&A$qtKW8LITWO5KhGB0zpN!5sw@}@@F;6{@hjXa;51l~Ax z%2jtBgnro@TAtl9hjzy^UCZ&?jx;)5Q?_l{GA*NRd)=lZ?N&|Gb$HVE0z!TG{co7K BfB^si literal 0 HcmV?d00001 diff --git a/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00007-b41961a2-330a-4c20-84ce-ce06c9007223-c000.snappy.parquet b/rust/tests/data/simple_table_with_stats_parsed_optimized/part-00007-b41961a2-330a-4c20-84ce-ce06c9007223-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7d233ecd842dd9c02ebbbf414ca6d43fa4b9163f GIT binary patch literal 521 zcmaJMm3bRsVDV z9JCdv#bQ3pX;{(gXZz_)B$NBpdytX93V=rnQ34-ns!nxUDN~?zP`RY%C{{F8doE}% zg-ZGOA2FSRA=m{fIbB8i1)*(i{qpR)e#ip3(zM}ZIU zh8Dz`9LofNcoBxe4S@hJWu5)*Wc}58eYZyBaLdL|l2eh%VGzkY%=N}qBT9@-48!DR zlbem)pPK~UBz26cy9h$x@J3zFZdoI{<5_Ok@!O6xJ3ULbZP~KAX5039O-I_TnocR) L^sRu<06xJN4*h_q literal 0 HcmV?d00001 diff --git a/rust/tests/write_exploration.rs b/rust/tests/write_exploration.rs index 1d661bd7ad..15dcf550be 100644 --- a/rust/tests/write_exploration.rs +++ b/rust/tests/write_exploration.rs @@ -292,7 +292,7 @@ pub fn create_add( partition_values: partition_values.to_owned(), partition_values_parsed: None, - modification_time: modification_time, + modification_time, data_change: true, // TODO: calculate additional stats From 0b7c8235ae609dd504e4773ec28c638076648a4b Mon Sep 17 00:00:00 2001 From: Manish Date: Sat, 9 Oct 2021 20:14:46 +0200 Subject: [PATCH 2/2] Ran cargo fmt --- rust/src/action.rs | 70 +++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/rust/src/action.rs b/rust/src/action.rs index 401d3d64cc..733b0fc881 100644 --- a/rust/src/action.rs +++ b/rust/src/action.rs @@ -2,18 +2,18 @@ #![allow(non_snake_case, non_camel_case_types)] +use super::schema::*; +use arrow::array::ArrayRef; +use arrow::array::Int64Array; +use arrow::datatypes::Field; +use arrow::record_batch::RecordBatch; use parquet::record::{ListAccessor, MapAccessor, RowAccessor}; use percent_encoding::percent_decode; use serde::{Deserialize, Serialize}; use serde_json::{Map, Value}; +use std::collections::hash_map::Entry; use std::collections::HashMap; -use arrow::record_batch::RecordBatch; -use super::schema::*; use std::sync::Arc; -use std::collections::hash_map::Entry; -use arrow::array::ArrayRef; -use arrow::array::Int64Array; -use arrow::datatypes::Field; /// Error returned when an invalid Delta log action is encountered. #[derive(thiserror::Error, Debug)] @@ -322,7 +322,11 @@ impl Add { .map_or(Ok(None), |s| serde_json::from_str(s)) } - fn get_stat_row_rb(&self, row: &parquet::record::Row, cols: &[String]) -> arrow::error::Result { + fn get_stat_row_rb( + &self, + row: &parquet::record::Row, + cols: &[String], + ) -> arrow::error::Result { let mut stat_col_map: HashMap<&String, Vec>> = HashMap::new(); for col in cols { let data: Vec> = vec![]; @@ -348,13 +352,13 @@ impl Add { match stat_col_map.entry(col) { Entry::Occupied(e) => { // TODO: i64s only for now. should support all possible column types - let my_vec: Vec = e.get().iter().map(|v| { - v.as_ref().unwrap().parse::().unwrap() - }).collect::>(); - let a : ArrayRef = Arc::new(Int64Array::from(my_vec)); - rb_vec.push( - (col.as_str(), a) - ) + let my_vec: Vec = e + .get() + .iter() + .map(|v| v.as_ref().unwrap().parse::().unwrap()) + .collect::>(); + let a: ArrayRef = Arc::new(Int64Array::from(my_vec)); + rb_vec.push((col.as_str(), a)) } Entry::Vacant(_) => {} } @@ -364,7 +368,10 @@ impl Add { /// The stats_parsed can be converted into the record batches and the original mem-heavy data structure /// can eventually be eliminated entirely. - pub fn get_stats_as_record_batch(&self, batch_columns: &[String]) -> Result, parquet::errors::ParquetError> { + pub fn get_stats_as_record_batch( + &self, + batch_columns: &[String], + ) -> Result, parquet::errors::ParquetError> { self.stats_parsed.as_ref().map_or(Ok(None), |record| { let mut min_val_vec = vec![]; let mut max_val_vec = vec![]; @@ -424,7 +431,11 @@ impl Add { fn new_schema(&self, batch_columns: &[String]) -> arrow::datatypes::Schema { let mut fields: Vec = vec![]; for col_name in batch_columns { - fields.push(Field::new(col_name, arrow::datatypes::DataType::Int64, false)) + fields.push(Field::new( + col_name, + arrow::datatypes::DataType::Int64, + false, + )) } arrow::datatypes::Schema::new(fields) } @@ -1032,43 +1043,38 @@ mod tests { let stats = action.get_stats().unwrap().unwrap(); assert_eq!(stats.num_records, 10); - // Verify stats rb + // Verify stats rb let cols = vec!["id".to_string()]; - let schema = Arc::new(arrow::datatypes::Schema::new(vec![ - Field::new("id", arrow::datatypes::DataType::Int64, false), - ])); + let schema = Arc::new(arrow::datatypes::Schema::new(vec![Field::new( + "id", + arrow::datatypes::DataType::Int64, + false, + )])); let stats_rb = action.get_stats_as_record_batch(&cols).unwrap().unwrap(); assert_eq!(stats_rb.num_records, 10); let min_batch = stats_rb.min_values; let min_arr = Int64Array::from(vec![1]); - let expected_min_batch = RecordBatch::try_new( - schema.clone(), - vec![Arc::new(min_arr)] - ).unwrap(); + let expected_min_batch = + RecordBatch::try_new(schema.clone(), vec![Arc::new(min_arr)]).unwrap(); assert_eq!(min_batch.num_columns(), 1); assert_eq!(min_batch.num_rows(), 1); assert_eq!(min_batch, expected_min_batch); let max_batch = stats_rb.max_values; let max_arr = Int64Array::from(vec![10]); - let expected_max_batch = RecordBatch::try_new( - schema.clone(), - vec![Arc::new(max_arr)] - ).unwrap(); + let expected_max_batch = + RecordBatch::try_new(schema.clone(), vec![Arc::new(max_arr)]).unwrap(); assert_eq!(max_batch, expected_max_batch); // TODO let null_count_batch = stats_rb.null_counts; assert_eq!(null_count_batch.num_columns(), 1); - let expected_null_count_batch = RecordBatch::new_empty( - schema.clone() - ); + let expected_null_count_batch = RecordBatch::new_empty(schema.clone()); assert_eq!(null_count_batch.num_columns(), 1); assert_eq!(null_count_batch.num_rows(), 0); assert_eq!(null_count_batch, expected_null_count_batch); - } #[test]