From a59e3bea399f5914e0ebbe528b12c074ab4451fc Mon Sep 17 00:00:00 2001 From: hhj Date: Mon, 16 Oct 2023 22:56:37 +0800 Subject: [PATCH 1/2] fix: add null_regex for string type in csv --- arrow-csv/src/reader/mod.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 2ba49cadc73f..ff512216cdc0 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -791,7 +791,14 @@ fn parse( } DataType::Utf8 => Ok(Arc::new( rows.iter() - .map(|row| Some(row.get(i))) + .map(|row| { + let s = row.get(i); + if null_regex.is_null(s) { + None + } else { + Some(s) + } + }) .collect::(), ) as ArrayRef), DataType::Dictionary(key_type, value_type) @@ -1495,7 +1502,7 @@ mod tests { let schema = Arc::new(Schema::new(vec![ Field::new("c_int", DataType::UInt64, false), Field::new("c_float", DataType::Float32, true), - Field::new("c_string", DataType::Utf8, false), + Field::new("c_string", DataType::Utf8, true), Field::new("c_bool", DataType::Boolean, false), ])); @@ -1596,8 +1603,7 @@ mod tests { assert!(batch.column(0).is_null(1)); assert!(batch.column(1).is_null(2)); assert!(batch.column(3).is_null(4)); - // String won't be empty - assert!(!batch.column(2).is_null(3)); + assert!(batch.column(2).is_null(3)); assert!(!batch.column(2).is_null(4)); } @@ -2237,8 +2243,8 @@ mod tests { fn err_test(csv: &[u8], expected: &str) { let schema = Arc::new(Schema::new(vec![ - Field::new("text1", DataType::Utf8, false), - Field::new("text2", DataType::Utf8, false), + Field::new("text1", DataType::Utf8, true), + Field::new("text2", DataType::Utf8, true), ])); let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv)); let b = ReaderBuilder::new(schema) From c631eb6350a755cd08815ffa2b35cccb82307a83 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Mon, 16 Oct 2023 23:09:37 +0800 Subject: [PATCH 2/2] Update arrow-csv/src/reader/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-csv/src/reader/mod.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index ff512216cdc0..1106b16bc46f 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -793,11 +793,7 @@ fn parse( rows.iter() .map(|row| { let s = row.get(i); - if null_regex.is_null(s) { - None - } else { - Some(s) - } + (!null_regex.is_null(s)).then_some(s) }) .collect::(), ) as ArrayRef),