From c407625d6203460b6dbc6e9faeff3fd64cfeebce Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 5 Jun 2024 16:27:48 +1000 Subject: [PATCH 1/3] fix: Column selection wasn't applied when reading CSV with no rows --- crates/polars-io/src/csv/read/read_impl.rs | 33 ++++++++++++++-------- py-polars/tests/unit/io/test_csv.py | 21 ++++++++++++++ 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/crates/polars-io/src/csv/read/read_impl.rs b/crates/polars-io/src/csv/read/read_impl.rs index b2ae43f6eef4..4c4d8232f510 100644 --- a/crates/polars-io/src/csv/read/read_impl.rs +++ b/crates/polars-io/src/csv/read/read_impl.rs @@ -64,17 +64,18 @@ pub(crate) fn cast_columns( }; if parallel { - let cols = df - .get_columns() - .iter() - .map(|s| { - if let Some(fld) = to_cast.iter().find(|fld| fld.name().as_str() == s.name()) { - cast_fn(s, fld) - } else { - Ok(s.clone()) - } - }) - .collect::>>()?; + let cols = POOL.install(|| { + df.get_columns() + .into_par_iter() + .map(|s| { + if let Some(fld) = to_cast.iter().find(|fld| fld.name().as_str() == s.name()) { + cast_fn(s, fld) + } else { + Ok(s.clone()) + } + }) + .collect::>>() + })?; *df = unsafe { DataFrame::new_no_checks(cols) } } else { // cast to the original dtypes in the schema @@ -473,7 +474,15 @@ impl<'a> CoreReader<'a> { // An empty file with a schema should return an empty DataFrame with that schema if bytes.is_empty() { - let mut df = DataFrame::from(self.schema.as_ref()); + let schema = &projection + .iter() + .map(|&i| self.schema.get_at_index(i).unwrap()) + .map(|(name, dtype)| Field { + name: name.clone(), + dtype: dtype.clone(), + }) + .collect::(); + let mut df = DataFrame::from(schema); if let Some(ref row_index) = self.row_index { df.insert_column(0, Series::new_empty(&row_index.name, &IDX_DTYPE))?; } diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 37fed6539e84..d6a2c8d549cb 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -2160,3 +2160,24 @@ def test_read_csv_dtypes_deprecated() -> None: schema={"a": pl.Int8, "b": pl.Int8, "c": pl.Int8}, ) assert_frame_equal(df, expected) + + +def test_projection_applied_on_file_with_no_rows_16606(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + + path = tmp_path / "data.csv" + + data = """\ +a,b,c,d +""" + + with path.open("w") as f: + f.write(data) + + columns = ["a", "b"] + + out = pl.read_csv(path, columns=columns).columns + assert out == columns + + out = pl.scan_csv(path).select(columns).collect().columns + assert out == columns From 3739a433d0af43d8f96c9ea950bc0debb653c0fa Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 5 Jun 2024 16:32:25 +1000 Subject: [PATCH 2/3] maybe elide alloc --- crates/polars-io/src/csv/read/read_impl.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/crates/polars-io/src/csv/read/read_impl.rs b/crates/polars-io/src/csv/read/read_impl.rs index 4c4d8232f510..0eacb504d3c9 100644 --- a/crates/polars-io/src/csv/read/read_impl.rs +++ b/crates/polars-io/src/csv/read/read_impl.rs @@ -474,14 +474,18 @@ impl<'a> CoreReader<'a> { // An empty file with a schema should return an empty DataFrame with that schema if bytes.is_empty() { - let schema = &projection - .iter() - .map(|&i| self.schema.get_at_index(i).unwrap()) - .map(|(name, dtype)| Field { - name: name.clone(), - dtype: dtype.clone(), - }) - .collect::(); + let schema = if projection.len() == self.schema.len() { + self.schema.as_ref() + } else { + &projection + .iter() + .map(|&i| self.schema.get_at_index(i).unwrap()) + .map(|(name, dtype)| Field { + name: name.clone(), + dtype: dtype.clone(), + }) + .collect::() + }; let mut df = DataFrame::from(schema); if let Some(ref row_index) = self.row_index { df.insert_column(0, Series::new_empty(&row_index.name, &IDX_DTYPE))?; From d64dde8312204c6cf73d8a64c5dc2e2334fb1856 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Wed, 5 Jun 2024 16:34:47 +1000 Subject: [PATCH 3/3] clippy --- crates/polars-io/src/csv/read/read_impl.rs | 23 +++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/crates/polars-io/src/csv/read/read_impl.rs b/crates/polars-io/src/csv/read/read_impl.rs index 0eacb504d3c9..3fa509189a60 100644 --- a/crates/polars-io/src/csv/read/read_impl.rs +++ b/crates/polars-io/src/csv/read/read_impl.rs @@ -474,19 +474,20 @@ impl<'a> CoreReader<'a> { // An empty file with a schema should return an empty DataFrame with that schema if bytes.is_empty() { - let schema = if projection.len() == self.schema.len() { - self.schema.as_ref() + let mut df = if projection.len() == self.schema.len() { + DataFrame::from(self.schema.as_ref()) } else { - &projection - .iter() - .map(|&i| self.schema.get_at_index(i).unwrap()) - .map(|(name, dtype)| Field { - name: name.clone(), - dtype: dtype.clone(), - }) - .collect::() + DataFrame::from( + &projection + .iter() + .map(|&i| self.schema.get_at_index(i).unwrap()) + .map(|(name, dtype)| Field { + name: name.clone(), + dtype: dtype.clone(), + }) + .collect::(), + ) }; - let mut df = DataFrame::from(schema); if let Some(ref row_index) = self.row_index { df.insert_column(0, Series::new_empty(&row_index.name, &IDX_DTYPE))?; }