Skip to content

Commit

Permalink
fix: Decompress moved out of schema initialization (#15550)
Browse files Browse the repository at this point in the history
  • Loading branch information
leoforney authored Apr 10, 2024
1 parent 8f40509 commit c758416
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 34 deletions.
65 changes: 31 additions & 34 deletions crates/polars-io/src/csv/read_impl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,43 +174,40 @@ impl<'a> CoreReader<'a> {
// check if schema should be inferred
let separator = separator.unwrap_or(b',');

// We keep track of the inferred schema bool
// In case the file is compressed this schema inference is wrong and has to be done
// again after decompression.
#[cfg(any(feature = "decompress", feature = "decompress-fast"))]
{
let total_n_rows =
n_rows.map(|n| skip_rows + (has_header as usize) + skip_rows_after_header + n);
if let Some(b) =
decompress(&reader_bytes, total_n_rows, separator, quote_char, eol_char)
{
reader_bytes = ReaderBytes::Owned(b);
}
}

let mut schema = match schema {
Some(schema) => schema,
None => {
{
// We keep track of the inferred schema bool
// In case the file is compressed this schema inference is wrong and has to be done
// again after decompression.
#[cfg(any(feature = "decompress", feature = "decompress-fast"))]
{
let total_n_rows = n_rows.map(|n| {
skip_rows + (has_header as usize) + skip_rows_after_header + n
});
if let Some(b) =
decompress(&reader_bytes, total_n_rows, separator, quote_char, eol_char)
{
reader_bytes = ReaderBytes::Owned(b);
}
}

let (inferred_schema, _, _) = infer_file_schema(
&reader_bytes,
separator,
max_records,
has_header,
schema_overwrite.as_deref(),
&mut skip_rows,
skip_rows_after_header,
comment_prefix.as_ref(),
quote_char,
eol_char,
null_values.as_ref(),
try_parse_dates,
raise_if_empty,
&mut n_threads,
)?;
Arc::new(inferred_schema)
}
let (inferred_schema, _, _) = infer_file_schema(
&reader_bytes,
separator,
max_records,
has_header,
schema_overwrite.as_deref(),
&mut skip_rows,
skip_rows_after_header,
comment_prefix.as_ref(),
quote_char,
eol_char,
null_values.as_ref(),
try_parse_dates,
raise_if_empty,
&mut n_threads,
)?;
Arc::new(inferred_schema)
},
};
if let Some(dtypes) = dtype_overwrite {
Expand Down
5 changes: 5 additions & 0 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,11 @@ def test_compressed_csv(io_files_path: Path) -> None:
out = pl.read_csv(str(csv_file), truncate_ragged_lines=True)
assert_frame_equal(out, expected)

# now with schema defined
schema = {"a": pl.Int64, "b": pl.Utf8, "c": pl.Float64}
out = pl.read_csv(str(csv_file), schema=schema, truncate_ragged_lines=True)
assert_frame_equal(out, expected)

# now with column projection
out = pl.read_csv(csv_bytes, columns=["a", "b"])
expected = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
Expand Down

0 comments on commit c758416

Please sign in to comment.