Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(rust, python): support timezone in csv writer #6722

Merged
merged 30 commits into from
Feb 10, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b8ada23
support timezones in csv writer
Feb 8, 2023
1267e8e
lint
Feb 8, 2023
3ae3f29
simplify
Feb 8, 2023
55d09a9
clippy
Feb 8, 2023
2e786d2
fix(python): respect 'None' in from_dicts (#6726)
ritchie46 Feb 8, 2023
0532e03
fix(rust, python): arrow map dtype conversion (#6732)
ritchie46 Feb 8, 2023
870a818
feat(python): don't require pyarrow for utf8 -> numpy conversion (#6733)
ritchie46 Feb 8, 2023
91f765f
feat(python): scan_ds predicate pushdown for string cmp (#6734)
ritchie46 Feb 8, 2023
44a7c5b
feat(rust, python): Support an ignore_nulls param for EWM calculation…
yuntai Feb 9, 2023
262114c
fix(rust,python): Improve error message in DataFrame constructor (#6715)
stinodego Feb 9, 2023
d43500e
feat(python): Improved assert equal messages (#6737)
stinodego Feb 9, 2023
0a1c1bc
test(python): Reorganize benchmark test folder (#6695)
stinodego Feb 9, 2023
d3633fb
feat(python): Improve numpy support: conversion of numpy arrays with …
ghuls Feb 9, 2023
80cce18
feat(rust, python): add argmin/max for utf8 data (#6746)
ritchie46 Feb 9, 2023
dd1dca7
chore(rust): update arrow to 0.16 (#6748)
ritchie46 Feb 9, 2023
aeb3a03
docs(python): redirect tz_localize (#6749)
MarcoGorelli Feb 9, 2023
b160f53
test(python): integrate `ignore_nulls` into EWM parametric tests (#6751)
alexander-beedie Feb 9, 2023
e103b34
fix(rust, python): respect skip_rows in glob parsing csv (#6754)
ritchie46 Feb 9, 2023
9de9316
feat(rust, python): formally support duration division (#6758)
ritchie46 Feb 9, 2023
7fbdb6c
chore(rust): propagate error in date_range with invalid time zone (#6…
MarcoGorelli Feb 9, 2023
11e4de2
build(python): Update `mypy` to version `1.0.0` (#6744)
stinodego Feb 9, 2023
0cf7d7f
feat(python): Add option to use PyArrow backed-extension arrays when …
ghuls Feb 10, 2023
aad4aa3
feat(rust, python): parse timezone from Datetime (#6766)
MarcoGorelli Feb 10, 2023
4607eb6
fix(rust,python): handle edge-case with string-literal replacement wh…
alexander-beedie Feb 10, 2023
2d7d728
feat(python): default to 1d interval in date_range (#6771)
MarcoGorelli Feb 10, 2023
1a45830
fix(rust, python): don't set sorted flag if we reverse sort the left …
ritchie46 Feb 10, 2023
f61fa38
fix(rust, python): use explicit drop function node (#6769)
ritchie46 Feb 10, 2023
b3a7374
feat(rust): implement series abstractions for `Int128Type` (#6679)
plaflamme Feb 10, 2023
afac817
Merge branch 'autodetect-aware' of github.com:MarcoGorelli/polars int…
ritchie46 Feb 10, 2023
7dbdc00
add timezones feature to polars-io
ritchie46 Feb 10, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion polars/polars-io/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,13 @@ decompress = ["flate2/miniz_oxide"]
decompress-fast = ["flate2/zlib-ng"]
dtype-categorical = ["polars-core/dtype-categorical"]
dtype-date = ["polars-core/dtype-date", "polars-time/dtype-date"]
dtype-datetime = ["polars-core/dtype-datetime", "polars-core/temporal", "polars-time/dtype-datetime"]
dtype-datetime = [
"polars-core/dtype-datetime",
"polars-core/temporal",
"polars-time/dtype-datetime",
"chrono-tz",
"chrono",
]
dtype-time = ["polars-core/dtype-time", "polars-core/temporal", "polars-time/dtype-time"]
dtype-struct = ["polars-core/dtype-struct"]
dtype-binary = ["polars-core/dtype-binary"]
Expand All @@ -46,6 +52,8 @@ anyhow.workspace = true
arrow.workspace = true
async-trait = { version = "0.1.59", optional = true }
bytes = "1.3.0"
chrono = { version = "0.4.23", optional = true }
chrono-tz = { version = "0.8.1", optional = true }
dirs = "4.0"
flate2 = { version = "1", optional = true, default-features = false }
futures = { version = "0.3.25", optional = true }
Expand Down
52 changes: 36 additions & 16 deletions polars/polars-io/src/csv/write_impl.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
use std::io::Write;

use arrow::temporal_conversions;
#[cfg(feature = "dtype-datetime")]
use chrono::TimeZone;
#[cfg(feature = "dtype-datetime")]
use chrono_tz::Tz;
use lexical_core::{FormattedSize, ToLexical};
use memchr::{memchr, memchr2};
use polars_core::error::PolarsError::ComputeError;
use polars_core::fmt::PlTzAware;
use polars_core::prelude::*;
use polars_core::series::SeriesIter;
use polars_core::POOL;
Expand Down Expand Up @@ -95,12 +98,17 @@ fn write_anyvalue(
TimeUnit::Microseconds => temporal_conversions::timestamp_us_to_datetime(v),
TimeUnit::Milliseconds => temporal_conversions::timestamp_ms_to_datetime(v),
};
match tz {
None => write!(f, "{}", ndt.format(datetime_format)),
Some(tz) => {
write!(f, "{}", PlTzAware::new(ndt, tz))
}
}
let formatted = match tz {
Some(tz) => match tz.parse::<Tz>() {
Ok(parsed_tz) => parsed_tz.from_utc_datetime(&ndt).format(datetime_format),
Err(_) => match temporal_conversions::parse_offset(tz) {
Ok(parsed_tz) => parsed_tz.from_utc_datetime(&ndt).format(datetime_format),
Err(_) => unreachable!(),
},
},
_ => ndt.format(datetime_format),
};
write!(f, "{formatted}")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @alexander-beedie - do you have any thoughts on this? On the one hand, parsing tz for each element slows things down for tz-aware columns, and could be done beforehand in a per-column fashion

On the other hand, in #4724 it looks like you intentionally tried to avoid per-column inference

Copy link
Collaborator

@alexander-beedie alexander-beedie Feb 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup - though it wasn't about not wanting per-column inference (which would actually be great) as ensuring that any such inference was done outside the hot loop (per-element inference would be bad).

I took a minimal approach in the end as my initial attempt to reshuffle things on a per-column basis became unnecessarily convoluted - treat my earlier commit as a mere first step towards a more flexible/better per-column future... ;)

}
#[cfg(feature = "dtype-time")]
AnyValue::Time(v) => {
Expand Down Expand Up @@ -186,24 +194,36 @@ pub(crate) fn write<W: Write>(
if options.datetime_format.is_none() {
for col in df.get_columns() {
match col.dtype() {
DataType::Datetime(TimeUnit::Microseconds, _)
DataType::Datetime(TimeUnit::Milliseconds, tz)
// lowest precision; only set if it's not been inferred yet
if options.datetime_format.is_none() =>
{
options.datetime_format = Some("%FT%H:%M:%S.%6f".to_string());
options.datetime_format = match tz{
Some(_) => Some("%FT%H:%M:%S.%3f%z".to_string()),
None => Some("%FT%H:%M:%S.%3f".to_string()),
};
}
DataType::Datetime(TimeUnit::Nanoseconds, _) => {
options.datetime_format = Some("%FT%H:%M:%S.%9f".to_string());
DataType::Datetime(TimeUnit::Microseconds, tz) => {
options.datetime_format = match tz{
Some(_) => Some("%FT%H:%M:%S.%6f%z".to_string()),
None => Some("%FT%H:%M:%S.%6f".to_string()),
};
}
DataType::Datetime(TimeUnit::Nanoseconds, tz) => {
options.datetime_format = match tz {
Some(_) => Some("%FT%H:%M:%S.%9f%z".to_string()),
None => Some("%FT%H:%M:%S.%9f".to_string()),
};
break; // highest precision; no need to check further
}
_ => {}
}
}
// if still not set, no cols require higher precision than "ms" (or no datetime cols)
if options.datetime_format.is_none() {
options.datetime_format = Some("%FT%H:%M:%S.%3f".to_string());
}
}
let datetime_format: &str = options.datetime_format.as_ref().unwrap();
let datetime_format: &str = match &options.datetime_format {
Some(datetime_format) => datetime_format,
None => "%FT%H:%M:%S.%9f",
};

let len = df.height();
let n_threads = POOL.current_num_threads();
Expand Down
2 changes: 2 additions & 0 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 20 additions & 2 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import tempfile
import textwrap
import zlib
from datetime import date, datetime, time
from datetime import date, datetime, time, timedelta, timezone
from pathlib import Path

import pytest
Expand Down Expand Up @@ -860,6 +860,24 @@ def test_datetime_format(fmt: str, expected: str) -> None:
assert csv == expected


@pytest.mark.parametrize(
("fmt", "expected"),
[
(None, "dt\n2022-01-02T00:00:00.000000+0000\n"),
("%F %T%.3f%z", "dt\n2022-01-02 00:00:00.000+0000\n"),
("%Y%z", "dt\n2022+0000\n"),
("%m%z", "dt\n01+0000\n"),
("%m$%d%z", "dt\n01$02+0000\n"),
("%R%z", "dt\n00:00+0000\n"),
],
)
@pytest.mark.parametrize("tzinfo", [timezone.utc, timezone(timedelta(hours=0))])
def test_datetime_format_tz_aware(fmt: str, expected: str, tzinfo: timezone) -> None:
df = pl.DataFrame({"dt": [datetime(2022, 1, 2, tzinfo=tzinfo)]})
csv = df.write_csv(datetime_format=fmt)
assert csv == expected


@pytest.mark.parametrize(
("tu1", "tu2", "expected"),
[
Expand Down Expand Up @@ -1094,7 +1112,7 @@ def test_csv_write_tz_aware() -> None:
df = pl.DataFrame({"times": datetime(2021, 1, 1)}).with_columns(
pl.col("times").dt.cast_time_zone("UTC").dt.with_time_zone("Europe/Zurich")
)
assert df.write_csv() == "times\n2021-01-01 01:00:00 CET\n"
assert df.write_csv() == "times\n2021-01-01T01:00:00.000000+0100\n"


def test_csv_statistics_offset() -> None:
Expand Down