Skip to content

Commit

Permalink
feat(rust, python): support timezone in csv writer (#6722)
Browse files Browse the repository at this point in the history
Co-authored-by: MarcoGorelli <>
Co-authored-by: Ritchie Vink <ritchie46@gmail.com>
  • Loading branch information
MarcoGorelli and ritchie46 authored Feb 10, 2023
1 parent b3a7374 commit 9e298e2
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 19 deletions.
2 changes: 1 addition & 1 deletion polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ bigidx = ["polars-core/bigidx", "polars-lazy/bigidx", "polars-ops/big_idx"]
list_to_struct = ["polars-ops/list_to_struct", "polars-lazy/list_to_struct"]
list_take = ["polars-ops/list_take", "polars-lazy/list_take"]
describe = ["polars-core/describe"]
timezones = ["polars-core/timezones", "polars-lazy/timezones"]
timezones = ["polars-core/timezones", "polars-lazy/timezones", "polars-io/timezones"]
string_justify = ["polars-lazy/string_justify", "polars-ops/string_justify"]
string_from_radix = ["polars-lazy/string_from_radix", "polars-ops/string_from_radix"]
arg_where = ["polars-lazy/arg_where"]
Expand Down
13 changes: 12 additions & 1 deletion polars/polars-io/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,16 @@ decompress = ["flate2/miniz_oxide"]
decompress-fast = ["flate2/zlib-ng"]
dtype-categorical = ["polars-core/dtype-categorical"]
dtype-date = ["polars-core/dtype-date", "polars-time/dtype-date"]
dtype-datetime = ["polars-core/dtype-datetime", "polars-core/temporal", "polars-time/dtype-datetime"]
dtype-datetime = [
"polars-core/dtype-datetime",
"polars-core/temporal",
"polars-time/dtype-datetime",
"chrono",
]
timezones = [
"chrono-tz",
"dtype-datetime",
]
dtype-time = ["polars-core/dtype-time", "polars-core/temporal", "polars-time/dtype-time"]
dtype-struct = ["polars-core/dtype-struct"]
dtype-binary = ["polars-core/dtype-binary"]
Expand All @@ -46,6 +55,8 @@ anyhow.workspace = true
arrow.workspace = true
async-trait = { version = "0.1.59", optional = true }
bytes = "1.3.0"
chrono = { version = "0.4.23", optional = true }
chrono-tz = { version = "0.8.1", optional = true }
dirs = "4.0"
flate2 = { version = "1", optional = true, default-features = false }
futures = { version = "0.3.25", optional = true }
Expand Down
55 changes: 40 additions & 15 deletions polars/polars-io/src/csv/write_impl.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
use std::io::Write;

use arrow::temporal_conversions;
#[cfg(feature = "timezones")]
use chrono::TimeZone;
#[cfg(feature = "timezones")]
use chrono_tz::Tz;
use lexical_core::{FormattedSize, ToLexical};
use memchr::{memchr, memchr2};
use polars_core::error::PolarsError::ComputeError;
use polars_core::fmt::PlTzAware;
use polars_core::prelude::*;
use polars_core::series::SeriesIter;
use polars_core::POOL;
Expand Down Expand Up @@ -95,12 +98,22 @@ fn write_anyvalue(
TimeUnit::Microseconds => temporal_conversions::timestamp_us_to_datetime(v),
TimeUnit::Milliseconds => temporal_conversions::timestamp_ms_to_datetime(v),
};
match tz {
None => write!(f, "{}", ndt.format(datetime_format)),
Some(tz) => {
write!(f, "{}", PlTzAware::new(ndt, tz))
let formatted = match tz {
#[cfg(feature = "timezones")]
Some(tz) => match tz.parse::<Tz>() {
Ok(parsed_tz) => parsed_tz.from_utc_datetime(&ndt).format(datetime_format),
Err(_) => match temporal_conversions::parse_offset(tz) {
Ok(parsed_tz) => parsed_tz.from_utc_datetime(&ndt).format(datetime_format),
Err(_) => unreachable!(),
},
},
#[cfg(not(feature = "timezones"))]
Some(_) => {
panic!("activate 'timezones' feature");
}
}
_ => ndt.format(datetime_format),
};
write!(f, "{formatted}")
}
#[cfg(feature = "dtype-time")]
AnyValue::Time(v) => {
Expand Down Expand Up @@ -186,24 +199,36 @@ pub(crate) fn write<W: Write>(
if options.datetime_format.is_none() {
for col in df.get_columns() {
match col.dtype() {
DataType::Datetime(TimeUnit::Microseconds, _)
DataType::Datetime(TimeUnit::Milliseconds, tz)
// lowest precision; only set if it's not been inferred yet
if options.datetime_format.is_none() =>
{
options.datetime_format = Some("%FT%H:%M:%S.%6f".to_string());
options.datetime_format = match tz{
Some(_) => Some("%FT%H:%M:%S.%3f%z".to_string()),
None => Some("%FT%H:%M:%S.%3f".to_string()),
};
}
DataType::Datetime(TimeUnit::Nanoseconds, _) => {
options.datetime_format = Some("%FT%H:%M:%S.%9f".to_string());
DataType::Datetime(TimeUnit::Microseconds, tz) => {
options.datetime_format = match tz{
Some(_) => Some("%FT%H:%M:%S.%6f%z".to_string()),
None => Some("%FT%H:%M:%S.%6f".to_string()),
};
}
DataType::Datetime(TimeUnit::Nanoseconds, tz) => {
options.datetime_format = match tz {
Some(_) => Some("%FT%H:%M:%S.%9f%z".to_string()),
None => Some("%FT%H:%M:%S.%9f".to_string()),
};
break; // highest precision; no need to check further
}
_ => {}
}
}
// if still not set, no cols require higher precision than "ms" (or no datetime cols)
if options.datetime_format.is_none() {
options.datetime_format = Some("%FT%H:%M:%S.%3f".to_string());
}
}
let datetime_format: &str = options.datetime_format.as_ref().unwrap();
let datetime_format: &str = match &options.datetime_format {
Some(datetime_format) => datetime_format,
None => "%FT%H:%M:%S.%9f",
};

let len = df.height();
let n_threads = POOL.current_num_threads();
Expand Down
2 changes: 2 additions & 0 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 20 additions & 2 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import tempfile
import textwrap
import zlib
from datetime import date, datetime, time
from datetime import date, datetime, time, timedelta, timezone
from pathlib import Path

import pytest
Expand Down Expand Up @@ -860,6 +860,24 @@ def test_datetime_format(fmt: str, expected: str) -> None:
assert csv == expected


@pytest.mark.parametrize(
("fmt", "expected"),
[
(None, "dt\n2022-01-02T00:00:00.000000+0000\n"),
("%F %T%.3f%z", "dt\n2022-01-02 00:00:00.000+0000\n"),
("%Y%z", "dt\n2022+0000\n"),
("%m%z", "dt\n01+0000\n"),
("%m$%d%z", "dt\n01$02+0000\n"),
("%R%z", "dt\n00:00+0000\n"),
],
)
@pytest.mark.parametrize("tzinfo", [timezone.utc, timezone(timedelta(hours=0))])
def test_datetime_format_tz_aware(fmt: str, expected: str, tzinfo: timezone) -> None:
df = pl.DataFrame({"dt": [datetime(2022, 1, 2, tzinfo=tzinfo)]})
csv = df.write_csv(datetime_format=fmt)
assert csv == expected


@pytest.mark.parametrize(
("tu1", "tu2", "expected"),
[
Expand Down Expand Up @@ -1094,7 +1112,7 @@ def test_csv_write_tz_aware() -> None:
df = pl.DataFrame({"times": datetime(2021, 1, 1)}).with_columns(
pl.col("times").dt.cast_time_zone("UTC").dt.with_time_zone("Europe/Zurich")
)
assert df.write_csv() == "times\n2021-01-01 01:00:00 CET\n"
assert df.write_csv() == "times\n2021-01-01T01:00:00.000000+0100\n"


def test_csv_statistics_offset() -> None:
Expand Down

0 comments on commit 9e298e2

Please sign in to comment.