Skip to content

Commit

Permalink
ARROW-17425: [R] lubridate::as_datetime() in dplyr query should be …
Browse files Browse the repository at this point in the history
…able to handle time in sub seconds (#13890)

This change allows strings containing sub-seconds and double types to be used as input to `lubridate::as_datetime()`.

```r
1.5 |>
  arrow::arrow_table(x = _) |>
  dplyr::mutate(
    y = lubridate::as_datetime(x)
  ) |>
  dplyr::collect() |>
  dplyr::mutate(
    z = lubridate::as_datetime(x),
    is_equal = (y == z)
  )
#>     x                   y                   z is_equal
#> 1 1.5 1970-01-01 00:00:01 1970-01-01 00:00:01     TRUE
```

And, because the timestamp type generated by `as_datetime` is expected to be used in combination with other functions, fix the bug of ~~`as.Date` and~~ `lubridate::as_date` that could cause an error if a sub-seconds timestamp was entered.

Edit: as.Date fixed by #14935

As a breaking change, the return type of `as_datetime()` will be nanoseconds, but I hope this will not have a major impact, since originally `as_datetime() |> as.integer()` or `as_datetime() |> as.numeric()` could not be used because it would try to cast to int32 or double, resulting in an error.
(We have to cast timestamp to int64)

arrow 9.0.0

```r
1 |>
  arrow::arrow_table(x = _) |>
  dplyr::mutate(
    x = lubridate::as_datetime(x),
    y = cast(x, arrow::int64())
  ) |>
  dplyr::collect()
#>                     x y
#> 1 1970-01-01 00:00:01 1
```

This PR

``` r
1 |>
  arrow::arrow_table(x = _) |>
  dplyr::mutate(
    x = lubridate::as_datetime(x),
    y = cast(x, arrow::int64())
  ) |>
  dplyr::collect()
#>                     x          y
#> 1 1970-01-01 00:00:01 1000000000
```

Lead-authored-by: SHIMA Tatsuya <ts1s1andn@gmail.com>
Co-authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Nic Crane <thisisnic@gmail.com>
  • Loading branch information
eitsupi and thisisnic authored Jan 11, 2023
1 parent 872bbda commit 157062d
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 14 deletions.
29 changes: 21 additions & 8 deletions r/R/dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,6 @@ register_bindings_datetime_conversion <- function() {
min = 0L,
sec = 0,
tz = "UTC") {

# ParseTimestampStrptime currently ignores the timezone information (ARROW-12820).
# Stop if tz other than 'UTC' is provided.
if (tz != "UTC") {
Expand All @@ -294,7 +293,6 @@ register_bindings_datetime_conversion <- function() {
min,
sec,
tz = "UTC") {

# NAs for seconds aren't propagated (but treated as 0) in the base version
sec <- call_binding(
"if_else",
Expand Down Expand Up @@ -367,7 +365,8 @@ register_bindings_datetime_conversion <- function() {
# => we only cast when we want the behaviour of the base version or when
# `tz` is set (i.e. not NULL)
if (call_binding("is.POSIXct", x) && !is.null(tz)) {
x <- cast(x, timestamp(timezone = tz))
unit <- if (inherits(x, "Expression")) x$type()$unit() else "s"
x <- cast(x, timestamp(unit = unit, timezone = tz))
}
binding_as_date(
x = x,
Expand All @@ -379,22 +378,36 @@ register_bindings_datetime_conversion <- function() {
register_binding("lubridate::as_datetime", function(x,
origin = "1970-01-01",
tz = "UTC",
format = NULL) {
format = NULL,
unit = "ns") {
# Arrow uses unit for time parsing, as_datetime() does not.
unit <- make_valid_time_unit(
unit,
c(valid_time64_units, valid_time32_units)
)

if (call_binding("is.integer", x)) {
x <- cast(x, int64())
}

if (call_binding("is.numeric", x)) {
delta <- cast(call_binding("difftime", origin, "1970-01-01"), int64())
multiple <- Expression$create("power_checked", 1000L, unit)
delta <- call_binding("difftime", origin, "1970-01-01")
delta <- cast(delta, int64())
delta <- Expression$create("multiply_checked", delta, multiple)
x <- Expression$create("multiply_checked", x, multiple)
x <- cast(x, int64())
x <- Expression$create("add_checked", x, delta)
}

if (call_binding("is.character", x) && !is.null(format)) {
# unit = 0L is the identifier for seconds in valid_time32_units
x <- Expression$create(
"strptime",
x,
options = list(format = format, unit = 0L, error_is_null = TRUE)
options = list(format = format, unit = unit, error_is_null = TRUE)
)
}
output <- cast(x, timestamp())
output <- cast(x, timestamp(unit = unit))
Expression$create("assume_timezone", output, options = list(timezone = tz))
})

Expand Down
61 changes: 55 additions & 6 deletions r/tests/testthat/test-dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -1989,6 +1989,7 @@ test_that("`as_datetime()`", {
test_df <- tibble(
date = as.Date(c("2022-03-22", "2021-07-30", NA)),
char_date = c("2022-03-22", "2021-07-30 14:32:47", NA),
char_date_subsec = c("1970-01-01T00:00:59.123456789", "2000-02-29T23:23:23.999999999", NA),
char_date_non_iso = c("2022-22-03 12:34:56", "2021-30-07 14:32:47", NA),
int_date = c(10L, 25L, NA),
integerish_date = c(10, 25, NA),
Expand All @@ -2002,24 +2003,72 @@ test_that("`as_datetime()`", {
ddate2 = lubridate::as_datetime(date),
dchar_date_no_tz = as_datetime(char_date),
dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas"),
dchar_date_subsec_no_tz = as_datetime(char_date_subsec),
dchar_date_subsec_with_tz = as_datetime(char_date_subsec, tz = "Pacific/Marquesas"),
dint_date = as_datetime(int_date, origin = "1970-01-02"),
dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"),
dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01")
dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01"),
ddouble_date = as_datetime(double_date)
) %>%
collect(),
test_df
)

# Arrow does not support conversion of double to date
# the below should error with an error message originating in the C++ code
expect_error(
expect_identical(
test_df %>%
arrow_table() %>%
mutate(
ddouble_date = as_datetime(double_date)
x = cast(as_datetime(double_date, unit = "ns"), int64()),
y = cast(as_datetime(double_date, unit = "us"), int64()),
z = cast(as_datetime(double_date, unit = "ms"), int64()),
.keep = "none"
) %>%
collect(),
regexp = "Float value 10.1 was truncated converting to int64"
tibble(
x = bit64::as.integer64(c(10100000000, 25200000000, NA)),
y = as.integer(c(10100000, 25200000, NA)),
z = as.integer(c(10100, 25200, NA))
)
)
})

test_that("as_datetime() works with other functions", {
test_df <- tibble(
char_date = c("2022-03-22", "2021-07-30 14:32:47", "1970-01-01 00:00:59.123456789", NA)
)

compare_dplyr_binding(
.input %>%
transmute(
ddchar_date = as_datetime(char_date),
ddchar_date_date32_1 = as.Date(ddchar_date),
ddchar_date_date32_2 = as_date(ddchar_date),
ddchar_date_floored = floor_date(ddchar_date, unit = "days")
) %>%
collect(),
test_df
)

# ARROW-17428 - Arrow does not support conversion of timestamp to int32
expect_error(
test_df %>%
arrow_table() %>%
mutate(
dchar_date = as_datetime(char_date),
dchar_date_int = as.integer(dchar_date)
) %>%
collect()
)

# ARROW-17428 - Arrow does not support conversion of timestamp to double
expect_error(
test_df %>%
arrow_table() %>%
mutate(
dchar_date = as_datetime(char_date),
dchar_date_num = as.numeric(dchar_date)
) %>%
collect()
)
})

Expand Down

0 comments on commit 157062d

Please sign in to comment.