Skip to content

Commit

Permalink
apacheGH-37411: [C++][Python] Add string -> date cast kernel (fix pyt…
Browse files Browse the repository at this point in the history
…hon scalar cast) (apache#38038)

### Rationale for this change

Adding `string -> date32/date64` cast kernels, which then also fixes the pyarrow scalar cast method (which was earlier refactored to rely on the general cast kernels)

* Closes: apache#37411

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
jorisvandenbossche authored and loicalleyne committed Nov 13, 2023
1 parent 3bfb290 commit 109f8d1
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 32 deletions.
3 changes: 2 additions & 1 deletion cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,8 @@ struct ParseString {
};

template <typename O, typename I>
struct CastFunctor<O, I, enable_if_base_binary<I>> {
struct CastFunctor<
O, I, enable_if_t<(is_number_type<O>::value && is_base_binary_type<I>::value)>> {
static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
return applicator::ScalarUnaryNotNull<O, I, ParseString<O>>::Exec(ctx, batch, out);
}
Expand Down
49 changes: 49 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
namespace arrow {

using internal::ParseTimestampISO8601;
using internal::ParseYYYY_MM_DD;

namespace compute {
namespace internal {
Expand Down Expand Up @@ -451,6 +452,44 @@ struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>>
}
};

template <typename DateType>
struct ParseDate {
using value_type = typename DateType::c_type;

using duration_type =
typename std::conditional<std::is_same<DateType, Date32Type>::value,
arrow_vendored::date::days,
std::chrono::milliseconds>::type;

template <typename OutValue, typename Arg0Value>
OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
OutValue result = OutValue(0);

if (ARROW_PREDICT_FALSE(val.size() != 10)) {
*st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
TypeTraits<DateType>::type_singleton()->ToString());
return result;
}

duration_type since_epoch;
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(val.data(), &since_epoch))) {
*st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
TypeTraits<DateType>::type_singleton()->ToString());
} else {
result = static_cast<value_type>(since_epoch.count());
}
return result;
}
};

template <typename O, typename I>
struct CastFunctor<O, I,
enable_if_t<(is_date_type<O>::value && is_string_type<I>::value)>> {
static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
return applicator::ScalarUnaryNotNull<O, I, ParseDate<O>>::Exec(ctx, batch, out);
}
};

template <typename Type>
void AddCrossUnitCast(CastFunction* func) {
ScalarKernel kernel;
Expand Down Expand Up @@ -483,6 +522,11 @@ std::shared_ptr<CastFunction> GetDate32Cast() {
// timestamp -> date32
AddSimpleCast<TimestampType, Date32Type>(InputType(Type::TIMESTAMP), date32(),
func.get());

// string -> date32
AddSimpleCast<StringType, Date32Type>(utf8(), date32(), func.get());
AddSimpleCast<LargeStringType, Date32Type>(large_utf8(), date32(), func.get());

return func;
}

Expand All @@ -500,6 +544,11 @@ std::shared_ptr<CastFunction> GetDate64Cast() {
// timestamp -> date64
AddSimpleCast<TimestampType, Date64Type>(InputType(Type::TIMESTAMP), date64(),
func.get());

// string -> date64
AddSimpleCast<StringType, Date64Type>(utf8(), date64(), func.get());
AddSimpleCast<LargeStringType, Date64Type>(large_utf8(), date64(), func.get());

return func;
}

Expand Down
21 changes: 19 additions & 2 deletions cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@ TEST(Cast, CanCast) {
ExpectCannotCast(from_base_binary, {null()});
}

ExpectCanCast(utf8(), {timestamp(TimeUnit::MILLI)});
ExpectCanCast(large_utf8(), {timestamp(TimeUnit::NANO)});
ExpectCanCast(utf8(), {timestamp(TimeUnit::MILLI), date32(), date64()});
ExpectCanCast(large_utf8(), {timestamp(TimeUnit::NANO), date32(), date64()});
ExpectCannotCast(timestamp(TimeUnit::MICRO),
{binary(), large_binary()}); // no formatting supported

Expand Down Expand Up @@ -2016,6 +2016,23 @@ TEST(Cast, StringToTimestamp) {
}
}

TEST(Cast, StringToDate) {
for (auto string_type : {utf8(), large_utf8()}) {
auto strings = ArrayFromJSON(string_type, R"(["1970-01-01", null, "2000-02-29"])");

CheckCast(strings, ArrayFromJSON(date32(), "[0, null, 11016]"));
CheckCast(strings, ArrayFromJSON(date64(), "[0, null, 951782400000]"));

for (auto date_type : {date32(), date64()}) {
for (std::string not_ts : {"", "2012-01-xx", "2012-01-01 09:00:00"}) {
auto options = CastOptions::Safe(date_type);
CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_ts + "\"]"), options);
}
}
// NOTE: YYYY-MM-DD parsing is tested comprehensively in value_parsing_test.cc
}
}

static void AssertBinaryZeroCopy(std::shared_ptr<Array> lhs, std::shared_ptr<Array> rhs) {
// null bitmap and data buffers are always zero-copied
AssertBufferSame(*lhs, *rhs, 0);
Expand Down
58 changes: 29 additions & 29 deletions cpp/src/arrow/util/value_parsing.h
Original file line number Diff line number Diff line change
Expand Up @@ -443,33 +443,6 @@ namespace detail {

using ts_type = TimestampType::c_type;

template <typename Duration>
static inline bool ParseYYYY_MM_DD(const char* s, Duration* since_epoch) {
uint16_t year = 0;
uint8_t month = 0;
uint8_t day = 0;
if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
return false;
}
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 4, &year))) {
return false;
}
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 5, 2, &month))) {
return false;
}
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 8, 2, &day))) {
return false;
}
arrow_vendored::date::year_month_day ymd{arrow_vendored::date::year{year},
arrow_vendored::date::month{month},
arrow_vendored::date::day{day}};
if (ARROW_PREDICT_FALSE(!ymd.ok())) return false;

*since_epoch = std::chrono::duration_cast<Duration>(
arrow_vendored::date::sys_days{ymd}.time_since_epoch());
return true;
}

template <typename Duration>
static inline bool ParseHH(const char* s, Duration* out) {
uint8_t hours = 0;
Expand Down Expand Up @@ -641,6 +614,33 @@ static inline bool ParseSubSeconds(const char* s, size_t length, TimeUnit::type

} // namespace detail

template <typename Duration>
static inline bool ParseYYYY_MM_DD(const char* s, Duration* since_epoch) {
uint16_t year = 0;
uint8_t month = 0;
uint8_t day = 0;
if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
return false;
}
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 4, &year))) {
return false;
}
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 5, 2, &month))) {
return false;
}
if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 8, 2, &day))) {
return false;
}
arrow_vendored::date::year_month_day ymd{arrow_vendored::date::year{year},
arrow_vendored::date::month{month},
arrow_vendored::date::day{day}};
if (ARROW_PREDICT_FALSE(!ymd.ok())) return false;

*since_epoch = std::chrono::duration_cast<Duration>(
arrow_vendored::date::sys_days{ymd}.time_since_epoch());
return true;
}

static inline bool ParseTimestampISO8601(const char* s, size_t length,
TimeUnit::type unit, TimestampType::c_type* out,
bool* out_zone_offset_present = NULLPTR) {
Expand Down Expand Up @@ -672,7 +672,7 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
if (ARROW_PREDICT_FALSE(length < 10)) return false;

seconds_type seconds_since_epoch;
if (ARROW_PREDICT_FALSE(!detail::ParseYYYY_MM_DD(s, &seconds_since_epoch))) {
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &seconds_since_epoch))) {
return false;
}

Expand Down Expand Up @@ -843,7 +843,7 @@ struct StringConverter<DATE_TYPE, enable_if_date<DATE_TYPE>> {
}

duration_type since_epoch;
if (ARROW_PREDICT_FALSE(!detail::ParseYYYY_MM_DD(s, &since_epoch))) {
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &since_epoch))) {
return false;
}

Expand Down
7 changes: 7 additions & 0 deletions python/pyarrow/tests/test_scalars.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,13 @@ def test_cast_int_to_float():
int_scalar.cast(pa.float64()) # verify default is safe cast


@pytest.mark.parametrize("typ", [pa.date32(), pa.date64()])
def test_cast_string_to_date(typ):
scalar = pa.scalar('2021-01-01')
result = scalar.cast(typ)
assert result == pa.scalar(datetime.date(2021, 1, 1), type=typ)


@pytest.mark.pandas
def test_timestamp():
import pandas as pd
Expand Down

0 comments on commit 109f8d1

Please sign in to comment.