diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index b7287129cbc13..d992d073d518a 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -98,6 +98,44 @@ struct EnumTraits return ""; } }; +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "AssumeTimezoneOptions::Ambiguous"; } + static std::string value_name(compute::AssumeTimezoneOptions::Ambiguous value) { + switch (value) { + case compute::AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_RAISE: + return "AMBIGUOUS_RAISE"; + case compute::AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_EARLIEST: + return "AMBIGUOUS_EARLIEST"; + case compute::AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_LATEST: + return "AMBIGUOUS_LATEST"; + } + return ""; + } +}; +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "AssumeTimezoneOptions::Nonexistent"; } + static std::string value_name(compute::AssumeTimezoneOptions::Nonexistent value) { + switch (value) { + case compute::AssumeTimezoneOptions::Nonexistent::NONEXISTENT_RAISE: + return "NONEXISTENT_RAISE"; + case compute::AssumeTimezoneOptions::Nonexistent::NONEXISTENT_EARLIEST: + return "NONEXISTENT_EARLIEST"; + case compute::AssumeTimezoneOptions::Nonexistent::NONEXISTENT_LATEST: + return "NONEXISTENT_LATEST"; + } + return ""; + } +}; } // namespace internal namespace compute { @@ -147,6 +185,10 @@ static auto kStrptimeOptionsType = GetFunctionOptionsType( DataMember("unit", &StrptimeOptions::unit)); static auto kStrftimeOptionsType = GetFunctionOptionsType( DataMember("format", &StrftimeOptions::format)); +static auto kAssumeTimezoneOptionsType = GetFunctionOptionsType( + DataMember("timezone", &AssumeTimezoneOptions::timezone), + DataMember("ambiguous", &AssumeTimezoneOptions::ambiguous), + DataMember("nonexistent", &AssumeTimezoneOptions::nonexistent)); static auto kPadOptionsType = GetFunctionOptionsType( DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding)); static auto kTrimOptionsType = GetFunctionOptionsType( @@ -250,6 +292,15 @@ StrftimeOptions::StrftimeOptions() : StrftimeOptions(kDefaultFormat) {} constexpr char StrftimeOptions::kTypeName[]; constexpr const char* StrftimeOptions::kDefaultFormat; +AssumeTimezoneOptions::AssumeTimezoneOptions(std::string timezone, Ambiguous ambiguous, + Nonexistent nonexistent) + : FunctionOptions(internal::kAssumeTimezoneOptionsType), + timezone(std::move(timezone)), + ambiguous(ambiguous), + nonexistent(nonexistent) {} +AssumeTimezoneOptions::AssumeTimezoneOptions() : AssumeTimezoneOptions("UTC") {} +constexpr char AssumeTimezoneOptions::kTypeName[]; + PadOptions::PadOptions(int64_t width, std::string padding) : FunctionOptions(internal::kPadOptionsType), width(width), @@ -311,6 +362,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kStrftimeOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kAssumeTimezoneOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kTrimOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType)); @@ -512,6 +564,11 @@ Result DayOfWeek(const Datum& arg, DayOfWeekOptions options, ExecContext* return CallFunction("day_of_week", {arg}, &options, ctx); } +Result AssumeTimezone(const Datum& arg, AssumeTimezoneOptions options, + ExecContext* ctx) { + return CallFunction("assume_timezone", {arg}, &options, ctx); +} + Result Strftime(const Datum& arg, StrftimeOptions options, ExecContext* ctx) { return CallFunction("strftime", {arg}, &options, ctx); } diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index e959884b23320..c50803a045691 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -29,6 +29,7 @@ #include "arrow/result.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" +#include "arrow/vendored/datetime.h" namespace arrow { namespace compute { @@ -278,6 +279,40 @@ struct ARROW_EXPORT DayOfWeekOptions : public FunctionOptions { uint32_t week_start; }; +/// Used to control timestamp timezone conversion and handling ambiguous/nonexistent +/// times. +struct ARROW_EXPORT AssumeTimezoneOptions : public FunctionOptions { + public: + /// \brief How to interpret ambiguous local times that can be interpreted as + /// multiple instants (normally two) due to DST shifts. + /// + /// AMBIGUOUS_EARLIEST emits the earliest instant amongst possible interpretations. + /// AMBIGUOUS_LATEST emits the latest instant amongst possible interpretations. + enum Ambiguous { AMBIGUOUS_RAISE, AMBIGUOUS_EARLIEST, AMBIGUOUS_LATEST }; + + /// \brief How to handle local times that do not exist due to DST shifts. + /// + /// NONEXISTENT_EARLIEST emits the instant "just before" the DST shift instant + /// in the given timestamp precision (for example, for a nanoseconds precision + /// timestamp, this is one nanosecond before the DST shift instant). + /// NONEXISTENT_LATEST emits the DST shift instant. + enum Nonexistent { NONEXISTENT_RAISE, NONEXISTENT_EARLIEST, NONEXISTENT_LATEST }; + + explicit AssumeTimezoneOptions(std::string timezone, + Ambiguous ambiguous = AMBIGUOUS_RAISE, + Nonexistent nonexistent = NONEXISTENT_RAISE); + AssumeTimezoneOptions(); + constexpr static char const kTypeName[] = "AssumeTimezoneOptions"; + + /// Timezone to convert timestamps from + std::string timezone; + + /// How to interpret ambiguous local times (due to DST shifts) + Ambiguous ambiguous; + /// How to interpret non-existent local times (due to DST shifts) + Nonexistent nonexistent; +}; + /// @} /// \brief Get the absolute value of a value. @@ -1025,5 +1060,21 @@ ARROW_EXPORT Result Subsecond(const Datum& values, ExecContext* ctx = NUL ARROW_EXPORT Result Strftime(const Datum& values, StrftimeOptions options, ExecContext* ctx = NULLPTR); +/// \brief Converts timestamps from local timestamp without a timezone to a timestamp with +/// timezone, interpreting the local timestamp as being in the specified timezone for each +/// element of `values` +/// +/// \param[in] values input to convert +/// \param[in] options for setting source timezone, exception and ambiguous timestamp +/// handling. +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 6.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result AssumeTimezone(const Datum& values, + AssumeTimezoneOptions options, + ExecContext* ctx = NULLPTR); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index d7ebdf3de1d31..ab41887ca35fb 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -15,17 +15,18 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/compute/function.h" + +#include + #include #include #include -#include - #include "arrow/compute/api_aggregate.h" #include "arrow/compute/api_scalar.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/cast.h" -#include "arrow/compute/function.h" #include "arrow/compute/kernel.h" #include "arrow/datum.h" #include "arrow/status.h" @@ -80,6 +81,11 @@ TEST(FunctionOptions, Equality) { options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::MILLI)); options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::NANO)); options.emplace_back(new StrftimeOptions("%Y-%m-%dT%H:%M:%SZ", "C")); +#ifndef _WIN32 + options.emplace_back(new AssumeTimezoneOptions( + "Europe/Amsterdam", AssumeTimezoneOptions::Ambiguous::AMBIGUOUS_RAISE, + AssumeTimezoneOptions::Nonexistent::NONEXISTENT_RAISE)); +#endif options.emplace_back(new PadOptions(5, " ")); options.emplace_back(new PadOptions(10, "A")); options.emplace_back(new TrimOptions(" ")); diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 099bd95bbf238..01750d1f35919 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -290,9 +290,11 @@ class ARROW_EXPORT OutputType { enum ResolveKind { FIXED, COMPUTED }; /// Type resolution function. Given input types and shapes, return output - /// type and shape. This function SHOULD _not_ be used to check for arity, - /// that is to be performed one or more layers above. May make use of kernel - /// state to know what type to output in some cases. + /// type and shape. This function MAY may use the kernel state to decide + /// the output type based on the functionoptions. + /// + /// This function SHOULD _not_ be used to check for arity, that is to be + /// performed one or more layers above. using Resolver = std::function(KernelContext*, const std::vector&)>; @@ -304,7 +306,8 @@ class ARROW_EXPORT OutputType { /// \brief Output the exact type and shape provided by a ValueDescr OutputType(ValueDescr descr); // NOLINT implicit construction - explicit OutputType(Resolver resolver) + /// \brief Output a computed type depending on actual input types + OutputType(Resolver resolver) // NOLINT implicit construction : kind_(COMPUTED), resolver_(std::move(resolver)) {} OutputType(const OutputType& other) { diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_temporal.cc index b60e4ef71f0c9..bb67489690ea8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include "arrow/builder.h" @@ -49,6 +50,7 @@ using arrow_vendored::date::weekday; using arrow_vendored::date::weeks; using arrow_vendored::date::year_month_day; using arrow_vendored::date::years; +using arrow_vendored::date::zoned_time; using arrow_vendored::date::literals::dec; using arrow_vendored::date::literals::jan; using arrow_vendored::date::literals::last; @@ -59,6 +61,7 @@ using internal::applicator::SimpleUnary; using DayOfWeekState = OptionsWrapper; using StrftimeState = OptionsWrapper; +using AssumeTimezoneState = OptionsWrapper; const std::shared_ptr& IsoCalendarType() { static auto type = struct_({field("iso_year", int64()), field("iso_week", int64()), @@ -205,6 +208,28 @@ struct TemporalComponentExtractDayOfWeek } }; +template