diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h index 8a18d6feab62..ffc45745d2ea 100644 --- a/velox/core/QueryConfig.h +++ b/velox/core/QueryConfig.h @@ -80,6 +80,21 @@ class QueryConfig { /// decimal part, otherwise rounds. static constexpr const char* kCastToIntByTruncate = "cast_to_int_by_truncate"; + /// If set, cast from string to date allows only ISO 8601 formatted strings: + /// [+-](YYYY-MM-DD). Otherwise, allows all patterns supported by Spark: + /// `[+-]yyyy*` + /// `[+-]yyyy*-[m]m` + /// `[+-]yyyy*-[m]m-[d]d` + /// `[+-]yyyy*-[m]m-[d]d *` + /// `[+-]yyyy*-[m]m-[d]dT*` + /// The asterisk `*` in `yyyy*` stands for any numbers. + /// For the last two patterns, the trailing `*` can represent none or any + /// sequence of characters, e.g: + /// "1970-01-01 123" + /// "1970-01-01 (BC)" + static constexpr const char* kCastStringToDateIsIso8601 = + "cast_string_to_date_is_iso_8601"; + /// Used for backpressure to block local exchange producers when the local /// exchange buffer reaches or exceeds this size. static constexpr const char* kMaxLocalExchangeBufferSize = @@ -336,6 +351,10 @@ class QueryConfig { return get(kCastToIntByTruncate, false); } + bool isIso8601() const { + return get(kCastStringToDateIsIso8601, true); + } + bool codegenEnabled() const { return get(kCodegenEnabled, false); } diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst index 24af5eec3da5..b7519f9f9c6f 100644 --- a/velox/docs/configs.rst +++ b/velox/docs/configs.rst @@ -82,6 +82,8 @@ Generic Configuration - 1000 - The minimum number of table rows that can trigger the parallel hash join table build. +.. _expression-evaluation-conf: + Expression Evaluation Configuration ----------------------------------- .. list-table:: @@ -109,6 +111,21 @@ Expression Evaluation Configuration - bool - false - This flags forces the cast from float/double/decimal/string to integer to be performed by truncating the decimal part instead of rounding. + * - cast_string_to_date_is_iso_8601 + - bool + - true + - If set, cast from string to date allows only ISO 8601 formatted strings: ``[+-](YYYY-MM-DD)``. + Otherwise, allows all patterns supported by Spark: + * ``[+-]yyyy*`` + * ``[+-]yyyy*-[m]m`` + * ``[+-]yyyy*-[m]m-[d]d`` + * ``[+-]yyyy*-[m]m-[d]d *`` + * ``[+-]yyyy*-[m]m-[d]dT*`` + The asterisk ``*`` in ``yyyy*`` stands for any numbers. + For the last two patterns, the trailing ``*`` can represent none or any sequence of characters, e.g: + * "1970-01-01 123" + * "1970-01-01 (BC)" + Regardless of this setting's value, leading spaces will be trimmed. Memory Management ----------------- diff --git a/velox/docs/functions/presto/conversion.rst b/velox/docs/functions/presto/conversion.rst index f4e78aede27d..40f842481fe7 100644 --- a/velox/docs/functions/presto/conversion.rst +++ b/velox/docs/functions/presto/conversion.rst @@ -582,20 +582,55 @@ Cast to Date From strings ^^^^^^^^^^^^ -Casting from a string to date is allowed if the string represents a date in the -format `YYYY-MM-DD`. Casting from invalid input values throws. +By default, only ISO 8601 strings are supported: `[+-]YYYY-MM-DD`. -Valid example +If cast_string_to_date_is_iso_8601 is set to false, all Spark supported patterns are allowed. +See the documentation for cast_string_to_date_is_iso_8601 in :ref:`Expression Evaluation Configuration` +for the full list of supported patterns. + +Casting from invalid input values throws. + +Valid examples + +**cast_string_to_date_is_iso_8601=true** :: SELECT cast('1970-01-01' as date); -- 1970-01-01 -Invalid example +**cast_string_to_date_is_iso_8601=false** + +:: + + SELECT cast('1970' as date); -- 1970-01-01 + SELECT cast('1970-01' as date); -- 1970-01-01 + SELECT cast('1970-01-01' as date); -- 1970-01-01 + SELECT cast('1970-01-01T123' as date); -- 1970-01-01 + SELECT cast('1970-01-01 ' as date); -- 1970-01-01 + SELECT cast('1970-01-01 (BC)' as date); -- 1970-01-01 + +Invalid examples + +**cast_string_to_date_is_iso_8601=true** + +:: + + SELECT cast('2012' as date); -- Invalid argument + SELECT cast('2012-10' as date); -- Invalid argument + SELECT cast('2012-10-23T123' as date); -- Invalid argument + SELECT cast('2012-10-23 (BC)' as date); -- Invalid argument + SELECT cast('2012-Oct-23' as date); -- Invalid argument + SELECT cast('2012/10/23' as date); -- Invalid argument + SELECT cast('2012.10.23' as date); -- Invalid argument + SELECT cast('2012-10-23 ' as date); -- Invalid argument + +**cast_string_to_date_is_iso_8601=false** :: SELECT cast('2012-Oct-23' as date); -- Invalid argument + SELECT cast('2012/10/23' as date); -- Invalid argument + SELECT cast('2012.10.23' as date); -- Invalid argument From TIMESTAMP ^^^^^^^^^^^^^^ diff --git a/velox/expression/CastExpr.cpp b/velox/expression/CastExpr.cpp index 48c6cedf603f..ab5fd4a31aa9 100644 --- a/velox/expression/CastExpr.cpp +++ b/velox/expression/CastExpr.cpp @@ -103,14 +103,14 @@ VectorPtr CastExpr::castToDate( switch (fromType->kind()) { case TypeKind::VARCHAR: { auto* inputVector = input.as>(); + const auto& queryConfig = context.execCtx()->queryCtx()->queryConfig(); + auto isIso8601 = queryConfig.isIso8601(); applyToSelectedNoThrowLocal(context, rows, castResult, [&](int row) { try { auto inputString = inputVector->valueAt(row); - resultFlatVector->set(row, DATE()->toDays(inputString)); - } catch (const VeloxException& ue) { - if (!ue.isUserError()) { - throw; - } + resultFlatVector->set( + row, util::castFromDateString(inputString, isIso8601)); + } catch (const VeloxUserError& ue) { VELOX_USER_FAIL( makeErrorMessage(input, row, DATE()) + " " + ue.message()); } catch (const std::exception& e) { diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp index d68ec7fc431b..5486561add51 100644 --- a/velox/expression/tests/CastExprTest.cpp +++ b/velox/expression/tests/CastExprTest.cpp @@ -71,6 +71,12 @@ class CastExprTest : public functions::test::CastBaseTest { }); } + void setCastStringToDateIsIso8601(bool value) { + queryCtx_->testingOverrideConfigUnsafe({ + {core::QueryConfig::kCastStringToDateIsIso8601, std::to_string(value)}, + }); + } + std::shared_ptr makeConstantNullExpr(TypeKind kind) { return std::make_shared( createType(kind, {}), variant(kind)); @@ -672,49 +678,110 @@ TEST_F(CastExprTest, timestampAdjustToTimezoneInvalid) { } TEST_F(CastExprTest, date) { - std::vector> input{ - "1970-01-01", - "2020-01-01", - "2135-11-09", - "1969-12-27", - "1812-04-15", - "1920-01-02", - std::nullopt, - }; - std::vector> result{ - 0, - 18262, - 60577, - -5, - -57604, - -18262, - std::nullopt, - }; - - testCast( - "date", input, result, false, false, VARCHAR(), DATE()); + for (bool isIso8601 : {true, false}) { + setCastStringToDateIsIso8601(isIso8601); + testCast( + "date", + {"1970-01-01", + "2020-01-01", + "2135-11-09", + "1969-12-27", + "1812-04-15", + "1920-01-02", + "12345-12-18", + "1970-1-2", + "1970-01-2", + "1970-1-02", + "+1970-01-02", + "-1-1-1", + " 1970-01-01", + std::nullopt}, + {0, + 18262, + 60577, + -5, + -57604, + -18262, + 3789742, + 1, + 1, + 1, + 1, + -719893, + 0, + std::nullopt}, + false, + false, + VARCHAR(), + DATE()); + } - setCastIntByTruncate(true); + setCastStringToDateIsIso8601(false); testCast( - "date", input, result, false, false, VARCHAR(), DATE()); + "date", + {"12345", + "2015", + "2015-03", + "2015-03-18T", + "2015-03-18T123123", + "2015-03-18 123142", + "2015-03-18 (BC)"}, + {3789391, 16436, 16495, 16512, 16512, 16512, 16512}, + false, + false, + VARCHAR(), + DATE()); } TEST_F(CastExprTest, invalidDate) { - testCast("date", {12}, {0}, true, false, TINYINT(), DATE()); - testCast( - "date", {1234}, {0}, true, false, SMALLINT(), DATE()); - testCast( - "date", {1234}, {0}, true, false, INTEGER(), DATE()); - testCast( - "date", {1234}, {0}, true, false, BIGINT(), DATE()); - - testCast("date", {12.99}, {0}, true, false, REAL(), DATE()); - testCast( - "date", {12.99}, {0}, true, false, DOUBLE(), DATE()); - - // Parsing an ill-formated date. + for (bool isIso8601 : {true, false}) { + setCastStringToDateIsIso8601(isIso8601); + + testCast( + "date", {12}, {0}, true, false, TINYINT(), DATE()); + testCast( + "date", {1234}, {0}, true, false, SMALLINT(), DATE()); + testCast( + "date", {1234}, {0}, true, false, INTEGER(), DATE()); + testCast( + "date", {1234}, {0}, true, false, BIGINT(), DATE()); + + testCast("date", {12.99}, {0}, true, false, REAL(), DATE()); + testCast( + "date", {12.99}, {0}, true, false, DOUBLE(), DATE()); + + // Parsing ill-formated dates. + testCast( + "date", {"2012-Oct-23"}, {0}, true, false, VARCHAR(), DATE()); + testCast( + "date", {"2015-03-18X"}, {0}, true, false, VARCHAR(), DATE()); + testCast( + "date", {"2015/03/18"}, {0}, true, false, VARCHAR(), DATE()); + testCast( + "date", {"2015.03.18"}, {0}, true, false, VARCHAR(), DATE()); + testCast( + "date", {"20150318"}, {0}, true, false, VARCHAR(), DATE()); + testCast( + "date", {"2015-031-8"}, {0}, true, false, VARCHAR(), DATE()); + } + + setCastStringToDateIsIso8601(true); + testCast( + "date", {"12345"}, {0}, true, false, VARCHAR(), DATE()); + testCast( + "date", {"2015-03"}, {0}, true, false, VARCHAR(), DATE()); + testCast( + "date", {"2015-03-18 123412"}, {0}, true, false, VARCHAR(), DATE()); + testCast( + "date", {"2015-03-18T"}, {0}, true, false, VARCHAR(), DATE()); + testCast( + "date", {"2015-03-18T123412"}, {0}, true, false, VARCHAR(), DATE()); + testCast( + "date", {"2015-03-18 (BC)"}, {0}, true, false, VARCHAR(), DATE()); + testCast( + "date", {"1970-01-01 "}, {0}, true, false, VARCHAR(), DATE()); testCast( - "date", {"2012-Oct-23"}, {0}, true, false, VARCHAR(), DATE()); + "date", {" 1970-01-01 "}, {0}, true, false, VARCHAR(), DATE()); } TEST_F(CastExprTest, primitiveInvalidCornerCases) { diff --git a/velox/type/TimestampConversion.cpp b/velox/type/TimestampConversion.cpp index 6b110b6347b9..a867c20f2990 100644 --- a/velox/type/TimestampConversion.cpp +++ b/velox/type/TimestampConversion.cpp @@ -268,7 +268,6 @@ bool tryParseDateString( return false; } - // In standard-cast mode, no more trailing characters. if (mode == ParseMode::kStandardCast) { daysSinceEpoch = daysSinceEpochFromDate(year, month, day); @@ -278,7 +277,7 @@ bool tryParseDateString( return false; } - // In non-standard cast mode, any optional trailing 'T' or spaces followed + // In non-standard cast mode, an optional trailing 'T' or space followed // by any optional characters are valid patterns. if (mode == ParseMode::kNonStandardCast) { daysSinceEpoch = daysSinceEpochFromDate(year, month, day); @@ -586,26 +585,26 @@ int64_t fromDateString(const char* str, size_t len) { return daysSinceEpoch; } -int32_t -castFromDateString(const char* str, size_t len, bool isNonStandardCast) { +int32_t castFromDateString(const char* str, size_t len, bool isIso8601) { int64_t daysSinceEpoch; size_t pos = 0; - auto mode = isNonStandardCast ? ParseMode::kNonStandardCast - : ParseMode::kStandardCast; + auto mode = + isIso8601 ? ParseMode::kStandardCast : ParseMode::kNonStandardCast; if (!tryParseDateString(str, len, pos, daysSinceEpoch, mode)) { - if (isNonStandardCast) { + if (isIso8601) { VELOX_USER_FAIL( "Unable to parse date value: \"{}\"." - "Valid date string patterns include " - "(YYYY, YYYY-MM, YYYY-MM-DD), and any pattern prefixed with [+-]", + "Valid date string pattern is (YYYY-MM-DD), " + "and can be prefixed with [+-]", std::string(str, len)); - } else { VELOX_USER_FAIL( "Unable to parse date value: \"{}\"." - "Valid date string pattern is (YYYY-MM-DD), " - "and can be prefixed with [+-]", + "Valid date string patterns include " + "(yyyy*, yyyy*-[m]m, yyyy*-[m]m-[d]d, " + "yyyy*-[m]m-[d]d *, yyyy*-[m]m-[d]dT*), " + "and any pattern prefixed with [+-]", std::string(str, len)); } } diff --git a/velox/type/TimestampConversion.h b/velox/type/TimestampConversion.h index 3ed38a01756a..0a4c3372190c 100644 --- a/velox/type/TimestampConversion.h +++ b/velox/type/TimestampConversion.h @@ -85,8 +85,8 @@ inline int64_t fromDateString(const StringView& str) { } /// Cast string to date. -/// When isNonStandardCast = false, only support "[+-]YYYY-MM-DD" format. -/// When isNonStandardCast = true, supported date formats include: +/// When isIso8601 = true, only support "[+-]YYYY-MM-DD" format (ISO 8601). +/// When isIso8601 = false, supported date formats include: /// /// `[+-]YYYY*` /// `[+-]YYYY*-[M]M` @@ -96,12 +96,10 @@ inline int64_t fromDateString(const StringView& str) { /// `[+-]YYYY*-[M]M-[D]DT*` /// /// Throws VeloxUserError if the format or date is invalid. -int32_t castFromDateString(const char* buf, size_t len, bool isNonStandardCast); +int32_t castFromDateString(const char* buf, size_t len, bool isIso8601); -inline int32_t castFromDateString( - const StringView& str, - bool isNonStandardCast) { - return castFromDateString(str.data(), str.size(), isNonStandardCast); +inline int32_t castFromDateString(const StringView& str, bool isIso8601) { + return castFromDateString(str.data(), str.size(), isIso8601); } // Extracts the day of the week from the number of days since epoch diff --git a/velox/type/tests/TimestampConversionTest.cpp b/velox/type/tests/TimestampConversionTest.cpp index 4879024f29f1..4763eb9d882e 100644 --- a/velox/type/tests/TimestampConversionTest.cpp +++ b/velox/type/tests/TimestampConversionTest.cpp @@ -16,8 +16,8 @@ #include "velox/type/TimestampConversion.h" #include -#include #include "velox/common/base/VeloxException.h" +#include "velox/common/base/tests/GTestUtils.h" #include "velox/external/date/tz.h" #include "velox/type/Timestamp.h" #include "velox/type/tz/TimeZoneMap.h" @@ -110,42 +110,76 @@ TEST(DateTimeUtilTest, fromDateStrInvalid) { } TEST(DateTimeUtilTest, castFromDateString) { - for (bool nonStandard : {true, false}) { - EXPECT_EQ(0, castFromDateString("1970-01-01", nonStandard)); - EXPECT_EQ(3789742, castFromDateString("12345-12-18", nonStandard)); + for (bool isIso8601 : {true, false}) { + EXPECT_EQ(0, castFromDateString("1970-01-01", isIso8601)); + EXPECT_EQ(3789742, castFromDateString("12345-12-18", isIso8601)); - EXPECT_EQ(1, castFromDateString("+1970-1-2", nonStandard)); - EXPECT_EQ(1, castFromDateString("+1970-01-2", nonStandard)); - EXPECT_EQ(1, castFromDateString("+1970-1-02", nonStandard)); + EXPECT_EQ(1, castFromDateString("1970-1-2", isIso8601)); + EXPECT_EQ(1, castFromDateString("1970-01-2", isIso8601)); + EXPECT_EQ(1, castFromDateString("1970-1-02", isIso8601)); - EXPECT_EQ(1, castFromDateString("+1970-01-02", nonStandard)); - EXPECT_EQ(-719893, castFromDateString("-1-1-1", nonStandard)); + EXPECT_EQ(1, castFromDateString("+1970-01-02", isIso8601)); + EXPECT_EQ(-719893, castFromDateString("-1-1-1", isIso8601)); + + EXPECT_EQ(0, castFromDateString(" 1970-01-01", isIso8601)); } - EXPECT_EQ(3789391, castFromDateString("12345", true)); - EXPECT_EQ(16495, castFromDateString("2015-03", true)); - EXPECT_EQ(16512, castFromDateString("2015-03-18 ", true)); - EXPECT_EQ(16512, castFromDateString("2015-03-18 123412", true)); - EXPECT_EQ(16512, castFromDateString("2015-03-18T", true)); - EXPECT_EQ(16512, castFromDateString("2015-03-18T123412", true)); + EXPECT_EQ(3789391, castFromDateString("12345", false)); + EXPECT_EQ(16436, castFromDateString("2015", false)); + EXPECT_EQ(16495, castFromDateString("2015-03", false)); + EXPECT_EQ(16512, castFromDateString("2015-03-18T", false)); + EXPECT_EQ(16512, castFromDateString("2015-03-18T123123", false)); + EXPECT_EQ(16512, castFromDateString("2015-03-18 123142", false)); + EXPECT_EQ(16512, castFromDateString("2015-03-18 (BC)", false)); + + EXPECT_EQ(0, castFromDateString("1970-01-01 ", false)); + EXPECT_EQ(0, castFromDateString(" 1970-01-01 ", false)); } TEST(DateTimeUtilTest, castFromDateStringInvalid) { - for (bool nonStandard : {true, false}) { - EXPECT_THROW( - castFromDateString("2015-03-18X", nonStandard), VeloxUserError); - EXPECT_THROW(castFromDateString("2015/03/18", nonStandard), VeloxUserError); - EXPECT_THROW(castFromDateString("2015.03.18", nonStandard), VeloxUserError); - EXPECT_THROW(castFromDateString("20150318", nonStandard), VeloxUserError); - EXPECT_THROW(castFromDateString("2015-031-8", nonStandard), VeloxUserError); + auto testCastFromDateStringInvalid = [&](const StringView& str, + bool isIso8601) { + if (isIso8601) { + VELOX_ASSERT_THROW( + castFromDateString(str, isIso8601), + fmt::format( + "Unable to parse date value: \"{}\"." + "Valid date string pattern is (YYYY-MM-DD), " + "and can be prefixed with [+-]", + std::string(str.data(), str.size()))); + } else { + VELOX_ASSERT_THROW( + castFromDateString(str, isIso8601), + fmt::format( + "Unable to parse date value: \"{}\"." + "Valid date string patterns include " + "(yyyy*, yyyy*-[m]m, yyyy*-[m]m-[d]d, " + "yyyy*-[m]m-[d]d *, yyyy*-[m]m-[d]dT*), " + "and any pattern prefixed with [+-]", + std::string(str.data(), str.size()))); + } + }; + + for (bool isIso8601 : {true, false}) { + testCastFromDateStringInvalid("2012-Oct-23", isIso8601); + testCastFromDateStringInvalid("2012-Oct-23", isIso8601); + testCastFromDateStringInvalid("2015-03-18X", isIso8601); + testCastFromDateStringInvalid("2015/03/18", isIso8601); + testCastFromDateStringInvalid("2015.03.18", isIso8601); + testCastFromDateStringInvalid("20150318", isIso8601); + testCastFromDateStringInvalid("2015-031-8", isIso8601); } - EXPECT_THROW(castFromDateString("12345", false), VeloxUserError); - EXPECT_THROW(castFromDateString("2015-03", false), VeloxUserError); - EXPECT_THROW(castFromDateString("2015-03-18 ", false), VeloxUserError); - EXPECT_THROW(castFromDateString("2015-03-18 123412", false), VeloxUserError); - EXPECT_THROW(castFromDateString("2015-03-18T", false), VeloxUserError); - EXPECT_THROW(castFromDateString("2015-03-18T123412", false), VeloxUserError); + testCastFromDateStringInvalid("12345", true); + testCastFromDateStringInvalid("2015", true); + testCastFromDateStringInvalid("2015-03", true); + testCastFromDateStringInvalid("2015-03-18 123412", true); + testCastFromDateStringInvalid("2015-03-18T", true); + testCastFromDateStringInvalid("2015-03-18T123412", true); + testCastFromDateStringInvalid("2015-03-18 (BC)", true); + + testCastFromDateStringInvalid("1970-01-01 ", true); + testCastFromDateStringInvalid(" 1970-01-01 ", true); } TEST(DateTimeUtilTest, fromTimeString) {