Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Spark CAST(string as boolean) #10382

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions velox/docs/functions/spark/conversion.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,42 @@ Valid examples
SELECT cast(cast(2147483648.90 as DECIMAL(12, 2)) as integer); -- -2147483648
SELECT cast(cast(2147483648.90 as DECIMAL(12, 2)) as bigint); -- 2147483648

Cast to Boolean
---------------

From VARCHAR
^^^^^^^^^^^^

The strings `t, f, y, n, 1, 0, yes, no, true, false` and their upper case equivalents are allowed to be casted to boolean.
Casting from other strings to boolean throws.

Valid examples

::

SELECT cast('1' as boolean); -- true
SELECT cast('0' as boolean); -- false
SELECT cast('t' as boolean); -- true (case insensitive)
SELECT cast('true' as boolean); -- true (case insensitive)
SELECT cast('f' as boolean); -- false (case insensitive)
SELECT cast('false' as boolean); -- false (case insensitive)
SELECT cast('y' as boolean); -- true (case insensitive)
SELECT cast('yes' as boolean); -- true (case insensitive)
SELECT cast('n' as boolean); -- false (case insensitive)
SELECT cast('no' as boolean); -- false (case insensitive)

Invalid examples

::

SELECT cast('1.7E308' as boolean); -- Invalid argument
SELECT cast('nan' as boolean); -- Invalid argument
SELECT cast('infinity' as boolean); -- Invalid argument
SELECT cast('12' as boolean); -- Invalid argument
SELECT cast('-1' as boolean); -- Invalid argument
SELECT cast('tr' as boolean); -- Invalid argument
SELECT cast('tru' as boolean); -- Invalid argument

Cast to String
--------------

Expand Down
36 changes: 20 additions & 16 deletions velox/functions/sparksql/tests/SparkCastExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,26 +267,23 @@ TEST_F(SparkCastExprTest, primitiveInvalidCornerCases) {
// To boolean.
{
testInvalidCast<std::string>(
"boolean",
{"1.7E308"},
"Non-whitespace character found after end of conversion");
"boolean", {"1.7E308"}, "Cannot cast 1.7E308 to BOOLEAN");
testInvalidCast<std::string>(
"boolean",
{"nan"},
"Non-whitespace character found after end of conversion");
"boolean", {"nan"}, "Cannot cast nan to BOOLEAN");
testInvalidCast<std::string>(
"boolean", {"infinity"}, "Invalid value for bool");
"boolean", {"infinity"}, "Cannot cast infinity to BOOLEAN");
testInvalidCast<std::string>(
"boolean", {"12"}, "Integer overflow when parsing bool");
testInvalidCast<std::string>("boolean", {"-1"}, "Invalid value for bool");
"boolean", {"12"}, "Cannot cast 12 to BOOLEAN");
testInvalidCast<std::string>(
"boolean",
{"tr"},
"Non-whitespace character found after end of conversion");
"boolean", {"-1"}, "Cannot cast -1 to BOOLEAN");
testInvalidCast<std::string>(
"boolean",
{"tru"},
"Non-whitespace character found after end of conversion");
"boolean", {"tr"}, "Cannot cast tr to BOOLEAN");
testInvalidCast<std::string>(
"boolean", {"tru"}, "Cannot cast tru to BOOLEAN");
testInvalidCast<std::string>(
"boolean", {"on"}, "Cannot cast on to BOOLEAN");
testInvalidCast<std::string>(
"boolean", {"off"}, "Cannot cast off to BOOLEAN");
}
}

Expand Down Expand Up @@ -352,9 +349,16 @@ TEST_F(SparkCastExprTest, primitiveValidCornerCases) {
testCast<double, bool>("boolean", {0.0000000000001}, {true});

testCast<std::string, bool>("boolean", {"1"}, {true});
testCast<std::string, bool>("boolean", {"0"}, {false});
testCast<std::string, bool>("boolean", {"t"}, {true});
testCast<std::string, bool>("boolean", {"y"}, {true});
testCast<std::string, bool>("boolean", {"yes"}, {true});
testCast<std::string, bool>("boolean", {"true"}, {true});

testCast<std::string, bool>("boolean", {"0"}, {false});
testCast<std::string, bool>("boolean", {"f"}, {false});
testCast<std::string, bool>("boolean", {"n"}, {false});
testCast<std::string, bool>("boolean", {"no"}, {false});
testCast<std::string, bool>("boolean", {"false"}, {false});
}

// To string.
Expand Down
32 changes: 0 additions & 32 deletions velox/type/Conversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,35 +23,3 @@ DEFINE_bool(
" format of type conversions used for casting. This is a temporary solution"
" that aims to facilitate a seamless transition for users who rely on the"
" legacy behavior and hence can change in the future.");

namespace facebook::velox::util {

// This is based on Presto java's castToBoolean method.
Expected<bool> castToBoolean(const char* data, size_t len) {
const auto& TU = static_cast<int (*)(int)>(std::toupper);

if (len == 1) {
auto character = TU(data[0]);
if (character == 'T' || character == '1') {
return true;
}
if (character == 'F' || character == '0') {
return false;
}
}

if ((len == 4) && (TU(data[0]) == 'T') && (TU(data[1]) == 'R') &&
(TU(data[2]) == 'U') && (TU(data[3]) == 'E')) {
return true;
}

if ((len == 5) && (TU(data[0]) == 'F') && (TU(data[1]) == 'A') &&
(TU(data[2]) == 'L') && (TU(data[3]) == 'S') && (TU(data[4]) == 'E')) {
return false;
}

const std::string errorMessage =
fmt::format("Cannot cast {} to BOOLEAN", StringView(data, len));
return folly::makeUnexpected(Status::UserError(errorMessage));
}
} // namespace facebook::velox::util
78 changes: 58 additions & 20 deletions velox/type/Conversions.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,61 @@ struct Converter {
}
};

/// Presto compatible cast of strings to boolean. There is a set of strings
/// allowed to be casted to boolean. These strings are `t, f, 1, 0, true, false`
/// and their upper case equivalents. Casting from other strings to boolean
/// throws.
Expected<bool> castToBoolean(const char* data, size_t len);
/// Casts a string to a boolean. Allows a fixed set of strings. Casting from
/// other strings throws.
/// @tparam TPolicy PrestoCastPolicy and LegacyCastPolicy allow `t, f, 1, 0,
/// true, false` and their upper case equivalents. SparkCastPolicy additionally
/// allows 'y, n, yes, no' and their upper case equivalents.
template <typename TPolicy>
Expected<bool> castToBoolean(const char* data, size_t len) {
const auto& TU = static_cast<int (*)(int)>(std::toupper);

if (len == 1) {
auto character = TU(data[0]);
if (character == 'T' || character == '1') {
return true;
}
if (character == 'F' || character == '0') {
return false;
}
if constexpr (std::is_same_v<TPolicy, SparkCastPolicy>) {
if (character == 'Y') {
return true;
}
if (character == 'N') {
return false;
}
}
}

// Case-insensitive 'true'.
if ((len == 4) && (TU(data[0]) == 'T') && (TU(data[1]) == 'R') &&
rui-mo marked this conversation as resolved.
Show resolved Hide resolved
(TU(data[2]) == 'U') && (TU(data[3]) == 'E')) {
return true;
}

// Case-insensitive 'false'.
if ((len == 5) && (TU(data[0]) == 'F') && (TU(data[1]) == 'A') &&
(TU(data[2]) == 'L') && (TU(data[3]) == 'S') && (TU(data[4]) == 'E')) {
return false;
}

if constexpr (std::is_same_v<TPolicy, SparkCastPolicy>) {
// Case-insensitive 'yes'.
if ((len == 3) && (TU(data[0]) == 'Y') && (TU(data[1]) == 'E') &&
(TU(data[2]) == 'S')) {
return true;
}

// Case-insensitive 'no'.
if ((len == 2) && (TU(data[0]) == 'N') && (TU(data[1]) == 'O')) {
return false;
}
}

return folly::makeUnexpected(Status::UserError(
"Cannot cast {} to BOOLEAN", std::string_view(data, len)));
}

namespace detail {

Expand Down Expand Up @@ -99,27 +149,15 @@ struct Converter<TypeKind::BOOLEAN, void, TPolicy> {
}

static Expected<T> tryCast(folly::StringPiece v) {
if (std::is_same_v<TPolicy, PrestoCastPolicy>) {
return castToBoolean(v.data(), v.size());
}

return detail::callFollyTo<T>(v);
return castToBoolean<TPolicy>(v.data(), v.size());
}

static Expected<T> tryCast(const StringView& v) {
if (std::is_same_v<TPolicy, PrestoCastPolicy>) {
return castToBoolean(v.data(), v.size());
}

return detail::callFollyTo<T>(folly::StringPiece(v));
return castToBoolean<TPolicy>(v.data(), v.size());
}

static Expected<T> tryCast(const std::string& v) {
if (std::is_same_v<TPolicy, PrestoCastPolicy>) {
return castToBoolean(v.data(), v.length());
}

return detail::callFollyTo<T>(v);
return castToBoolean<TPolicy>(v.data(), v.length());
}

static Expected<T> tryCast(const bool& v) {
Expand Down
Loading