Skip to content

Commit

Permalink
Fix Spark CAST(string as boolean) (#10382)
Browse files Browse the repository at this point in the history
Summary:
Spark only allows the strings `t, f, y, n, 1, 0, yes, no, true, false` and
their uppercase equivalents. Folly allows more strings than spark does, e.g.
'on' and 'off'. This PR restricts the strings that can be converted to boolean
to these strings. Strings apart from these will throw.

Pull Request resolved: #10382

Reviewed By: kgpai

Differential Revision: D59475498

Pulled By: pedroerp

fbshipit-source-id: 25df40fc23e84e27bbd3f19b819764aca2e562df
  • Loading branch information
rui-mo authored and facebook-github-bot committed Jul 9, 2024
1 parent c3839e1 commit 5926750
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 68 deletions.
36 changes: 36 additions & 0 deletions velox/docs/functions/spark/conversion.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,42 @@ Valid examples
SELECT cast(cast(2147483648.90 as DECIMAL(12, 2)) as integer); -- -2147483648
SELECT cast(cast(2147483648.90 as DECIMAL(12, 2)) as bigint); -- 2147483648

Cast to Boolean
---------------

From VARCHAR
^^^^^^^^^^^^

The strings `t, f, y, n, 1, 0, yes, no, true, false` and their upper case equivalents are allowed to be casted to boolean.
Casting from other strings to boolean throws.

Valid examples

::

SELECT cast('1' as boolean); -- true
SELECT cast('0' as boolean); -- false
SELECT cast('t' as boolean); -- true (case insensitive)
SELECT cast('true' as boolean); -- true (case insensitive)
SELECT cast('f' as boolean); -- false (case insensitive)
SELECT cast('false' as boolean); -- false (case insensitive)
SELECT cast('y' as boolean); -- true (case insensitive)
SELECT cast('yes' as boolean); -- true (case insensitive)
SELECT cast('n' as boolean); -- false (case insensitive)
SELECT cast('no' as boolean); -- false (case insensitive)

Invalid examples

::

SELECT cast('1.7E308' as boolean); -- Invalid argument
SELECT cast('nan' as boolean); -- Invalid argument
SELECT cast('infinity' as boolean); -- Invalid argument
SELECT cast('12' as boolean); -- Invalid argument
SELECT cast('-1' as boolean); -- Invalid argument
SELECT cast('tr' as boolean); -- Invalid argument
SELECT cast('tru' as boolean); -- Invalid argument

Cast to String
--------------

Expand Down
36 changes: 20 additions & 16 deletions velox/functions/sparksql/tests/SparkCastExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,26 +267,23 @@ TEST_F(SparkCastExprTest, primitiveInvalidCornerCases) {
// To boolean.
{
testInvalidCast<std::string>(
"boolean",
{"1.7E308"},
"Non-whitespace character found after end of conversion");
"boolean", {"1.7E308"}, "Cannot cast 1.7E308 to BOOLEAN");
testInvalidCast<std::string>(
"boolean",
{"nan"},
"Non-whitespace character found after end of conversion");
"boolean", {"nan"}, "Cannot cast nan to BOOLEAN");
testInvalidCast<std::string>(
"boolean", {"infinity"}, "Invalid value for bool");
"boolean", {"infinity"}, "Cannot cast infinity to BOOLEAN");
testInvalidCast<std::string>(
"boolean", {"12"}, "Integer overflow when parsing bool");
testInvalidCast<std::string>("boolean", {"-1"}, "Invalid value for bool");
"boolean", {"12"}, "Cannot cast 12 to BOOLEAN");
testInvalidCast<std::string>(
"boolean",
{"tr"},
"Non-whitespace character found after end of conversion");
"boolean", {"-1"}, "Cannot cast -1 to BOOLEAN");
testInvalidCast<std::string>(
"boolean",
{"tru"},
"Non-whitespace character found after end of conversion");
"boolean", {"tr"}, "Cannot cast tr to BOOLEAN");
testInvalidCast<std::string>(
"boolean", {"tru"}, "Cannot cast tru to BOOLEAN");
testInvalidCast<std::string>(
"boolean", {"on"}, "Cannot cast on to BOOLEAN");
testInvalidCast<std::string>(
"boolean", {"off"}, "Cannot cast off to BOOLEAN");
}
}

Expand Down Expand Up @@ -352,9 +349,16 @@ TEST_F(SparkCastExprTest, primitiveValidCornerCases) {
testCast<double, bool>("boolean", {0.0000000000001}, {true});

testCast<std::string, bool>("boolean", {"1"}, {true});
testCast<std::string, bool>("boolean", {"0"}, {false});
testCast<std::string, bool>("boolean", {"t"}, {true});
testCast<std::string, bool>("boolean", {"y"}, {true});
testCast<std::string, bool>("boolean", {"yes"}, {true});
testCast<std::string, bool>("boolean", {"true"}, {true});

testCast<std::string, bool>("boolean", {"0"}, {false});
testCast<std::string, bool>("boolean", {"f"}, {false});
testCast<std::string, bool>("boolean", {"n"}, {false});
testCast<std::string, bool>("boolean", {"no"}, {false});
testCast<std::string, bool>("boolean", {"false"}, {false});
}

// To string.
Expand Down
32 changes: 0 additions & 32 deletions velox/type/Conversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,35 +23,3 @@ DEFINE_bool(
" format of type conversions used for casting. This is a temporary solution"
" that aims to facilitate a seamless transition for users who rely on the"
" legacy behavior and hence can change in the future.");

namespace facebook::velox::util {

// This is based on Presto java's castToBoolean method.
Expected<bool> castToBoolean(const char* data, size_t len) {
const auto& TU = static_cast<int (*)(int)>(std::toupper);

if (len == 1) {
auto character = TU(data[0]);
if (character == 'T' || character == '1') {
return true;
}
if (character == 'F' || character == '0') {
return false;
}
}

if ((len == 4) && (TU(data[0]) == 'T') && (TU(data[1]) == 'R') &&
(TU(data[2]) == 'U') && (TU(data[3]) == 'E')) {
return true;
}

if ((len == 5) && (TU(data[0]) == 'F') && (TU(data[1]) == 'A') &&
(TU(data[2]) == 'L') && (TU(data[3]) == 'S') && (TU(data[4]) == 'E')) {
return false;
}

const std::string errorMessage =
fmt::format("Cannot cast {} to BOOLEAN", StringView(data, len));
return folly::makeUnexpected(Status::UserError(errorMessage));
}
} // namespace facebook::velox::util
78 changes: 58 additions & 20 deletions velox/type/Conversions.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,61 @@ struct Converter {
}
};

/// Presto compatible cast of strings to boolean. There is a set of strings
/// allowed to be casted to boolean. These strings are `t, f, 1, 0, true, false`
/// and their upper case equivalents. Casting from other strings to boolean
/// throws.
Expected<bool> castToBoolean(const char* data, size_t len);
/// Casts a string to a boolean. Allows a fixed set of strings. Casting from
/// other strings throws.
/// @tparam TPolicy PrestoCastPolicy and LegacyCastPolicy allow `t, f, 1, 0,
/// true, false` and their upper case equivalents. SparkCastPolicy additionally
/// allows 'y, n, yes, no' and their upper case equivalents.
template <typename TPolicy>
Expected<bool> castToBoolean(const char* data, size_t len) {
const auto& TU = static_cast<int (*)(int)>(std::toupper);

if (len == 1) {
auto character = TU(data[0]);
if (character == 'T' || character == '1') {
return true;
}
if (character == 'F' || character == '0') {
return false;
}
if constexpr (std::is_same_v<TPolicy, SparkCastPolicy>) {
if (character == 'Y') {
return true;
}
if (character == 'N') {
return false;
}
}
}

// Case-insensitive 'true'.
if ((len == 4) && (TU(data[0]) == 'T') && (TU(data[1]) == 'R') &&
(TU(data[2]) == 'U') && (TU(data[3]) == 'E')) {
return true;
}

// Case-insensitive 'false'.
if ((len == 5) && (TU(data[0]) == 'F') && (TU(data[1]) == 'A') &&
(TU(data[2]) == 'L') && (TU(data[3]) == 'S') && (TU(data[4]) == 'E')) {
return false;
}

if constexpr (std::is_same_v<TPolicy, SparkCastPolicy>) {
// Case-insensitive 'yes'.
if ((len == 3) && (TU(data[0]) == 'Y') && (TU(data[1]) == 'E') &&
(TU(data[2]) == 'S')) {
return true;
}

// Case-insensitive 'no'.
if ((len == 2) && (TU(data[0]) == 'N') && (TU(data[1]) == 'O')) {
return false;
}
}

return folly::makeUnexpected(Status::UserError(
"Cannot cast {} to BOOLEAN", std::string_view(data, len)));
}

namespace detail {

Expand Down Expand Up @@ -99,27 +149,15 @@ struct Converter<TypeKind::BOOLEAN, void, TPolicy> {
}

static Expected<T> tryCast(folly::StringPiece v) {
if (std::is_same_v<TPolicy, PrestoCastPolicy>) {
return castToBoolean(v.data(), v.size());
}

return detail::callFollyTo<T>(v);
return castToBoolean<TPolicy>(v.data(), v.size());
}

static Expected<T> tryCast(const StringView& v) {
if (std::is_same_v<TPolicy, PrestoCastPolicy>) {
return castToBoolean(v.data(), v.size());
}

return detail::callFollyTo<T>(folly::StringPiece(v));
return castToBoolean<TPolicy>(v.data(), v.size());
}

static Expected<T> tryCast(const std::string& v) {
if (std::is_same_v<TPolicy, PrestoCastPolicy>) {
return castToBoolean(v.data(), v.length());
}

return detail::callFollyTo<T>(v);
return castToBoolean<TPolicy>(v.data(), v.length());
}

static Expected<T> tryCast(const bool& v) {
Expand Down

0 comments on commit 5926750

Please sign in to comment.