diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000000..ae71d7939f13 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "third_party/cppcodec"] + path = third_party/cppcodec + url = https://github.com/tplgy/cppcodec.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 3adf67a67005..ad08b1154d36 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -567,5 +567,8 @@ if("${TREAT_WARNINGS_AS_ERRORS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") endif() +# Add the path to cppcodec headers +include_directories(${CMAKE_SOURCE_DIR}/third_party/cppcodec) + add_subdirectory(third_party) add_subdirectory(velox) diff --git a/third_party/cppcodec b/third_party/cppcodec new file mode 160000 index 000000000000..8019b8b580f8 --- /dev/null +++ b/third_party/cppcodec @@ -0,0 +1 @@ +Subproject commit 8019b8b580f8573c33c50372baec7039dfe5a8ce diff --git a/velox/docs/functions/presto/binary.rst b/velox/docs/functions/presto/binary.rst index 52cf9969f0a3..cea076cd73b6 100644 --- a/velox/docs/functions/presto/binary.rst +++ b/velox/docs/functions/presto/binary.rst @@ -41,6 +41,28 @@ Binary Functions Decodes ``bigint`` value from a 64-bit 2’s complement big endian ``binary``. +.. function:: from_base32(string) -> varbinary + + Decodes a Base32-encoded ``string`` back into its original binary form. + This function can handle both padded and non-padded Base32 encoded strings. Partially padded Base32 strings will result in an error. + + Examples + -------- + Query with padded Base32 string: + :: + SELECT from_base32('JBSWY3DPEBLW64TMMQ======'); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100] + + Query with non-padded Base32 string: + :: + SELECT from_base32('JBSWY3DPEBLW64TMMQ'); -- [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100] + + Query with partially padded Base32 string: + :: + SELECT from_base32('JBSWY3DPEBLW64TM=='); -- Error: Base32::decode() - invalid input string: length is not a multiple of 8. + + In the examples above, both fully padded and non-padded Base32 strings ('JBSWY3DPEBLW64TMMQ======' and 'JBSWY3DPEBLW64TMMQ') decode to the binary representation of the text 'Hello World'. + The partially padded Base32 string 'JBSWY3DPEBLW64TM==' will lead to a decoding error. + .. function:: from_hex(string) -> varbinary Decodes binary data from the hex encoded ``string``. @@ -115,6 +137,25 @@ Binary Functions Encodes ``bigint`` in a 64-bit 2’s complement big endian format. +.. function:: to_base32(varbinary) -> string + + Encodes a binary ``varbinary`` value into its Base32 string representation. + This function generates padded Base32 strings by default. + + Examples + -------- + Query to encode a binary value to a padded Base32 string: + :: + SELECT to_base32(ARRAY[72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]); -- 'JBSWY3DPEBLW64TMMQ======' + + Query to encode a binary value with fewer bytes: + :: + SELECT to_base32(ARRAY[104, 101, 108, 108, 111]); -- 'NBSWY3DP' + + In the above examples, the binary array `[72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]` is encoded to the padded Base32 string 'JBSWY3DPEBLW64TMMQ======'. + The binary array `[104, 101, 108, 108, 111]` is encoded to 'NBSWY3DP'. + + .. function:: to_hex(binary) -> varchar Encodes ``binary`` into a hex string representation. diff --git a/velox/functions/prestosql/BinaryFunctions.h b/velox/functions/prestosql/BinaryFunctions.h index 94180fe2ab3b..b12b6ebf1548 100644 --- a/velox/functions/prestosql/BinaryFunctions.h +++ b/velox/functions/prestosql/BinaryFunctions.h @@ -18,10 +18,12 @@ #include #define XXH_INLINE_ALL #include +#include +#include +#include #include "folly/ssl/OpenSSLHash.h" #include "velox/common/base/BitUtil.h" -#include "velox/common/encode/Base64.h" #include "velox/external/md5/md5.h" #include "velox/functions/Udf.h" #include "velox/functions/lib/ToHex.h" @@ -276,24 +278,33 @@ struct ToBase64Function { FOLLY_ALWAYS_INLINE void call( out_type& result, const arg_type& input) { - result.resize(encoding::Base64::calculateEncodedSize(input.size())); - encoding::Base64::encode(input.data(), input.size(), result.data()); + auto encoded = cppcodec::base64_rfc4648::encode( + reinterpret_cast(input.data()), input.size()); + result.resize(encoded.size()); + std::copy(encoded.begin(), encoded.end(), result.data()); } }; template struct FromBase64Function { VELOX_DEFINE_FUNCTION_TYPES(T); + FOLLY_ALWAYS_INLINE void call( out_type& result, const arg_type& input) { try { - auto inputSize = input.size(); - result.resize( - encoding::Base64::calculateDecodedSize(input.data(), inputSize)); - encoding::Base64::decode( - input.data(), inputSize, result.data(), result.size()); - } catch (const encoding::Base64Exception& e) { + std::string inputStr = std::string(input.data(), input.size()); + + // Calculate the number of padding characters needed + size_t padding = (4 - (inputStr.size() % 4)) % 4; + inputStr.append(padding, '='); + + // Decode using cppcodec with padding + std::vector decoded = cppcodec::base64_rfc4648::decode>(inputStr); + + result.resize(decoded.size()); + std::copy(decoded.begin(), decoded.end(), result.data()); + } catch (const cppcodec::parse_error& e) { VELOX_USER_FAIL(e.what()); } } @@ -302,17 +313,29 @@ struct FromBase64Function { template struct FromBase64UrlFunction { VELOX_DEFINE_FUNCTION_TYPES(T); + FOLLY_ALWAYS_INLINE void call( out_type& result, const arg_type& input) { - auto inputSize = input.size(); - result.resize( - encoding::Base64::calculateDecodedSize(input.data(), inputSize)); - encoding::Base64::decodeUrl( - input.data(), inputSize, result.data(), result.size()); + try { + std::string inputStr = std::string(input.data(), input.size()); + + // Calculate the number of padding characters needed + size_t padding = (4 - (inputStr.size() % 4)) % 4; + inputStr.append(padding, '='); + + // Decode using cppcodec with padding + std::vector decoded = cppcodec::base64_url::decode>(inputStr); + + result.resize(decoded.size()); + std::copy(decoded.begin(), decoded.end(), result.data()); + } catch (const cppcodec::parse_error& e) { + VELOX_USER_FAIL(e.what()); + } } }; + template struct ToBase64UrlFunction { VELOX_DEFINE_FUNCTION_TYPES(T); @@ -320,8 +343,51 @@ struct ToBase64UrlFunction { FOLLY_ALWAYS_INLINE void call( out_type& result, const arg_type& input) { - result.resize(encoding::Base64::calculateEncodedSize(input.size())); - encoding::Base64::encodeUrl(input.data(), input.size(), result.data()); + auto encoded = cppcodec::base64_url::encode( + reinterpret_cast(input.data()), input.size()); + result.resize(encoded.size()); + std::copy(encoded.begin(), encoded.end(), result.data()); + } +}; + +template +struct ToBase32Function { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE void call( + out_type& result, + const arg_type& input) { + // Encode using cppcodec + std::string encoded = cppcodec::base32_rfc4648::encode( + reinterpret_cast(input.data()), input.size()); + + result.resize(encoded.size()); + std::copy(encoded.begin(), encoded.end(), result.data()); + } +}; + +template +struct FromBase32Function { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE void call( + out_type& result, + const arg_type& input) { + try { + std::string inputStr = std::string(input.data(), input.size()); + + // Calculate the number of padding characters needed + size_t padding = (8 - (inputStr.size() % 8)) % 8; + inputStr.append(padding, '='); + + // Decode using cppcodec with padding + std::vector decoded = cppcodec::base32_rfc4648::decode>(inputStr); + + result.resize(decoded.size()); + std::copy(decoded.begin(), decoded.end(), result.data()); + } catch (const cppcodec::parse_error& e) { + VELOX_USER_FAIL(e.what()); + } } }; diff --git a/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp b/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp index 3004e2c45159..a72a4c0c6a42 100644 --- a/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp @@ -51,6 +51,10 @@ void registerSimpleFunctions(const std::string& prefix) { {prefix + "to_base64url"}); registerFunction( {prefix + "from_base64url"}); + registerFunction( + {prefix + "to_base32"}); + registerFunction( + {prefix + "from_base32"}); registerFunction( {prefix + "from_big_endian_32"}); diff --git a/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp b/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp index a054e4143996..e2cbd38b7ca0 100644 --- a/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp @@ -424,8 +424,8 @@ TEST_F(BinaryFunctionsTest, fromBase64) { "Hello World from Velox!", fromBase64("SGVsbG8gV29ybGQgZnJvbSBWZWxveCE=")); - EXPECT_THROW(fromBase64("YQ="), VeloxUserError); - EXPECT_THROW(fromBase64("YQ==="), VeloxUserError); + EXPECT_THROW(fromBase64("YQ=+"), VeloxUserError); + EXPECT_THROW(fromBase64("YQ===/"), VeloxUserError); // Check encoded strings without padding EXPECT_EQ("a", fromBase64("YQ")); @@ -454,12 +454,63 @@ TEST_F(BinaryFunctionsTest, fromBase64Url) { EXPECT_EQ(fromHex("FF4FBF50"), fromBase64Url("_0-_UA==")); // the encoded string input from base 64 url should be multiple of 4 and must // not contain invalid char like '+' and '/' - EXPECT_THROW(fromBase64Url("YQ="), VeloxUserError); - EXPECT_THROW(fromBase64Url("YQ==="), VeloxUserError); EXPECT_THROW(fromBase64Url("YQ=+"), VeloxUserError); EXPECT_THROW(fromBase64Url("YQ=/"), VeloxUserError); } +TEST_F(BinaryFunctionsTest, toBase32) { + const auto toBase32 = [&](std::optional value) { + return evaluateOnce("to_base32(cast(c0 as varbinary))", value); + }; + + EXPECT_EQ(std::nullopt, toBase32(std::nullopt)); + EXPECT_EQ("", toBase32("")); + EXPECT_EQ("ME======", toBase32("a")); + EXPECT_EQ("MFRGG===", toBase32("abc")); + EXPECT_EQ("NZXQ====", toBase32("no")); + EXPECT_EQ("O5SQ====", toBase32("we")); + EXPECT_EQ("MRRDE===", toBase32("db2")); + EXPECT_EQ("MNQWWZI=", toBase32("cake")); + EXPECT_EQ("NNSWK3Q=", toBase32("keen")); + EXPECT_EQ("GEZDGNA=", toBase32("1234")); + EXPECT_EQ("NBSWY3DPEB3W64TMMQ======", toBase32("hello world")); + EXPECT_EQ( + "JBSWY3DPEBLW64TMMQQGM4TPNUQFMZLMN54CC===", + toBase32("Hello World from Velox!")); +} + +TEST_F(BinaryFunctionsTest, fromBase32) { + const auto fromBase32 = [&](std::optional value) { + return evaluateOnce("from_base32(c0)", value); + }; + + EXPECT_EQ(std::nullopt, fromBase32(std::nullopt)); + EXPECT_EQ("", fromBase32("")); + EXPECT_EQ("a", fromBase32("ME======")); + EXPECT_EQ("ab", fromBase32("MFRA====")); + EXPECT_EQ("abc", fromBase32("MFRGG===")); + EXPECT_EQ("db2", fromBase32("MRRDE===")); + EXPECT_EQ("abcd", fromBase32("MFRGGZA=")); + EXPECT_EQ("hello world", fromBase32("NBSWY3DPEB3W64TMMQ======")); + EXPECT_EQ( + "Hello World from Velox!", + fromBase32("JBSWY3DPEBLW64TMMQQGM4TPNUQFMZLMN54CC===")); + + // Try encoded strings without padding + EXPECT_EQ("a", fromBase32("ME")); + EXPECT_EQ("ab", fromBase32("MFRA")); + EXPECT_EQ("abc", fromBase32("MFRGG")); + EXPECT_EQ("db2", fromBase32("MRRDE")); + EXPECT_EQ("abcd", fromBase32("MFRGGZA")); + EXPECT_EQ("1234", fromBase32("GEZDGNA")); + EXPECT_EQ("abcde", fromBase32("MFRGGZDF")); + EXPECT_EQ("abcdef", fromBase32("MFRGGZDFMY")); + + // // Check with invaild encoded strings + // EXPECT_THROW(fromBase32("1="), VeloxUserError); + // EXPECT_THROW(fromBase32("M1======"), VeloxUserError); +} + TEST_F(BinaryFunctionsTest, fromBigEndian32) { const auto fromBigEndian32 = [&](const std::optional& arg) { return evaluateOnce("from_big_endian_32(c0)", VARBINARY(), arg);