Skip to content

Commit

Permalink
Add presto scalar function 'to_base32'
Browse files Browse the repository at this point in the history
  • Loading branch information
Joe-Abraham committed Apr 4, 2024
1 parent 09eccfc commit 67c917b
Show file tree
Hide file tree
Showing 9 changed files with 312 additions and 2 deletions.
182 changes: 182 additions & 0 deletions velox/common/encode/Base32.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "velox/common/encode/Base32.h"

#include <glog/logging.h>

namespace facebook::velox::encoding {

// Encoding base to be used.
constexpr static int kBase = 32;

// Constants defining the size of binary and encoded blocks for Base32 encoding.
constexpr static int kBinaryBlockSize = 5; // 5 bytes of binary = 40 bits
constexpr static int kEncodedBlockSize = 8; // 8 bytes of encoded = 40 bits

constexpr Charset kBase32Charset = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', '2', '3', '4', '5', '6', '7'};

constexpr ReverseIndex kBase32ReverseIndexTable = {
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 255, 255, 255, 255,
255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255};

/// Verify that for each 32 entries in kBase32Charset, the corresponding entry
/// in kBase32ReverseIndexTable is correct.
static_assert(
checkForwardIndex(
sizeof(kBase32Charset) / 2 - 1,
kBase32Charset,
kBase32ReverseIndexTable),
"kBase32Charset has incorrect entries");

/// Verify that for every entry in kBase32ReverseIndexTable, the corresponding
/// entry in kBase32Charset is correct.
static_assert(
checkReverseIndex(
sizeof(kBase32ReverseIndexTable) - 1,
kBase32Charset,
kBase,
kBase32ReverseIndexTable),
"kBase32ReverseIndexTable has incorrect entries.");

// static
size_t Base32::calculateEncodedSize(size_t size, bool withPadding) {
if (size == 0) {
return 0;
}

// Calculate the output size assuming that we are including padding.
size_t encodedSize = ((size + 4) / 5) * 8;
if (!withPadding) {
// If the padding was not requested, subtract the padding bytes.
encodedSize -= (5 - (size % 5)) % 5;
}
return encodedSize;
}

// static
void Base32::encode(const char* data, size_t len, char* output) {
encodeImpl(folly::StringPiece(data, len), kBase32Charset, true, output);
}

template <class T>
/* static */ void Base32::encodeImpl(
const T& data,
const Charset& charset,
bool include_pad,
char* out) {
auto len = data.size();
if (len == 0) {
return;
}

auto wp = out;
auto it = data.begin();

auto append_padding = [include_pad](char* str, int n) -> char* {
if (include_pad) {
for (int i = 0; i < n; ++i) {
*str++ = kPadding;
}
}
return str;
};

/// For each group of 5 bytes (40 bits) in the input, split that into
/// 8 groups of 5 bits and encode that using the supplied charset lookup.
for (; len > 4; len -= 5) {
uint64_t curr = uint64_t(*it++) << 32;
curr |= uint8_t(*it++) << 24;
curr |= uint8_t(*it++) << 16;
curr |= uint8_t(*it++) << 8;
curr |= uint8_t(*it++);

*wp++ = charset[(curr >> 35) & 0x1f];
*wp++ = charset[(curr >> 30) & 0x1f];
*wp++ = charset[(curr >> 25) & 0x1f];
*wp++ = charset[(curr >> 20) & 0x1f];
*wp++ = charset[(curr >> 15) & 0x1f];
*wp++ = charset[(curr >> 10) & 0x1f];
*wp++ = charset[(curr >> 5) & 0x1f];
*wp++ = charset[curr & 0x1f];
}

if (len > 0) {
/// We have either 1 to 4 input bytes left. Encode this similar to the
/// above (assuming 0 for all other bytes). Optionally append the '='
/// character if it is requested.
uint64_t curr = uint64_t(*it++) << 32;
*wp++ = charset[(curr >> 35) & 0x1f];

if (len > 3) {
curr |= uint8_t(*it++) << 24;
curr |= uint8_t(*it++) << 16;
curr |= uint8_t(*it++) << 8;

*wp++ = charset[(curr >> 30) & 0x1f];
*wp++ = charset[(curr >> 25) & 0x1f];
*wp++ = charset[(curr >> 20) & 0x1f];
*wp++ = charset[(curr >> 15) & 0x1f];
*wp++ = charset[(curr >> 10) & 0x1f];
*wp++ = charset[(curr >> 5) & 0x1f];

append_padding(wp, 1);
} else if (len > 2) {
curr |= uint8_t(*it++) << 24;
curr |= uint8_t(*it++) << 16;

*wp++ = charset[(curr >> 30) & 0x1f];
*wp++ = charset[(curr >> 25) & 0x1f];
*wp++ = charset[(curr >> 20) & 0x1f];
*wp++ = charset[(curr >> 15) & 0x1f];

append_padding(wp, 3);

} else if (len > 1) {
curr |= uint8_t(*it) << 24;

*wp++ = charset[(curr >> 30) & 0x1f];
*wp++ = charset[(curr >> 25) & 0x1f];
*wp++ = charset[(curr >> 20) & 0x1f];

append_padding(wp, 4);
} else {
*wp++ = charset[(curr >> 30) & 0x1f];

append_padding(wp, 6);
}
}
}

} // namespace facebook::velox::encoding
46 changes: 46 additions & 0 deletions velox/common/encode/Base32.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <exception>
#include <map>
#include <string>

#include <folly/Range.h>
#include "velox/common/encode/EncoderUtils.h"

namespace facebook::velox::encoding {

class Base32 {
public:
/// Returns encoded size for the input of the specified size.
static size_t calculateEncodedSize(size_t size, bool withPadding = true);

/// Encodes the specified number of characters from the 'data' and writes the
/// result to the 'output'. The output must have enough space, e.g. as
/// returned by the calculateEncodedSize().
static void encode(const char* data, size_t size, char* output);

private:
template <class T>
static void encodeImpl(
const T& data,
const Charset& charset,
bool include_pad,
char* out);
};

} // namespace facebook::velox::encoding
2 changes: 1 addition & 1 deletion velox/common/encode/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ if(${VELOX_BUILD_TESTING})
add_subdirectory(tests)
endif()

add_library(velox_encode Base64.cpp)
add_library(velox_encode Base32.cpp Base64.cpp)
target_link_libraries(velox_encode PUBLIC Folly::folly)
42 changes: 42 additions & 0 deletions velox/common/encode/tests/Base32Test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "velox/common/encode/Base32.h"
#include <gtest/gtest.h>
#include "velox/common/base/tests/GTestUtils.h"

namespace facebook::velox::encoding {

class Base32Test : public ::testing::Test {};

TEST_F(Base32Test, calculateEncodedSizeProperSize) {
EXPECT_EQ(0, Base32::calculateEncodedSize(0, false));
EXPECT_EQ(4, Base32::calculateEncodedSize(1, false));
EXPECT_EQ(5, Base32::calculateEncodedSize(2, false));
EXPECT_EQ(6, Base32::calculateEncodedSize(3, false));
EXPECT_EQ(7, Base32::calculateEncodedSize(4, false));

EXPECT_EQ(0, Base32::calculateEncodedSize(0, true));
EXPECT_EQ(8, Base32::calculateEncodedSize(1, true));
EXPECT_EQ(8, Base32::calculateEncodedSize(2, true));
EXPECT_EQ(8, Base32::calculateEncodedSize(3, true));
EXPECT_EQ(8, Base32::calculateEncodedSize(4, true));

EXPECT_EQ(20, Base32::calculateEncodedSize(11, false));
EXPECT_EQ(24, Base32::calculateEncodedSize(11, true));
}

} // namespace facebook::velox::encoding
2 changes: 1 addition & 1 deletion velox/common/encode/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

add_executable(velox_common_encode_test Base64Test.cpp EncoderUtilsTests.cpp)
add_executable(velox_common_encode_test Base32Test.cpp Base64Test.cpp EncoderUtilsTests.cpp)
add_test(velox_common_encode_test velox_common_encode_test)
target_link_libraries(
velox_common_encode_test
Expand Down
4 changes: 4 additions & 0 deletions velox/docs/functions/presto/binary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ Binary Functions

Encodes ``bigint`` in a 64-bit 2’s complement big endian format.

.. function:: to_base32(binary) -> varchar

Encodes ``binary`` into a base32 string representation.

.. function:: to_hex(binary) -> varchar

Encodes ``binary`` into a hex string representation.
Expand Down
13 changes: 13 additions & 0 deletions velox/functions/prestosql/BinaryFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include "folly/ssl/OpenSSLHash.h"
#include "velox/common/base/BitUtil.h"
#include "velox/common/encode/Base32.h"
#include "velox/common/encode/Base64.h"
#include "velox/external/md5/md5.h"
#include "velox/functions/Udf.h"
Expand Down Expand Up @@ -325,6 +326,18 @@ struct ToBase64UrlFunction {
}
};

template <typename T>
struct ToBase32Function {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE void call(
out_type<Varchar>& result,
const arg_type<Varbinary>& input) {
result.resize(encoding::Base32::calculateEncodedSize(input.size()));
encoding::Base32::encode(input.data(), input.size(), result.data());
}
};

template <typename T>
struct FromBigEndian32 {
VELOX_DEFINE_FUNCTION_TYPES(T);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ void registerSimpleFunctions(const std::string& prefix) {
{prefix + "to_base64url"});
registerFunction<FromBase64UrlFunction, Varbinary, Varchar>(
{prefix + "from_base64url"});
registerFunction<ToBase32Function, Varchar, Varbinary>(
{prefix + "to_base32"});

registerFunction<FromBigEndian32, int32_t, Varbinary>(
{prefix + "from_big_endian_32"});
Expand Down
21 changes: 21 additions & 0 deletions velox/functions/prestosql/tests/BinaryFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,27 @@ TEST_F(BinaryFunctionsTest, fromBase64Url) {
EXPECT_THROW(fromBase64Url("YQ=/"), VeloxUserError);
}

TEST_F(BinaryFunctionsTest, toBase32) {
const auto toBase32 = [&](std::optional<std::string> value) {
return evaluateOnce<std::string>("to_base32(cast(c0 as varbinary))", value);
};

EXPECT_EQ(std::nullopt, toBase32(std::nullopt));
EXPECT_EQ("", toBase32(""));
EXPECT_EQ("ME======", toBase32("a"));
EXPECT_EQ("MFRGG===", toBase32("abc"));
EXPECT_EQ("NZXQ====", toBase32("no"));
EXPECT_EQ("O5SQ====", toBase32("we"));
EXPECT_EQ("MRRDE===", toBase32("db2"));
EXPECT_EQ("MNQWWZI=", toBase32("cake"));
EXPECT_EQ("NNSWK3Q=", toBase32("keen"));
EXPECT_EQ("GEZDGNA=", toBase32("1234"));
EXPECT_EQ("NBSWY3DPEB3W64TMMQ======", toBase32("hello world"));
EXPECT_EQ(
"JBSWY3DPEBLW64TMMQQGM4TPNUQFMZLMN54CC===",
toBase32("Hello World from Velox!"));
}

TEST_F(BinaryFunctionsTest, fromBigEndian32) {
const auto fromBigEndian32 = [&](const std::optional<std::string>& arg) {
return evaluateOnce<int32_t, std::string>(
Expand Down

0 comments on commit 67c917b

Please sign in to comment.