Skip to content

Commit

Permalink
to_base32
Browse files Browse the repository at this point in the history
  • Loading branch information
Joe-Abraham committed Oct 5, 2024
1 parent baf6c1b commit 123e464
Show file tree
Hide file tree
Showing 6 changed files with 174 additions and 2 deletions.
105 changes: 105 additions & 0 deletions velox/common/encode/Base32.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,111 @@ static_assert(
kBase32ReverseIndexTable),
"kBase32ReverseIndexTable has incorrect entries.");

// static
Status Base32::encode(std::string_view input, std::string& output) {
return encodeImpl(input, kBase32Charset, true, output);
}

// static
template <class T>
Status Base32::encodeImpl(
const T& input,
const Charset& charset,
bool includePadding,
std::string& output) {
auto inputSize = input.size();
if (inputSize == 0) {
output.clear();
return Status::OK();
}

// Calculate the output size and resize the string beforehand
size_t outputSize = calculateEncodedSize(
inputSize, includePadding, kBinaryBlockByteSize, kEncodedBlockByteSize);
output.resize(outputSize);

// Use a pointer to write into the pre-allocated buffer
auto outputPointer = output.data();
auto inputIterator = input.begin();

// Process 5-byte (40-bit) blocks, split into 8 groups of 5 bits
for (; inputSize > 4; inputSize -= 5) {
uint64_t currentBlock = static_cast<uint64_t>(*inputIterator++) << 32;
currentBlock |= static_cast<uint64_t>(*inputIterator++) << 24;
currentBlock |= static_cast<uint64_t>(*inputIterator++) << 16;
currentBlock |= static_cast<uint64_t>(*inputIterator++) << 8;
currentBlock |= static_cast<uint64_t>(*inputIterator++);

*outputPointer++ = charset[(currentBlock >> 35) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 30) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 25) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 20) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 15) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 10) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 5) & 0x1f];
*outputPointer++ = charset[currentBlock & 0x1f];
}

// Handle remaining bytes (1 to 4 bytes)
if (inputSize > 0) {
uint64_t currentBlock = static_cast<uint64_t>(*inputIterator++) << 32;
*outputPointer++ = charset[(currentBlock >> 35) & 0x1f];

if (inputSize > 3) {
currentBlock |= static_cast<uint64_t>(*inputIterator++) << 24;
currentBlock |= static_cast<uint64_t>(*inputIterator++) << 16;
currentBlock |= static_cast<uint64_t>(*inputIterator++) << 8;

*outputPointer++ = charset[(currentBlock >> 30) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 25) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 20) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 15) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 10) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 5) & 0x1f];
if (includePadding) {
*outputPointer++ = kPadding;
}
} else if (inputSize > 2) {
currentBlock |= static_cast<uint64_t>(*inputIterator++) << 24;
currentBlock |= static_cast<uint64_t>(*inputIterator++) << 16;

*outputPointer++ = charset[(currentBlock >> 30) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 25) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 20) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 15) & 0x1f];
if (includePadding) {
*outputPointer++ = kPadding;
*outputPointer++ = kPadding;
*outputPointer++ = kPadding;
}
} else if (inputSize > 1) {
currentBlock |= static_cast<uint64_t>(*inputIterator++) << 24;

*outputPointer++ = charset[(currentBlock >> 30) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 25) & 0x1f];
*outputPointer++ = charset[(currentBlock >> 20) & 0x1f];
if (includePadding) {
*outputPointer++ = kPadding;
*outputPointer++ = kPadding;
*outputPointer++ = kPadding;
*outputPointer++ = kPadding;
}
} else {
*outputPointer++ = charset[(currentBlock >> 30) & 0x1f];
if (includePadding) {
*outputPointer++ = kPadding;
*outputPointer++ = kPadding;
*outputPointer++ = kPadding;
*outputPointer++ = kPadding;
*outputPointer++ = kPadding;
*outputPointer++ = kPadding;
}
}
}

return Status::OK();
}

// static
uint8_t Base32::base32ReverseLookup(
char encodedChar,
Expand Down
9 changes: 9 additions & 0 deletions velox/common/encode/Base32.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class Base32 {
/// within the encoding base.
using ReverseIndex = std::array<uint8_t, kReverseIndexSize>;

static Status encode(std::string_view input, std::string& output);

/// Decodes the specified number of characters from the 'input' and writes the
/// result to the 'output'.
static Status decode(std::string_view input, std::string& output);
Expand All @@ -48,6 +50,13 @@ class Base32 {
const ReverseIndex& reverseIndex,
Status& status);

template <class T>
static Status encodeImpl(
const T& input,
const Charset& charset,
bool includePadding,
std::string& output);

// Decodes the specified input using the provided reverse lookup table.
static Status decodeImpl(
std::string_view input,
Expand Down
18 changes: 18 additions & 0 deletions velox/docs/functions/presto/binary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,24 @@ Binary Functions

Encodes ``binary`` into a base64 string representation.

.. function:: to_base32(varbinary) -> string

Encodes a binary ``varbinary`` value into its Base32 string representation.
This function generates padded Base32 strings by default.

Examples
--------
Query to encode a binary value to a padded Base32 string:
::
SELECT to_base32(ARRAY[72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]); -- 'JBSWY3DPEBLW64TMMQ======'

Query to encode a binary value with fewer bytes:
::
SELECT to_base32(ARRAY[104, 101, 108, 108, 111]); -- 'NBSWY3DP'

In the above examples, the binary array `[72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]` is encoded to the padded Base32 string 'JBSWY3DPEBLW64TMMQ======'.
The binary array `[104, 101, 108, 108, 111]` is encoded to 'NBSWY3DP'.

.. function:: to_base64url(binary) -> varchar

Encodes ``binary`` into a base64 string representation using the `URL safe alphabet <https://www.rfc-editor.org/rfc/rfc4648#section-5>`_.
Expand Down
18 changes: 18 additions & 0 deletions velox/functions/prestosql/BinaryFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,24 @@ struct ToBase64UrlFunction {
}
};

template <typename T>
struct ToBase32Function {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE Status
call(out_type<Varchar>& result, const arg_type<Varbinary>& input) {
std::string_view inputView(input.data(), input.size());
std::string output;
auto status = encoding::Base32::encode(inputView, output);
if (!status.ok()) {
return status;
}
result.resize(output.size());
std::memcpy(result.data(), output.data(), output.size());
return Status::OK();
}
};

template <typename TExec>
struct FromBase32Function {
VELOX_DEFINE_FUNCTION_TYPES(TExec);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ void registerSimpleFunctions(const std::string& prefix) {
registerFunction<FromBase64UrlFunction, Varbinary, Varchar>(
{prefix + "from_base64url"});

registerFunction<ToBase32Function, Varchar, Varbinary>(
{prefix + "to_base32"});
registerFunction<FromBase32Function, Varbinary, Varchar>(
{prefix + "from_base32"});
registerFunction<FromBase32Function, Varbinary, Varbinary>(
Expand Down
24 changes: 22 additions & 2 deletions velox/functions/prestosql/tests/BinaryFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,27 @@ TEST_F(BinaryFunctionsTest, fromBase64Url) {
EXPECT_THROW(fromBase64Url("YQ=/"), VeloxUserError);
}

TEST_F(BinaryFunctionsTest, toBase32) {
const auto toBase32 = [&](std::optional<std::string> value) {
return evaluateOnce<std::string>("to_base32(cast(c0 as varbinary))", value);
};

EXPECT_EQ(std::nullopt, toBase32(std::nullopt));
EXPECT_EQ("", toBase32(""));
EXPECT_EQ("ME======", toBase32("a"));
EXPECT_EQ("MFRGG===", toBase32("abc"));
EXPECT_EQ("NZXQ====", toBase32("no"));
EXPECT_EQ("O5SQ====", toBase32("we"));
EXPECT_EQ("MRRDE===", toBase32("db2"));
EXPECT_EQ("MNQWWZI=", toBase32("cake"));
EXPECT_EQ("NNSWK3Q=", toBase32("keen"));
EXPECT_EQ("GEZDGNA=", toBase32("1234"));
EXPECT_EQ("NBSWY3DPEB3W64TMMQ======", toBase32("hello world"));
EXPECT_EQ(
"JBSWY3DPEBLW64TMMQQGM4TPNUQFMZLMN54CC===",
toBase32("Hello World from Velox!"));
}

TEST_F(BinaryFunctionsTest, fromBase32) {
const auto fromBase32 = [&](std::optional<std::string> value) {
// from_base32 allows VARCHAR and VARBINARY inputs.
Expand Down Expand Up @@ -520,8 +541,7 @@ TEST_F(BinaryFunctionsTest, fromBase32) {
EXPECT_EQ("abcdef", fromBase32("MFRGGZDFMY"));

VELOX_ASSERT_USER_THROW(
fromBase32("1="),
"decode() - invalid input string length.");
fromBase32("1="), "decode() - invalid input string length.");
VELOX_ASSERT_USER_THROW(
fromBase32("M1======"),
"invalid input string: contains invalid characters.");
Expand Down

0 comments on commit 123e464

Please sign in to comment.