Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
Joe-Abraham committed Sep 18, 2024
1 parent e0dfc42 commit 71920a2
Show file tree
Hide file tree
Showing 8 changed files with 376 additions and 195 deletions.
232 changes: 134 additions & 98 deletions velox/common/encode/Base32.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,19 @@

namespace facebook::velox::encoding {

// Encoding base to be used.
constexpr static int kBase = 32;

// Constants defining the size in bytes of binary and encoded blocks for Base32
// encoding.
// Size of a binary block in bytes (5 bytes = 40 bits)
constexpr static int kBinaryBlockByteSize = 5;
// Size of an encoded block in bytes (8 bytes = 40 bits)
constexpr static int kEncodedBlockByteSize = 8;

constexpr Charset kBase32Charset = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', '2', '3', '4', '5', '6', '7'};
constexpr Base32::Charset kBase32Charset = {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
'W', 'X', 'Y', 'Z', '2', '3', '4', '5', '6', '7'};

constexpr ReverseIndex kBase32ReverseIndexTable = {
constexpr Base32::ReverseIndex kBase32ReverseIndexTable = {
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
Expand All @@ -54,138 +51,177 @@ constexpr ReverseIndex kBase32ReverseIndexTable = {
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255};

/// Verify that for each 32 entries in kBase32Charset, the corresponding entry
/// in kBase32ReverseIndexTable is correct.
// Verify that for each 32 entries in kBase32Charset, the corresponding entry
// in kBase32ReverseIndexTable is correct.
static_assert(
checkForwardIndex(
sizeof(kBase32Charset) / 2 - 1,
kBase32Charset,
kBase32ReverseIndexTable),
"kBase32Charset has incorrect entries");

/// Verify that for every entry in kBase32ReverseIndexTable, the corresponding
/// entry in kBase32Charset is correct.
// Verify that for every entry in kBase32ReverseIndexTable, the corresponding
// entry in kBase32Charset is correct.
static_assert(
checkReverseIndex(
sizeof(kBase32ReverseIndexTable) - 1,
kBase32Charset,
kBase,
kBase32ReverseIndexTable),
"kBase32ReverseIndexTable has incorrect entries.");

size_t Base32::calculateDecodedSize(const char* data, size_t& size) {
if (size == 0) {
return 0;
// static
Status Base32::calculateDecodedSize(
std::string_view input,
size_t& inputSize,
size_t& decodedSize) {
if (inputSize == 0) {
decodedSize = 0;
return Status::OK();
}

// Check if the input data is padded
if (isPadded(data, size)) {
/// If padded, ensure that the string length is a multiple of the encoded
/// block size.
if (size % kEncodedBlockByteSize != 0) {
VELOX_USER_FAIL(
"Base32::decode() - invalid input string: "
"string length is not a multiple of 8.");
if (isPadded(input.data(), inputSize)) {
// If padded, ensure that the string length is a multiple of the encoded
// block size
if (inputSize % kEncodedBlockByteSize != 0) {
return Status::UserError(
"Base32::decode() - invalid input string: string length is not a multiple of 8.");
}

auto needed = (size * kBinaryBlockByteSize) / kEncodedBlockByteSize;
auto padding = numPadding(data, size);
size -= padding;
decodedSize = (inputSize * kBinaryBlockByteSize) / kEncodedBlockByteSize;
auto padding = numPadding(input.data(), inputSize);
inputSize -= padding;

// Adjust the needed size by deducting the bytes corresponding to the
// padding from the calculated size.
return needed -
// padding
decodedSize -=
((padding * kBinaryBlockByteSize) + (kEncodedBlockByteSize - 1)) /
kEncodedBlockByteSize;
} else {
// If not padded, calculate extra bytes, if any.
auto extra = size % kEncodedBlockByteSize;
auto needed = (size / kEncodedBlockByteSize) * kBinaryBlockByteSize;

// Adjust the needed size for extra bytes, if present.
if (extra) {
if ((extra == 6) || (extra == 3) || (extra == 1)) {
VELOX_USER_FAIL(
"Base32::decode() - invalid input string: "
"string length cannot be 6, 3 or 1 more than a multiple of 8.");
}
needed += (extra * kBinaryBlockByteSize) / kEncodedBlockByteSize;
}
return Status::OK();
}

// If not padded, calculate extra bytes, if any
auto extraBytes = inputSize % kEncodedBlockByteSize;
decodedSize = (inputSize / kEncodedBlockByteSize) * kBinaryBlockByteSize;

return needed;
// Adjust the needed size for extra bytes, if present
if (extraBytes) {
if ((extraBytes == 6) || (extraBytes == 3) || (extraBytes == 1)) {
return Status::UserError(
"Base32::decode() - invalid input string: string length cannot be 6, 3, or 1 more than a multiple of 8.");
}
decodedSize += (extraBytes * kBinaryBlockByteSize) / kEncodedBlockByteSize;
}

return Status::OK();
}

// static
uint8_t Base32::base32ReverseLookup(
char p,
const Base32::ReverseIndex& reverseIndex,
Status& status) {
return reverseLookup(p, reverseIndex, status, Base32::kCharsetSize);
}

size_t
Base32::decode(const char* src, size_t src_len, char* dst, size_t dst_len) {
return decodeImpl(src, src_len, dst, dst_len, kBase32ReverseIndexTable);
// static
Status Base32::decode(
std::string_view input,
size_t inputSize,
char* output,
size_t outputSize) {
return decodeImpl(
input, inputSize, output, outputSize, kBase32ReverseIndexTable);
}

size_t Base32::decodeImpl(
const char* src,
size_t src_len,
char* dst,
size_t dst_len,
const ReverseIndex& reverse_lookup) {
if (!src_len) {
return 0;
// static
Status Base32::decodeImpl(
std::string_view input,
size_t inputSize,
char* output,
size_t outputSize,
const Base32::ReverseIndex& reverseIndex) {
// Check if input is empty
if (input.empty()) {
return Status::OK();
}

auto needed = calculateDecodedSize(src, src_len);
if (dst_len < needed) {
VELOX_USER_FAIL(
"Base32::decode() - invalid output string: "
"output string is too small.");
size_t decodedSize;
// Calculate decoded size and check for status
auto status = calculateDecodedSize(input, inputSize, decodedSize);
if (!status.ok()) {
return status;
}

if (outputSize < decodedSize) {
return Status::UserError("Base32::decode() - output buffer too small.");
}

Status lookupStatus;
// Handle full groups of 8 characters.
for (; src_len > 8; src_len -= 8, src += 8, dst += 5) {
/// Each character of the 8 bytes encode 5 bits of the original, grab each
/// with the appropriate shifts to rebuild the original and then split that
/// back into the original 8 bit bytes.
while (inputSize >= 8) {
// Each character of the 8 bytes encodes 5 bits of the original, grab each
// with the appropriate shifts to rebuild the original and then split that
// back into the original 8-bit bytes.
uint64_t last =
(uint64_t(baseReverseLookup(kBase, src[0], reverse_lookup)) << 35) |
(uint64_t(baseReverseLookup(kBase, src[1], reverse_lookup)) << 30) |
(baseReverseLookup(kBase, src[2], reverse_lookup) << 25) |
(baseReverseLookup(kBase, src[3], reverse_lookup) << 20) |
(baseReverseLookup(kBase, src[4], reverse_lookup) << 15) |
(baseReverseLookup(kBase, src[5], reverse_lookup) << 10) |
(baseReverseLookup(kBase, src[6], reverse_lookup) << 5) |
baseReverseLookup(kBase, src[7], reverse_lookup);
dst[0] = (last >> 32) & 0xff;
dst[1] = (last >> 24) & 0xff;
dst[2] = (last >> 16) & 0xff;
dst[3] = (last >> 8) & 0xff;
dst[4] = last & 0xff;
(uint64_t(base32ReverseLookup(input[0], reverseIndex, lookupStatus))
<< 35) |
(uint64_t(base32ReverseLookup(input[1], reverseIndex, lookupStatus))
<< 30) |
(base32ReverseLookup(input[2], reverseIndex, lookupStatus) << 25) |
(base32ReverseLookup(input[3], reverseIndex, lookupStatus) << 20) |
(base32ReverseLookup(input[4], reverseIndex, lookupStatus) << 15) |
(base32ReverseLookup(input[5], reverseIndex, lookupStatus) << 10) |
(base32ReverseLookup(input[6], reverseIndex, lookupStatus) << 5) |
base32ReverseLookup(input[7], reverseIndex, lookupStatus);

output[0] = (last >> 32) & 0xff;
output[1] = (last >> 24) & 0xff;
output[2] = (last >> 16) & 0xff;
output[3] = (last >> 8) & 0xff;
output[4] = last & 0xff;

// Move the input string_view forward
input.remove_prefix(8);
output += 5;
inputSize -= 8;
}

/// Handle the last 2, 4, 5, 7 or 8 characters. This is similar to the above,
/// but the last characters may or may not exist.
DCHECK(src_len >= 2);
uint64_t last =
(uint64_t(baseReverseLookup(kBase, src[0], reverse_lookup)) << 35) |
(uint64_t(baseReverseLookup(kBase, src[1], reverse_lookup)) << 30);
dst[0] = (last >> 32) & 0xff;
if (src_len > 2) {
last |= baseReverseLookup(kBase, src[2], reverse_lookup) << 25;
last |= baseReverseLookup(kBase, src[3], reverse_lookup) << 20;
dst[1] = (last >> 24) & 0xff;
if (src_len > 4) {
last |= baseReverseLookup(kBase, src[4], reverse_lookup) << 15;
dst[2] = (last >> 16) & 0xff;
if (src_len > 5) {
last |= baseReverseLookup(kBase, src[5], reverse_lookup) << 10;
last |= baseReverseLookup(kBase, src[6], reverse_lookup) << 5;
dst[3] = (last >> 8) & 0xff;
if (src_len > 7) {
last |= baseReverseLookup(kBase, src[7], reverse_lookup);
dst[4] = last & 0xff;
// Handle the last 2, 4, 5, 7, or 8 characters.
if (inputSize >= 2) {
uint64_t last =
(uint64_t(base32ReverseLookup(input[0], reverseIndex, lookupStatus))
<< 35) |
(uint64_t(base32ReverseLookup(input[1], reverseIndex, lookupStatus))
<< 30);
output[0] = (last >> 32) & 0xff;

if (inputSize > 2) {
last |= base32ReverseLookup(input[2], reverseIndex, lookupStatus) << 25;
last |= base32ReverseLookup(input[3], reverseIndex, lookupStatus) << 20;
output[1] = (last >> 24) & 0xff;

if (inputSize > 4) {
last |= base32ReverseLookup(input[4], reverseIndex, lookupStatus) << 15;
output[2] = (last >> 16) & 0xff;

if (inputSize > 5) {
last |= base32ReverseLookup(input[5], reverseIndex, lookupStatus)
<< 10;
last |= base32ReverseLookup(input[6], reverseIndex, lookupStatus)
<< 5;
output[3] = (last >> 8) & 0xff;

if (inputSize > 7) {
last |= base32ReverseLookup(input[7], reverseIndex, lookupStatus);
output[4] = last & 0xff;
}
}
}
}
}

return needed;
return lookupStatus.ok() ? Status::OK() : lookupStatus;
}

} // namespace facebook::velox::encoding
60 changes: 41 additions & 19 deletions velox/common/encode/Base32.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,34 +19,56 @@
#include <map>
#include <string>

#include <folly/Range.h>

#include "velox/common/base/Exceptions.h"
#include "velox/common/base/GTestMacros.h"
#include "velox/common/base/Status.h"
#include "velox/common/encode/EncoderUtils.h"

namespace facebook::velox::encoding {

class Base32 {
public:
/// Returns decoded size for the specified input. Adjusts the 'size' to
/// subtract the length of the padding, if exists.
static size_t calculateDecodedSize(const char* data, size_t& size);
static const size_t kCharsetSize = 32;
static const size_t kReverseIndexSize = 256;

/// Character set used for encoding purposes.
/// Contains specific characters that form the encoding scheme.
using Charset = std::array<char, kCharsetSize>;

/// Reverse lookup table for decoding purposes.
/// Maps each possible encoded character to its corresponding numeric value
/// within the encoding base.
using ReverseIndex = std::array<uint8_t, kReverseIndexSize>;

/// Decodes the specified number of characters from the 'src' and writes the
/// result to the 'dst'. The destination must have enough space, e.g. as
/// returned by the calculateDecodedSize().
static size_t
decode(const char* src, size_t src_len, char* dst, size_t dst_len);
/// Returns the actual size of the decoded data. Will also remove the padding
/// length from the 'inputSize'.
static Status calculateDecodedSize(
std::string_view input,
size_t& inputSize,
size_t& decodedSize);

/// Decodes the specified number of characters from the 'input' and writes the
/// result to the 'output'.
static Status decode(
std::string_view input,
size_t inputSize,
char* output,
size_t outputSize);

private:
/// Decodes the specified number of base 32 encoded characters from the 'src'
/// and writes to 'dst'.
static size_t decodeImpl(
const char* src,
size_t src_len,
char* dst,
size_t dst_len,
const ReverseIndex& table);
// Performs a reverse lookup in the reverse index to retrieve the original
// index of a character in the base.
static uint8_t base32ReverseLookup(
char p,
const Base32::ReverseIndex& reverseIndex,
Status& status);

// Decodes the specified input using the provided reverse lookup table.
static Status decodeImpl(
std::string_view input,
size_t inputSize,
char* output,
size_t outputSize,
const Base32::ReverseIndex& reverseIndex);
};

} // namespace facebook::velox::encoding
8 changes: 1 addition & 7 deletions velox/common/encode/Base64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,13 +297,7 @@ uint8_t Base64::base64ReverseLookup(
char p,
const Base64::ReverseIndex& reverseIndex,
Status& status) {
auto curr = reverseIndex[(uint8_t)p];
if (curr >= 0x40) {
status = Status::UserError(
"Base64::decode() - invalid input string: contains invalid characters.");
return 0; // Return 0 or any other error code indicating failure
}
return curr;
return reverseLookup(p, reverseIndex, status, Base64::kCharsetSize);
}

// static
Expand Down
Loading

0 comments on commit 71920a2

Please sign in to comment.