Add presto scalar function 'to_base32'

Joe-Abraham · Apr 4, 2024 · 67c917b · 67c917b
1 parent 09eccfc
commit 67c917b
Show file tree

Hide file tree

Showing 9 changed files with 312 additions and 2 deletions.
diff --git a/velox/common/encode/Base32.cpp b/velox/common/encode/Base32.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/common/encode/Base32.h"
+
+#include <glog/logging.h>
+
+namespace facebook::velox::encoding {
+
+// Encoding base to be used.
+constexpr static int kBase = 32;
+
+// Constants defining the size of binary and encoded blocks for Base32 encoding.
+constexpr static int kBinaryBlockSize = 5; // 5 bytes of binary = 40 bits
+constexpr static int kEncodedBlockSize = 8; // 8 bytes of encoded = 40 bits
+
+constexpr Charset kBase32Charset = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
+                                    'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+                                    'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
+                                    'Y', 'Z', '2', '3', '4', '5', '6', '7'};
+
+constexpr ReverseIndex kBase32ReverseIndexTable = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  255, 255, 255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
+
+/// Verify that for each 32 entries in kBase32Charset, the corresponding entry
+/// in kBase32ReverseIndexTable is correct.
+static_assert(
+    checkForwardIndex(
+        sizeof(kBase32Charset) / 2 - 1,
+        kBase32Charset,
+        kBase32ReverseIndexTable),
+    "kBase32Charset has incorrect entries");
+
+/// Verify that for every entry in kBase32ReverseIndexTable, the corresponding
+/// entry in kBase32Charset is correct.
+static_assert(
+    checkReverseIndex(
+        sizeof(kBase32ReverseIndexTable) - 1,
+        kBase32Charset,
+        kBase,
+        kBase32ReverseIndexTable),
+    "kBase32ReverseIndexTable has incorrect entries.");
+
+// static
+size_t Base32::calculateEncodedSize(size_t size, bool withPadding) {
+  if (size == 0) {
+    return 0;
+  }
+
+  // Calculate the output size assuming that we are including padding.
+  size_t encodedSize = ((size + 4) / 5) * 8;
+  if (!withPadding) {
+    // If the padding was not requested, subtract the padding bytes.
+    encodedSize -= (5 - (size % 5)) % 5;
+  }
+  return encodedSize;
+}
+
+// static
+void Base32::encode(const char* data, size_t len, char* output) {
+  encodeImpl(folly::StringPiece(data, len), kBase32Charset, true, output);
+}
+
+template <class T>
+/* static */ void Base32::encodeImpl(
+    const T& data,
+    const Charset& charset,
+    bool include_pad,
+    char* out) {
+  auto len = data.size();
+  if (len == 0) {
+    return;
+  }
+
+  auto wp = out;
+  auto it = data.begin();
+
+  auto append_padding = [include_pad](char* str, int n) -> char* {
+    if (include_pad) {
+      for (int i = 0; i < n; ++i) {
+        *str++ = kPadding;
+      }
+    }
+    return str;
+  };
+
+  /// For each group of 5 bytes (40 bits) in the input, split that into
+  /// 8 groups of 5 bits and encode that using the supplied charset lookup.
+  for (; len > 4; len -= 5) {
+    uint64_t curr = uint64_t(*it++) << 32;
+    curr |= uint8_t(*it++) << 24;
+    curr |= uint8_t(*it++) << 16;
+    curr |= uint8_t(*it++) << 8;
+    curr |= uint8_t(*it++);
+
+    *wp++ = charset[(curr >> 35) & 0x1f];
+    *wp++ = charset[(curr >> 30) & 0x1f];
+    *wp++ = charset[(curr >> 25) & 0x1f];
+    *wp++ = charset[(curr >> 20) & 0x1f];
+    *wp++ = charset[(curr >> 15) & 0x1f];
+    *wp++ = charset[(curr >> 10) & 0x1f];
+    *wp++ = charset[(curr >> 5) & 0x1f];
+    *wp++ = charset[curr & 0x1f];
+  }
+
+  if (len > 0) {
+    /// We have either 1 to 4 input bytes left.  Encode this similar to the
+    /// above (assuming 0 for all other bytes).  Optionally append the '='
+    /// character if it is requested.
+    uint64_t curr = uint64_t(*it++) << 32;
+    *wp++ = charset[(curr >> 35) & 0x1f];
+
+    if (len > 3) {
+      curr |= uint8_t(*it++) << 24;
+      curr |= uint8_t(*it++) << 16;
+      curr |= uint8_t(*it++) << 8;
+
+      *wp++ = charset[(curr >> 30) & 0x1f];
+      *wp++ = charset[(curr >> 25) & 0x1f];
+      *wp++ = charset[(curr >> 20) & 0x1f];
+      *wp++ = charset[(curr >> 15) & 0x1f];
+      *wp++ = charset[(curr >> 10) & 0x1f];
+      *wp++ = charset[(curr >> 5) & 0x1f];
+
+      append_padding(wp, 1);
+    } else if (len > 2) {
+      curr |= uint8_t(*it++) << 24;
+      curr |= uint8_t(*it++) << 16;
+
+      *wp++ = charset[(curr >> 30) & 0x1f];
+      *wp++ = charset[(curr >> 25) & 0x1f];
+      *wp++ = charset[(curr >> 20) & 0x1f];
+      *wp++ = charset[(curr >> 15) & 0x1f];
+
+      append_padding(wp, 3);
+
+    } else if (len > 1) {
+      curr |= uint8_t(*it) << 24;
+
+      *wp++ = charset[(curr >> 30) & 0x1f];
+      *wp++ = charset[(curr >> 25) & 0x1f];
+      *wp++ = charset[(curr >> 20) & 0x1f];
+
+      append_padding(wp, 4);
+    } else {
+      *wp++ = charset[(curr >> 30) & 0x1f];
+
+      append_padding(wp, 6);
+    }
+  }
+}
+
+} // namespace facebook::velox::encoding
diff --git a/velox/common/encode/Base32.h b/velox/common/encode/Base32.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <exception>
+#include <map>
+#include <string>
+
+#include <folly/Range.h>
+#include "velox/common/encode/EncoderUtils.h"
+
+namespace facebook::velox::encoding {
+
+class Base32 {
+ public:
+  /// Returns encoded size for the input of the specified size.
+  static size_t calculateEncodedSize(size_t size, bool withPadding = true);
+
+  /// Encodes the specified number of characters from the 'data' and writes the
+  /// result to the 'output'. The output must have enough space, e.g. as
+  /// returned by the calculateEncodedSize().
+  static void encode(const char* data, size_t size, char* output);
+
+ private:
+  template <class T>
+  static void encodeImpl(
+      const T& data,
+      const Charset& charset,
+      bool include_pad,
+      char* out);
+};
+
+} // namespace facebook::velox::encoding
diff --git a/velox/common/encode/CMakeLists.txt b/velox/common/encode/CMakeLists.txt
@@ -16,5 +16,5 @@ if(${VELOX_BUILD_TESTING})
   add_subdirectory(tests)
 endif()
 
-add_library(velox_encode Base64.cpp)
+add_library(velox_encode Base32.cpp Base64.cpp)
 target_link_libraries(velox_encode PUBLIC Folly::folly)
diff --git a/velox/common/encode/tests/Base32Test.cpp b/velox/common/encode/tests/Base32Test.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/common/encode/Base32.h"
+#include <gtest/gtest.h>
+#include "velox/common/base/tests/GTestUtils.h"
+
+namespace facebook::velox::encoding {
+
+class Base32Test : public ::testing::Test {};
+
+TEST_F(Base32Test, calculateEncodedSizeProperSize) {
+  EXPECT_EQ(0, Base32::calculateEncodedSize(0, false));
+  EXPECT_EQ(4, Base32::calculateEncodedSize(1, false));
+  EXPECT_EQ(5, Base32::calculateEncodedSize(2, false));
+  EXPECT_EQ(6, Base32::calculateEncodedSize(3, false));
+  EXPECT_EQ(7, Base32::calculateEncodedSize(4, false));
+
+  EXPECT_EQ(0, Base32::calculateEncodedSize(0, true));
+  EXPECT_EQ(8, Base32::calculateEncodedSize(1, true));
+  EXPECT_EQ(8, Base32::calculateEncodedSize(2, true));
+  EXPECT_EQ(8, Base32::calculateEncodedSize(3, true));
+  EXPECT_EQ(8, Base32::calculateEncodedSize(4, true));
+
+  EXPECT_EQ(20, Base32::calculateEncodedSize(11, false));
+  EXPECT_EQ(24, Base32::calculateEncodedSize(11, true));
+}
+
+} // namespace facebook::velox::encoding
diff --git a/velox/common/encode/tests/CMakeLists.txt b/velox/common/encode/tests/CMakeLists.txt
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_executable(velox_common_encode_test Base64Test.cpp EncoderUtilsTests.cpp)
+add_executable(velox_common_encode_test Base32Test.cpp Base64Test.cpp EncoderUtilsTests.cpp)
 add_test(velox_common_encode_test velox_common_encode_test)
 target_link_libraries(
   velox_common_encode_test

diff --git a/velox/docs/functions/presto/binary.rst b/velox/docs/functions/presto/binary.rst
@@ -96,6 +96,10 @@ Binary Functions
 
      Encodes ``bigint`` in a 64-bit 2’s complement big endian format.
 
+.. function:: to_base32(binary) -> varchar
+
+    Encodes ``binary`` into a base32 string representation.
+
 .. function:: to_hex(binary) -> varchar
 
     Encodes ``binary`` into a hex string representation.

diff --git a/velox/functions/prestosql/BinaryFunctions.h b/velox/functions/prestosql/BinaryFunctions.h
@@ -21,6 +21,7 @@
 
 #include "folly/ssl/OpenSSLHash.h"
 #include "velox/common/base/BitUtil.h"
+#include "velox/common/encode/Base32.h"
 #include "velox/common/encode/Base64.h"
 #include "velox/external/md5/md5.h"
 #include "velox/functions/Udf.h"
@@ -325,6 +326,18 @@ struct ToBase64UrlFunction {
   }
 };
 
+template <typename T>
+struct ToBase32Function {
+  VELOX_DEFINE_FUNCTION_TYPES(T);
+
+  FOLLY_ALWAYS_INLINE void call(
+      out_type<Varchar>& result,
+      const arg_type<Varbinary>& input) {
+    result.resize(encoding::Base32::calculateEncodedSize(input.size()));
+    encoding::Base32::encode(input.data(), input.size(), result.data());
+  }
+};
+
 template <typename T>
 struct FromBigEndian32 {
   VELOX_DEFINE_FUNCTION_TYPES(T);

diff --git a/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp b/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp
@@ -51,6 +51,8 @@ void registerSimpleFunctions(const std::string& prefix) {
       {prefix + "to_base64url"});
   registerFunction<FromBase64UrlFunction, Varbinary, Varchar>(
       {prefix + "from_base64url"});
+  registerFunction<ToBase32Function, Varchar, Varbinary>(
+      {prefix + "to_base32"});
 
   registerFunction<FromBigEndian32, int32_t, Varbinary>(
       {prefix + "from_big_endian_32"});

diff --git a/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp b/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp
@@ -468,6 +468,27 @@ TEST_F(BinaryFunctionsTest, fromBase64Url) {
   EXPECT_THROW(fromBase64Url("YQ=/"), VeloxUserError);
 }
 
+TEST_F(BinaryFunctionsTest, toBase32) {
+  const auto toBase32 = [&](std::optional<std::string> value) {
+    return evaluateOnce<std::string>("to_base32(cast(c0 as varbinary))", value);
+  };
+
+  EXPECT_EQ(std::nullopt, toBase32(std::nullopt));
+  EXPECT_EQ("", toBase32(""));
+  EXPECT_EQ("ME======", toBase32("a"));
+  EXPECT_EQ("MFRGG===", toBase32("abc"));
+  EXPECT_EQ("NZXQ====", toBase32("no"));
+  EXPECT_EQ("O5SQ====", toBase32("we"));
+  EXPECT_EQ("MRRDE===", toBase32("db2"));
+  EXPECT_EQ("MNQWWZI=", toBase32("cake"));
+  EXPECT_EQ("NNSWK3Q=", toBase32("keen"));
+  EXPECT_EQ("GEZDGNA=", toBase32("1234"));
+  EXPECT_EQ("NBSWY3DPEB3W64TMMQ======", toBase32("hello world"));
+  EXPECT_EQ(
+      "JBSWY3DPEBLW64TMMQQGM4TPNUQFMZLMN54CC===",
+      toBase32("Hello World from Velox!"));
+}
+
 TEST_F(BinaryFunctionsTest, fromBigEndian32) {
   const auto fromBigEndian32 = [&](const std::optional<std::string>& arg) {
     return evaluateOnce<int32_t, std::string>(