Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add transform_keys and transform_values Presto functions #2245

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions velox/docs/functions/map.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,26 @@ Map Functions
Corresponds to SQL subscript operator [].

SELECT name_to_age_map['Bob'] AS bob_age;

.. function:: transform_keys(map(K1,V), function(K1,V,K2)) -> map(K2,V)

Returns a map that applies ``function`` to each entry of ``map`` and transforms the keys::

SELECT transform_keys(MAP(ARRAY[], ARRAY[]), (k, v) -> k + 1); -- {}
SELECT transform_keys(MAP(ARRAY [1, 2, 3], ARRAY ['a', 'b', 'c']), (k, v) -> k + 1); -- {2 -> a, 3 -> b, 4 -> c}
SELECT transform_keys(MAP(ARRAY ['a', 'b', 'c'], ARRAY [1, 2, 3]), (k, v) -> v * v); -- {1 -> 1, 4 -> 2, 9 -> 3}
SELECT transform_keys(MAP(ARRAY ['a', 'b'], ARRAY [1, 2]), (k, v) -> k || CAST(v as VARCHAR)); -- {a1 -> 1, b2 -> 2}
SELECT transform_keys(MAP(ARRAY [1, 2], ARRAY [1.0, 1.4]), -- {one -> 1.0, two -> 1.4}
(k, v) -> MAP(ARRAY[1, 2], ARRAY['one', 'two'])[k]);

.. function:: transform_values(map(K,V1), function(K,V1,V2)) -> map(K,V2)

Returns a map that applies ``function`` to each entry of ``map`` and transforms the values::

SELECT transform_values(MAP(ARRAY[], ARRAY[]), (k, v) -> v + 1); -- {}
SELECT transform_values(MAP(ARRAY [1, 2, 3], ARRAY [10, 20, 30]), (k, v) -> v + k); -- {1 -> 11, 2 -> 22, 3 -> 33}
SELECT transform_values(MAP(ARRAY [1, 2, 3], ARRAY ['a', 'b', 'c']), (k, v) -> k * k); -- {1 -> 1, 2 -> 4, 3 -> 9}
SELECT transform_values(MAP(ARRAY ['a', 'b'], ARRAY [1, 2]), (k, v) -> k || CAST(v as VARCHAR)); -- {a -> a1, b -> b2}
SELECT transform_values(MAP(ARRAY [1, 2], ARRAY [1.0, 1.4]), -- {1 -> one_1.0, 2 -> two_1.4}
(k, v) -> MAP(ARRAY[1, 2], ARRAY['one', 'two'])[k] || '_' || CAST(v AS VARCHAR));

2 changes: 2 additions & 0 deletions velox/functions/prestosql/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ add_library(
Subscript.cpp
ToUtf8.cpp
Transform.cpp
TransformKeys.cpp
TransformValues.cpp
URLFunctions.cpp
VectorArithmetic.cpp
WidthBucketArray.cpp
Expand Down
2 changes: 1 addition & 1 deletion velox/functions/prestosql/Transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class TransformFunction : public exec::VectorFunction {
public:
bool isDefaultNullBehavior() const override {
// transform is null preserving for the array. But since an
// expr tree witht a lambda depends on all named fields, including
// expr tree with a lambda depends on all named fields, including
// captures, a null in a capture does not automatically make a
// null result.
return false;
Expand Down
124 changes: 124 additions & 0 deletions velox/functions/prestosql/TransformKeys.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "velox/expression/Expr.h"
#include "velox/expression/VectorFunction.h"
#include "velox/functions/lib/LambdaFunctionUtil.h"
#include "velox/vector/FunctionVector.h"

namespace facebook::velox::functions {
namespace {

// See documentation at https://prestodb.io/docs/current/functions/map.html
class TransformKeysFunction : public exec::VectorFunction {
public:
bool isDefaultNullBehavior() const override {
// transform_keys is null preserving for the map. But
// since an expr tree with a lambda depends on all named fields, including
// captures, a null in a capture does not automatically make a
// null result.
return false;
}

void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
const TypePtr& outputType,
exec::EvalCtx* context,
VectorPtr* result) const override {
VELOX_CHECK_EQ(args.size(), 2);

// Flatten input map.
exec::LocalDecodedVector mapDecoder(context, *args[0], rows);
auto& decodedMap = *mapDecoder.get();

auto flatMap = flattenMap(rows, args[0], decodedMap);

std::vector<VectorPtr> lambdaArgs = {
flatMap->mapKeys(), flatMap->mapValues()};
auto numKeys = flatMap->mapKeys()->size();

VectorPtr transformedKeys;

// Loop over lambda functions and apply these to keys of the map.
// In most cases there will be only one function and the loop will run once.
auto it = args[1]->asUnchecked<FunctionVector>()->iterator(&rows);
while (auto entry = it.next()) {
auto keyRows =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do keyRows change over lambdas ? I would presume typically entry.rows would be similar .

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a general case, there is only one lambda function and entry.rows == rows. However, in case when there are multiple lambdas, each lambda applies to a unique subset of rows, hence, keyRows are non-overlapping between lambdas.

toElementRows<MapVector>(numKeys, *entry.rows, flatMap.get());
auto wrapCapture = toWrapCapture<MapVector>(
numKeys, entry.callable, *entry.rows, flatMap);

entry.callable->apply(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: One question: across multiple lambdas , how do we ensure the type of keys remains same ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the job of the query planner. We assume the types are correct similar to different "then" and "else" branches of a switch statement.

keyRows, wrapCapture, context, lambdaArgs, &transformedKeys);
}

auto localResult = std::make_shared<MapVector>(
flatMap->pool(),
outputType,
flatMap->nulls(),
flatMap->size(),
flatMap->offsets(),
flatMap->sizes(),
transformedKeys,
flatMap->mapValues());

checkDuplicateKeys(localResult, rows);

context->moveOrCopyResult(localResult, rows, result);
}

static std::vector<std::shared_ptr<exec::FunctionSignature>> signatures() {
// map(K1, V), function(K1, V) -> K2 -> map(K2, V)
return {exec::FunctionSignatureBuilder()
.typeVariable("K1")
.typeVariable("K2")
.typeVariable("V")
.returnType("map(K2,V)")
.argumentType("map(K1,V)")
.argumentType("function(K1,V,K2)")
.build()};
}

private:
void checkDuplicateKeys(
const MapVectorPtr& mapVector,
const SelectivityVector& rows) const {
static const char* kDuplicateKey = "Duplicate map keys are not allowed";

MapVector::canonicalize(mapVector);

auto offsets = mapVector->rawOffsets();
auto sizes = mapVector->rawSizes();
auto mapKeys = mapVector->mapKeys();
rows.applyToSelected([&](auto row) {
auto offset = offsets[row];
auto size = sizes[row];
for (auto i = 1; i < size; i++) {
if (mapKeys->equalValueAt(mapKeys.get(), offset + i, offset + i - 1)) {
VELOX_USER_FAIL("{}", kDuplicateKey);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Maybe also print out the value of the duplicate key?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good idea. Let me add this in a follow-up PR as it should be added to the map() function as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kagamiori #2260 adds duplicate key to the message.

}
}
});
}
};
} // namespace

VELOX_DECLARE_VECTOR_FUNCTION(
udf_transform_keys,
TransformKeysFunction::signatures(),
std::make_unique<TransformKeysFunction>());

} // namespace facebook::velox::functions
99 changes: 99 additions & 0 deletions velox/functions/prestosql/TransformValues.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "velox/expression/Expr.h"
#include "velox/expression/VectorFunction.h"
#include "velox/functions/lib/LambdaFunctionUtil.h"
#include "velox/vector/FunctionVector.h"

namespace facebook::velox::functions {
namespace {

// See documentation at https://prestodb.io/docs/current/functions/map.html
class TransformValuesFunction : public exec::VectorFunction {
Copy link
Contributor

@kgpai kgpai Aug 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: It seems values/key transforms can be templatized to one function, however this makes easier reading personally to me at expense of code duplication.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking about that and decided not to use templates for readability.

public:
bool isDefaultNullBehavior() const override {
// transform_values is null preserving for the map. But
// since an expr tree with a lambda depends on all named fields, including
// captures, a null in a capture does not automatically make a
// null result.
return false;
}

void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
const TypePtr& outputType,
exec::EvalCtx* context,
VectorPtr* result) const override {
VELOX_CHECK_EQ(args.size(), 2);

// Flatten input map.
exec::LocalDecodedVector mapDecoder(context, *args[0], rows);
auto& decodedMap = *mapDecoder.get();

auto flatMap = flattenMap(rows, args[0], decodedMap);

std::vector<VectorPtr> lambdaArgs = {
flatMap->mapKeys(), flatMap->mapValues()};
auto numValues = flatMap->mapValues()->size();

VectorPtr transformedValues;

// Loop over lambda functions and apply these to values of the map.
// In most cases there will be only one function and the loop will run once.
auto it = args[1]->asUnchecked<FunctionVector>()->iterator(&rows);
while (auto entry = it.next()) {
auto valueRows =
toElementRows<MapVector>(numValues, *entry.rows, flatMap.get());
auto wrapCapture = toWrapCapture<MapVector>(
numValues, entry.callable, *entry.rows, flatMap);

entry.callable->apply(
valueRows, wrapCapture, context, lambdaArgs, &transformedValues);
}

auto localResult = std::make_shared<MapVector>(
flatMap->pool(),
outputType,
flatMap->nulls(),
flatMap->size(),
flatMap->offsets(),
flatMap->sizes(),
flatMap->mapKeys(),
transformedValues);
context->moveOrCopyResult(localResult, rows, result);
}

static std::vector<std::shared_ptr<exec::FunctionSignature>> signatures() {
// map(K, V1), function(K, V1) -> V2 -> map(K, V2)
return {exec::FunctionSignatureBuilder()
.typeVariable("K")
.typeVariable("V1")
.typeVariable("V2")
.returnType("map(K,V2)")
.argumentType("map(K,V1)")
.argumentType("function(K,V1,V2)")
.build()};
}
};
} // namespace

VELOX_DECLARE_VECTOR_FUNCTION(
udf_transform_values,
TransformValuesFunction::signatures(),
std::make_unique<TransformValuesFunction>());

} // namespace facebook::velox::functions
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
namespace facebook::velox::functions {
void registerMapFunctions() {
VELOX_REGISTER_VECTOR_FUNCTION(udf_map_filter, "map_filter");
VELOX_REGISTER_VECTOR_FUNCTION(udf_transform_keys, "transform_keys");
VELOX_REGISTER_VECTOR_FUNCTION(udf_transform_values, "transform_values");
VELOX_REGISTER_VECTOR_FUNCTION(udf_map, "map");
VELOX_REGISTER_VECTOR_FUNCTION(udf_map_concat, "map_concat");
VELOX_REGISTER_VECTOR_FUNCTION(udf_map_entries, "map_entries");
Expand Down
2 changes: 2 additions & 0 deletions velox/functions/prestosql/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ add_executable(
SplitTest.cpp
StringFunctionsTest.cpp
TransformTest.cpp
TransformKeysTest.cpp
TransformValuesTest.cpp
URLFunctionsTest.cpp
WidthBucketArrayTest.cpp
GreatestLeastTest.cpp
Expand Down
Loading