Skip to content

Commit

Permalink
Fix json_extract for non-definite paths (paths with wildcards)
Browse files Browse the repository at this point in the history
Summary:
In Presto, paths with wildcards are handled by Jayway engine. It always returns an array for such paths, even if only one entry matches.

Non-definite paths matching zero or one entry:

```
presto:di> select json_extract('{"a": [{"b": 123}]}', '$.a[*].b');
 _col0
-------
 [123]

presto:di> select json_extract('{"a": [{"b": 123}]}', '$.a[*].c');
 _col0
-------
 []
```

"Equivalent" define paths:

```
presto:di> select json_extract('{"a": [{"b": 123}]}', '$.a[0].b');
 _col0
-------
 123

presto:di> select json_extract('{"a": [{"b": 123}]}', '$.a[0].c');
 _col0
-------
 NULL
```

Reviewed By: Yuhta

Differential Revision: D56529345

fbshipit-source-id: 13b5d3a9de852a7b33b28fcecf7b5cf175213317
  • Loading branch information
mbasmanova authored and facebook-github-bot committed Apr 25, 2024
1 parent abfcf12 commit 4e075a0
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 46 deletions.
24 changes: 16 additions & 8 deletions velox/functions/prestosql/SIMDJsonFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,8 @@ struct SIMDJsonExtractScalarFunction {
return simdjson::SUCCESS;
};

SIMDJSON_TRY(simdJsonExtract(json, jsonPath, consumer));
auto& extractor = SIMDJsonExtractor::getInstance(jsonPath);
SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer));

if (resultStr.has_value()) {
result.copy_from(*resultStr);
Expand Down Expand Up @@ -284,20 +285,26 @@ struct SIMDJsonExtractFunction {
return simdjson::SUCCESS;
};

SIMDJSON_TRY(simdJsonExtract(json, jsonPath, consumer));
auto& extractor = SIMDJsonExtractor::getInstance(jsonPath);
SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer));

if (resultSize == 0) {
// If the path didn't map to anything in the JSON object, return null.
return simdjson::NO_SUCH_FIELD;
}
if (extractor.isDefinitePath()) {
// If the path didn't map to anything in the JSON object, return null.
return simdjson::NO_SUCH_FIELD;
}

if (resultSize == 1) {
result.copy_from("[]");
} else if (resultSize == 1 && extractor.isDefinitePath()) {
// If there was only one value mapped to by the path, don't wrap it in an
// array.
result.copy_from(results);
} else {
// Add the square brackets to make it a valid JSON array.
result.copy_from("[" + results + "]");
result.reserve(2 + results.size());
result.append("[");
result.append(results);
result.append("]");
}
return simdjson::SUCCESS;
}
Expand Down Expand Up @@ -349,7 +356,8 @@ struct SIMDJsonSizeFunction {
return simdjson::SUCCESS;
};

SIMDJSON_TRY(simdJsonExtract(json, jsonPath, consumer));
auto& extractor = SIMDJsonExtractor::getInstance(jsonPath);
SIMDJSON_TRY(simdJsonExtract(json, extractor, consumer));

if (resultCount == 0) {
// If the path didn't map to anything in the JSON object, return null.
Expand Down
8 changes: 6 additions & 2 deletions velox/functions/prestosql/json/SIMDJsonExtractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@

#include "velox/functions/prestosql/json/SIMDJsonExtractor.h"

namespace facebook::velox::functions::detail {
namespace facebook::velox::functions {
namespace {
using JsonVector = std::vector<simdjson::ondemand::value>;
}

/* static */ SIMDJsonExtractor& SIMDJsonExtractor::getInstance(
folly::StringPiece path) {
// Cache tokenize operations in JsonExtractor across invocations in the same
Expand Down Expand Up @@ -94,4 +98,4 @@ simdjson::error_code extractArray(
}
return simdjson::SUCCESS;
}
} // namespace facebook::velox::functions::detail
} // namespace facebook::velox::functions
50 changes: 18 additions & 32 deletions velox/functions/prestosql/json/SIMDJsonExtractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,6 @@

namespace facebook::velox::functions {

template <typename TConsumer>
simdjson::error_code simdJsonExtract(
const velox::StringView& json,
const velox::StringView& path,
TConsumer&& consumer);

namespace detail {

using JsonVector = std::vector<simdjson::ondemand::value>;

class SIMDJsonExtractor {
public:
template <typename TConsumer>
Expand All @@ -45,14 +35,26 @@ class SIMDJsonExtractor {
TConsumer& consumer,
size_t tokenStartIndex = 0);

// Returns true if this extractor was initialized with the trivial path "$".
/// Returns true if this extractor was initialized with the trivial path "$".
bool isRootOnlyPath() {
return tokens_.empty();
}

// Use this method to get an instance of SIMDJsonExtractor given a JSON path.
// Given the nature of the cache, it's important this is only used by
// simdJsonExtract.
/// Returns true if this extractor was initialized with a path that's
/// guaranteed to match at most one entry.
bool isDefinitePath() {
for (const auto& token : tokens_) {
if (token == "*") {
return false;
}
}

return true;
}

/// Use this method to get an instance of SIMDJsonExtractor given a JSON path.
/// Given the nature of the cache, it's important this is only used by
/// the callers of simdJsonExtract.
static SIMDJsonExtractor& getInstance(folly::StringPiece path);

private:
Expand Down Expand Up @@ -123,9 +125,7 @@ simdjson::error_code SIMDJsonExtractor::extract(
}

return consumer(input);
}

} // namespace detail
};

/**
* Extract element(s) from a JSON object using the given path.
Expand All @@ -145,15 +145,11 @@ simdjson::error_code SIMDJsonExtractor::extract(
* @return Return simdjson::SUCCESS on success.
* If any errors are encountered parsing the JSON, returns the error.
*/

template <typename TConsumer>
simdjson::error_code simdJsonExtract(
const velox::StringView& json,
const velox::StringView& path,
SIMDJsonExtractor& extractor,
TConsumer&& consumer) {
// If extractor fails to parse the path, this will throw a VeloxUserError, and
// we want to let this exception bubble up to the client.
auto& extractor = detail::SIMDJsonExtractor::getInstance(path);
simdjson::padded_string paddedJson(json.data(), json.size());
SIMDJSON_ASSIGN_OR_RAISE(auto jsonDoc, simdjsonParse(paddedJson));

Expand All @@ -167,14 +163,4 @@ simdjson::error_code simdJsonExtract(
return extractor.extract(value, std::forward<TConsumer>(consumer));
}

template <typename TConsumer>
simdjson::error_code simdJsonExtract(
const std::string& json,
const std::string& path,
TConsumer&& consumer) {
return simdJsonExtract(
velox::StringView(json),
velox::StringView(path),
std::forward<TConsumer>(consumer));
}
} // namespace facebook::velox::functions
21 changes: 17 additions & 4 deletions velox/functions/prestosql/json/tests/SIMDJsonExtractorTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,18 @@
#include "gtest/gtest.h"
#include "velox/common/base/VeloxException.h"

namespace facebook::velox::functions {
namespace {
using facebook::velox::VeloxUserError;
using facebook::velox::functions::simdJsonExtract;

template <typename TConsumer>
simdjson::error_code simdJsonExtract(
const std::string& json,
const std::string& path,
TConsumer&& consumer) {
auto& extractor = SIMDJsonExtractor::getInstance(path);
return simdJsonExtract(
velox::StringView(json), extractor, std::forward<TConsumer>(consumer));
}

class SIMDJsonExtractorTest : public testing::Test {
public:
Expand Down Expand Up @@ -52,8 +61,10 @@ class SIMDJsonExtractorTest : public testing::Test {
return simdjson::SUCCESS;
};

EXPECT_EQ(simdJsonExtract(json, path, consumer), simdjson::SUCCESS)
<< "with json " << json << " and path " << path;
SCOPED_TRACE(json);
SCOPED_TRACE(path);

EXPECT_EQ(simdJsonExtract(json, path, consumer), simdjson::SUCCESS);

if (!expected) {
EXPECT_EQ(0, res.size());
Expand Down Expand Up @@ -536,4 +547,6 @@ TEST_F(SIMDJsonExtractorTest, invalidJson) {
json = "{\"foo\": [\"bar\", \"baz]}";
EXPECT_NE(simdJsonExtract(json, "$.foo[0]", consumer), simdjson::SUCCESS);
}

} // namespace
} // namespace facebook::velox::functions
7 changes: 7 additions & 0 deletions velox/functions/prestosql/tests/JsonFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,13 @@ TEST_F(JsonFunctionsTest, jsonExtract) {
EXPECT_EQ("3", jsonExtract(json, "$[0][0][0].b.[2]"));
EXPECT_EQ("3", jsonExtract(json, "$.[0].[0][0].b.[2]"));

// Definite vs. non-definite paths.
EXPECT_EQ("[123]", jsonExtract(R"({"a": [{"b": 123}]})", "$.a[*].b"));
EXPECT_EQ("123", jsonExtract(R"({"a": [{"b": 123}]})", "$.a[0].b"));

EXPECT_EQ("[]", jsonExtract(R"({"a": [{"b": 123}]})", "$.a[*].c"));
EXPECT_EQ(std::nullopt, jsonExtract(R"({"a": [{"b": 123}]})", "$.a[0].c"));

// TODO The following paths are supported by Presto via Jayway, but do not
// work in Velox yet. Figure out how to add support for these.
VELOX_ASSERT_THROW(jsonExtract(kJson, "$..price"), "Invalid JSON path");
Expand Down

0 comments on commit 4e075a0

Please sign in to comment.