-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Presto Type Parser based on Flex and Bison #7568
Changes from all commits
260f66a
35de81e
6c348dd
9612498
26ecd6b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
if(${VELOX_BUILD_TESTING}) | ||
add_subdirectory(tests) | ||
endif() | ||
|
||
bison_target( | ||
TypeParser TypeParser.yy ${CMAKE_CURRENT_BINARY_DIR}/TypeParser.yy.cc | ||
DEFINES_FILE ${CMAKE_CURRENT_BINARY_DIR}/TypeParser.yy.h) | ||
|
||
flex_target( | ||
TypeParserScanner TypeParser.ll ${CMAKE_CURRENT_BINARY_DIR}/Scanner.cpp | ||
COMPILE_FLAGS "-Cf --prefix=veloxtp") | ||
|
||
add_flex_bison_dependency(TypeParserScanner TypeParser) | ||
|
||
include_directories(${PROJECT_BINARY_DIR}) | ||
include_directories(${FLEX_INCLUDE_DIRS}) | ||
add_library( | ||
velox_type_parser ${BISON_TypeParser_OUTPUTS} | ||
${FLEX_TypeParserScanner_OUTPUTS} Scanner.h TypeParser.h) | ||
target_link_libraries(velox_type_parser velox_common_base) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
/* | ||
* Copyright (c) Facebook, Inc. and its affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
#include <cmath> | ||
#include <iostream> | ||
#include <sstream> | ||
#include <string> | ||
#include <unordered_map> | ||
|
||
#include "velox/common/base/Exceptions.h" | ||
#include "velox/type/Type.h" | ||
|
||
namespace facebook::velox::type { | ||
|
||
class Scanner : public yyFlexLexer { | ||
public: | ||
Scanner( | ||
std::istream& arg_yyin, | ||
std::ostream& arg_yyout, | ||
TypePtr& outputType, | ||
const std::string_view input) | ||
: yyFlexLexer(&arg_yyin, &arg_yyout), | ||
outputType_(outputType), | ||
input_(input){}; | ||
int lex(Parser::semantic_type* yylval); | ||
|
||
void setType(TypePtr type) { | ||
outputType_ = std::move(type); | ||
} | ||
|
||
// Store input to print it as part of the error message. | ||
std::string_view input() { | ||
return input_; | ||
} | ||
|
||
private: | ||
TypePtr& outputType_; | ||
const std::string_view input_; | ||
}; | ||
|
||
} // namespace facebook::velox::type |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
/* | ||
* Copyright (c) Facebook, Inc. and its affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <string> | ||
#include "velox/type/Type.h" | ||
|
||
namespace facebook::velox { | ||
|
||
/// Parses a type string in Presto format to Velox type. | ||
/// Example type strings: | ||
/// row(col0 bigint, varchar) | ||
/// array(bigint) | ||
/// map(bigint, array(bigint)) | ||
/// function(bigint,bigint,bigint) | ||
/// The parsing is case-insensitive. i.e. 'Row' and 'row' are equal. | ||
/// Field names for rows are optional. | ||
/// Quoted field names are supported. | ||
/// All custom types need to be registered. An error is thrown otherwise. | ||
/// Types with spaces must be explicitly handled in the parser. | ||
/// Uses the Type::getType API to convert a string to Velox type. | ||
TypePtr parseType(const std::string& typeText); | ||
|
||
} // namespace facebook::velox |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
%{ | ||
#include <vector> | ||
#include <memory> | ||
|
||
#include "velox/type/parser/TypeParser.yy.h" // @manual | ||
#include "velox/type/parser/Scanner.h" | ||
#define YY_DECL int facebook::velox::type::Scanner::lex(facebook::velox::type::Parser::semantic_type *yylval) | ||
%} | ||
|
||
%option c++ noyywrap noyylineno nodefault caseless | ||
|
||
A [A|a] | ||
B [B|b] | ||
C [C|c] | ||
D [D|d] | ||
E [E|e] | ||
F [F|f] | ||
G [G|g] | ||
H [H|h] | ||
I [I|i] | ||
J [J|j] | ||
K [K|k] | ||
L [L|l] | ||
M [M|m] | ||
O [O|o] | ||
P [P|p] | ||
R [R|r] | ||
S [S|s] | ||
T [T|t] | ||
U [U|u] | ||
W [W|w] | ||
X [X|x] | ||
Y [Y|y] | ||
Z [Z|z] | ||
|
||
WORD ([[:alpha:][:alnum:]_]*) | ||
QUOTED_ID (['"'][[:alnum:][:space:]_]*['"']) | ||
NUMBER ([[:digit:]]+) | ||
ROW (ROW|STRUCT) | ||
VARIABLE (VARCHAR|VARBINARY) | ||
TYPE_WITH_SPACES ((DOUBLE[ ]PRECISION)|(TIME[ ]WITH[ ]TIME[ ]ZONE)|(TIMESTAMP[ ]WITH[ ]TIME[ ]ZONE)|(INTERVAL[ ]YEAR[ ]TO[ ]MONTH)|(INTERVAL[ ]DAY[ ]TO[ ]SECOND)) | ||
|
||
%% | ||
|
||
"(" return Parser::token::LPAREN; | ||
")" return Parser::token::RPAREN; | ||
"," return Parser::token::COMMA; | ||
(ARRAY) return Parser::token::ARRAY; | ||
(MAP) return Parser::token::MAP; | ||
(FUNCTION) return Parser::token::FUNCTION; | ||
(DECIMAL) return Parser::token::DECIMAL; | ||
{ROW} return Parser::token::ROW; | ||
{VARIABLE} yylval->build<std::string>(YYText()); return Parser::token::VARIABLE; | ||
{NUMBER} yylval->build<long long>(folly::to<int>(YYText())); return Parser::token::NUMBER; | ||
{WORD} yylval->build<std::string>(YYText()); return Parser::token::WORD; | ||
{TYPE_WITH_SPACES} yylval->build<std::string>(YYText()); return Parser::token::TYPE_WITH_SPACES; | ||
{QUOTED_ID} yylval->build<std::string>(YYText()); return Parser::token::QUOTED_ID; | ||
<<EOF>> return Parser::token::YYEOF; | ||
. /* no action on unmatched input */ | ||
|
||
%% | ||
|
||
int yyFlexLexer::yylex() { | ||
throw std::runtime_error("Bad call to yyFlexLexer::yylex()"); | ||
} | ||
|
||
#include "velox/type/parser/TypeParser.h" | ||
|
||
facebook::velox::TypePtr facebook::velox::parseType(const std::string& typeText) | ||
{ | ||
std::istringstream is(typeText); | ||
facebook::velox::TypePtr type; | ||
facebook::velox::type::Scanner scanner{is, std::cerr, type, typeText}; | ||
facebook::velox::type::Parser parser{ &scanner }; | ||
parser.parse(); | ||
VELOX_CHECK(type, "Failed to parse type [{}]", typeText); | ||
return type; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
%{ | ||
#include <FlexLexer.h> | ||
#include "velox/common/base/Exceptions.h" | ||
#include "velox/type/Type.h" | ||
%} | ||
%require "3.0.4" | ||
%language "C++" | ||
|
||
%define parser_class_name {Parser} | ||
%define api.namespace {facebook::velox::type} | ||
%define api.value.type variant | ||
%parse-param {Scanner* scanner} | ||
%define parse.error verbose | ||
|
||
%code requires | ||
{ | ||
namespace facebook::velox::type { | ||
class Scanner; | ||
} // namespace facebook::velox::type | ||
namespace facebook::velox { | ||
class Type; | ||
} // namespace facebook::velox | ||
struct RowArguments { | ||
std::vector<std::string> names; | ||
std::vector<std::shared_ptr<const facebook::velox::Type>> types; | ||
}; | ||
} // %code requires | ||
|
||
%code | ||
{ | ||
#include <velox/type/parser/Scanner.h> | ||
#define yylex(x) scanner->lex(x) | ||
using namespace facebook::velox; | ||
TypePtr typeFromString(const std::string& type) { | ||
auto upper = type; | ||
std::transform(upper.begin(), upper.end(), upper.begin(), ::toupper); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think folly has an in-place upper case method as well There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could not find one. Velox uses boost in other places. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It actually only support lower case: https://github.com/facebook/folly/blob/main/folly/String.h#L742 never mind |
||
if (upper == "INT") { | ||
upper = "INTEGER"; | ||
} else if (upper == "DOUBLE PRECISION") { | ||
upper = "DOUBLE"; | ||
} | ||
auto inferredType = getType(upper, {}); | ||
VELOX_CHECK(inferredType, "Failed to parse type [{}]. Type not registered.", type); | ||
return inferredType; | ||
} | ||
} | ||
|
||
%token LPAREN RPAREN COMMA ARRAY MAP ROW FUNCTION DECIMAL | ||
%token <std::string> WORD VARIABLE QUOTED_ID TYPE_WITH_SPACES | ||
%token <long long> NUMBER | ||
%token YYEOF 0 | ||
|
||
%nterm <std::shared_ptr<const Type>> type array_type map_type variable_type | ||
%nterm <std::pair<std::string, std::shared_ptr<const Type>>> named_type | ||
%nterm <std::shared_ptr<const Type>> row_type function_type decimal_type simple_type | ||
%nterm <std::string> identifier | ||
%nterm <std::vector<std::shared_ptr<const Type>>> type_list | ||
%nterm <RowArguments> type_list_opt_names | ||
|
||
%% | ||
|
||
type_spec : named_type { scanner->setType($1.second); } | ||
| type { scanner->setType($1); } | ||
| error { yyerrok; } | ||
; | ||
|
||
named_type : identifier type { $$ = std::make_pair($1, $2); } | ||
; | ||
|
||
type : array_type { $$ = $1; } | ||
| map_type { $$ = $1; } | ||
| row_type { $$ = $1; } | ||
| simple_type { $$ = $1; } | ||
| function_type { $$ = $1; } | ||
| variable_type { $$ = $1; } | ||
| decimal_type { $$ = $1; } | ||
; | ||
|
||
simple_type : WORD { $$ = typeFromString($1); } | ||
| TYPE_WITH_SPACES { $$ = typeFromString($1); } | ||
; | ||
|
||
variable_type : VARIABLE LPAREN NUMBER RPAREN { $$ = typeFromString($1); } | ||
| VARIABLE { $$ = typeFromString($1); } | ||
; | ||
|
||
array_type : ARRAY LPAREN type RPAREN { $$ = ARRAY($3); } | ||
; | ||
|
||
decimal_type : DECIMAL LPAREN NUMBER COMMA NUMBER RPAREN { $$ = DECIMAL($3, $5); } | ||
; | ||
|
||
type_list : type { $$.push_back($1); } | ||
| type_list COMMA type { $1.push_back($3); $$ = std::move($1); } | ||
; | ||
|
||
type_list_opt_names : type { $$.names.push_back(""); $$.types.push_back($1); } | ||
| named_type { $$.names.push_back($1.first); $$.types.push_back($1.second); } | ||
| type_list_opt_names COMMA type { $1.names.push_back(""); $1.types.push_back($3); | ||
$$.names = std::move($1.names); $$.types = std::move($1.types); } | ||
| type_list_opt_names COMMA named_type { $1.names.push_back($3.first); $1.types.push_back($3.second); | ||
$$.names = std::move($1.names); $$.types = std::move($1.types); } | ||
; | ||
|
||
row_type : ROW LPAREN type_list_opt_names RPAREN { $$ = ROW(std::move($3.names), std::move($3.types)); } | ||
; | ||
|
||
map_type : MAP LPAREN type COMMA type RPAREN { $$ = MAP($3, $5); } | ||
; | ||
|
||
function_type : FUNCTION LPAREN type_list RPAREN { auto returnType = $3.back(); $3.pop_back(); | ||
$$ = FUNCTION(std::move($3), returnType); } | ||
|
||
identifier : QUOTED_ID { $1.erase(0, 1); $1.pop_back(); $$ = $1; } // Remove the quotes. | ||
| WORD { $$ = $1; } | ||
; | ||
|
||
%% | ||
|
||
void facebook::velox::type::Parser::error(const std::string& msg) { | ||
VELOX_FAIL("Failed to parse type [{}]", scanner->input()); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
add_executable(velox_type_parser_test TypeParserTest.cpp) | ||
|
||
add_test(NAME velox_type_parser_test COMMAND velox_type_parser_test) | ||
|
||
target_link_libraries(velox_type_parser_test velox_type_parser velox_type gtest | ||
gtest_main gmock) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
feels like the WORD, since it's used for the type names, will need to support whitespaces as well - even though this will make it ambiguous in some cases, but I remember there was a way to specify how these should be handled.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
QUOTED_ID will support names with spaces. There is a test as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think my point is that since WORD cannot have spaces, things like TIMESTAMP WITH TIMEZONE will not be parsed as a single WORD, and thus will need special flex/grammar support to understand that these three WORDs together form a type name. And I guess we can't update the grammar as more types are registered/unregistered. For example, if I register a new type named "PEDROs DATA TYPE", the parser won't be able to understand it.
This is ok for now, but we might need to think through it eventually.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Even Postgres has tokens for types with spaces. https://github.com/postgres/postgres/blob/master/src/backend/parser/gram.y#L14336
The issue with your proposal is how do we decide if
TIMESTAMP WITH TIME ZONE VELOX
is a new type vs. an error parsingTIMESTAMP WITH TIME ZONE
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The other ambiguity is if
ROW(PEDROs DATA TYPE)
is Row with field name PEDROs with type name DATA TYPE or a typePEDROs DATA TYPE
without a field name.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What Presto Java does is to first check if there's a type that matches the full name ("PEDROs DATA TYPE"), and if not, assume the first token is the name and proceed to try to match the remaining as the type ("DATA TYPE" as the type name and "PEDROs" the field name). Could we mimic that logic here somehow?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see. Yes we can. We can give precedence between two rules and achieve this. I will implement this in a follow up PR.