Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Presto Type Parser based on Flex and Bison #7568

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions velox/type/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ if(${VELOX_BUILD_TESTING})
add_subdirectory(tests)
endif()

add_subdirectory(parser)
add_subdirectory(tz)
add_subdirectory(fbhive)

Expand Down
34 changes: 34 additions & 0 deletions velox/type/parser/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

if(${VELOX_BUILD_TESTING})
add_subdirectory(tests)
endif()

bison_target(
TypeParser TypeParser.yy ${CMAKE_CURRENT_BINARY_DIR}/TypeParser.yy.cc
DEFINES_FILE ${CMAKE_CURRENT_BINARY_DIR}/TypeParser.yy.h)

flex_target(
TypeParserScanner TypeParser.ll ${CMAKE_CURRENT_BINARY_DIR}/Scanner.cpp
COMPILE_FLAGS "-Cf --prefix=veloxtp")

add_flex_bison_dependency(TypeParserScanner TypeParser)

include_directories(${PROJECT_BINARY_DIR})
include_directories(${FLEX_INCLUDE_DIRS})
add_library(
velox_type_parser ${BISON_TypeParser_OUTPUTS}
${FLEX_TypeParserScanner_OUTPUTS} Scanner.h TypeParser.h)
target_link_libraries(velox_type_parser velox_common_base)
55 changes: 55 additions & 0 deletions velox/type/parser/Scanner.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cmath>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>

#include "velox/common/base/Exceptions.h"
#include "velox/type/Type.h"

namespace facebook::velox::type {

class Scanner : public yyFlexLexer {
public:
Scanner(
std::istream& arg_yyin,
std::ostream& arg_yyout,
TypePtr& outputType,
const std::string_view input)
: yyFlexLexer(&arg_yyin, &arg_yyout),
outputType_(outputType),
input_(input){};
int lex(Parser::semantic_type* yylval);

void setType(TypePtr type) {
outputType_ = std::move(type);
}

// Store input to print it as part of the error message.
std::string_view input() {
return input_;
}

private:
TypePtr& outputType_;
const std::string_view input_;
};

} // namespace facebook::velox::type
38 changes: 38 additions & 0 deletions velox/type/parser/TypeParser.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <string>
#include "velox/type/Type.h"

namespace facebook::velox {

/// Parses a type string in Presto format to Velox type.
/// Example type strings:
/// row(col0 bigint, varchar)
/// array(bigint)
/// map(bigint, array(bigint))
/// function(bigint,bigint,bigint)
/// The parsing is case-insensitive. i.e. 'Row' and 'row' are equal.
/// Field names for rows are optional.
/// Quoted field names are supported.
/// All custom types need to be registered. An error is thrown otherwise.
/// Types with spaces must be explicitly handled in the parser.
/// Uses the Type::getType API to convert a string to Velox type.
TypePtr parseType(const std::string& typeText);

} // namespace facebook::velox
78 changes: 78 additions & 0 deletions velox/type/parser/TypeParser.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
%{
#include <vector>
#include <memory>

#include "velox/type/parser/TypeParser.yy.h" // @manual
#include "velox/type/parser/Scanner.h"
#define YY_DECL int facebook::velox::type::Scanner::lex(facebook::velox::type::Parser::semantic_type *yylval)
%}

%option c++ noyywrap noyylineno nodefault caseless

A [A|a]
B [B|b]
C [C|c]
D [D|d]
E [E|e]
F [F|f]
G [G|g]
H [H|h]
I [I|i]
J [J|j]
K [K|k]
L [L|l]
M [M|m]
O [O|o]
P [P|p]
R [R|r]
S [S|s]
T [T|t]
U [U|u]
W [W|w]
X [X|x]
Y [Y|y]
Z [Z|z]

WORD ([[:alpha:][:alnum:]_]*)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

feels like the WORD, since it's used for the type names, will need to support whitespaces as well - even though this will make it ambiguous in some cases, but I remember there was a way to specify how these should be handled.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

QUOTED_ID will support names with spaces. There is a test as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think my point is that since WORD cannot have spaces, things like TIMESTAMP WITH TIMEZONE will not be parsed as a single WORD, and thus will need special flex/grammar support to understand that these three WORDs together form a type name. And I guess we can't update the grammar as more types are registered/unregistered. For example, if I register a new type named "PEDROs DATA TYPE", the parser won't be able to understand it.

This is ok for now, but we might need to think through it eventually.

Copy link
Collaborator Author

@majetideepak majetideepak Nov 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even Postgres has tokens for types with spaces. https://github.com/postgres/postgres/blob/master/src/backend/parser/gram.y#L14336
The issue with your proposal is how do we decide if TIMESTAMP WITH TIME ZONE VELOX is a new type vs. an error parsing TIMESTAMP WITH TIME ZONE?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The other ambiguity is if ROW(PEDROs DATA TYPE) is Row with field name PEDROs with type name DATA TYPE or a type PEDROs DATA TYPE without a field name.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What Presto Java does is to first check if there's a type that matches the full name ("PEDROs DATA TYPE"), and if not, assume the first token is the name and proceed to try to match the remaining as the type ("DATA TYPE" as the type name and "PEDROs" the field name). Could we mimic that logic here somehow?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Yes we can. We can give precedence between two rules and achieve this. I will implement this in a follow up PR.

QUOTED_ID (['"'][[:alnum:][:space:]_]*['"'])
NUMBER ([[:digit:]]+)
ROW (ROW|STRUCT)
VARIABLE (VARCHAR|VARBINARY)
TYPE_WITH_SPACES ((DOUBLE[ ]PRECISION)|(TIME[ ]WITH[ ]TIME[ ]ZONE)|(TIMESTAMP[ ]WITH[ ]TIME[ ]ZONE)|(INTERVAL[ ]YEAR[ ]TO[ ]MONTH)|(INTERVAL[ ]DAY[ ]TO[ ]SECOND))

%%

"(" return Parser::token::LPAREN;
")" return Parser::token::RPAREN;
"," return Parser::token::COMMA;
(ARRAY) return Parser::token::ARRAY;
(MAP) return Parser::token::MAP;
(FUNCTION) return Parser::token::FUNCTION;
(DECIMAL) return Parser::token::DECIMAL;
{ROW} return Parser::token::ROW;
{VARIABLE} yylval->build<std::string>(YYText()); return Parser::token::VARIABLE;
{NUMBER} yylval->build<long long>(folly::to<int>(YYText())); return Parser::token::NUMBER;
{WORD} yylval->build<std::string>(YYText()); return Parser::token::WORD;
{TYPE_WITH_SPACES} yylval->build<std::string>(YYText()); return Parser::token::TYPE_WITH_SPACES;
{QUOTED_ID} yylval->build<std::string>(YYText()); return Parser::token::QUOTED_ID;
<<EOF>> return Parser::token::YYEOF;
. /* no action on unmatched input */

%%

int yyFlexLexer::yylex() {
throw std::runtime_error("Bad call to yyFlexLexer::yylex()");
}

#include "velox/type/parser/TypeParser.h"

facebook::velox::TypePtr facebook::velox::parseType(const std::string& typeText)
{
std::istringstream is(typeText);
facebook::velox::TypePtr type;
facebook::velox::type::Scanner scanner{is, std::cerr, type, typeText};
facebook::velox::type::Parser parser{ &scanner };
parser.parse();
VELOX_CHECK(type, "Failed to parse type [{}]", typeText);
return type;
}
122 changes: 122 additions & 0 deletions velox/type/parser/TypeParser.yy
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
%{
#include <FlexLexer.h>
#include "velox/common/base/Exceptions.h"
#include "velox/type/Type.h"
%}
%require "3.0.4"
%language "C++"

%define parser_class_name {Parser}
%define api.namespace {facebook::velox::type}
%define api.value.type variant
%parse-param {Scanner* scanner}
%define parse.error verbose

%code requires
{
namespace facebook::velox::type {
class Scanner;
} // namespace facebook::velox::type
namespace facebook::velox {
class Type;
} // namespace facebook::velox
struct RowArguments {
std::vector<std::string> names;
std::vector<std::shared_ptr<const facebook::velox::Type>> types;
};
} // %code requires

%code
{
#include <velox/type/parser/Scanner.h>
#define yylex(x) scanner->lex(x)
using namespace facebook::velox;
TypePtr typeFromString(const std::string& type) {
auto upper = type;
std::transform(upper.begin(), upper.end(), upper.begin(), ::toupper);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think folly has an in-place upper case method as well

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could not find one. Velox uses boost in other places.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It actually only support lower case:

https://github.com/facebook/folly/blob/main/folly/String.h#L742

never mind

if (upper == "INT") {
upper = "INTEGER";
} else if (upper == "DOUBLE PRECISION") {
upper = "DOUBLE";
}
auto inferredType = getType(upper, {});
VELOX_CHECK(inferredType, "Failed to parse type [{}]. Type not registered.", type);
return inferredType;
}
}

%token LPAREN RPAREN COMMA ARRAY MAP ROW FUNCTION DECIMAL
%token <std::string> WORD VARIABLE QUOTED_ID TYPE_WITH_SPACES
%token <long long> NUMBER
%token YYEOF 0

%nterm <std::shared_ptr<const Type>> type array_type map_type variable_type
%nterm <std::pair<std::string, std::shared_ptr<const Type>>> named_type
%nterm <std::shared_ptr<const Type>> row_type function_type decimal_type simple_type
%nterm <std::string> identifier
%nterm <std::vector<std::shared_ptr<const Type>>> type_list
%nterm <RowArguments> type_list_opt_names

%%

type_spec : named_type { scanner->setType($1.second); }
| type { scanner->setType($1); }
| error { yyerrok; }
;

named_type : identifier type { $$ = std::make_pair($1, $2); }
;

type : array_type { $$ = $1; }
| map_type { $$ = $1; }
| row_type { $$ = $1; }
| simple_type { $$ = $1; }
| function_type { $$ = $1; }
| variable_type { $$ = $1; }
| decimal_type { $$ = $1; }
;

simple_type : WORD { $$ = typeFromString($1); }
| TYPE_WITH_SPACES { $$ = typeFromString($1); }
;

variable_type : VARIABLE LPAREN NUMBER RPAREN { $$ = typeFromString($1); }
| VARIABLE { $$ = typeFromString($1); }
;

array_type : ARRAY LPAREN type RPAREN { $$ = ARRAY($3); }
;

decimal_type : DECIMAL LPAREN NUMBER COMMA NUMBER RPAREN { $$ = DECIMAL($3, $5); }
;

type_list : type { $$.push_back($1); }
| type_list COMMA type { $1.push_back($3); $$ = std::move($1); }
;

type_list_opt_names : type { $$.names.push_back(""); $$.types.push_back($1); }
| named_type { $$.names.push_back($1.first); $$.types.push_back($1.second); }
| type_list_opt_names COMMA type { $1.names.push_back(""); $1.types.push_back($3);
$$.names = std::move($1.names); $$.types = std::move($1.types); }
| type_list_opt_names COMMA named_type { $1.names.push_back($3.first); $1.types.push_back($3.second);
$$.names = std::move($1.names); $$.types = std::move($1.types); }
;

row_type : ROW LPAREN type_list_opt_names RPAREN { $$ = ROW(std::move($3.names), std::move($3.types)); }
;

map_type : MAP LPAREN type COMMA type RPAREN { $$ = MAP($3, $5); }
;

function_type : FUNCTION LPAREN type_list RPAREN { auto returnType = $3.back(); $3.pop_back();
$$ = FUNCTION(std::move($3), returnType); }

identifier : QUOTED_ID { $1.erase(0, 1); $1.pop_back(); $$ = $1; } // Remove the quotes.
| WORD { $$ = $1; }
;

%%

void facebook::velox::type::Parser::error(const std::string& msg) {
VELOX_FAIL("Failed to parse type [{}]", scanner->input());
}
20 changes: 20 additions & 0 deletions velox/type/parser/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

add_executable(velox_type_parser_test TypeParserTest.cpp)

add_test(NAME velox_type_parser_test COMMAND velox_type_parser_test)

target_link_libraries(velox_type_parser_test velox_type_parser velox_type gtest
gtest_main gmock)
Loading