facebookincubator · majetideepak · Nov 9, 2023 · Nov 15, 2023 · Nov 15, 2023 · Nov 16, 2023
@@ -15,6 +15,7 @@ if(${VELOX_BUILD_TESTING})
   add_subdirectory(tests)
 endif()
 
+add_subdirectory(parser)
 add_subdirectory(tz)
 add_subdirectory(fbhive)
 

@@ -0,0 +1,34 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(${VELOX_BUILD_TESTING})
+  add_subdirectory(tests)
+endif()
+
+bison_target(
+  TypeParser TypeParser.yy ${CMAKE_CURRENT_BINARY_DIR}/TypeParser.yy.cc
+  DEFINES_FILE ${CMAKE_CURRENT_BINARY_DIR}/TypeParser.yy.h)
+
+flex_target(
+  TypeParserScanner TypeParser.ll ${CMAKE_CURRENT_BINARY_DIR}/Scanner.cpp
+  COMPILE_FLAGS "-Cf --prefix=veloxtp")
+
+add_flex_bison_dependency(TypeParserScanner TypeParser)
+
+include_directories(${PROJECT_BINARY_DIR})
+include_directories(${FLEX_INCLUDE_DIRS})
+add_library(
+  velox_type_parser ${BISON_TypeParser_OUTPUTS}
+                    ${FLEX_TypeParserScanner_OUTPUTS} Scanner.h TypeParser.h)
+target_link_libraries(velox_type_parser velox_common_base)
diff --git a/velox/type/parser/Scanner.h b/velox/type/parser/Scanner.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include "velox/common/base/Exceptions.h"
+#include "velox/type/Type.h"
+
+namespace facebook::velox::type {
+
+class Scanner : public yyFlexLexer {
+ public:
+  Scanner(
+      std::istream& arg_yyin,
+      std::ostream& arg_yyout,
+      TypePtr& outputType,
+      const std::string_view input)
+      : yyFlexLexer(&arg_yyin, &arg_yyout),
+        outputType_(outputType),
+        input_(input){};
+  int lex(Parser::semantic_type* yylval);
+
+  void setType(TypePtr type) {
+    outputType_ = std::move(type);
+  }
+
+  // Store input to print it as part of the error message.
+  std::string_view input() {
+    return input_;
+  }
+
+ private:
+  TypePtr& outputType_;
+  const std::string_view input_;
+};
+
+} // namespace facebook::velox::type
diff --git a/velox/type/parser/TypeParser.h b/velox/type/parser/TypeParser.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+#include "velox/type/Type.h"
+
+namespace facebook::velox {
+
+/// Parses a type string in Presto format to Velox type.
+/// Example type strings:
+///    row(col0 bigint, varchar)
+///    array(bigint)
+///    map(bigint, array(bigint))
+///    function(bigint,bigint,bigint)
+/// The parsing is case-insensitive. i.e. 'Row' and 'row' are equal.
+/// Field names for rows are optional.
+/// Quoted field names are supported.
+/// All custom types need to be registered. An error is thrown otherwise.
+/// Types with spaces must be explicitly handled in the parser.
+/// Uses the Type::getType API to convert a string to Velox type.
+TypePtr parseType(const std::string& typeText);
+
+} // namespace facebook::velox
diff --git a/velox/type/parser/TypeParser.ll b/velox/type/parser/TypeParser.ll
@@ -0,0 +1,78 @@
+%{
+#include <vector>
+#include <memory>
+
+#include "velox/type/parser/TypeParser.yy.h"  // @manual
+#include "velox/type/parser/Scanner.h"
+#define YY_DECL int facebook::velox::type::Scanner::lex(facebook::velox::type::Parser::semantic_type *yylval)
+%}
+
+%option c++ noyywrap noyylineno nodefault caseless
+
+A   [A|a]
+B   [B|b]
+C   [C|c]
+D   [D|d]
+E   [E|e]
+F   [F|f]
+G   [G|g]
+H   [H|h]
+I   [I|i]
+J   [J|j]
+K   [K|k]
+L   [L|l]
+M   [M|m]
+O   [O|o]
+P   [P|p]
+R   [R|r]
+S   [S|s]
+T   [T|t]
+U   [U|u]
+W   [W|w]
+X   [X|x]
+Y   [Y|y]
+Z   [Z|z]
+
+WORD              ([[:alpha:][:alnum:]_]*)
+QUOTED_ID         (['"'][[:alnum:][:space:]_]*['"'])
+NUMBER            ([[:digit:]]+)
+ROW               (ROW|STRUCT)
+VARIABLE          (VARCHAR|VARBINARY)
+TYPE_WITH_SPACES  ((DOUBLE[ ]PRECISION)|(TIME[ ]WITH[ ]TIME[ ]ZONE)|(TIMESTAMP[ ]WITH[ ]TIME[ ]ZONE)|(INTERVAL[ ]YEAR[ ]TO[ ]MONTH)|(INTERVAL[ ]DAY[ ]TO[ ]SECOND))
+
+%%
+
+"("                return Parser::token::LPAREN;
+")"                return Parser::token::RPAREN;
+","                return Parser::token::COMMA;
+(ARRAY)            return Parser::token::ARRAY;
+(MAP)              return Parser::token::MAP;
+(FUNCTION)         return Parser::token::FUNCTION;
+(DECIMAL)          return Parser::token::DECIMAL;
+{ROW}              return Parser::token::ROW;
+{VARIABLE}         yylval->build<std::string>(YYText()); return Parser::token::VARIABLE;
+{NUMBER}           yylval->build<long long>(folly::to<int>(YYText())); return Parser::token::NUMBER;
+{WORD}             yylval->build<std::string>(YYText()); return Parser::token::WORD;
+{TYPE_WITH_SPACES} yylval->build<std::string>(YYText()); return Parser::token::TYPE_WITH_SPACES;
+{QUOTED_ID}        yylval->build<std::string>(YYText()); return Parser::token::QUOTED_ID;
+<<EOF>>            return Parser::token::YYEOF;
+.               /* no action on unmatched input */
+
+%%
+
+int yyFlexLexer::yylex() {
+    throw std::runtime_error("Bad call to yyFlexLexer::yylex()");
+}
+
+#include "velox/type/parser/TypeParser.h"
+
+facebook::velox::TypePtr facebook::velox::parseType(const std::string& typeText)
+ {
+    std::istringstream is(typeText);
+    facebook::velox::TypePtr type;
+    facebook::velox::type::Scanner scanner{is, std::cerr, type, typeText};
+    facebook::velox::type::Parser parser{ &scanner };
+    parser.parse();
+    VELOX_CHECK(type, "Failed to parse type [{}]", typeText);
+    return type;
+}
diff --git a/velox/type/parser/TypeParser.yy b/velox/type/parser/TypeParser.yy
@@ -0,0 +1,122 @@
+%{
+#include <FlexLexer.h>
+#include "velox/common/base/Exceptions.h"
+#include "velox/type/Type.h"
+%}
+%require "3.0.4"
+%language "C++"
+
+%define parser_class_name {Parser}
+%define api.namespace {facebook::velox::type}
+%define api.value.type variant
+%parse-param {Scanner* scanner}
+%define parse.error verbose
+
+%code requires
+{
+    namespace facebook::velox::type {
+        class Scanner;
+    } // namespace facebook::velox::type
+    namespace facebook::velox {
+        class Type;
+    } // namespace facebook::velox
+    struct RowArguments {
+       std::vector<std::string> names;
+       std::vector<std::shared_ptr<const facebook::velox::Type>> types;
+    };
+} // %code requires
+
+%code
+{
+    #include <velox/type/parser/Scanner.h>
+    #define yylex(x) scanner->lex(x)
+    using namespace facebook::velox;
+    TypePtr typeFromString(const std::string& type) {
+        auto upper = type;
+        std::transform(upper.begin(), upper.end(), upper.begin(), ::toupper);
+        if (upper == "INT") {
+            upper = "INTEGER";
+        } else if (upper == "DOUBLE PRECISION") {
+            upper = "DOUBLE";
+        }
+        auto inferredType = getType(upper, {});
+        VELOX_CHECK(inferredType, "Failed to parse type [{}]. Type not registered.", type);
+        return inferredType;
+    }
+}
+
+%token               LPAREN RPAREN COMMA ARRAY MAP ROW FUNCTION DECIMAL
+%token <std::string> WORD VARIABLE QUOTED_ID TYPE_WITH_SPACES
+%token <long long>   NUMBER
+%token YYEOF         0
+
+%nterm <std::shared_ptr<const Type>> type array_type map_type variable_type
+%nterm <std::pair<std::string, std::shared_ptr<const Type>>> named_type
+%nterm <std::shared_ptr<const Type>> row_type function_type decimal_type simple_type
+%nterm <std::string> identifier
+%nterm <std::vector<std::shared_ptr<const Type>>> type_list
+%nterm <RowArguments> type_list_opt_names
+
+%%
+
+type_spec : named_type           { scanner->setType($1.second); }
+          | type                 { scanner->setType($1); }
+          | error                { yyerrok; }
+          ;
+
+named_type :  identifier type       { $$ = std::make_pair($1, $2); }
+           ;
+
+type : array_type                  { $$ = $1; }
+     | map_type                    { $$ = $1; }
+     | row_type                    { $$ = $1; }
+     | simple_type                 { $$ = $1; }
+     | function_type               { $$ = $1; }
+     | variable_type               { $$ = $1; }
+     | decimal_type                { $$ = $1; }
+     ;
+
+simple_type : WORD                { $$ = typeFromString($1); }
+            | TYPE_WITH_SPACES    { $$ = typeFromString($1); }
+            ;
+
+variable_type : VARIABLE LPAREN NUMBER RPAREN  { $$ = typeFromString($1); }
+              | VARIABLE                       { $$ = typeFromString($1); }
+              ;
+
+array_type : ARRAY LPAREN type RPAREN { $$ = ARRAY($3); }
+           ;
+
+decimal_type : DECIMAL LPAREN NUMBER COMMA NUMBER RPAREN { $$ = DECIMAL($3, $5); }
+             ;
+
+type_list : type                   { $$.push_back($1); }
+          | type_list COMMA type   { $1.push_back($3); $$ = std::move($1); }
+          ;
+
+type_list_opt_names : type                                 { $$.names.push_back(""); $$.types.push_back($1); }
+                    | named_type                           { $$.names.push_back($1.first); $$.types.push_back($1.second); }
+                    | type_list_opt_names COMMA type       { $1.names.push_back(""); $1.types.push_back($3);
+                                                             $$.names = std::move($1.names); $$.types = std::move($1.types); }
+                    | type_list_opt_names COMMA named_type { $1.names.push_back($3.first); $1.types.push_back($3.second);
+                                                             $$.names = std::move($1.names); $$.types = std::move($1.types); }
+                    ;
+
+row_type : ROW LPAREN type_list_opt_names RPAREN  { $$ = ROW(std::move($3.names), std::move($3.types)); }
+         ;
+
+map_type : MAP LPAREN type COMMA type RPAREN { $$ = MAP($3, $5); }
+         ;
+
+function_type : FUNCTION LPAREN type_list RPAREN { auto returnType = $3.back(); $3.pop_back();
+                                                   $$ = FUNCTION(std::move($3), returnType); }
+
+identifier : QUOTED_ID { $1.erase(0, 1); $1.pop_back(); $$ = $1; } // Remove the quotes.
+           | WORD     { $$ = $1; }
+           ;
+
+%%
+
+void facebook::velox::type::Parser::error(const std::string& msg) {
+    VELOX_FAIL("Failed to parse type [{}]", scanner->input());
+}
@@ -0,0 +1,20 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_executable(velox_type_parser_test TypeParserTest.cpp)
+
+add_test(NAME velox_type_parser_test COMMAND velox_type_parser_test)
+
+target_link_libraries(velox_type_parser_test velox_type_parser velox_type gtest
+                      gtest_main gmock)