diff --git a/core/BUILD b/core/BUILD index aea6b5f6e..6e2e13215 100644 --- a/core/BUILD +++ b/core/BUILD @@ -1,12 +1,39 @@ package(default_visibility = ["//visibility:public"]) +cc_library( + name = "common", + hdrs = [ + "unicode.h", + "static_error.h", + ], + includes = ["."], +) + +cc_library( + name = "lexer", + srcs = ["lexer.cpp"], + hdrs = ["lexer.h"], + deps = [ + ":common", + ], + includes = ["."], +) + +cc_test( + name = "lexer_test", + srcs = ["lexer_test.cc"], + deps = [ + ":lexer", + "//external:gtest_main", + ], +) + cc_library( name = "jsonnet-common", srcs = [ "desugarer.cpp", "formatter.cpp", "libjsonnet.cpp", - "lexer.cpp", "parser.cpp", "static_analysis.cpp", "string_utils.cpp", @@ -17,16 +44,15 @@ cc_library( "ast.h", "desugarer.h", "formatter.h", - "lexer.h", "parser.h", "state.h", "static_analysis.h", - "static_error.h", "string_utils.h", - "unicode.h", "vm.h", ], deps = [ + ":common", + ":lexer", "//include:libjsonnet", ], linkopts = ["-lm"], diff --git a/core/lexer.h b/core/lexer.h index 5ee2ccdd9..a3246d187 100644 --- a/core/lexer.h +++ b/core/lexer.h @@ -136,7 +136,7 @@ struct Token { std::string stringBlockIndent; /** If kind == STRING_BLOCK then stores the sequence of whitespace that indented the end of - * the block. + * the block. * * This is always fewer whitespace characters than in stringBlockIndent. */ @@ -153,7 +153,8 @@ struct Token { stringBlockTermIndent(string_block_term_indent), location(location) { } - //Token(Kind kind, const std::string &data="") : kind(kind), data(data) { } + Token(Kind kind, const std::string &data="") + : kind(kind), data(data) { } static const char *toString(Kind v) { @@ -204,7 +205,7 @@ struct Token { }; /** The result of lexing. - * + * * Because of the EOF token, this will always contain at least one token. So element 0 can be used * to get the filename. */ diff --git a/core/lexer_test.cc b/core/lexer_test.cc new file mode 100644 index 000000000..c490bf43a --- /dev/null +++ b/core/lexer_test.cc @@ -0,0 +1,284 @@ +// Copyright 2016 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lexer.h" + +#include +#include "gtest/gtest.h" + +namespace { + +void TestLex(const char* name, + const char* input, + const std::list& tokens, + const std::string& error) { + std::list test_tokens(tokens); + test_tokens.push_back(Token(Token::Kind::END_OF_FILE, "")); + + try { + std::list lexed_tokens = jsonnet_lex(name, input); + ASSERT_EQ(test_tokens, lexed_tokens) + << "Test failed: " << name << std::endl; + } catch (StaticError& e) { + ASSERT_EQ(error, e.ToString()); + } +} + +TEST(Lexer, TestWhitespace) { + TestLex("empty", "", {}, ""); + TestLex("whitespace", " \t\n\r\r\n", {}, ""); +} + +TEST(Lexer, TestOperators) { + TestLex("brace L", "{", {Token(Token::Kind::BRACE_L, "")}, ""); + TestLex("brace R", "}", {Token(Token::Kind::BRACE_R, "")}, ""); + TestLex("bracket L", "[", {Token(Token::Kind::BRACKET_L, "")}, ""); + TestLex("bracket R", "]", {Token(Token::Kind::BRACKET_R, "")}, ""); + TestLex("colon ", ":", {Token(Token::Kind::OPERATOR, ":")}, ""); + TestLex("colon 2", "::", {Token(Token::Kind::OPERATOR, "::")}, ""); + TestLex("colon 2", ":::", {Token(Token::Kind::OPERATOR, ":::")}, ""); + TestLex("arrow right", "->", {Token(Token::Kind::OPERATOR, "->")}, ""); + TestLex("less than minus", "<-", + {Token(Token::Kind::OPERATOR, "<"), + Token(Token::Kind::OPERATOR, "-")}, ""); + TestLex("comma", ",", {Token(Token::Kind::COMMA, "")}, ""); + TestLex("dollar", "$", {Token(Token::Kind::DOLLAR, "")}, ""); + TestLex("dot", ".", {Token(Token::Kind::DOT, "")}, ""); + TestLex("paren L", "(", {Token(Token::Kind::PAREN_L, "")}, ""); + TestLex("paren R", ")", {Token(Token::Kind::PAREN_R, "")}, ""); + TestLex("semicolon", ";", {Token(Token::Kind::SEMICOLON, "")}, ""); + + TestLex("not 1", "!", {Token(Token::Kind::OPERATOR, "!")}, ""); + TestLex("not 2", "! ", {Token(Token::Kind::OPERATOR, "!")}, ""); + TestLex("not equal", "!=", {Token(Token::Kind::OPERATOR, "!=")}, ""); + TestLex("tilde", "~", {Token(Token::Kind::OPERATOR, "~")}, ""); + TestLex("plus", "+", {Token(Token::Kind::OPERATOR, "+")}, ""); + TestLex("minus", "-", {Token(Token::Kind::OPERATOR, "-")}, ""); +} + +TEST(Lexer, TestMiscOperators) { + TestLex("op *", "*", {Token(Token::Kind::OPERATOR, "*")}, ""); + TestLex("op /", "/", {Token(Token::Kind::OPERATOR, "/")}, ""); + TestLex("op %", "%", {Token(Token::Kind::OPERATOR, "%")}, ""); + TestLex("op &", "&", {Token(Token::Kind::OPERATOR, "&")}, ""); + TestLex("op |", "|", {Token(Token::Kind::OPERATOR, "|")}, ""); + TestLex("op ^", "^", {Token(Token::Kind::OPERATOR, "^")}, ""); + TestLex("op =", "=", {Token(Token::Kind::OPERATOR, "=")}, ""); + TestLex("op <", "<", {Token(Token::Kind::OPERATOR, "<")}, ""); + TestLex("op >", ">", {Token(Token::Kind::OPERATOR, ">")}, ""); + TestLex("op >==|", ">==|", {Token(Token::Kind::OPERATOR, ">==|")}, ""); +} + +TEST(Lexer, TestNumbers) { + TestLex("number 0", "0", {Token(Token::Kind::NUMBER, "0")}, ""); + TestLex("number 1", "1", {Token(Token::Kind::NUMBER, "1")}, ""); + TestLex("number 1.0", "1.0", {Token(Token::Kind::NUMBER, "1.0")}, ""); + TestLex("number 0.10", "0.10", {Token(Token::Kind::NUMBER, "0.10")}, ""); + TestLex("number 0e100", "0e100", {Token(Token::Kind::NUMBER, "0e100")}, ""); + TestLex("number 1e100", "1e100", {Token(Token::Kind::NUMBER, "1e100")}, ""); + TestLex("number 1.1e100", "1.1e100", + {Token(Token::Kind::NUMBER, "1.1e100")}, ""); + TestLex("number 1.1e-100", "1.1e-100", + {Token(Token::Kind::NUMBER, "1.1e-100")}, ""); + TestLex("number 1.1e+100", "1.1e+100", + {Token(Token::Kind::NUMBER, "1.1e+100")}, ""); + TestLex("number 0100", "0100", + {Token(Token::Kind::NUMBER, "0"), Token(Token::Kind::NUMBER, "100")}, + ""); + TestLex("number 10+10", "10+10", + {Token(Token::Kind::NUMBER, "10"), + Token(Token::Kind::OPERATOR, "+"), + Token(Token::Kind::NUMBER, "10")}, ""); + TestLex("number 1.+3", "1.+3", {}, + "number 1.+3:1:1: Couldn't lex number, junk after decimal point: +"); + TestLex("number 1e!", "1e!", {}, + "number 1e!:1:1: Couldn't lex number, junk after 'E': !"); + TestLex("number 1e+!", "1e+!", {}, + "number 1e+!:1:1: Couldn't lex number, junk after exponent sign: !"); +} + +TEST(Lexer, TestDoubleStrings) { + TestLex("double string \"hi\"", + "\"hi\"", {Token(Token::Kind::STRING_DOUBLE, "hi")}, ""); + TestLex("double string \"hi nl\"", + "\"hi\n\"", {Token(Token::Kind::STRING_DOUBLE, "hi\n")}, ""); + TestLex("double string \"hi\\\"\"", + "\"hi\\\"\"", {Token(Token::Kind::STRING_DOUBLE, "hi\\\"")}, ""); + TestLex("double string \"hi\\nl\"", + "\"hi\\\n\"", {Token(Token::Kind::STRING_DOUBLE, "hi\\\n")}, ""); + TestLex("double string \"hi", + "\"hi", {}, "double string \"hi:1:1: Unterminated string"); +} + +TEST(Lexer, TestSingleStrings) { + TestLex("single string 'hi'", + "'hi'", {Token(Token::Kind::STRING_SINGLE, "hi")}, ""); + TestLex("single string 'hi nl'", + "'hi\n'", {Token(Token::Kind::STRING_SINGLE, "hi\n")}, ""); + TestLex("single string 'hi\\''", + "'hi\\''", {Token(Token::Kind::STRING_SINGLE, "hi\\'")}, ""); + TestLex("single string 'hi\\nl'", + "'hi\\\n'", {Token(Token::Kind::STRING_SINGLE, "hi\\\n")}, ""); + TestLex("single string 'hi", + "'hi", {}, "single string 'hi:1:1: Unterminated string"); +} + +TEST(Lexer, TestBlockStringSpaces) { + const char str[] = + "|||\n" + " test\n" + " more\n" + " |||\n" + " foo\n" + "|||"; + const Token token = Token( + Token::Kind::STRING_BLOCK, + {}, + "test\n more\n|||\n foo\n", + " ", + "", + {}); + TestLex("block string spaces", str, {token}, ""); +} + +TEST(Lexer, TestBlockStringTabs) { + const char str[] = + "|||\n" + "\ttest\n" + "\t more\n" + "\t|||\n" + "\t foo\n" + "|||"; + const Token token = Token( + Token::Kind::STRING_BLOCK, + {}, + "test\n more\n|||\n foo\n", + "\t", + "", + {}); + TestLex("block string tabs", str, {token}, ""); +} + +TEST(Lexer, TestBlockStringsMixed) { + const char str[] = + "|||\n" + "\t \ttest\n" + "\t \t more\n" + "\t \t|||\n" + "\t \t foo\n" + "|||"; + const Token token = Token( + Token::Kind::STRING_BLOCK, + {}, + "test\n more\n|||\n foo\n", + "\t \t", + "", + {}); + TestLex("block string mixed", str, {token}, ""); +} + +TEST(Lexer, TestBlockStringBlanks) { + const char str[] = + "|||\n\n" + " test\n\n\n" + " more\n" + " |||\n" + " foo\n" + "|||"; + const Token token = Token( + Token::Kind::STRING_BLOCK, + {}, + "\ntest\n\n\n more\n|||\n foo\n", + " ", + "", + {}); + TestLex("block string blanks", str, {token}, ""); +} + +TEST(Lexer, TestBlockStringBadIndent) { + const char str[] = + "|||\n" + " test\n" + " foo\n" + "|||"; + TestLex("block string bad indent", str, {}, + "block string bad indent:1:1: Text block not terminated with |||"); +} + +TEST(Lexer, TestBlockStringEof) { + const char str[] = + "|||\n" + " test"; + TestLex("block string eof", str, {}, "block string eof:1:1: Unexpected EOF"); +} + +TEST(Lexer, TestBlockStringNotTerm) { + const char str[] = + "|||\n" + " test\n"; + TestLex("block string not term", str, {}, + "block string not term:1:1: Text block not terminated with |||"); +} + +TEST(Lexer, TestBlockStringNoWs) { + const char str[] = + "|||\n" + "test\n" + "|||"; + TestLex("block string no ws", str, {}, + "block string no ws:1:1: Text block's first line must start with" + " whitespace."); +} + +TEST(Lexer, TestKeywords) { + TestLex("assert", "assert", {Token(Token::Kind::ASSERT, "assert")}, ""); + TestLex("else", "else", {Token(Token::Kind::ELSE, "else")}, ""); + TestLex("error", "error", {Token(Token::Kind::ERROR, "error")}, ""); + TestLex("false", "false", {Token(Token::Kind::FALSE, "false")}, ""); + TestLex("for", "for", {Token(Token::Kind::FOR, "for")}, ""); + TestLex("function", "function", + {Token(Token::Kind::FUNCTION, "function")}, ""); + TestLex("if", "if", {Token(Token::Kind::IF, "if")}, ""); + TestLex("import", "import", {Token(Token::Kind::IMPORT, "import")}, ""); + TestLex("importstr", "importstr", + {Token(Token::Kind::IMPORTSTR, "importstr")}, ""); + TestLex("in", "in", {Token(Token::Kind::IN, "in")}, ""); + TestLex("local", "local", {Token(Token::Kind::LOCAL, "local")}, ""); + TestLex("null", "null", {Token(Token::Kind::NULL_LIT, "null")}, ""); + TestLex("self", "self", {Token(Token::Kind::SELF, "self")}, ""); + TestLex("super", "super", {Token(Token::Kind::SUPER, "super")}, ""); + TestLex("tailstrict", "tailstrict", + {Token(Token::Kind::TAILSTRICT, "tailstrict")}, ""); + TestLex("then", "then", {Token(Token::Kind::THEN, "then")}, ""); + TestLex("true", "true", {Token(Token::Kind::TRUE, "true")}, ""); +} + +TEST(Lexer, TestIdentifier) { + TestLex("identifier", "foobar123", + {Token(Token::Kind::IDENTIFIER, "foobar123")}, ""); + TestLex("identifier", "foo bar123", + {Token(Token::Kind::IDENTIFIER, "foo"), + Token(Token::Kind::IDENTIFIER, "bar123")}, ""); +} + +TEST(Lexer, TestComments) { + // TODO(dzc): Test does not look at fodder yet. + TestLex("c++ comment", "// hi", {}, ""); + TestLex("hash comment", "# hi", {}, ""); + TestLex("c comment", "/* hi */", {}, ""); + TestLex("c comment no term", "/* hi", {}, + "c comment no term:1:1: Multi-line comment has no terminating */."); +} + +} // namespace diff --git a/core/static_error.h b/core/static_error.h index 72a28832d..7da7b880e 100644 --- a/core/static_error.h +++ b/core/static_error.h @@ -17,6 +17,9 @@ limitations under the License. #ifndef JSONNET_STATIC_ERROR_H #define JSONNET_STATIC_ERROR_H +#include +#include + struct Location { unsigned long line; unsigned long column; @@ -91,14 +94,21 @@ struct StaticError { : location(location), msg(msg) { } + + std::string ToString() const + { + std::stringstream ss; + if (location.isSet()) { + ss << location << ":"; + } + ss << " " << msg; + return ss.str(); + } }; static inline std::ostream &operator<<(std::ostream &o, const StaticError &err) { - if (err.location.isSet()) { - o << err.location << ":"; - } - o << " " << err.msg; + o << err.ToString(); return o; }