diff --git a/CHANGES.md b/CHANGES.md index bef1adab..e39183e0 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -47,6 +47,8 @@ Unreleased - `SchemaVisitor.(on_field_definition|on_argument_definition|on_input_field_definition)` have become `SchemaVisitor.(on_field|on_argument|on_input_field)` to maintain consistency with other methods. - Do not expose `GraphQLExtension` on `py_gql`. +- Fix Lexer greediness. Some edge cases were not handled as expected. This commit adds test cases from the 2 RFCs clarifying the expected behaviour ([graphql/graphql-spec#601](https://github.com/graphql/graphql-spec/pull/601), [graphql/graphql-spec#599](https://github.com/graphql/graphql-spec/pull/599)) and updates the Lexer to match. This is _technically_ a breaking change but most cases were likely to lead to validation errors (e.g. "0xF1" being parsed as [0, xF1] when expecting a list of integers). + [0.4.0](https://github.com/lirsacc/py-gql/releases/tag/0.4.0) - 2019-10-10 -------------------------------------------------------------------------- diff --git a/py_gql/lang/lexer.py b/py_gql/lang/lexer.py index 6cc4d817..4a4b145f 100755 --- a/py_gql/lang/lexer.py +++ b/py_gql/lang/lexer.py @@ -15,7 +15,6 @@ from .._string_utils import ensure_unicode, parse_block_string from ..exc import ( - GraphQLSyntaxError, InvalidCharacter, InvalidEscapeSequence, NonTerminatedString, @@ -47,12 +46,14 @@ Token, ) -EOL_CHARS = frozenset([0x000A, 0x000D]) # "\n" # "\r" +EOL_CHARS = [0x000A, 0x000D] # "\n" # "\r" -IGNORED_CHARS = ( - frozenset([0xFEFF, 0x0009, 0x0020, 0x002C]) - | EOL_CHARS # BOM # \t # SPACE # , -) +IGNORED_CHARS = [ # BOM # \t # SPACE # , + 0xFEFF, + 0x0009, + 0x0020, + 0x002C, +] + EOL_CHARS SYMBOLS = { cls.value: cls @@ -86,14 +87,27 @@ def _unexpected( - expected: str, char: str, position: int, source: str -) -> GraphQLSyntaxError: + char: Optional[str], + position: int, + source: str, + expected: Optional[str] = None, +) -> Union[UnexpectedEOF, UnexpectedCharacter]: if char is None: return UnexpectedEOF(position - 1, source) - else: + elif expected is not None: return UnexpectedCharacter( 'Expected "%s" but found "%s"' % (expected, char), position, source ) + else: + return UnexpectedCharacter( + 'Unexpected character "%s"' % char, position, source + ) + + +def _is_name_start(code): + return ( + code == 0x005F or 0x0041 <= code <= 0x005A or 0x0061 <= code <= 0x007A + ) class Lexer: @@ -165,9 +179,7 @@ def _read_ellipsis(self) -> Ellip: char = self._peek() self._position += 1 if char != ".": - raise _unexpected( - ".", cast(str, char), self._position, self._source - ) + raise _unexpected(char, self._position, self._source, ".") return Ellip(start, self._position) def _read_string(self) -> String: @@ -296,6 +308,17 @@ def _read_number(self) -> Union[Integer, Float]: self._read_over_integer() + # Explicit lookahead restrictions. + next_char = self._peek() + if next_char is not None: + next_code = ord(next_char) + if _is_name_start(next_code): + raise UnexpectedCharacter( + 'Unexpected character "%s"' % char, + self._position, + self._source, + ) + end = self._position value = self._source[start:end] return ( @@ -312,7 +335,7 @@ def _read_over_integer(self): if code == 0x0030: # "0" self._position += 1 char = self._peek() - if char is not None and ord(char) == 0x0030: + if char is not None and (0x0030 <= ord(char) <= 0x0039): raise UnexpectedCharacter( 'Unexpected character "%s"' % char, self._position, @@ -406,11 +429,7 @@ def __next__(self) -> Token: return self._read_string() elif code == 0x002D or 0x0030 <= code <= 0x0039: return self._read_number() - elif ( - code == 0x005F - or 0x0041 <= code <= 0x005A - or 0x0061 <= code <= 0x007A - ): + elif _is_name_start(code): return self._read_name() else: raise UnexpectedCharacter( diff --git a/tests/test_lang/test_lexer.py b/tests/test_lang/test_lexer.py index fc1ba37d..ad430c56 100644 --- a/tests/test_lang/test_lexer.py +++ b/tests/test_lang/test_lexer.py @@ -19,7 +19,9 @@ def lex_one(source): lexer = Lexer(source) assert type(next(lexer)) == token.SOF - return next(lexer) + val = next(lexer) + assert type(next(lexer)) == token.EOF + return val def test_it_disallows_uncommon_control_characters(): @@ -101,6 +103,20 @@ def test_errors_respect_whitespace(): ) +@pytest.mark.parametrize( + "value,expected", + [ + ("abc", token.Name(0, 3, "abc")), + ("_abc", token.Name(0, 4, "_abc")), + ("abc_", token.Name(0, 4, "abc_")), + ("abc123", token.Name(0, 6, "abc123")), + ("abc_123", token.Name(0, 7, "abc_123")), + ], +) +def test_it_lexes_name(value, expected): + assert lex_one(value) == expected + + @pytest.mark.parametrize( "value,expected", [ @@ -116,6 +132,7 @@ def test_errors_respect_whitespace(): '"unicode \\u1234\\u5678\\u90AB\\uCDEF"', token.String(0, 34, "unicode \u1234\u5678\u90AB\uCDEF"), ), + ('""', token.String(0, 2, "")), ], ) def test_it_lexes_strings(value, expected): @@ -126,6 +143,8 @@ def test_it_lexes_strings(value, expected): "value, err_cls, expected_positon", [ ('"', NonTerminatedString, 1), + ('"""', NonTerminatedString, 3), + ('""""', NonTerminatedString, 4), ('"no end quote', NonTerminatedString, 13), ("'single quotes'", UnexpectedCharacter, 0), ('"contains unescaped \u0007 control char"', InvalidCharacter, 20), @@ -154,6 +173,7 @@ def test_it_lex_reports_useful_string_errors(value, err_cls, expected_positon): "value, expected", [ ('"""simple"""', token.BlockString(0, 12, "simple")), + ('""""""', token.BlockString(0, 6, "")), ('""" white space """', token.BlockString(0, 19, " white space ")), ( '"""contains " quote"""', @@ -223,6 +243,7 @@ def test_it_lex_reports_useful_block_string_errors( ("123E4", token.Float(0, 5, "123E4")), ("123e-4", token.Float(0, 6, "123e-4")), ("123e+4", token.Float(0, 6, "123e+4")), + ("1.2e3", token.Float(0, 5, "1.2e3")), ("-123e4", token.Float(0, 6, "-123e4")), ("-123E4", token.Float(0, 6, "-123E4")), ("-123e-4", token.Float(0, 7, "-123e-4")), @@ -238,6 +259,7 @@ def test_it_lexes_numbers(string, expected): "value, err_cls, expected_positon", [ ("00", UnexpectedCharacter, 1), + ("01", UnexpectedCharacter, 1), ("+1", UnexpectedCharacter, 0), ("1.", UnexpectedEOF, 2), ("1.e1", UnexpectedCharacter, 2), @@ -246,6 +268,21 @@ def test_it_lexes_numbers(string, expected): ("-A", UnexpectedCharacter, 1), ("1.0e", UnexpectedEOF, 4), ("1.0eA", UnexpectedCharacter, 4), + ("123.", UnexpectedEOF, 4), + ("123e", UnexpectedEOF, 4), + ("123E", UnexpectedEOF, 4), + ("01.23", UnexpectedCharacter, 1), + ("1.2e3.4", UnexpectedCharacter, 7), + ("1.23.4", UnexpectedCharacter, 6), + ("1.2e3e", UnexpectedCharacter, 5), + ("0xF1", UnexpectedCharacter, 1), + ("0b10", UnexpectedCharacter, 1), + ("123abc", UnexpectedCharacter, 3), + ("1_234", UnexpectedCharacter, 1), + ("1ß", UnexpectedCharacter, 1), + ("1.23f", UnexpectedCharacter, 4), + ("1.234_5", UnexpectedCharacter, 5), + ("1.2ß", UnexpectedCharacter, 3), ], ) def test_it_lex_reports_useful_number_errors(value, err_cls, expected_positon):