-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: use a stateful scanner to keep track of quote count to avoid inf…
…inite loop bug
- Loading branch information
Showing
1 changed file
with
172 additions
and
145 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,174 +1,201 @@ | ||
#include <assert.h> | ||
#include <string.h> | ||
#include <tree_sitter/parser.h> | ||
#include <wctype.h> | ||
|
||
enum TokenType { | ||
TYPE_ARGS_START, | ||
BLOCK_COMMENT, | ||
TYPE_ARGS_START, | ||
BLOCK_COMMENT, | ||
|
||
MULTILINE_STRING_CONTENT, | ||
MULTILINE_STRING_CONTENT, | ||
}; | ||
|
||
void *tree_sitter_pony_external_scanner_create() { return NULL; } | ||
typedef struct { | ||
int quote_count; | ||
} Scanner; | ||
|
||
void tree_sitter_pony_external_scanner_destroy(void *payload) {} | ||
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } | ||
|
||
void tree_sitter_pony_external_scanner_reset(void *payload) {} | ||
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } | ||
|
||
void *tree_sitter_pony_external_scanner_create() { | ||
return calloc(1, sizeof(Scanner)); | ||
} | ||
|
||
void tree_sitter_pony_external_scanner_destroy(void *payload) { | ||
Scanner *scanner = (Scanner *)payload; | ||
free(scanner); | ||
} | ||
|
||
unsigned tree_sitter_pony_external_scanner_serialize(void *payload, | ||
char *buffer) { | ||
return 0; | ||
Scanner *scanner = (Scanner *)payload; | ||
memcpy(buffer, scanner, sizeof(Scanner)); | ||
return sizeof(Scanner); | ||
} | ||
|
||
void tree_sitter_pony_external_scanner_deserialize(void *payload, | ||
const char *buffer, | ||
unsigned length) {} | ||
|
||
int quote_count = 0; | ||
unsigned length) { | ||
if (length > 0) { | ||
assert(length == sizeof(Scanner)); | ||
Scanner *scanner = (Scanner *)payload; | ||
memcpy(scanner, buffer, sizeof(Scanner)); | ||
} | ||
} | ||
|
||
bool tree_sitter_pony_external_scanner_scan(void *payload, TSLexer *lexer, | ||
const bool *valid_symbols) { | ||
Scanner *scanner = (Scanner *)payload; | ||
|
||
// We ACCEPT [ if there's whitespace but NO newline, otherwise ignore | ||
// and let the internal grammar handle it as an array literal | ||
if (valid_symbols[TYPE_ARGS_START]) { | ||
while (iswspace(lexer->lookahead) && lexer->lookahead != '\0' && | ||
lexer->lookahead != '\n') { | ||
advance(lexer); | ||
} | ||
|
||
// We ACCEPT [ if there's whitespace but NO newline, otherwise ignore | ||
// and let the internal grammar handle it as an array literal | ||
if (valid_symbols[TYPE_ARGS_START]) { | ||
while (iswspace(lexer->lookahead) && lexer->lookahead != '\0' && | ||
lexer->lookahead != '\n') { | ||
lexer->advance(lexer, false); | ||
if (lexer->lookahead == '[') { | ||
advance(lexer); | ||
lexer->result_symbol = TYPE_ARGS_START; | ||
return true; | ||
} | ||
} | ||
|
||
if (lexer->lookahead == '[') { | ||
lexer->advance(lexer, false); | ||
lexer->result_symbol = TYPE_ARGS_START; | ||
return true; | ||
while (iswspace(lexer->lookahead)) { | ||
skip(lexer); | ||
} | ||
} | ||
|
||
while (iswspace(lexer->lookahead)) | ||
lexer->advance(lexer, true); | ||
|
||
// Multiline string content can have anything, | ||
// escape sequences are just parsed as two characters | ||
if (valid_symbols[MULTILINE_STRING_CONTENT]) { | ||
bool has_content = false; | ||
lexer->result_symbol = MULTILINE_STRING_CONTENT; | ||
|
||
for (;;) { | ||
switch (lexer->lookahead) { | ||
case '"': | ||
lexer->mark_end(lexer); | ||
// This outer if statement is to handle a fresh state without prior | ||
// knowledge of quotes | ||
if (quote_count == 0) { | ||
while (lexer->lookahead == '"') { | ||
lexer->advance(lexer, false); | ||
quote_count++; | ||
} | ||
|
||
if (quote_count > 3) { | ||
// Trigger the external scanner again for a state | ||
// where we know the quote count from before | ||
return true; | ||
} else if (quote_count == 3) { | ||
// We have a triple quote aka the end, so we can return | ||
quote_count = 0; | ||
return has_content; | ||
} else { | ||
// we have a single or double quote, so we need to keep going | ||
// this is just content in the multiline string, so extend the | ||
// current token with `mark_end` | ||
lexer->mark_end(lexer); | ||
quote_count = 0; | ||
has_content = true; | ||
} | ||
// This else if is to handle the case where we have a quote count > 0 | ||
// beforehand, and need to see if some of this just might be content, | ||
// that is, only if there's more than 3 quotes in a row | ||
} else if (quote_count > 3) { | ||
// We returned from the last iteration with a quote count > 3, so we | ||
// must mark n-3 quotes as content | ||
for (int i = 0; i < quote_count - 3; i++) { | ||
lexer->advance(lexer, false); | ||
} | ||
// Extend the current token with `mark_end` | ||
lexer->mark_end(lexer); | ||
quote_count = 0; | ||
has_content = true; | ||
return true; | ||
// This else if is to handle the case where we have a quote count == 3 | ||
// from the last iteration | ||
} else if (quote_count == 3) { | ||
// We have a triple quote aka the end, so we can return | ||
quote_count = 0; | ||
return has_content; | ||
// This else is to handle the case where we have a | ||
// quote count == 1 or 2, just extend the current token | ||
} else { | ||
lexer->mark_end(lexer); | ||
quote_count = 0; | ||
// We know we have a single or double quote, so we need to keep going, | ||
// mark has_content as true | ||
has_content = true; | ||
} | ||
break; | ||
case '\0': | ||
if (lexer->eof(lexer)) { | ||
return false; | ||
|
||
// Multiline string content can have anything, | ||
// escape sequences are just parsed as two characters | ||
if (valid_symbols[MULTILINE_STRING_CONTENT]) { | ||
bool has_content = false; | ||
lexer->result_symbol = MULTILINE_STRING_CONTENT; | ||
|
||
for (;;) { | ||
switch (lexer->lookahead) { | ||
case '"': | ||
lexer->mark_end(lexer); | ||
// This outer if statement is to handle a fresh state | ||
// without prior knowledge of quotes | ||
if (scanner->quote_count == 0) { | ||
while (lexer->lookahead == '"') { | ||
advance(lexer); | ||
scanner->quote_count++; | ||
} | ||
|
||
if (scanner->quote_count > 3) { | ||
// Trigger the external scanner again for a state | ||
// where we know the quote count from before | ||
return true; | ||
} | ||
if (scanner->quote_count == 3) { | ||
// We have a triple quote aka the end, so we can | ||
// return | ||
scanner->quote_count = 0; | ||
return has_content; | ||
} | ||
// we have a single or double quote, so we need to | ||
// keep going since this is just content in the | ||
// multiline string, so we extend the current token with | ||
// `mark_end` | ||
lexer->mark_end(lexer); | ||
scanner->quote_count = 0; | ||
has_content = true; | ||
|
||
// This else if is to handle the case where we have a | ||
// quote count > 0 beforehand, and need to see if some | ||
// of this just might be content, that is, only if | ||
// there's more than 3 quotes in a row | ||
} else if (scanner->quote_count > 3) { | ||
// We returned from the last iteration with a quote | ||
// count > 3, so we must mark n-3 quotes as content | ||
for (int i = 0; i < scanner->quote_count - 3; i++) { | ||
advance(lexer); | ||
} | ||
// Extend the current token with `mark_end` | ||
lexer->mark_end(lexer); | ||
scanner->quote_count = 0; | ||
has_content = true; | ||
return true; | ||
// This else if is to handle the case where we have a | ||
// quote count == 3 from the last iteration | ||
} else if (scanner->quote_count == 3) { | ||
// We have a triple quote aka the end, so we can return | ||
scanner->quote_count = 0; | ||
return has_content; | ||
// This else is to handle the case where we have a | ||
// quote count == 1 or 2, just extend the current token | ||
} else { | ||
lexer->mark_end(lexer); | ||
scanner->quote_count = 0; | ||
// We know we have a single or double quote, so we need | ||
// to keep going, mark has_content as true | ||
has_content = true; | ||
} | ||
break; | ||
case '\0': | ||
if (lexer->eof(lexer)) { | ||
return false; | ||
} | ||
advance(lexer); | ||
has_content = true; | ||
break; | ||
default: | ||
advance(lexer); | ||
has_content = true; | ||
break; | ||
} | ||
} | ||
lexer->advance(lexer, false); | ||
has_content = true; | ||
break; | ||
default: | ||
lexer->advance(lexer, false); | ||
has_content = true; | ||
break; | ||
} | ||
} | ||
} | ||
|
||
while (iswspace(lexer->lookahead)) | ||
lexer->advance(lexer, true); | ||
|
||
if (lexer->lookahead == '/') { | ||
lexer->advance(lexer, false); | ||
if (lexer->lookahead != '*') | ||
return false; | ||
lexer->advance(lexer, false); | ||
|
||
bool after_star = false; | ||
unsigned nesting_depth = 1; | ||
for (;;) { | ||
switch (lexer->lookahead) { | ||
case '\0': | ||
return false; | ||
case '*': | ||
lexer->advance(lexer, false); | ||
after_star = true; | ||
break; | ||
case '/': | ||
if (after_star) { | ||
lexer->advance(lexer, false); | ||
after_star = false; | ||
nesting_depth--; | ||
if (nesting_depth == 0) { | ||
lexer->result_symbol = BLOCK_COMMENT; | ||
return true; | ||
} | ||
} else { | ||
lexer->advance(lexer, false); | ||
after_star = false; | ||
if (lexer->lookahead == '*') { | ||
nesting_depth++; | ||
lexer->advance(lexer, false); | ||
} | ||
|
||
while (iswspace(lexer->lookahead)) { | ||
skip(lexer); | ||
} | ||
|
||
if (lexer->lookahead == '/') { | ||
advance(lexer); | ||
if (lexer->lookahead != '*') { | ||
return false; | ||
} | ||
advance(lexer); | ||
|
||
bool after_star = false; | ||
unsigned nesting_depth = 1; | ||
for (;;) { | ||
switch (lexer->lookahead) { | ||
case '\0': | ||
return false; | ||
case '*': | ||
advance(lexer); | ||
after_star = true; | ||
break; | ||
case '/': | ||
if (after_star) { | ||
advance(lexer); | ||
after_star = false; | ||
nesting_depth--; | ||
if (nesting_depth == 0) { | ||
lexer->result_symbol = BLOCK_COMMENT; | ||
return true; | ||
} | ||
} else { | ||
advance(lexer); | ||
after_star = false; | ||
if (lexer->lookahead == '*') { | ||
nesting_depth++; | ||
advance(lexer); | ||
} | ||
} | ||
break; | ||
default: | ||
advance(lexer); | ||
after_star = false; | ||
break; | ||
} | ||
} | ||
break; | ||
default: | ||
lexer->advance(lexer, false); | ||
after_star = false; | ||
break; | ||
} | ||
} | ||
} | ||
|
||
return false; | ||
return false; | ||
} |