Skip to content

Commit

Permalink
fix: use a stateful scanner to keep track of quote count to avoid inf…
Browse files Browse the repository at this point in the history
…inite loop bug
  • Loading branch information
amaanq committed Jun 23, 2023
1 parent 88cb155 commit 49733fd
Showing 1 changed file with 172 additions and 145 deletions.
317 changes: 172 additions & 145 deletions src/scanner.c
Original file line number Diff line number Diff line change
@@ -1,174 +1,201 @@
#include <assert.h>
#include <string.h>
#include <tree_sitter/parser.h>
#include <wctype.h>

enum TokenType {
TYPE_ARGS_START,
BLOCK_COMMENT,
TYPE_ARGS_START,
BLOCK_COMMENT,

MULTILINE_STRING_CONTENT,
MULTILINE_STRING_CONTENT,
};

void *tree_sitter_pony_external_scanner_create() { return NULL; }
typedef struct {
int quote_count;
} Scanner;

void tree_sitter_pony_external_scanner_destroy(void *payload) {}
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }

void tree_sitter_pony_external_scanner_reset(void *payload) {}
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }

void *tree_sitter_pony_external_scanner_create() {
return calloc(1, sizeof(Scanner));
}

void tree_sitter_pony_external_scanner_destroy(void *payload) {
Scanner *scanner = (Scanner *)payload;
free(scanner);
}

unsigned tree_sitter_pony_external_scanner_serialize(void *payload,
char *buffer) {
return 0;
Scanner *scanner = (Scanner *)payload;
memcpy(buffer, scanner, sizeof(Scanner));
return sizeof(Scanner);
}

void tree_sitter_pony_external_scanner_deserialize(void *payload,
const char *buffer,
unsigned length) {}

int quote_count = 0;
unsigned length) {
if (length > 0) {
assert(length == sizeof(Scanner));
Scanner *scanner = (Scanner *)payload;
memcpy(scanner, buffer, sizeof(Scanner));
}
}

bool tree_sitter_pony_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = (Scanner *)payload;

// We ACCEPT [ if there's whitespace but NO newline, otherwise ignore
// and let the internal grammar handle it as an array literal
if (valid_symbols[TYPE_ARGS_START]) {
while (iswspace(lexer->lookahead) && lexer->lookahead != '\0' &&
lexer->lookahead != '\n') {
advance(lexer);
}

// We ACCEPT [ if there's whitespace but NO newline, otherwise ignore
// and let the internal grammar handle it as an array literal
if (valid_symbols[TYPE_ARGS_START]) {
while (iswspace(lexer->lookahead) && lexer->lookahead != '\0' &&
lexer->lookahead != '\n') {
lexer->advance(lexer, false);
if (lexer->lookahead == '[') {
advance(lexer);
lexer->result_symbol = TYPE_ARGS_START;
return true;
}
}

if (lexer->lookahead == '[') {
lexer->advance(lexer, false);
lexer->result_symbol = TYPE_ARGS_START;
return true;
while (iswspace(lexer->lookahead)) {
skip(lexer);
}
}

while (iswspace(lexer->lookahead))
lexer->advance(lexer, true);

// Multiline string content can have anything,
// escape sequences are just parsed as two characters
if (valid_symbols[MULTILINE_STRING_CONTENT]) {
bool has_content = false;
lexer->result_symbol = MULTILINE_STRING_CONTENT;

for (;;) {
switch (lexer->lookahead) {
case '"':
lexer->mark_end(lexer);
// This outer if statement is to handle a fresh state without prior
// knowledge of quotes
if (quote_count == 0) {
while (lexer->lookahead == '"') {
lexer->advance(lexer, false);
quote_count++;
}

if (quote_count > 3) {
// Trigger the external scanner again for a state
// where we know the quote count from before
return true;
} else if (quote_count == 3) {
// We have a triple quote aka the end, so we can return
quote_count = 0;
return has_content;
} else {
// we have a single or double quote, so we need to keep going
// this is just content in the multiline string, so extend the
// current token with `mark_end`
lexer->mark_end(lexer);
quote_count = 0;
has_content = true;
}
// This else if is to handle the case where we have a quote count > 0
// beforehand, and need to see if some of this just might be content,
// that is, only if there's more than 3 quotes in a row
} else if (quote_count > 3) {
// We returned from the last iteration with a quote count > 3, so we
// must mark n-3 quotes as content
for (int i = 0; i < quote_count - 3; i++) {
lexer->advance(lexer, false);
}
// Extend the current token with `mark_end`
lexer->mark_end(lexer);
quote_count = 0;
has_content = true;
return true;
// This else if is to handle the case where we have a quote count == 3
// from the last iteration
} else if (quote_count == 3) {
// We have a triple quote aka the end, so we can return
quote_count = 0;
return has_content;
// This else is to handle the case where we have a
// quote count == 1 or 2, just extend the current token
} else {
lexer->mark_end(lexer);
quote_count = 0;
// We know we have a single or double quote, so we need to keep going,
// mark has_content as true
has_content = true;
}
break;
case '\0':
if (lexer->eof(lexer)) {
return false;

// Multiline string content can have anything,
// escape sequences are just parsed as two characters
if (valid_symbols[MULTILINE_STRING_CONTENT]) {
bool has_content = false;
lexer->result_symbol = MULTILINE_STRING_CONTENT;

for (;;) {
switch (lexer->lookahead) {
case '"':
lexer->mark_end(lexer);
// This outer if statement is to handle a fresh state
// without prior knowledge of quotes
if (scanner->quote_count == 0) {
while (lexer->lookahead == '"') {
advance(lexer);
scanner->quote_count++;
}

if (scanner->quote_count > 3) {
// Trigger the external scanner again for a state
// where we know the quote count from before
return true;
}
if (scanner->quote_count == 3) {
// We have a triple quote aka the end, so we can
// return
scanner->quote_count = 0;
return has_content;
}
// we have a single or double quote, so we need to
// keep going since this is just content in the
// multiline string, so we extend the current token with
// `mark_end`
lexer->mark_end(lexer);
scanner->quote_count = 0;
has_content = true;

// This else if is to handle the case where we have a
// quote count > 0 beforehand, and need to see if some
// of this just might be content, that is, only if
// there's more than 3 quotes in a row
} else if (scanner->quote_count > 3) {
// We returned from the last iteration with a quote
// count > 3, so we must mark n-3 quotes as content
for (int i = 0; i < scanner->quote_count - 3; i++) {
advance(lexer);
}
// Extend the current token with `mark_end`
lexer->mark_end(lexer);
scanner->quote_count = 0;
has_content = true;
return true;
// This else if is to handle the case where we have a
// quote count == 3 from the last iteration
} else if (scanner->quote_count == 3) {
// We have a triple quote aka the end, so we can return
scanner->quote_count = 0;
return has_content;
// This else is to handle the case where we have a
// quote count == 1 or 2, just extend the current token
} else {
lexer->mark_end(lexer);
scanner->quote_count = 0;
// We know we have a single or double quote, so we need
// to keep going, mark has_content as true
has_content = true;
}
break;
case '\0':
if (lexer->eof(lexer)) {
return false;
}
advance(lexer);
has_content = true;
break;
default:
advance(lexer);
has_content = true;
break;
}
}
lexer->advance(lexer, false);
has_content = true;
break;
default:
lexer->advance(lexer, false);
has_content = true;
break;
}
}
}

while (iswspace(lexer->lookahead))
lexer->advance(lexer, true);

if (lexer->lookahead == '/') {
lexer->advance(lexer, false);
if (lexer->lookahead != '*')
return false;
lexer->advance(lexer, false);

bool after_star = false;
unsigned nesting_depth = 1;
for (;;) {
switch (lexer->lookahead) {
case '\0':
return false;
case '*':
lexer->advance(lexer, false);
after_star = true;
break;
case '/':
if (after_star) {
lexer->advance(lexer, false);
after_star = false;
nesting_depth--;
if (nesting_depth == 0) {
lexer->result_symbol = BLOCK_COMMENT;
return true;
}
} else {
lexer->advance(lexer, false);
after_star = false;
if (lexer->lookahead == '*') {
nesting_depth++;
lexer->advance(lexer, false);
}

while (iswspace(lexer->lookahead)) {
skip(lexer);
}

if (lexer->lookahead == '/') {
advance(lexer);
if (lexer->lookahead != '*') {
return false;
}
advance(lexer);

bool after_star = false;
unsigned nesting_depth = 1;
for (;;) {
switch (lexer->lookahead) {
case '\0':
return false;
case '*':
advance(lexer);
after_star = true;
break;
case '/':
if (after_star) {
advance(lexer);
after_star = false;
nesting_depth--;
if (nesting_depth == 0) {
lexer->result_symbol = BLOCK_COMMENT;
return true;
}
} else {
advance(lexer);
after_star = false;
if (lexer->lookahead == '*') {
nesting_depth++;
advance(lexer);
}
}
break;
default:
advance(lexer);
after_star = false;
break;
}
}
break;
default:
lexer->advance(lexer, false);
after_star = false;
break;
}
}
}

return false;
return false;
}

0 comments on commit 49733fd

Please sign in to comment.