Skip to content

Commit

Permalink
HTML: introduce a specialized tokenizer for script areas
Browse files Browse the repository at this point in the history
The original code used a html-aware tokenizer for reading
tokens in <script>...</script> areas.

As reported in #3581 and #3597, this original code could
not recognize <script>...</script> areas in some cases.

This change introduces a tokenizer specialized to script
areas in addition to the original html-aware tokenizer.

Close #3581.
Close #3597.

Signed-off-by: Masatake YAMATO <yamato@redhat.com>
  • Loading branch information
masatake committed Dec 17, 2022
1 parent 3af4135 commit ddf6766
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 1 deletion.
3 changes: 3 additions & 0 deletions Units/parser-html.r/comment-starter-in-script.d/args.ctags
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
--sort=no
--extras=+g
--fields=+Kl
3 changes: 3 additions & 0 deletions Units/parser-html.r/comment-starter-in-script.d/expected.tags
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Foo input.html /^<h1>Foo<\/h1>$/;" heading1 language:HTML
BAR input.html /^<h1>BAR<\/h1>$/;" heading1 language:HTML
x input.html /^ var x/;" variable language:JavaScript
6 changes: 6 additions & 0 deletions Units/parser-html.r/comment-starter-in-script.d/input.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<h1>Foo</h1>
<script>
// <!--
var x
</script>
<h1>BAR</h1>
3 changes: 3 additions & 0 deletions Units/parser-html.r/string-in-script.d/args.ctags
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
--sort=no
--extras=+g
--fields=+Kl
6 changes: 6 additions & 0 deletions Units/parser-html.r/string-in-script.d/expected.tags
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Foo input.html /^<h1>Foo<\/h1>$/;" heading1 language:HTML
BAR input.html /^<h1>BAR<\/h1>$/;" heading1 language:HTML
bar input.html /^ const bar = 123$/;" constant language:JavaScript
baz input.html /^ function baz () {$/;" function language:JavaScript
bar2 input.html /^ const bar2 = 123$/;" constant language:JavaScript
baz2 input.html /^ function baz2 () {$/;" function language:JavaScript
24 changes: 24 additions & 0 deletions Units/parser-html.r/string-in-script.d/input.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<!-- Taken from #3581 submitted by @polyscone -->
<h1>Foo</h1>

<script>
const bar = 123

// I don't know why, but an apostrophe breaks
// the JavaScript guest language
function baz () {
return 'abc'
}
</script>

<script>
const bar2 = 123

// I don"t know why, but an apostrophe breaks
// the JavaScript guest language
function baz2 () {
return 'abc'
}
</script>

<h1>BAR</h1>
58 changes: 57 additions & 1 deletion parsers/html.c
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,62 @@ static void readTokenText (tokenInfo *const token, bool collectText)
}
}

static void readTokenInScript (tokenInfo *const token)
{
int c;

vStringClear (token->string);

c = getcFromInputFile ();
while (isspace (c))
c = getcFromInputFile ();

switch (c)
{
case EOF:
token->type = TOKEN_EOF;
break;

case '<':
{
int d = getcFromInputFile ();
if (d == '/')
token->type = TOKEN_CLOSE_TAG_START;
else
{
ungetcToInputFile (d);
token->type = TOKEN_OTHER;
}
break;
}
default:
{
while (!isspace (c) && c != '<' && c != '>' && c != '/' &&
c != '=' && c != '\'' && c != '"' && c != EOF)
{
vStringPut (token->string, tolower (c));
c = getcFromInputFile ();
}

if (vStringLength (token->string) == 0)
{
vStringPut (token->string, c);
token->type = TOKEN_OTHER;
}
else if (c == EOF)
token->type = TOKEN_NAME;
else
{
ungetcToInputFile (c);
token->type = TOKEN_NAME;
}
break;
}
}

TRACE_PRINT("token (in script): %s (%s)", tokenTypes[token->type], vStringValue (token->string));
}

static void readToken (tokenInfo *const token, bool skipComments)
{
int c;
Expand Down Expand Up @@ -414,7 +470,7 @@ static bool skipScriptContent (tokenInfo *token, long *line, long *lineOffset)
line_tmp[0] = getInputLineNumber ();
lineOffset_tmp[0] = getInputLineOffset ();

readToken (token, false);
readTokenInScript (token);
type = token->type;

if (type == TOKEN_CLOSE_TAG_START)
Expand Down

0 comments on commit ddf6766

Please sign in to comment.