HTML: introduce a specialized tokenizer for script areas

The original code used a html-aware tokenizer for reading tokens in <script>...</script> areas. As reported in #3581 and #3597, this original code could not recognize <script>...</script> areas in some cases. This change introduces a tokenizer specialized to script areas in addition to the original html-aware tokenizer. Close #3581. Close #3597. Signed-off-by: Masatake YAMATO <yamato@redhat.com>
universal-ctags · Dec 17, 2022 · ddf6766 · ddf6766
1 parent 3af4135
commit ddf6766
Show file tree

Hide file tree

Showing 7 changed files with 102 additions and 1 deletion.
diff --git a/Units/parser-html.r/comment-starter-in-script.d/args.ctags b/Units/parser-html.r/comment-starter-in-script.d/args.ctags
@@ -0,0 +1,3 @@
+--sort=no
+--extras=+g
+--fields=+Kl
diff --git a/Units/parser-html.r/comment-starter-in-script.d/expected.tags b/Units/parser-html.r/comment-starter-in-script.d/expected.tags
@@ -0,0 +1,3 @@
+Foo	input.html	/^<h1>Foo<\/h1>$/;"	heading1	language:HTML
+BAR	input.html	/^<h1>BAR<\/h1>$/;"	heading1	language:HTML
+x	input.html	/^  var x/;"	variable	language:JavaScript
diff --git a/Units/parser-html.r/comment-starter-in-script.d/input.html b/Units/parser-html.r/comment-starter-in-script.d/input.html
@@ -0,0 +1,6 @@
+<h1>Foo</h1>
+<script>
+  // <!--
+  var x
+</script>
+<h1>BAR</h1>
diff --git a/Units/parser-html.r/string-in-script.d/args.ctags b/Units/parser-html.r/string-in-script.d/args.ctags
@@ -0,0 +1,3 @@
+--sort=no
+--extras=+g
+--fields=+Kl
diff --git a/Units/parser-html.r/string-in-script.d/expected.tags b/Units/parser-html.r/string-in-script.d/expected.tags
@@ -0,0 +1,6 @@
+Foo	input.html	/^<h1>Foo<\/h1>$/;"	heading1	language:HTML
+BAR	input.html	/^<h1>BAR<\/h1>$/;"	heading1	language:HTML
+bar	input.html	/^	const bar = 123$/;"	constant	language:JavaScript
+baz	input.html	/^	function baz () {$/;"	function	language:JavaScript
+bar2	input.html	/^	const bar2 = 123$/;"	constant	language:JavaScript
+baz2	input.html	/^	function baz2 () {$/;"	function	language:JavaScript
diff --git a/Units/parser-html.r/string-in-script.d/input.html b/Units/parser-html.r/string-in-script.d/input.html
@@ -0,0 +1,24 @@
+<!-- Taken from #3581 submitted by @polyscone -->
+<h1>Foo</h1>
+
+<script>
+	const bar = 123
+
+	// I don't know why, but an apostrophe breaks
+	// the JavaScript guest language
+	function baz () {
+		return 'abc'
+	}
+</script>
+
+<script>
+	const bar2 = 123
+
+	// I don"t know why, but an apostrophe breaks
+	// the JavaScript guest language
+	function baz2 () {
+		return 'abc'
+	}
+</script>
+
+<h1>BAR</h1>
diff --git a/parsers/html.c b/parsers/html.c
@@ -236,6 +236,62 @@ static void readTokenText (tokenInfo *const token, bool collectText)
 	}
 }
 
+static void readTokenInScript (tokenInfo *const token)
+{
+	int c;
+
+	vStringClear (token->string);
+
+	c = getcFromInputFile ();
+	while (isspace (c))
+		c = getcFromInputFile ();
+
+	switch (c)
+	{
+		case EOF:
+			token->type = TOKEN_EOF;
+			break;
+
+		case '<':
+		{
+			int d = getcFromInputFile ();
+			if (d == '/')
+				token->type = TOKEN_CLOSE_TAG_START;
+			else
+			{
+				ungetcToInputFile (d);
+				token->type = TOKEN_OTHER;
+			}
+			break;
+		}
+		default:
+		{
+			while (!isspace (c) && c != '<' && c != '>' && c != '/' &&
+				   c != '=' && c != '\'' && c != '"' && c != EOF)
+			{
+				vStringPut (token->string, tolower (c));
+				c = getcFromInputFile ();
+			}
+
+			if (vStringLength (token->string) == 0)
+			{
+				vStringPut (token->string, c);
+				token->type = TOKEN_OTHER;
+			}
+			else if (c == EOF)
+				token->type = TOKEN_NAME;
+			else
+			{
+				ungetcToInputFile (c);
+				token->type = TOKEN_NAME;
+			}
+			break;
+		}
+	}
+
+	TRACE_PRINT("token (in script): %s (%s)", tokenTypes[token->type], vStringValue (token->string));
+}
+
 static void readToken (tokenInfo *const token, bool skipComments)
 {
 	int c;
@@ -414,7 +470,7 @@ static bool skipScriptContent (tokenInfo *token, long *line, long *lineOffset)
 		line_tmp[0] = getInputLineNumber ();
 		lineOffset_tmp[0] = getInputLineOffset ();
 
-		readToken (token, false);
+		readTokenInScript (token);
 		type = token->type;
 
 		if (type == TOKEN_CLOSE_TAG_START)