Fix: enable Python 3.12 to parse templates (webpy#785)

* Fix: enable Python 3.12 to parse templates Closes webpy#784. This PR fixes the inability of `web.py` to run on Python 3.12, and it does so by matching any unmatched `"` in a string. In Python 3.12, changes to `tokenize.generate_tokens()` requires all quotes to be matched, and the `web.py` template parser hands unterminated strings to `tokenize.generate_tokens()` quite frequently. This PR makes a tacit assumption that we should avoid rewriting the template parser if possible while also allowing `web.py` to run on Python 3.12. To that end, it handles every `TokenError` that is because of unmatched string literals by adding a `"` to the line. Although this appears to work, I am aware that appending a `"` to each line that has an unmatched `"` is less than ideal. However, it seemed a very easy way to avoid rewriting the template parser. I am absolutely open to other approaches. *IF* this approach looks as if it has legs, I can can look more into where that "extra" `"` goes, why it seems not to matter, and write some unit tests. With regard to testing, I tested this with as many pages on Open Library that I could (via the local development environment), and it seems to work. * noqa: complexity and excess statements for read_expr() * Add suggestions from @tfmorris. * Switch to more-itertools.peekable and set linter limits back down again * Fix off-by-1 error and add test coverage * Linting fixes * Add `more_itertools` as a dependency `more_itertools.peekable()` is used now. `cheroot` already requires `more_itertools`, so this depedency is already installed anyway, but adding it as a dependency for `webpy` itself will ensure that if `cheroot` drops the `more_itertools`, dependency, `webpy` will still require it. --------- Co-authored-by: Tom Morris <tfmorris@gmail.com>
gjdv · Feb 21, 2024 · d364932 · d364932
1 parent 5709b1f
commit d364932
Show file tree

Hide file tree

Showing 9 changed files with 121 additions and 48 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -83,14 +83,13 @@ show-source = true
 target-version = "py38"
 
 [tool.ruff.mccabe]
-max-complexity = 26
+max-complexity = 20
 
 [tool.ruff.pylint]
 allow-magic-value-types = ["int", "str"]
 max-args = 9  # default is 5
 max-branches = 17  # default is 12
 max-returns = 8  # default is 6
-max-statements = 51  # default is 50
 
 [tool.codespell]
 ignore-words-list = "asend,gae"

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 cheroot>=6.0.0
+more_itertools>=2.6
 multipart>=0.2.4
diff --git a/tests/test_template.py b/tests/test_template.py
@@ -1,7 +1,14 @@
 import unittest
 
 import web
-from web.template import SecurityError, Template
+from web.template import ExpressionNode, Parser, SecurityError, Template
+
+
+class TestItem:
+    __test__ = False  # silence collection warning from test framework
+
+    def __init__(self):
+        self.id = 12345
 
 
 class _TestResult:
@@ -43,6 +50,61 @@ def test_overridden(self):
         f = t(tpl, globals={"print": lambda x: x})
         assert repr(f()) == "'blah\\n'"
 
+    def test_quotes(self):
+        template = 'a="$foo" <p>'
+        f = t(template, globals={"foo": "bar"})
+        assert repr(f()) == "'a=\"bar\" <p>\\n'"
+
+    def test_accessor(self):
+        template = 'a="$foo.id"<p>'
+        f = t(template, globals={"foo": TestItem()})
+        assert repr(f()) == "'a=\"12345\"<p>\\n'"
+
+    def test_href(self):
+        template = '<a href="/del/$item.id">Delete</a>'
+        f = t(template, globals={"item": TestItem()})
+        assert repr(f()) == "'<a href=\"/del/12345\">Delete</a>\\n'"
+
+
+class TestParser(unittest.TestCase):
+    """
+    Test the Parser.
+
+    Tests functions from the Parser class as if the following template were loaded:
+
+    test_template = '''$def with (back, docs)
+    $var title: Index
+    <p><a href="$back">&larr; Back to Index</a></p>
+    <ul>
+    $for path, title in docs:
+        <li><a href="$path">$title</a></li>
+    </ul>
+    '''
+    """
+
+    def test_read_expr(self) -> None:
+        """
+        Test Parser.read_expr() with the `text` values it would get from
+        `Parser.read_node(), if processing `test_template`.
+        """
+        got = Parser().read_expr('back">&larr; Back to Index</a></p>\n')
+        expression_node = got[0]
+        assert isinstance(expression_node, ExpressionNode)
+        assert repr(expression_node) == "$back"
+        assert got[1] == '">&larr; Back to Index</a></p>\n'
+
+        got = Parser().read_expr('path">$title</a></li>\n')
+        expression_node = got[0]
+        assert isinstance(expression_node, ExpressionNode)
+        assert repr(expression_node) == "$path"
+        assert got[1] == '">$title</a></li>\n'
+
+        got = Parser().read_expr("title</a></li>\n")
+        expression_node = got[0]
+        assert isinstance(expression_node, ExpressionNode)
+        assert repr(expression_node) == "$title"
+        assert got[1] == "</a></li>\n"
+
 
 class TestRender:
     def test_template_without_ext(self, tmpdir):

diff --git a/web/browser.py b/web/browser.py
@@ -1,6 +1,7 @@
 """Browser to test web applications.
 (from web.py)
 """
+
 import os
 import webbrowser
 from http.cookiejar import CookieJar
@@ -46,7 +47,7 @@ def reset(self):
 
     def build_opener(self):
         """Builds the opener using (urllib2/urllib.request).build_opener.
-        Subclasses can override this function to prodive custom openers.
+        Subclasses can override this function to provide custom openers.
         """
         return urllib_build_opener()
 

diff --git a/web/contrib/template.py b/web/contrib/template.py
@@ -1,6 +1,7 @@
 """
 Interface to various templating engines.
 """
+
 import os.path
 
 __all__ = ["render_cheetah", "render_genshi", "render_mako", "cache"]

diff --git a/web/db.py b/web/db.py
@@ -2,6 +2,7 @@
 Database API
 (part of web.py)
 """
+
 import ast
 import datetime
 import os

diff --git a/web/net.py b/web/net.py
@@ -3,7 +3,6 @@
 (from web.py)
 """
 
-
 import datetime
 import re
 import socket

diff --git a/web/template.py b/web/template.py
@@ -31,11 +31,15 @@
 import ast
 import builtins
 import glob
+import itertools
 import os
 import sys
+import token
 import tokenize
 from functools import partial
 
+from more_itertools import peekable
+
 from .net import websafe
 from .utils import re_compile, safestr, safeunicode, storage
 from .webapi import config
@@ -242,7 +246,7 @@ def read_keyword(self, text):
         line, text = splitline(text)
         return StatementNode(line.strip() + "\n"), text
 
-    def read_expr(self, text, escape=True):
+    def read_expr(self, text, escape=True):  # noqa: C901, PLR0915
         """Reads a python expression from the text and returns the expression and remaining text.
 
         expr -> simple_expr | paren_expr
@@ -271,10 +275,10 @@ def simple_expr():
             extended_expr()
 
         def identifier():
-            next(tokens)
+            return next(tokens)
 
         def extended_expr():
-            lookahead = tokens.lookahead()
+            lookahead = tokens.peek()
             if lookahead is None:
                 return
             elif lookahead.value == ".":
@@ -288,7 +292,7 @@ def extended_expr():
         def attr_access():
             from token import NAME  # python token constants
 
-            if tokens.lookahead2().type == NAME:
+            if tokens[1].type == NAME:
                 next(tokens)  # consume dot
                 identifier()
                 extended_expr()
@@ -297,7 +301,7 @@ def paren_expr():
             begin = next(tokens).value
             end = parens[begin]
             while True:
-                if tokens.lookahead().value in parens:
+                if tokens.peek().value in parens:
                     paren_expr()
                 else:
                     t = next(tokens)
@@ -306,57 +310,61 @@ def paren_expr():
 
         parens = {"(": ")", "[": "]", "{": "}"}
 
-        def get_tokens(text):
+        def get_tokens(text: str):
             """tokenize text using python tokenizer.
             Python tokenizer ignores spaces, but they might be important in some cases.
             This function introduces dummy space tokens when it identifies any ignored space.
             Each token is a storage object containing type, value, begin and end.
             """
-            i = iter([text])
-            readline = lambda: next(i)
-            end = None
-            for t in tokenize.generate_tokens(readline):
-                t = storage(type=t[0], value=t[1], begin=t[2], end=t[3])
-                if end is not None and end != t.begin:
-                    _, x1 = end
-                    _, x2 = t.begin
-                    yield storage(type=-1, value=text[x1:x2], begin=end, end=t.begin)
-                end = t.end
-                yield t
-
-        class BetterIter:
-            """Iterator like object with 2 support for 2 look aheads."""
-
-            def __init__(self, items):
-                self.iteritems = iter(items)
-                self.items = []
-                self.position = 0
-                self.current_item = None
 
-            def lookahead(self):
-                if len(self.items) <= self.position:
-                    self.items.append(self._next())
-                return self.items[self.position]
+            def tokenize_text(input_text):
+                i = iter([input_text])
+                readline = lambda: next(i)
+                end = None
+                for t in tokenize.generate_tokens(readline):
+                    t = storage(type=t[0], value=t[1], begin=t[2], end=t[3])
+                    if end is not None and end != t.begin:
+                        _, x1 = end
+                        _, x2 = t.begin
+                        yield storage(
+                            type=-1, value=input_text[x1:x2], begin=end, end=t.begin
+                        )
+                    end = t.end
+                    yield t
+
+            try:
+                yield from tokenize_text(text)
+            except tokenize.TokenError as e:
+                # Things like unterminated string literals or EOF in multi-line literals will raise exceptions
+                # tokenize the error free portion, then return an error token with the rest of the text
+                error_pos = e.args[1][1] - 1
+                fixed_text = text[0:error_pos]
+                yield from itertools.chain(
+                    tokenize_text(fixed_text),
+                    error_token_generator(text, error_pos + 1, len(text)),
+                )
+
+        def error_token_generator(text, start, end):
+            yield storage(
+                type=token.ERRORTOKEN, value=text[start:], begin=start, end=end
+            )
 
-            def _next(self):
-                try:
-                    return next(self.iteritems)
-                except StopIteration:
-                    return None
+        class peekable2(peekable):
+            """
+            A peekable class which caches the last item returned by next()
+            """
 
-            def lookahead2(self):
-                if len(self.items) <= self.position + 1:
-                    self.items.append(self._next())
-                return self.items[self.position + 1]
+            def __init__(self, iterable):
+                super().__init__(iterable)
+                self.current_item = None
 
             def __next__(self):
-                self.current_item = self.lookahead()
-                self.position += 1
+                self.current_item = super().__next__()
                 return self.current_item
 
-        tokens = BetterIter(get_tokens(text))
+        tokens = peekable2(get_tokens(text))
 
-        if tokens.lookahead().value in parens:
+        if tokens.peek().value in parens:
             paren_expr()
         else:
             simple_expr()

diff --git a/web/test.py b/web/test.py
@@ -1,6 +1,7 @@
 """test utilities
 (part of web.py)
 """
+
 import doctest
 import sys
 import unittest