Universal newline support

nineteendo · Jul 31, 2024 · 756f778 · 756f778
1 parent ba1c32f
commit 756f778
Show file tree

Hide file tree

Showing 5 changed files with 88 additions and 34 deletions.
diff --git a/src/jsonyx/__init__.py b/src/jsonyx/__init__.py
@@ -80,8 +80,6 @@ def loads(
 
         if not isinstance(s, str):
             s = s.decode(detect_encoding(s), self._errors)
-            # Normalize newlines
-            s = s.replace("\r\n", "\n").replace("\r", "\n")
         elif s.startswith("\ufeff"):
             msg: str = "Unexpected UTF-8 BOM"
             raise JSONSyntaxError(msg, filename, s, 0)

diff --git a/src/jsonyx/_decoder.py b/src/jsonyx/_decoder.py
@@ -31,6 +31,9 @@
 _match_chunk: Callable[[str, int], Match[str] | None] = re.compile(
     r'[^"\\\x00-\x1f]+', _FLAGS,
 ).match
+_match_line_end: Callable[[str, int], Match[str] | None] = re.compile(
+    r"[^\n\r]+", _FLAGS,
+).match
 _match_number: Callable[[str, int], Match[str] | None] = re.compile(
     r"(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?", _FLAGS,
 ).match
@@ -40,9 +43,13 @@
 
 
 def _get_err_context(doc: str, start: int, end: int) -> tuple[int, str, int]:
-    line_start: int = doc.rfind("\n", 0, start) + 1
-    if (line_end := doc.find("\n", start)) == -1:
-        line_end = len(doc)
+    line_start: int = max(
+        doc.rfind("\n", 0, start), doc.rfind("\r", 0, start),
+    ) + 1
+    if (match := _match_line_end(doc, start)):
+        line_end: int = match.end()
+    else:
+        line_end = start
 
     end = min(max(start + 1, line_end), end)
     max_chars: int = get_terminal_size().columns - 4  # leading spaces
@@ -91,16 +98,30 @@ def __init__(  # pylint: disable=R0913
         self, msg: str, filename: str, doc: str, start: int, end: int = 0,
     ) -> None:
         """Create new JSON syntax error."""
-        lineno: int = doc.count("\n", 0, start) + 1
-        colno: int = start - doc.rfind("\n", 0, start)
+        lineno: int = (
+            doc.count("\n", 0, start)
+            + doc.count("\r", 0, start)
+            - doc.count("\r\n", 0, start)
+            + 1
+        )
+        colno: int = start - max(
+            doc.rfind("\n", 0, start), doc.rfind("\r", 0, start),
+        )
         if end <= 0:  # offset
-            if (line_end := doc.find("\n", start)) == -1:
-                line_end = len(doc)
-
-            end = min(line_end, start - end)
+            if (match := _match_line_end(doc, start)):
+                end = min(match.end(), start - end)
+            else:
+                end = start
 
-        end_lineno: int = doc.count("\n", 0, end) + 1
-        end_colno: int = end - doc.rfind("\n", 0, end)
+        end_lineno: int = (
+            doc.count("\n", 0, end)
+            + doc.count("\r", 0, end)
+            - doc.count("\r\n", 0, end)
+            + 1
+        )
+        end_colno: int = end - max(
+            doc.rfind("\n", 0, end), doc.rfind("\r", 0, end),
+        )
         offset, text, end_offset = _get_err_context(doc, start, end)
         super().__init__(
             msg, (filename, lineno, offset, text, end_lineno, end_offset),
@@ -156,10 +177,9 @@ def skip_comments(filename: str, s: str, end: int) -> int:
 
                 comment_idx: int = end
                 if (comment_prefix := s[end:end + 2]) == "//":
-                    if (end := find("\n", end + 2)) != -1:
-                        end += 1
-                    else:
-                        end = len(s)
+                    end += 2
+                    if (match := _match_line_end(s, end)):
+                        end = match.end()
                 elif comment_prefix == "/*":
                     if (end := find("*/", end + 2)) == -1:
                         if allow_comments:
@@ -213,7 +233,7 @@ def scan_string(  # noqa: C901, PLR0912, PLR0915
                     return "".join(chunks), end + 1
 
                 if terminator != "\\":
-                    if terminator == "\n":
+                    if terminator in {"\n", "\r"}:
                         msg = "Unterminated string"
                         raise _errmsg(msg, filename, s, str_idx, end)
 
@@ -232,7 +252,7 @@ def scan_string(  # noqa: C901, PLR0912, PLR0915
                     try:
                         char = _UNESCAPE[esc]
                     except KeyError:
-                        if esc == "\n":
+                        if esc in {"\n", "\r"}:
                             msg = "Expecting escaped character"
                             raise _errmsg(msg, filename, s, end) from None
 

diff --git a/src/jsonyx/_speedups.c b/src/jsonyx/_speedups.c
@@ -112,7 +112,7 @@ _skip_comments(PyScannerObject *s, PyObject *pyfilename, PyObject *pystr, Py_ssi
             PyUnicode_READ(kind, str, idx + 1) == '/')
         {
             idx += 2;
-            while (idx < len && PyUnicode_READ(kind,str, idx) != '\n') {
+            while (idx < len && (PyUnicode_READ(kind,str, idx) != '\n' || PyUnicode_READ(kind,str, idx) != '\r')) {
                 idx++;
             }
         }
@@ -417,7 +417,7 @@ scanstring_unicode(PyObject *pyfilename, PyObject *pystr, Py_ssize_t end, int al
                     break;
                 }
                 if (d <= 0x1f) {
-                    if (d == '\n') {
+                    if (d == '\n' || d == '\r') {
                         raise_errmsg("Unterminated string", pyfilename, pystr, begin, next);
                     }
                     else {
@@ -474,7 +474,7 @@ scanstring_unicode(PyObject *pyfilename, PyObject *pystr, Py_ssize_t end, int al
                 case 'r': c = '\r'; break;
                 case 't': c = '\t'; break;
                 default:
-                    if (c == '\n') {
+                    if (c == '\n' || c == '\r') {
                         raise_errmsg("Expecting escaped character", pyfilename, pystr, end - 1, 0);
                     }
                     else {

diff --git a/src/jsonyx/test/test_loads.py b/src/jsonyx/test/test_loads.py
@@ -231,9 +231,13 @@ def test_surrogate_escapes(json: ModuleType, s: str, expected: Any) -> None:
 @pytest.mark.parametrize(("s", "msg", "colno", "end_colno"), [
     ('"foo', "Unterminated string", 1, 5),
     ('"foo\n', "Unterminated string", 1, 5),
+    ('"foo\r', "Unterminated string", 1, 5),
+    ('"foo\r\n', "Unterminated string", 1, 5),
     ('"\b"', "Unescaped control character", 2, 3),
     ('"\\', "Expecting escaped character", 3, 0),
     ('"\\\n', "Expecting escaped character", 3, 0),
+    ('"\\\r', "Expecting escaped character", 3, 0),
+    ('"\\\r\n', "Expecting escaped character", 3, 0),
     (r'"\a"', "Invalid backslash escape", 2, 4),
     (r'"\u"', "Expecting 4 hex digits", 4, 5),
     (r'"\u0xff"', "Expecting 4 hex digits", 4, 8),
@@ -501,14 +505,18 @@ def test_whitespace(json: ModuleType, s: str) -> None:
 
 @pytest.mark.parametrize("s", [
     # One comment
-    "0//line comment", "0/*block comment*/",
+    "0//", "0//line comment", "0/*block comment*/",
 
     # Multiple comments
     "0//comment 1\n//comment 2\n//comment 3",
+    "0//comment 1\r//comment 2\r//comment 3",
+    "0//comment 1\r\n//comment 2\r\n//comment 3",
     "0/*comment 1*//*comment 2*//*comment 3*/",
 
     # Whitespace
     "0 //comment 1\n //comment 2\n //comment 3\n ",
+    "0 //comment 1\r //comment 2\r //comment 3\r ",
+    "0 //comment 1\r\n //comment 2\r\n //comment 3\r\n ",
     "0 /*comment 1*/ /*comment 2*/ /*comment 3*/ ",
 ])
 def test_comments(json: ModuleType, s: str) -> None:
@@ -524,9 +532,9 @@ def test_invalid_comment(json: ModuleType) -> None:
     _check_syntax_err(exc_info, "Unterminated comment", 2, 24)
 
 
-@pytest.mark.parametrize(
-    "s", ["0//line comment", "0/*block comment*/", "0/*unterminated comment"],
-)
+@pytest.mark.parametrize("s", [
+    "0//", "0//line comment", "0/*block comment*/", "0/*unterminated comment",
+])
 def test_comments_not_allowed(json: ModuleType, s: str) -> None:
     """Test comments if not allowed."""
     with pytest.raises(json.JSONSyntaxError) as exc_info:

diff --git a/src/jsonyx/test/test_syntax_error.py b/src/jsonyx/test/test_syntax_error.py
@@ -12,24 +12,44 @@
 @pytest.mark.parametrize(
     ("doc", "start", "end", "lineno", "end_lineno", "colno", "end_colno"), [
         # Offset
-        ("line ", 5, -1, 1, 1, 6, 6),  # line 1, column 6
+        ("line ", 5, -1, 1, 1, 6, 6),  # ln 1, col 6
         #      ^
-        ("line \nline 2", 5, -1, 1, 1, 6, 6),  # line 1, column 6
+        ("line \nline 2", 5, -1, 1, 1, 6, 6),  # ln 1, col 6
         #      ^
-        ("line ?", 5, -1, 1, 1, 6, 7),  # line 1, column 6-7
+        ("line \rline 2", 5, -1, 1, 1, 6, 6),  # ln 1, col 6
         #      ^
-        ("line ?\nline 2", 5, -1, 1, 1, 6, 7),  # line 1, column 6-7
+        ("line \r\nline 2", 5, -1, 1, 1, 6, 6),  # ln 1, col 6
+        #      ^
+        ("line ?", 5, -1, 1, 1, 6, 7),  # ln 1, col 6-7
+        #      ^
+        ("line ?\nline 2", 5, -1, 1, 1, 6, 7),  # ln 1, col 6-7
+        #      ^
+        ("line ?\rline 2", 5, -1, 1, 1, 6, 7),  # ln 1, col 6-7
+        #      ^
+        ("line ?\r\nline 2", 5, -1, 1, 1, 6, 7),  # ln 1, col 6-7
         #      ^
 
         # Range
-        ("line 1", 0, 1, 1, 1, 1, 2),  # line 1, column 1-2
+        ("line 1", 0, 1, 1, 1, 1, 2),  # ln 1, col 1-2
         # ^
-        ("line 1\nline 2", 12, 13, 2, 2, 6, 7),  # line 2, column 6-7
+        ("line 1\nline 2", 12, 13, 2, 2, 6, 7),  # ln 2, col 6-7
+        #              ^
+        ("line 1\rline 2", 12, 13, 2, 2, 6, 7),  # ln 2, col 6-7
         #              ^
-        ("line 1\nline 2\nline 3", 12, 19, 2, 3, 6, 6),  # line 2-3, column 6
+        ("line 1\r\nline 2", 13, 14, 2, 2, 6, 7),  # ln 2, col 6-7
+        #                ^
+        ("line 1\nline 2\nline 3", 12, 19, 2, 3, 6, 6),  # ln 2-3, col 6
         #              ^^^^^^^^
-        ("line 1\nline 2\nline 3", 12, 20, 2, 3, 6, 7),  # line 2-3, column 6-7
+        ("line 1\rline 2\rline 3", 12, 19, 2, 3, 6, 6),  # ln 2-3, col 6
+        #              ^^^^^^^^
+        ("line 1\r\nline 2\r\nline 3", 13, 21, 2, 3, 6, 6),  # ln 2-3, col 6
+        #                ^^^^^^^^^^
+        ("line 1\nline 2\nline 3", 12, 20, 2, 3, 6, 7),  # ln 2-3, col 6-7
+        #              ^^^^^^^^^
+        ("line 1\rline 2\rline 3", 12, 20, 2, 3, 6, 7),  # ln 2-3, col 6-7
         #              ^^^^^^^^^
+        ("line 1\r\nline 2\r\nline 3", 13, 22, 2, 3, 6, 7),  # ln 2-3, col 6-7
+        #                ^^^^^^^^^^^
     ],
 )
 # pylint: disable-next=R0913
@@ -52,8 +72,16 @@ def test_start_and_end_position(  # noqa: PLR0913, PLR0917
         #    ^^^^^^^             ^^^^^^^
         (12, "current\nnext", 0, 7, 1, "current", 8),
         #     ^^^^^^^                   ^^^^^^^
+        (12, "current\rnext", 0, 7, 1, "current", 8),
+        #     ^^^^^^^                   ^^^^^^^
+        (12, "current\r\nnext", 0, 7, 1, "current", 8),
+        #     ^^^^^^^                     ^^^^^^^
         (16, "previous\ncurrent", 9, 16, 1, "current", 8),
         #               ^^^^^^^              ^^^^^^^
+        (16, "previous\rcurrent", 9, 16, 1, "current", 8),
+        #               ^^^^^^^              ^^^^^^^
+        (16, "previous\r\ncurrent", 10, 17, 1, "current", 8),
+        #                 ^^^^^^^               ^^^^^^^
 
         # No newline
         (17, "start-middle-end", 0, 5, 1, "start-middle-end", 6),