From 86b7f251331c71a029b78431587713c57f1f40ce Mon Sep 17 00:00:00 2001 From: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com> Date: Thu, 14 Nov 2024 07:16:01 +0100 Subject: [PATCH] Fix #81, manage an edge case in which an LLM will never use delimiters in objects --- pyproject.toml | 2 +- src/json_repair/json_parser.py | 49 ++++++++++++++++++++-------------- tests/test_json_repair.py | 1 + 3 files changed, 31 insertions(+), 21 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2d29068..c528a0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" [project] name = "json_repair" -version = "0.30.1" +version = "0.30.2" license = {file = "LICENSE"} authors = [ { name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" }, diff --git a/src/json_repair/json_parser.py b/src/json_repair/json_parser.py index da59c2a..fa5cd61 100644 --- a/src/json_repair/json_parser.py +++ b/src/json_repair/json_parser.py @@ -328,26 +328,35 @@ def parse_string(self) -> Union[str, bool, None]: if not next_c: rstring_delimiter_missing = False else: - # skip any whitespace first - i = self.skip_whitespaces_at(idx=1, move_main_index=False) - # We couldn't find any rstring_delimeter before the end of the string - # check if this is the last string of an object and therefore we can keep going - # make an exception if this is the last char before the closing brace - j = self.skip_to_character(character="}", idx=i) - if j - i > 1: - # Ok it's not right after the comma - # Let's ignore - rstring_delimiter_missing = False - # Check that j was not out of bound - elif self.get_char_at(j): - # Check for an unmatched opening brace in string_acc - for c in reversed(string_acc): - if c == "{": - # Ok then this is part of the string - rstring_delimiter_missing = False - break - elif c == "}": - break + # There could be a case in which even the next key:value is missing delimeters + # because it might be a systemic issue with the output + # So let's check if we can find a : in the string instead + i = self.skip_to_character(character=":", idx=1) + next_c = self.get_char_at(i) + if next_c: + # OK then this is a systemic issue with the output + break + else: + # skip any whitespace first + i = self.skip_whitespaces_at(idx=1, move_main_index=False) + # We couldn't find any rstring_delimeter before the end of the string + # check if this is the last string of an object and therefore we can keep going + # make an exception if this is the last char before the closing brace + j = self.skip_to_character(character="}", idx=i) + if j - i > 1: + # Ok it's not right after the comma + # Let's ignore + rstring_delimiter_missing = False + # Check that j was not out of bound + elif self.get_char_at(j): + # Check for an unmatched opening brace in string_acc + for c in reversed(string_acc): + if c == "{": + # Ok then this is part of the string + rstring_delimiter_missing = False + break + elif c == "}": + break if rstring_delimiter_missing: self.log( "While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here", diff --git a/tests/test_json_repair.py b/tests/test_json_repair.py index 30ce47c..a1f379d 100644 --- a/tests/test_json_repair.py +++ b/tests/test_json_repair.py @@ -149,6 +149,7 @@ def test_object_edge_cases(): assert repair_json('{text:words{words in brackets}}') == '{"text": "words{words in brackets}"}' assert repair_json('{text:words{words in brackets}m}') == '{"text": "words{words in brackets}m"}' assert repair_json('{"key": "value, value2"```') == '{"key": "value, value2"}' + assert repair_json('{key:value,key2:value2}') == '{"key": "value", "key2": "value2"}' def test_number_edge_cases(): assert repair_json(' - { "test_key": ["test_value", "test_value2"] }') == '{"test_key": ["test_value", "test_value2"]}'