Skip to content

Commit

Permalink
Fix #81, manage an edge case in which an LLM will never use delimiter…
Browse files Browse the repository at this point in the history
…s in objects
  • Loading branch information
mangiucugna committed Nov 14, 2024
1 parent eeb6b3f commit 86b7f25
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 21 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "json_repair"
version = "0.30.1"
version = "0.30.2"
license = {file = "LICENSE"}
authors = [
{ name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },
Expand Down
49 changes: 29 additions & 20 deletions src/json_repair/json_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,26 +328,35 @@ def parse_string(self) -> Union[str, bool, None]:
if not next_c:
rstring_delimiter_missing = False
else:
# skip any whitespace first
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
# We couldn't find any rstring_delimeter before the end of the string
# check if this is the last string of an object and therefore we can keep going
# make an exception if this is the last char before the closing brace
j = self.skip_to_character(character="}", idx=i)
if j - i > 1:
# Ok it's not right after the comma
# Let's ignore
rstring_delimiter_missing = False
# Check that j was not out of bound
elif self.get_char_at(j):
# Check for an unmatched opening brace in string_acc
for c in reversed(string_acc):
if c == "{":
# Ok then this is part of the string
rstring_delimiter_missing = False
break
elif c == "}":
break
# There could be a case in which even the next key:value is missing delimeters
# because it might be a systemic issue with the output
# So let's check if we can find a : in the string instead
i = self.skip_to_character(character=":", idx=1)
next_c = self.get_char_at(i)
if next_c:
# OK then this is a systemic issue with the output
break
else:
# skip any whitespace first
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
# We couldn't find any rstring_delimeter before the end of the string
# check if this is the last string of an object and therefore we can keep going
# make an exception if this is the last char before the closing brace
j = self.skip_to_character(character="}", idx=i)
if j - i > 1:
# Ok it's not right after the comma
# Let's ignore
rstring_delimiter_missing = False
# Check that j was not out of bound
elif self.get_char_at(j):
# Check for an unmatched opening brace in string_acc
for c in reversed(string_acc):
if c == "{":
# Ok then this is part of the string
rstring_delimiter_missing = False
break
elif c == "}":
break
if rstring_delimiter_missing:
self.log(
"While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
Expand Down
1 change: 1 addition & 0 deletions tests/test_json_repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def test_object_edge_cases():
assert repair_json('{text:words{words in brackets}}') == '{"text": "words{words in brackets}"}'
assert repair_json('{text:words{words in brackets}m}') == '{"text": "words{words in brackets}m"}'
assert repair_json('{"key": "value, value2"```') == '{"key": "value, value2"}'
assert repair_json('{key:value,key2:value2}') == '{"key": "value", "key2": "value2"}'

def test_number_edge_cases():
assert repair_json(' - { "test_key": ["test_value", "test_value2"] }') == '{"test_key": ["test_value", "test_value2"]}'
Expand Down

0 comments on commit 86b7f25

Please sign in to comment.