Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

json: $ref + object overhaul (https & recursive $refs, mix properties & allOf) #8199

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
318 changes: 188 additions & 130 deletions common/json-schema-to-grammar.cpp

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions examples/json_schema_to_grammar.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
JSON Schema to Grammar converter (JavaScript version)

There are C++ and Python converters w/ the same features.
(More flags are currently exposed by the Python version)

Usage:
node examples/json_schema_to_grammar.mjs schema.json
node examples/json_schema_to_grammar.mjs https://json.schemastore.org/tsconfig.json
echo '{"type": "object"}' | node examples/json_schema_to_grammar.mjs -
*/
import { readFileSync } from "fs"
import { SchemaConverter } from "./server/public/json-schema-to-grammar.mjs"
import fs from 'fs'

const [, , file] = process.argv
let schema;
if (file === '-') {
schema = JSON.parse(fs.readFileSync(0, 'utf8'))
} else if (file.startsWith('https://')) {
schema = await (await fetch(file)).json()
} else {
schema = JSON.parse(readFileSync(file, "utf8"));
}
const converter = new SchemaConverter({})
converter.visit(schema, '')
console.log(converter.formatGrammar())
200 changes: 102 additions & 98 deletions examples/json_schema_to_grammar.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
#!/usr/bin/env python3
'''
JSON Schema to Grammar conversion

There are C++ and JavaScript converters w/ the same features.

Usage:
python examples/json_schema_to_grammar.py schema.json
python examples/json_schema_to_grammar.py https://json.schemastore.org/tsconfig.json
echo '{"type": "object"}' | python examples/json_schema_to_grammar.py -

Also see https://github.com/ggerganov/llama.cpp/tree/master/grammars
'''
from __future__ import annotations

import argparse
Expand Down Expand Up @@ -237,16 +249,15 @@


class SchemaConverter:
def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
def __init__(self, *, prop_order, dotall, raw_pattern):
self._prop_order = prop_order
self._allow_fetch = allow_fetch
self._dotall = dotall
self._raw_pattern = raw_pattern
self._rules = {
'space': SPACE_RULE,
}
self._refs = {}
self._refs_being_resolved = set()
self._external_refs = {}
self._ref_context = []

def _format_literal(self, literal):
escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
Expand Down Expand Up @@ -334,51 +345,6 @@
self._rules[key] = rule
return key

def resolve_refs(self, schema: dict, url: str):
'''
Resolves all $ref fields in the given schema, fetching any remote schemas,
replacing $ref with absolute reference URL and populating self._refs with the
respective referenced (sub)schema dictionaries.
'''
def visit(n: dict):
if isinstance(n, list):
return [visit(x) for x in n]
elif isinstance(n, dict):
ref = n.get('$ref')
if ref is not None and ref not in self._refs:
if ref.startswith('https://'):
assert self._allow_fetch, 'Fetching remote schemas is not allowed (use --allow-fetch for force)'
import requests

frag_split = ref.split('#')
base_url = frag_split[0]

target = self._refs.get(base_url)
if target is None:
target = self.resolve_refs(requests.get(ref).json(), base_url)
self._refs[base_url] = target

if len(frag_split) == 1 or frag_split[-1] == '':
return target
elif ref.startswith('#/'):
target = schema
ref = f'{url}{ref}'
n['$ref'] = ref
else:
raise ValueError(f'Unsupported ref {ref}')

for sel in ref.split('#')[-1].split('/')[1:]:
assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
target = target[sel]

self._refs[ref] = target
else:
for v in n.values():
visit(v)

return n
return visit(schema)

def _generate_union_rule(self, name, alt_schemas):
return ' | '.join((
self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}')
Expand Down Expand Up @@ -543,25 +509,64 @@
else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")


def _resolve_ref(self, ref):
ref_name = ref.split('/')[-1]
if ref_name not in self._rules and ref not in self._refs_being_resolved:
self._refs_being_resolved.add(ref)
resolved = self._refs[ref]
ref_name = self.visit(resolved, ref_name)
self._refs_being_resolved.remove(ref)
return ref_name

def _generate_constant_rule(self, value):
return self._format_literal(json.dumps(value))

class ResolvedRef:
def __init__(self, target: Any, name: str, is_local: bool):
self.target = target
self.name = name
self.is_local = is_local

def _resolve_ref(self, ref: str):
parts = ref.split('#')
assert len(parts) <= 2, f'Unsupported ref: {ref}'
url = parts[0]
target = None
is_local = not url
if is_local:
assert self._ref_context, f'Error resolving ref {ref}: no context'
target = self._ref_context[-1]
else:
target = self._external_refs.get(url)
if target is None:
# Fetch the referenced schema and resolve its refs
assert url.startswith("https://"), f"Error resolving ref {ref}: unsupported url scheme"
import requests
target = requests.get(url).json()
self._external_refs[url] = target

if len(parts) == 1:
return self.ResolvedRef(target, '', is_local)
else:
tokens = parts[1].split('/')
for sel in tokens[1:]:
assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
target = target[sel]

return self.ResolvedRef(target, tokens[-1] if tokens else '', is_local)

def visit(self, schema, name):
schema_type = schema.get('type')
schema_format = schema.get('format')
rule_name = name + '-' if name in RESERVED_NAMES else name or 'root'

if not self._ref_context:
self._ref_context.append(schema)
try:
return self.visit(schema, name)
finally:
self._ref_context.pop()

if (ref := schema.get('$ref')) is not None:
return self._add_rule(rule_name, self._resolve_ref(ref))
resolved = self._resolve_ref(ref)
if not resolved.is_local:
self._ref_context.append(resolved.target)
try:
return self.visit(resolved.target, name if name == '' or resolved.name == '' else resolved.name)
finally:
if not resolved.is_local:
self._ref_context.pop()

elif 'oneOf' in schema or 'anyOf' in schema:
return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
Expand All @@ -576,36 +581,6 @@
rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
return self._add_rule(rule_name, rule)

elif schema_type in (None, 'object') and \
('properties' in schema or \
('additionalProperties' in schema and schema['additionalProperties'] is not True)):
required = set(schema.get('required', []))
properties = list(schema.get('properties', {}).items())
return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties')))

elif schema_type in (None, 'object') and 'allOf' in schema:
required = set()
properties = []
hybrid_name = name
def add_component(comp_schema, is_required):
if (ref := comp_schema.get('$ref')) is not None:
comp_schema = self._refs[ref]

if 'properties' in comp_schema:
for prop_name, prop_schema in comp_schema['properties'].items():
properties.append((prop_name, prop_schema))
if is_required:
required.add(prop_name)

for t in schema['allOf']:
if 'anyOf' in t:
for tt in t['anyOf']:
add_component(tt, is_required=False)
else:
add_component(t, is_required=True)

return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))

elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
items = schema.get('items') or schema['prefixItems']
if isinstance(items, list):
Expand Down Expand Up @@ -660,8 +635,44 @@
out.append(") space")
return self._add_rule(rule_name, ''.join(out))

elif (schema_type == 'object') or (len(schema) == 0):
return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
elif (schema_type == 'object') or (schema_type is None):
required = set(schema.get('required', []))
properties = list(schema.get('properties', {}).items())
is_explicit_object = schema_type == 'object' or 'properties' in schema or 'additionalProperties' in schema
additional_properties = schema.get('additionalProperties')

def add_component(comp_schema, is_required):
if (ref := comp_schema.get('$ref')) is not None:
resolved = self._resolve_ref(ref)
comp_schema = resolved.target

if 'properties' in comp_schema:
for prop_name, prop_schema in comp_schema['properties'].items():
properties.append((prop_name, prop_schema))
if is_required:
required.add(prop_name)
if 'additionalProperties' in comp_schema:
if additional_properties is None:

Check failure on line 655 in examples/json_schema_to_grammar.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"additional_properties" is unbound (reportUnboundVariable)
additional_properties = comp_schema['additionalProperties']
elif additional_properties != comp_schema['additionalProperties']:

Check failure on line 657 in examples/json_schema_to_grammar.py

View workflow job for this annotation

GitHub Actions / pyright type-check

"additional_properties" is unbound (reportUnboundVariable)
raise ValueError('Inconsistent additionalProperties in allOf')

for t in schema.get('allOf', []):
if 'anyOf' in t:
for tt in t['anyOf']:
add_component(tt, is_required=False)
else:
add_component(t, is_required=True)

if not properties and (additional_properties == True or additional_properties is None):
return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))

default_additional_properties = None if is_explicit_object else False
return self._add_rule(
rule_name,
self._build_object_rule(
properties, required, name,
additional_properties if additional_properties is not None else default_additional_properties))

else:
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
Expand Down Expand Up @@ -767,11 +778,6 @@
given precedence over optional properties.
'''
)
parser.add_argument(
'--allow-fetch',
action='store_true',
default=False,
help='Whether to allow fetching referenced schemas over HTTPS')
parser.add_argument(
'--dotall',
action='store_true',
Expand Down Expand Up @@ -799,10 +805,8 @@
schema = json.load(f)
converter = SchemaConverter(
prop_order={name: idx for idx, name in enumerate(args.prop_order)},
allow_fetch=args.allow_fetch,
dotall=args.dotall,
raw_pattern=args.raw_pattern)
schema = converter.resolve_refs(schema, url)
converter.visit(schema, '')
print(converter.format_grammar())

Expand Down
3 changes: 1 addition & 2 deletions examples/server/chat.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,8 @@ const propOrder = grammarJsonSchemaPropOrder

let grammar = null
if (grammarJsonSchemaFile) {
let schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
const schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
const converter = new SchemaConverter({prop_order: propOrder, allow_fetch: true})
schema = await converter.resolveRefs(schema, grammarJsonSchemaFile)
converter.visit(schema, '')
grammar = converter.formatGrammar()
}
Expand Down
3 changes: 1 addition & 2 deletions examples/server/public/index-new.html
Original file line number Diff line number Diff line change
Expand Up @@ -558,14 +558,13 @@
const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
const convertJSONSchemaGrammar = async () => {
try {
let schema = JSON.parse(params.value.grammar)
const schema = JSON.parse(params.value.grammar)
const converter = new SchemaConverter({
prop_order: grammarJsonSchemaPropOrder.value
.split(',')
.reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
allow_fetch: true,
})
schema = await converter.resolveRefs(schema, 'input')
converter.visit(schema, '')
params.value = {
...params.value,
Expand Down
3 changes: 1 addition & 2 deletions examples/server/public/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -855,14 +855,13 @@
const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
const convertJSONSchemaGrammar = async () => {
try {
let schema = JSON.parse(params.value.grammar)
const schema = JSON.parse(params.value.grammar)
const converter = new SchemaConverter({
prop_order: grammarJsonSchemaPropOrder.value
.split(',')
.reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
allow_fetch: true,
})
schema = await converter.resolveRefs(schema, 'input')
converter.visit(schema, '')
params.value = {
...params.value,
Expand Down
Loading
Loading