From f6c17dc3d596e8f7300241e241314cbfa2bbff41 Mon Sep 17 00:00:00 2001 From: Aru Sahni Date: Fri, 28 Jun 2024 16:33:44 -0400 Subject: [PATCH] Introduce Chroma syntax-generation script. Hugo uses Chroma for syntax highlighting. We upstreamed a Materialize-specific syntax to ensure our dialect's keywords are recognized as such. Now that the end-to-end change is in prod, let's automate the process of generating updates for the dialect lexer. --- bin/gen-chroma-syntax | 15 ++++ doc/developer/chroma-syntax-generation.md | 12 +++ .../materialize/cli/gen-chroma-syntax.py | 81 +++++++++++++++++++ 3 files changed, 108 insertions(+) create mode 100755 bin/gen-chroma-syntax create mode 100644 doc/developer/chroma-syntax-generation.md create mode 100755 misc/python/materialize/cli/gen-chroma-syntax.py diff --git a/bin/gen-chroma-syntax b/bin/gen-chroma-syntax new file mode 100755 index 000000000000..76f92ad7cdc9 --- /dev/null +++ b/bin/gen-chroma-syntax @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. +# +# gen-chroma-syntax -- regenerates a Materialize-dialect Chroma syntax file +# using the currently-checked out Materialize keywords + +exec "$(dirname "$0")"/pyactivate -m materialize.cli.gen-chroma-syntax "$@" diff --git a/doc/developer/chroma-syntax-generation.md b/doc/developer/chroma-syntax-generation.md new file mode 100644 index 000000000000..22824578851b --- /dev/null +++ b/doc/developer/chroma-syntax-generation.md @@ -0,0 +1,12 @@ +# Generating new Chroma syntax highlights + +Chroma is the syntax highlighter used by Hugo, the static site generator that powers Materialize's docs. We have upstreamed a Materialize lexer (which is a slightly-modified version of their Postgres lexer). When new keywords are added we should upstream an update. + +## Generating a new lexer definition + +1. Fork the Chroma repo and clone it locally as a sibling of the `materialize` repo. +2. From the root directory of the `materialize` repo, run the generate script: + ```shell + ./bin/gen-chroma-syntax + ``` +3. In the Chroma repo, commit the changes to the Materialize dialect file and submit them as a PR. diff --git a/misc/python/materialize/cli/gen-chroma-syntax.py b/misc/python/materialize/cli/gen-chroma-syntax.py new file mode 100755 index 000000000000..e2e2ac10d946 --- /dev/null +++ b/misc/python/materialize/cli/gen-chroma-syntax.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Regenerates a Materialize-dialect Chroma syntax file using the local Materialize keywords""" + +import argparse +import xml.etree.ElementTree as ET +from pathlib import Path + +from materialize import MZ_ROOT + +CONFIG_FIELDS = { + "name": "Materialize SQL dialect", + "alias": ["materialize", "mzsql"], + "mime_type": "text/x-materializesql", +} + + +def keyword_pattern(): + keywords_file = MZ_ROOT / "src/sql-lexer/src/keywords.txt" + keywords = [ + line.upper() + for line in keywords_file.read_text().splitlines() + if not (line.startswith("#") or not line.strip()) + ] + return f"({'|'.join(keywords)})\\b" + + +def set_keywords(root: ET.Element): + rule = root.find(".//rule/token[@type='Keyword']/..") + if not rule: + raise RuntimeError("No keyword rule found") + rule.set("pattern", keyword_pattern()) + + +def set_config(root: ET.Element): + config = root.find("config") + if not config: + raise RuntimeError("No config found") + for field_name, field_value in CONFIG_FIELDS.items(): + if isinstance(field_value, list): + for element in config.findall(field_name): + config.remove(element) + for item in field_value: + field = ET.SubElement(config, field_name) + field.text = item + else: + field = config.find(field_name) + if field is None: + raise RuntimeError(f"No such config field: '{field_name}'") + field.text = field_value + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--chroma-dir", + default="../chroma", + ) + args = parser.parse_args() + lexer_dir = Path(f"{args.chroma_dir}/lexers/embedded/") + tree = ET.parse(lexer_dir / "postgresql_sql_dialect.xml") + root = tree.getroot() + if not root: + raise RuntimeError("Could not find root element") + set_keywords(root) + set_config(root) + ET.indent(root, " ") + tree.write(lexer_dir / "materialize_sql_dialect.xml", encoding="unicode") + + +if __name__ == "__main__": + main()