Skip to content

Commit

Permalink
Update the Materialize lexer (#978)
Browse files Browse the repository at this point in the history
This introduces some additional keywords. I've also scripted this on our
end hence the changes in formatting and encoding of certain characters
in attribute values.

This also includes:

 - Some tests
- Updates to the README to call out the `--csrf-key` argument for
chromad. Without it securecookie throws an error.
  • Loading branch information
arusahni committed Jun 28, 2024
1 parent 2d94bda commit d1034f8
Show file tree
Hide file tree
Showing 4 changed files with 407 additions and 78 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ for that setup the `chroma` executable can be just symlinked to `~/.lessfilter`.
If you edit some lexers and want to try it, open a shell in `cmd/chromad` and run:

```shell
go run .
go run . --csrf-key=securekey
```

A Link will be printed. Open it in your Browser. Now you can test on the Playground with your local changes.
Expand Down
154 changes: 77 additions & 77 deletions lexers/embedded/materialize_sql_dialect.xml
Original file line number Diff line number Diff line change
@@ -1,154 +1,154 @@
<lexer>
<config>
<name>Materialize SQL dialect</name>
<alias>materialize</alias>
<alias>mzsql</alias>
<mime_type>text/x-materializesql</mime_type>
<case_insensitive>true</case_insensitive>
<not_multiline>true</not_multiline>
<alias>materialize</alias>
<alias>mzsql</alias>
</config>
<rules>
<state name="root">
<rule pattern="\s+">
<token type="Text"/>
<token type="Text" />
</rule>
<rule pattern="--.*\n?">
<token type="CommentSingle"/>
<token type="CommentSingle" />
</rule>
<rule pattern="/\*">
<token type="CommentMultiline"/>
<push state="multiline-comments"/>
<token type="CommentMultiline" />
<push state="multiline-comments" />
</rule>
<rule pattern="(bigint|bigserial|bit|bit\s+varying|bool|boolean|box|bytea|char|character|character\s+varying|cidr|circle|date|decimal|double\s+precision|float4|float8|inet|int|int2|int4|int8|integer|interval|json|jsonb|line|lseg|macaddr|money|numeric|path|pg_lsn|point|polygon|real|serial|serial2|serial4|serial8|smallint|smallserial|text|time|timestamp|timestamptz|timetz|tsquery|tsvector|txid_snapshot|uuid|varbit|varchar|with\s+time\s+zone|without\s+time\s+zone|xml|anyarray|anyelement|anyenum|anynonarray|anyrange|cstring|fdw_handler|internal|language_handler|opaque|record|void)\b">
<token type="NameBuiltin"/>
<token type="NameBuiltin" />
</rule>
<rule pattern="(?s)(DO)(\s+)(?:(LANGUAGE)?(\s+)(&#39;?)(\w+)?(&#39;?)(\s+))?(\$)([^$]*)(\$)(.*?)(\$)(\10)(\$)">
<rule pattern="(?s)(DO)(\s+)(?:(LANGUAGE)?(\s+)('?)(\w+)?('?)(\s+))?(\$)([^$]*)(\$)(.*?)(\$)(\10)(\$)">
<usingbygroup>
<sublexer_name_group>6</sublexer_name_group>
<code_group>12</code_group>
<emitters>
<token type="Keyword"/>
<token type="Text"/>
<token type="Keyword"/>
<token type="Text"/>
<token type="LiteralStringSingle"/>
<token type="LiteralStringSingle"/>
<token type="LiteralStringSingle"/>
<token type="Text"/>
<token type="LiteralStringHeredoc"/>
<token type="LiteralStringHeredoc"/>
<token type="LiteralStringHeredoc"/>
<token type="LiteralStringHeredoc"/>
<token type="LiteralStringHeredoc"/>
<token type="LiteralStringHeredoc"/>
<token type="LiteralStringHeredoc"/>
<token type="Keyword" />
<token type="Text" />
<token type="Keyword" />
<token type="Text" />
<token type="LiteralStringSingle" />
<token type="LiteralStringSingle" />
<token type="LiteralStringSingle" />
<token type="Text" />
<token type="LiteralStringHeredoc" />
<token type="LiteralStringHeredoc" />
<token type="LiteralStringHeredoc" />
<token type="LiteralStringHeredoc" />
<token type="LiteralStringHeredoc" />
<token type="LiteralStringHeredoc" />
<token type="LiteralStringHeredoc" />
</emitters>
</usingbygroup>
</rule>
<rule pattern="(ACCESS|ACKS|ADD|ADDRESSES|AGGREGATE|ALL|ALTER|AND|ANY|ARN|ARRANGEMENT|ARRAY|AS|ASC|ASSERT|AT|AUCTION|AUTHORITY|AVAILABILITY|AVRO|AWS|BEGIN|BETWEEN|BIGINT|BILLED|BODY|BOOLEAN|BOTH|BPCHAR|BROKEN|BROKER|BROKERS|BY|BYTES|CARDINALITY|CASCADE|CASE|CAST|CERTIFICATE|CHAIN|CHAR|CHARACTER|CHARACTERISTICS|CHECK|CLIENT|CLOSE|CLUSTER|CLUSTERS|COALESCE|COLLATE|COLUMN|COLUMNS|COMMENT|COMMIT|COMMITTED|COMPACTION|COMPRESSION|COMPUTE|COMPUTECTL|CONFLUENT|CONNECTION|CONNECTIONS|CONSTRAINT|COPY|COUNT|COUNTER|CREATE|CREATECLUSTER|CREATEDB|CREATEROLE|CROSS|CSV|CURRENT|CURSOR|DATABASE|DATABASES|DATUMS|DAY|DAYS|DEALLOCATE|DEBEZIUM|DEBUG|DEBUGGING|DEC|DECIMAL|DECLARE|DECORRELATED|DEFAULT|DEFAULTS|DELETE|DELIMITED|DELIMITER|DESC|DETAILS|DISCARD|DISK|DISTINCT|DOC|DOT|DOUBLE|DROP|EFFORT|ELEMENT|ELSE|ENABLE|END|ENDPOINT|ENFORCED|ENVELOPE|ERROR|ESCAPE|EXCEPT|EXECUTE|EXISTS|EXPECTED|EXPLAIN|EXPOSE|EXTRACT|FACTOR|FALSE|FETCH|FIELDS|FILTER|FIRST|FLOAT|FOLLOWING|FOR|FOREIGN|FORMAT|FORWARD|FROM|FULL|FULLNAME|FUNCTION|GENERATOR|GRANT|GREATEST|GROUP|GROUPS|HAVING|HEADER|HEADERS|HOLD|HOST|HOUR|HOURS|ID|IDEMPOTENCE|IDLE|IF|IGNORE|ILIKE|IN|INCLUDE|INDEX|INDEXES|INFO|INHERIT|INLINE|INNER|INPUT|INSERT|INSPECT|INT|INTEGER|INTERNAL|INTERSECT|INTERVAL|INTO|INTROSPECTION|IS|ISNULL|ISOLATION|JOIN|JSON|KAFKA|KEY|KEYS|LAST|LATERAL|LATEST|LEADING|LEAST|LEFT|LEVEL|LIKE|LIMIT|LIST|LOAD|LOCAL|LOG|LOGICAL|LOGIN|MANAGED|MAP|MARKETING|MATERIALIZE|MATERIALIZED|MAX|MECHANISMS|MEMBERSHIP|MERGE|MESSAGE|METADATA|MINUTE|MINUTES|MODE|MONTH|MONTHS|MS|MUTUALLY|NAME|NAMES|NATURAL|NEXT|NO|NOCREATECLUSTER|NOCREATEDB|NOCREATEROLE|NOINHERIT|NOLOGIN|NONE|NOSUPERUSER|NOT|NOTICE|NULL|NULLIF|NULLS|OBJECTS|OF|OFFSET|ON|ONLY|OPERATOR|OPTIMIZED|OPTIMIZER|OPTIONS|OR|ORDER|ORDINALITY|OUTER|OVER|OWNED|OWNER|PARTITION|PASSWORD|PHYSICAL|PLAN|PLANS|PORT|POSITION|POSTGRES|PRECEDING|PRECISION|PREFIX|PREPARE|PRIMARY|PRIVATELINK|PRIVILEGES|PROGRESS|PROTOBUF|PROTOCOL|PUBLICATION|QUERY|QUOTE|RAISE|RANGE|RAW|READ|REAL|REASSIGN|RECURSION|RECURSIVE|REFERENCES|REFRESH|REGEX|REGION|REGISTRY|RENAME|REPEATABLE|REPLACE|REPLICA|REPLICAS|REPLICATION|RESET|RESPECT|RESTRICT|RETENTION|RETURN|RETURNING|REVOKE|RIGHT|ROLE|ROLES|ROLLBACK|ROTATE|ROW|ROWS|SASL|SCALE|SCHEMA|SCHEMAS|SCRIPT|SECOND|SECONDS|SECRET|SECRETS|SECURITY|SEED|SELECT|SEQUENCES|SERIALIZABLE|SERVICE|SESSION|SET|SHARD|SHOW|SINK|SINKS|SIZE|SMALLINT|SNAPSHOT|SOME|SOURCE|SOURCES|SSH|SSL|START|STDIN|STDOUT|STORAGE|STORAGECTL|STRATEGY|STRICT|STRING|SUBSCRIBE|SUBSOURCE|SUBSOURCES|SUBSTRING|SUPERUSER|SWAP|SYSTEM|TABLE|TABLES|TAIL|TEMP|TEMPORARY|TEST|TEXT|THEN|TICK|TIES|TIME|TIMELINE|TIMEOUT|TIMESTAMP|TIMESTAMPTZ|TO|TOKEN|TOPIC|TPCH|TRACE|TRAILING|TRANSACTION|TRIM|TRUE|TUNNEL|TYPE|TYPES|UNBOUNDED|UNCOMMITTED|UNION|UNIQUE|UNKNOWN|UP|UPDATE|UPSERT|URL|USAGE|USER|USERNAME|USERS|USING|VALIDATE|VALUE|VALUES|VARCHAR|VARYING|VIEW|VIEWS|WARNING|WEBHOOK|WHEN|WHERE|WINDOW|WIRE|WITH|WITHIN|WITHOUT|WORK|WORKERS|WRITE|YEAR|YEARS|ZONE|ZONES)\b">
<token type="Keyword"/>
<rule pattern="(ACCESS|ADD|ADDRESSES|AGGREGATE|ALIGNED|ALL|ALTER|ANALYSIS|AND|ANY|ARITY|ARN|ARRANGEMENT|ARRAY|AS|ASC|ASSERT|ASSUME|AT|AUCTION|AUTHORITY|AVAILABILITY|AVRO|AWS|BATCH|BEGIN|BETWEEN|BIGINT|BILLED|BODY|BOOLEAN|BOTH|BPCHAR|BROKEN|BROKER|BROKERS|BY|BYTES|CARDINALITY|CASCADE|CASE|CAST|CERTIFICATE|CHAIN|CHAINS|CHAR|CHARACTER|CHARACTERISTICS|CHECK|CLIENT|CLOSE|CLUSTER|CLUSTERS|COALESCE|COLLATE|COLUMN|COLUMNS|COMMENT|COMMIT|COMMITTED|COMPACTION|COMPATIBILITY|COMPRESSION|COMPUTE|COMPUTECTL|CONFIG|CONFLUENT|CONNECTION|CONNECTIONS|CONSTRAINT|COPY|COUNT|COUNTER|CREATE|CREATECLUSTER|CREATEDB|CREATEROLE|CREATION|CROSS|CSV|CURRENT|CURSOR|DATABASE|DATABASES|DATUMS|DAY|DAYS|DEALLOCATE|DEBEZIUM|DEBUG|DEBUGGING|DEC|DECIMAL|DECLARE|DECODING|DECORRELATED|DEFAULT|DEFAULTS|DELETE|DELIMITED|DELIMITER|DELTA|DESC|DETAILS|DISCARD|DISK|DISTINCT|DOC|DOT|DOUBLE|DROP|EAGER|ELEMENT|ELSE|ENABLE|END|ENDPOINT|ENFORCED|ENVELOPE|ERROR|ERRORS|ESCAPE|ESTIMATE|EVERY|EXCEPT|EXECUTE|EXISTS|EXPECTED|EXPLAIN|EXPOSE|EXPRESSIONS|EXTERNAL|EXTRACT|FACTOR|FALSE|FAST|FEATURES|FETCH|FIELDS|FILE|FILTER|FIRST|FIXPOINT|FLOAT|FOLLOWING|FOR|FOREIGN|FORMAT|FORWARD|FROM|FULL|FULLNAME|FUNCTION|GENERATOR|GRANT|GREATEST|GROUP|GROUPS|HAVING|HEADER|HEADERS|HISTORY|HOLD|HOST|HOUR|HOURS|HUMANIZED|ID|IDENTIFIERS|IDS|IF|IGNORE|ILIKE|IMPLEMENTATIONS|IMPORTED|IN|INCLUDE|INDEX|INDEXES|INFO|INHERIT|INLINE|INNER|INPUT|INSERT|INSIGHTS|INSPECT|INT|INTEGER|INTERNAL|INTERSECT|INTERVAL|INTO|INTROSPECTION|IS|ISNULL|ISOLATION|JOIN|JOINS|JSON|KAFKA|KEY|KEYS|LAST|LATERAL|LATEST|LEADING|LEAST|LEFT|LEGACY|LETREC|LEVEL|LIKE|LIMIT|LINEAR|LIST|LOAD|LOCAL|LOCALLY|LOG|LOGICAL|LOGIN|LOWERING|MANAGED|MANUAL|MAP|MARKETING|MATERIALIZE|MATERIALIZED|MAX|MECHANISMS|MEMBERSHIP|MESSAGE|METADATA|MINUTE|MINUTES|MODE|MONTH|MONTHS|MUTUALLY|MYSQL|NAME|NAMES|NATURAL|NEGATIVE|NEW|NEXT|NO|NOCREATECLUSTER|NOCREATEDB|NOCREATEROLE|NODE|NOINHERIT|NOLOGIN|NON|NONE|NOSUPERUSER|NOT|NOTICE|NOTICES|NULL|NULLIF|NULLS|OBJECTS|OF|OFFSET|ON|ONLY|OPERATOR|OPTIMIZED|OPTIMIZER|OPTIONS|OR|ORDER|ORDINALITY|OUTER|OVER|OWNED|OWNER|PARTITION|PARTITIONS|PASSWORD|PATH|PHYSICAL|PLAN|PLANS|PORT|POSITION|POSTGRES|PRECEDING|PRECISION|PREFIX|PREPARE|PRIMARY|PRIVATELINK|PRIVILEGES|PROGRESS|PROTOBUF|PROTOCOL|PUBLICATION|PUSHDOWN|QUERY|QUOTE|RAISE|RANGE|RATE|RAW|READ|REAL|REASSIGN|RECURSION|RECURSIVE|REDACTED|REFERENCE|REFERENCES|REFRESH|REGEX|REGION|REGISTRY|REHYDRATION|RENAME|REOPTIMIZE|REPEATABLE|REPLACE|REPLAN|REPLICA|REPLICAS|REPLICATION|RESET|RESPECT|RESTRICT|RETAIN|RETURN|RETURNING|REVOKE|RIGHT|ROLE|ROLES|ROLLBACK|ROTATE|ROUNDS|ROW|ROWS|SASL|SCALE|SCHEDULE|SCHEMA|SCHEMAS|SECOND|SECONDS|SECRET|SECRETS|SECURITY|SEED|SELECT|SEQUENCES|SERIALIZABLE|SERVICE|SESSION|SET|SHARD|SHOW|SINK|SINKS|SIZE|SMALLINT|SNAPSHOT|SOME|SOURCE|SOURCES|SSH|SSL|START|STDIN|STDOUT|STORAGE|STORAGECTL|STRATEGY|STRICT|STRING|STRONG|SUBSCRIBE|SUBSOURCE|SUBSOURCES|SUBSTRING|SUBTREE|SUPERUSER|SWAP|SYNTAX|SYSTEM|TABLE|TABLES|TAIL|TEMP|TEMPORARY|TEXT|THEN|TICK|TIES|TIME|TIMELINE|TIMEOUT|TIMESTAMP|TIMESTAMPTZ|TIMING|TO|TOKEN|TOPIC|TPCH|TRACE|TRAILING|TRANSACTION|TRANSACTIONAL|TRIM|TRUE|TUNNEL|TYPE|TYPES|UNBOUNDED|UNCOMMITTED|UNION|UNIQUE|UNKNOWN|UP|UPDATE|UPSERT|URL|USAGE|USER|USERNAME|USERS|USING|VALIDATE|VALUE|VALUES|VARCHAR|VARIADIC|VARYING|VERSION|VIEW|VIEWS|WARNING|WEBHOOK|WHEN|WHERE|WINDOW|WIRE|WITH|WITHIN|WITHOUT|WORK|WORKERS|WRITE|YEAR|YEARS|ZONE|ZONES)\b">
<token type="Keyword" />
</rule>
<rule pattern="[+*/&lt;&gt;=~!@#%^&amp;|`?-]+">
<token type="Operator"/>
<token type="Operator" />
</rule>
<rule pattern="::">
<token type="Operator"/>
<token type="Operator" />
</rule>
<rule pattern="\$\d+">
<token type="NameVariable"/>
<token type="NameVariable" />
</rule>
<rule pattern="([0-9]*\.[0-9]*|[0-9]+)(e[+-]?[0-9]+)?">
<token type="LiteralNumberFloat"/>
<token type="LiteralNumberFloat" />
</rule>
<rule pattern="[0-9]+">
<token type="LiteralNumberInteger"/>
<token type="LiteralNumberInteger" />
</rule>
<rule pattern="((?:E|U&amp;)?)(&#39;)">
<rule pattern="((?:E|U&amp;)?)(')">
<bygroups>
<token type="LiteralStringAffix"/>
<token type="LiteralStringSingle"/>
<token type="LiteralStringAffix" />
<token type="LiteralStringSingle" />
</bygroups>
<push state="string"/>
<push state="string" />
</rule>
<rule pattern="((?:U&amp;)?)(&#34;)">
<rule pattern="((?:U&amp;)?)(&quot;)">
<bygroups>
<token type="LiteralStringAffix"/>
<token type="LiteralStringName"/>
<token type="LiteralStringAffix" />
<token type="LiteralStringName" />
</bygroups>
<push state="quoted-ident"/>
<push state="quoted-ident" />
</rule>
<rule pattern="(?s)(\$)([^$]*)(\$)(.*?)(\$)(\2)(\$)(\s+)(LANGUAGE)?(\s+)(&#39;?)(\w+)?(&#39;?)">
<rule pattern="(?s)(\$)([^$]*)(\$)(.*?)(\$)(\2)(\$)(\s+)(LANGUAGE)?(\s+)('?)(\w+)?('?)">
<usingbygroup>
<sublexer_name_group>12</sublexer_name_group>
<code_group>4</code_group>
<emitters>
<token type="LiteralStringHeredoc"/>
<token type="LiteralStringHeredoc"/>
<token type="LiteralStringHeredoc"/>
<token type="LiteralStringHeredoc"/>
<token type="LiteralStringHeredoc"/>
<token type="LiteralStringHeredoc"/>
<token type="LiteralStringHeredoc"/>
<token type="Text"/>
<token type="Keyword"/>
<token type="Text"/>
<token type="LiteralStringSingle"/>
<token type="LiteralStringSingle"/>
<token type="LiteralStringSingle"/>
<token type="LiteralStringHeredoc" />
<token type="LiteralStringHeredoc" />
<token type="LiteralStringHeredoc" />
<token type="LiteralStringHeredoc" />
<token type="LiteralStringHeredoc" />
<token type="LiteralStringHeredoc" />
<token type="LiteralStringHeredoc" />
<token type="Text" />
<token type="Keyword" />
<token type="Text" />
<token type="LiteralStringSingle" />
<token type="LiteralStringSingle" />
<token type="LiteralStringSingle" />
</emitters>
</usingbygroup>
</rule>
<rule pattern="(?s)(\$)([^$]*)(\$)(.*?)(\$)(\2)(\$)">
<token type="LiteralStringHeredoc"/>
<token type="LiteralStringHeredoc" />
</rule>
<rule pattern="[a-z_]\w*">
<token type="Name"/>
<token type="Name" />
</rule>
<rule pattern=":([&#39;&#34;]?)[a-z]\w*\b\1">
<token type="NameVariable"/>
<rule pattern=":(['&quot;]?)[a-z]\w*\b\1">
<token type="NameVariable" />
</rule>
<rule pattern="[;:()\[\]{},.]">
<token type="Punctuation"/>
<token type="Punctuation" />
</rule>
</state>
<state name="multiline-comments">
<rule pattern="/\*">
<token type="CommentMultiline"/>
<push state="multiline-comments"/>
<token type="CommentMultiline" />
<push state="multiline-comments" />
</rule>
<rule pattern="\*/">
<token type="CommentMultiline"/>
<pop depth="1"/>
<token type="CommentMultiline" />
<pop depth="1" />
</rule>
<rule pattern="[^/*]+">
<token type="CommentMultiline"/>
<token type="CommentMultiline" />
</rule>
<rule pattern="[/*]">
<token type="CommentMultiline"/>
<token type="CommentMultiline" />
</rule>
</state>
<state name="string">
<rule pattern="[^&#39;]+">
<token type="LiteralStringSingle"/>
<rule pattern="[^']+">
<token type="LiteralStringSingle" />
</rule>
<rule pattern="&#39;&#39;">
<token type="LiteralStringSingle"/>
<rule pattern="''">
<token type="LiteralStringSingle" />
</rule>
<rule pattern="&#39;">
<token type="LiteralStringSingle"/>
<pop depth="1"/>
<rule pattern="'">
<token type="LiteralStringSingle" />
<pop depth="1" />
</rule>
</state>
<state name="quoted-ident">
<rule pattern="[^&#34;]+">
<token type="LiteralStringName"/>
<rule pattern="[^&quot;]+">
<token type="LiteralStringName" />
</rule>
<rule pattern="&#34;&#34;">
<token type="LiteralStringName"/>
<rule pattern="&quot;&quot;">
<token type="LiteralStringName" />
</rule>
<rule pattern="&#34;">
<token type="LiteralStringName"/>
<pop depth="1"/>
<rule pattern="&quot;">
<token type="LiteralStringName" />
<pop depth="1" />
</rule>
</state>
</rules>
Expand Down
49 changes: 49 additions & 0 deletions lexers/testdata/materialize.actual
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
-- basic statements

CREATE VIEW my_typed_source AS
SELECT
(data->>'field1')::boolean AS field_1,
(data->>'field2')::int AS field_2,
(data->>'field3')::float AS field_3
FROM my_jsonb_source;

WITH
regional_sales (region, total_sales) AS (
SELECT region, sum(amount)
FROM orders
GROUP BY region
),
top_regions AS (
SELECT region
FROM regional_sales
ORDER BY total_sales DESC
LIMIT 5
)
SELECT region,
product,
SUM(quantity) AS product_units,
SUM(amount) AS product_sales
FROM orders
WHERE region IN (SELECT region FROM top_regions)
GROUP BY region, product;

-- sources

CREATE SOURCE webhook_with_basic_auth
FROM WEBHOOK
BODY FORMAT JSON
CHECK (
WITH (
HEADERS,
BODY AS request_body,
SECRET basic_hook_auth AS validation_secret
)
-- The constant_time_eq validation function **does not support** fully
-- qualified secret names. We recommend always aliasing the secret name
-- for ease of use.
constant_time_eq(headers->'authorization', validation_secret)
);

CREATE SOURCE mz_source
FROM MYSQL CONNECTION mysql_connection
FOR TABLES (schema1.table_1 AS s1_table_1, schema2.table_1 AS s2_table_1);
Loading

0 comments on commit d1034f8

Please sign in to comment.