From ae13e2ae41a3450cb7430daa3f82d271954ec6ef Mon Sep 17 00:00:00 2001 From: Denis Bezrukov <6227442+denbezrukov@users.noreply.github.com> Date: Mon, 10 Jul 2023 23:06:22 +0300 Subject: [PATCH] feat(rome_css_parser): CSS lexer #4682 --- Cargo.lock | 19 +- Cargo.toml | 4 +- crates/rome_css_parser/Cargo.toml | 30 ++ crates/rome_css_parser/LICENSE | 21 + crates/rome_css_parser/README.md | 36 ++ crates/rome_css_parser/src/lexer/mod.rs | 499 +++++++++++++++++++ crates/rome_css_parser/src/lexer/tests.rs | 233 +++++++++ crates/rome_css_parser/src/lib.rs | 4 + crates/rome_css_parser/src/prelude.rs | 2 + crates/rome_css_syntax/Cargo.toml | 5 +- crates/rome_css_syntax/src/generated/kind.rs | 1 + crates/rome_json_parser/src/lexer/mod.rs | 2 +- xtask/codegen/src/css_kinds_src.rs | 9 +- 13 files changed, 859 insertions(+), 6 deletions(-) create mode 100644 crates/rome_css_parser/Cargo.toml create mode 100644 crates/rome_css_parser/LICENSE create mode 100644 crates/rome_css_parser/README.md create mode 100644 crates/rome_css_parser/src/lexer/mod.rs create mode 100644 crates/rome_css_parser/src/lexer/tests.rs create mode 100644 crates/rome_css_parser/src/lib.rs create mode 100644 crates/rome_css_parser/src/prelude.rs diff --git a/Cargo.lock b/Cargo.lock index c905ade6102..f35bfa0d43f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1746,9 +1746,26 @@ dependencies = [ "rome_rowan", ] +[[package]] +name = "rome_css_parser" +version = "0.0.1" +dependencies = [ + "insta", + "quickcheck", + "quickcheck_macros", + "rome_console", + "rome_css_syntax", + "rome_diagnostics", + "rome_js_unicode_table", + "rome_parser", + "rome_rowan", + "tests_macros", + "tracing", +] + [[package]] name = "rome_css_syntax" -version = "0.0.0" +version = "0.0.1" dependencies = [ "rome_rowan", ] diff --git a/Cargo.toml b/Cargo.toml index 8c24f0b470b..e510bd3962f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,6 @@ rome_aria_metadata = { path = "./crates/rome_aria_metadata" } rome_cli = { path = "./crates/rome_cli" } rome_console = { version = "0.0.1", path = "./crates/rome_console" } rome_control_flow = { path = "./crates/rome_control_flow" } -rome_css_syntax = { path = "./crates/rome_css_syntax" } rome_deserialize = { version = "0.0.0", path = "./crates/rome_deserialize" } rome_diagnostics = { version = "0.0.1", path = "./crates/rome_diagnostics" } rome_diagnostics_categories = { version = "0.0.1", path = "./crates/rome_diagnostics_categories" } @@ -52,6 +51,9 @@ rome_json_factory = { version = "0.0.1", path = "./crates/rome_json_fa rome_json_formatter = { path = "./crates/rome_json_formatter" } rome_json_parser = { path = "./crates/rome_json_parser" } rome_json_syntax = { version = "0.0.1", path = "./crates/rome_json_syntax" } +rome_css_factory = { path = "./crates/rome_css_factory" } +rome_css_parser = { path = "./crates/rome_css_parser" } +rome_css_syntax = { path = "./crates/rome_css_syntax" } rome_lsp = { path = "./crates/rome_lsp" } rome_markup = { version = "0.0.1", path = "./crates/rome_markup" } rome_migrate = { path = "./crates/rome_migrate" } diff --git a/crates/rome_css_parser/Cargo.toml b/crates/rome_css_parser/Cargo.toml new file mode 100644 index 00000000000..3db3a96d5c8 --- /dev/null +++ b/crates/rome_css_parser/Cargo.toml @@ -0,0 +1,30 @@ +[package] +authors.workspace = true +categories = ["parser-implementations", "development-tools"] +description = "An extremely fast CSS parser" +documentation = "https://rustdocs.rome.tools/rome_css_parser/index.html" +edition.workspace = true +homepage.workspace = true +license.workspace = true +name = "rome_css_parser" +repository.workspace = true +version = "0.0.1" + +[dependencies] +rome_console = { workspace = true } +rome_diagnostics = { workspace = true } +rome_js_unicode_table = { workspace = true } +rome_css_syntax = { workspace = true } +rome_parser = { workspace = true } +rome_rowan = { workspace = true } +tracing = { workspace = true } + +[dev-dependencies] +insta = { workspace = true } +quickcheck = "1.0.3" +quickcheck_macros = "1.0.0" +tests_macros = { workspace = true } + +# cargo-workspaces metadata +[package.metadata.workspaces] +independent = true diff --git a/crates/rome_css_parser/LICENSE b/crates/rome_css_parser/LICENSE new file mode 100644 index 00000000000..0c74aa6a873 --- /dev/null +++ b/crates/rome_css_parser/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) Rome Tools, Inc. and its affiliates. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/crates/rome_css_parser/README.md b/crates/rome_css_parser/README.md new file mode 100644 index 00000000000..26047a3b77e --- /dev/null +++ b/crates/rome_css_parser/README.md @@ -0,0 +1,36 @@ +

+ + + Rome's logo depicting an ancient Roman arch with the word Rome to its side + +

+ +
+ +[![MIT licensed][mit-badge]][mit-url] +[![Discord chat][discord-badge]][discord-url] +[![CI on main][ci-badge]][ci-url] +[![npm version][npm-badge]][npm-url] +[![VSCode version][vscode-badge]][vscode-url] +[![cargo version][cargo-badge]][cargo-url] + + +[mit-badge]: https://img.shields.io/badge/license-MIT-blue.svg?color=brightgreen +[mit-url]: LICENSE +[discord-badge]: https://img.shields.io/discord/678763474494423051?logo=discord&label=discord&color=brightgreen +[discord-url]: https://discord.gg/rome +[ci-badge]: https://github.com/rome/tools/actions/workflows/main.yml/badge.svg +[ci-url]: https://github.com/rome/tools/actions/workflows/main.yml +[npm-badge]: https://img.shields.io/npm/v/rome/latest?color=brightgreen +[npm-url]: https://www.npmjs.com/package/rome/v/latest +[vscode-badge]: https://img.shields.io/visual-studio-marketplace/v/rome.rome?color=brightgreen&label=vscode +[vscode-url]: https://marketplace.visualstudio.com/items?itemName=rome.rome +[cargo-badge]: https://img.shields.io/crates/v/rome_css_parser?&color=brightgreen +[cargo-url]: https://crates.io/crates/rome_css_parser + +
+ +# `rome_css_parser` + +Rome's CSS parser implementation. Follow the [documentation](https://rustdocs.rome.tools/rome_css_parser/index.html). + diff --git a/crates/rome_css_parser/src/lexer/mod.rs b/crates/rome_css_parser/src/lexer/mod.rs new file mode 100644 index 00000000000..1ad4efe6c8f --- /dev/null +++ b/crates/rome_css_parser/src/lexer/mod.rs @@ -0,0 +1,499 @@ +//! An extremely fast, lookup table based, СSS lexer which yields SyntaxKind tokens used by the rome-css parser. +#![allow(dead_code)] + +#[rustfmt::skip] +mod tests; + +use rome_css_syntax::{CssSyntaxKind, CssSyntaxKind::*, TextLen, TextRange, TextSize, T}; +use rome_js_unicode_table::{lookup_byte, Dispatch::*}; +use rome_parser::diagnostic::ParseDiagnostic; +use std::char::REPLACEMENT_CHARACTER; +use std::iter::FusedIterator; + +pub struct Token { + kind: CssSyntaxKind, + range: TextRange, +} + +impl Token { + pub fn kind(&self) -> CssSyntaxKind { + self.kind + } + + pub fn range(&self) -> TextRange { + self.range + } +} + +/// An extremely fast, lookup table based, lossless CSS lexer +#[derive(Debug)] +pub(crate) struct Lexer<'src> { + /// Source text + source: &'src str, + + /// The start byte position in the source text of the next token. + position: usize, + + diagnostics: Vec, +} + +impl<'src> Lexer<'src> { + /// Make a new lexer from a str, this is safe because strs are valid utf8 + pub fn from_str(source: &'src str) -> Self { + Self { + source, + position: 0, + diagnostics: vec![], + } + } + + /// Returns the source code + pub fn source(&self) -> &'src str { + self.source + } + + pub fn finish(self) -> Vec { + self.diagnostics + } + + /// Lexes the next token. + /// + /// ## Return + /// Returns its kind and any potential error. + pub(crate) fn next_token(&mut self) -> Option { + let start = self.text_position(); + + match self.current_byte() { + Some(current) => { + let kind = self.lex_token(current); + + debug_assert!(start < self.text_position(), "Lexer did not progress"); + Some(Token { + kind, + range: TextRange::new(start, self.text_position()), + }) + } + None if self.position == self.source.len() => { + self.advance(1); + Some(Token { + kind: EOF, + range: TextRange::new(start, start), + }) + } + None => None, + } + } + + fn text_position(&self) -> TextSize { + TextSize::try_from(self.position).expect("Input to be smaller than 4 GB") + } + + /// Bumps the current byte and creates a lexed token of the passed in kind + fn eat_byte(&mut self, tok: CssSyntaxKind) -> CssSyntaxKind { + self.advance(1); + tok + } + + /// Consume just one newline/line break. + /// + /// ## Safety + /// Must be called at a valid UT8 char boundary + fn consume_newline(&mut self) -> bool { + self.assert_at_char_boundary(); + + match self.current_byte() { + Some(b'\n') => { + self.advance(1); + true + } + Some(b'\r') => { + if self.peek_byte() == Some(b'\n') { + self.advance(2) + } else { + self.advance(1) + } + true + } + + _ => false, + } + } + + /// Consumes all whitespace until a non-whitespace or a newline is found. + /// + /// ## Safety + /// Must be called at a valid UT8 char boundary + fn consume_whitespaces(&mut self) { + self.assert_at_char_boundary(); + + while let Some(byte) = self.current_byte() { + let dispatch = lookup_byte(byte); + + match dispatch { + WHS => match byte { + b'\t' | b' ' => self.advance(1), + b'\r' | b'\n' => { + break; + } + _ => { + let start = self.text_position(); + self.advance(1); + + self.diagnostics.push( + ParseDiagnostic::new( + "The CSS standard only allows tabs, whitespace, carriage return and line feed whitespace.", + start..self.text_position(), + ) + .hint("Use a regular whitespace character instead."), + ) + } + }, + + _ => break, + } + } + } + + /// Consume one newline or all whitespace until a non-whitespace or a newline is found. + /// + /// ## Safety + /// Must be called at a valid UT8 char boundary + fn consume_newline_or_whitespaces(&mut self) -> CssSyntaxKind { + if self.consume_newline() { + NEWLINE + } else { + self.consume_whitespaces(); + WHITESPACE + } + } + + /// Get the UTF8 char which starts at the current byte + /// + /// ## Safety + /// Must be called at a valid UT8 char boundary + fn current_char_unchecked(&self) -> char { + // Precautionary measure for making sure the unsafe code below does not read over memory boundary + debug_assert!(!self.is_eof()); + self.assert_at_char_boundary(); + + // Safety: We know this is safe because we require the input to the lexer to be valid utf8 and we always call this when we are at a char + let string = unsafe { + std::str::from_utf8_unchecked(self.source.as_bytes().get_unchecked(self.position..)) + }; + let chr = if let Some(chr) = string.chars().next() { + chr + } else { + // Safety: we always call this when we are at a valid char, so this branch is completely unreachable + unsafe { + core::hint::unreachable_unchecked(); + } + }; + + chr + } + + /// Gets the current byte. + /// + /// ## Returns + /// The current byte if the lexer isn't at the end of the file. + #[inline] + fn current_byte(&self) -> Option { + if self.is_eof() { + None + } else { + Some(self.source.as_bytes()[self.position]) + } + } + + /// Asserts that the lexer is at a UTF8 char boundary + #[inline] + fn assert_at_char_boundary(&self) { + debug_assert!(self.source.is_char_boundary(self.position)); + } + + /// Peeks at the next byte + #[inline] + fn peek_byte(&self) -> Option { + self.byte_at(1) + } + + /// Returns the byte at position `self.position + offset` or `None` if it is out of bounds. + #[inline] + fn byte_at(&self, offset: usize) -> Option { + self.source.as_bytes().get(self.position + offset).copied() + } + + /// Advances the current position by `n` bytes. + #[inline] + fn advance(&mut self, n: usize) { + self.position += n; + } + + #[inline] + fn advance_byte_or_char(&mut self, chr: u8) { + if chr.is_ascii() { + self.advance(1); + } else { + self.advance_char_unchecked(); + } + } + + /// Advances the current position by the current char UTF8 length + /// + /// ## Safety + /// Must be called at a valid UT8 char boundary + #[inline] + fn advance_char_unchecked(&mut self) { + let c = self.current_char_unchecked(); + self.position += c.len_utf8(); + } + + /// Returns `true` if the parser is at or passed the end of the file. + #[inline] + fn is_eof(&self) -> bool { + self.position >= self.source.len() + } + + /// Lexes the next token + /// + /// Guaranteed to not be at the end of the file + // A lookup table of `byte -> fn(l: &mut Lexer) -> Token` is exponentially slower than this approach + fn lex_token(&mut self, current: u8) -> CssSyntaxKind { + // The speed difference comes from the difference in table size, a 2kb table is easily fit into cpu cache + // While a 16kb table will be ejected from cache very often leading to slowdowns, this also allows LLVM + // to do more aggressive optimizations on the match regarding how to map it to instructions + let dispatched = lookup_byte(current); + + match dispatched { + WHS => self.consume_newline_or_whitespaces(), + QOT => self.lex_string_literal(current), + SLH => self.lex_slash(), + + PRD => self.eat_byte(T![.]), + MUL => self.eat_byte(T![*]), + COL => self.eat_byte(T![:]), + AT_ => self.eat_byte(T![@]), + HAS => self.eat_byte(T![#]), + PNO => self.eat_byte(T!['(']), + PNC => self.eat_byte(T![')']), + BEO => self.eat_byte(T!['{']), + BEC => self.eat_byte(T!['}']), + BTO => self.eat_byte(T!('[')), + BTC => self.eat_byte(T![']']), + + _ => self.eat_unexpected_character(), + } + } + + fn lex_string_literal(&mut self, quote: u8) -> CssSyntaxKind { + self.assert_at_char_boundary(); + let start = self.text_position(); + + self.advance(1); // Skip over the quote + let mut state = LexStringState::InString; + + while let Some(chr) = self.current_byte() { + let dispatch = lookup_byte(chr); + + match dispatch { + QOT if quote == chr => { + self.advance(1); + state = match state { + LexStringState::InString => LexStringState::Terminated, + state => state, + }; + break; + } + // '\t' etc + BSL => { + let escape_start = self.text_position(); + self.advance(1); + + match self.current_byte() { + Some(b'\n' | b'\r') => self.advance(1), + + // Handle escaped `'` but only if this is a end quote string. + Some(b'\'') if quote == b'\'' => { + self.advance(1); + } + + // Handle escaped `'` but only if this is a end quote string. + Some(b'"') if quote == b'"' => { + self.advance(1); + } + + Some(c) if c.is_ascii_hexdigit() => { + // SAFETY: We know that the current byte is a hex digit. + let mut hex = (c as char).to_digit(16).unwrap(); + self.advance(1); + + // Consume as many hex digits as possible, but no more than 5. + // Note that this means 1-6 hex digits have been consumed in total. + for _ in 0..5 { + let Some(digit) = self.current_byte() + .and_then(|c| (c as char).to_digit(16)) else { break; }; + self.advance(1); + + hex = hex * 16 + digit; + } + + // Interpret the hex digits as a hexadecimal number. If this number is zero, or + // is for a surrogate, or is greater than the maximum allowed code point, return + // U+FFFD REPLACEMENT CHARACTER (�). + let hex = match hex { + // If this number is zero + 0 => REPLACEMENT_CHARACTER, + // or is for a surrogate + 55296..=57343 => REPLACEMENT_CHARACTER, + // or is greater than the maximum allowed code point + 1114112.. => REPLACEMENT_CHARACTER, + _ => char::from_u32(hex).unwrap_or(REPLACEMENT_CHARACTER), + }; + + if hex == REPLACEMENT_CHARACTER { + state = LexStringState::InvalidEscapeSequence; + + let diagnostic = ParseDiagnostic::new( + "Invalid escape sequence", + escape_start..self.text_position(), + ); + self.diagnostics.push(diagnostic); + } + } + + Some(chr) => { + self.advance_byte_or_char(chr); + } + + None => {} + } + } + WHS if matches!(chr, b'\n' | b'\r') => { + let unterminated = + ParseDiagnostic::new("Missing closing quote", start..self.text_position()) + .detail(self.position..self.position + 1, "line breaks here"); + + self.diagnostics.push(unterminated); + + return ERROR_TOKEN; + } + UNI => self.advance_char_unchecked(), + + _ => self.advance(1), + } + } + + match state { + LexStringState::Terminated => CSS_STRING_LITERAL, + LexStringState::InString => { + let unterminated = + ParseDiagnostic::new("Missing closing quote", start..self.text_position()) + .detail( + self.source.text_len()..self.source.text_len(), + "file ends here", + ); + self.diagnostics.push(unterminated); + + ERROR_TOKEN + } + LexStringState::InvalidEscapeSequence => ERROR_TOKEN, + } + } + + /// Lexes a comment. + fn lex_slash(&mut self) -> CssSyntaxKind { + let start = self.text_position(); + match self.peek_byte() { + Some(b'*') => { + // eat `/*` + self.advance(2); + + let mut has_newline = false; + + while let Some(chr) = self.current_byte() { + match chr { + b'*' if self.peek_byte() == Some(b'/') => { + self.advance(2); + + if has_newline { + return MULTILINE_COMMENT; + } else { + return COMMENT; + } + } + b'\n' | b'\r' => { + has_newline = true; + self.advance(1) + } + chr => self.advance_byte_or_char(chr), + } + } + + let err = + ParseDiagnostic::new("Unterminated block comment", start..self.text_position()) + .detail( + self.position..self.position + 1, + "... but the file ends here", + ); + + self.diagnostics.push(err); + + if has_newline { + MULTILINE_COMMENT + } else { + COMMENT + } + } + Some(b'/') => { + self.advance(2); + + while let Some(chr) = self.current_byte() { + match chr { + b'\n' | b'\r' => return COMMENT, + chr => self.advance_byte_or_char(chr), + } + } + + COMMENT + } + _ => self.eat_unexpected_character(), + } + } + + #[inline] + fn eat_unexpected_character(&mut self) -> CssSyntaxKind { + self.assert_at_char_boundary(); + + let char = self.current_char_unchecked(); + let err = ParseDiagnostic::new( + format!("unexpected character `{}`", char), + self.text_position()..self.text_position() + char.text_len(), + ); + self.diagnostics.push(err); + self.advance(char.len_utf8()); + + ERROR_TOKEN + } +} + +impl Iterator for Lexer<'_> { + type Item = Token; + + fn next(&mut self) -> Option { + self.next_token() + } +} + +impl FusedIterator for Lexer<'_> {} + +#[derive(Copy, Clone, Debug)] +enum LexStringState { + /// String that contains an invalid escape sequence + InvalidEscapeSequence, + + /// Between the opening `"` and closing `"` quotes. + InString, + + /// Properly terminated string + Terminated, +} diff --git a/crates/rome_css_parser/src/lexer/tests.rs b/crates/rome_css_parser/src/lexer/tests.rs new file mode 100644 index 00000000000..4bc08842f78 --- /dev/null +++ b/crates/rome_css_parser/src/lexer/tests.rs @@ -0,0 +1,233 @@ +#![cfg(test)] +#![allow(unused_mut, unused_variables, unused_assignments)] + +use super::{Lexer, TextSize}; +use quickcheck_macros::quickcheck; +use std::sync::mpsc::channel; +use std::thread; +use std::time::Duration; + +// Assert the result of lexing a piece of source code, +// and make sure the tokens yielded are fully lossless and the source can be reconstructed from only the tokens +macro_rules! assert_lex { + ($src:expr, $($kind:ident:$len:expr $(,)?)*) => {{ + let mut lexer = Lexer::from_str($src); + let mut idx = 0; + let mut tok_idx = TextSize::default(); + + let mut new_str = String::with_capacity($src.len()); + let tokens: Vec<_> = lexer.collect(); + + $( + assert_eq!( + tokens[idx].kind, + rome_css_syntax::CssSyntaxKind::$kind, + "expected token kind {}, but found {:?}", + stringify!($kind), + tokens[idx].kind, + ); + + assert_eq!( + tokens[idx].range.len(), + TextSize::from($len), + "expected token length of {}, but found {:?} for token {:?}", + $len, + tokens[idx].range.len(), + tokens[idx].kind, + ); + + new_str.push_str(&$src[tokens[idx].range]); + tok_idx += tokens[idx].range.len(); + + idx += 1; + )* + + if idx < tokens.len() { + panic!( + "expected {} tokens but lexer returned {}, first unexpected token is '{:?}'", + idx, + tokens.len(), + tokens[idx].kind + ); + } else { + assert_eq!(idx, tokens.len()); + } + + assert_eq!($src, new_str, "Failed to reconstruct input"); + }}; +} + +// This is for testing if the lexer is truly lossless +// It parses random strings and puts them back together with the produced tokens and compares +#[quickcheck] +fn losslessness(string: String) -> bool { + // using an mpsc channel allows us to spawn a thread and spawn the lexer there, then if + // it takes more than 2 seconds we panic because it is 100% infinite recursion + let cloned = string.clone(); + let (sender, receiver) = channel(); + thread::spawn(move || { + let mut lexer = Lexer::from_str(&cloned); + let tokens: Vec<_> = lexer.map(|token| token.range).collect(); + + sender + .send(tokens) + .expect("Could not send tokens to receiver"); + }); + let token_ranges = receiver + .recv_timeout(Duration::from_secs(2)) + .unwrap_or_else(|_| { + panic!( + "Lexer is infinitely recursing with this code: ->{}<-", + string + ) + }); + + let mut new_str = String::with_capacity(string.len()); + let mut idx = TextSize::from(0); + + for range in token_ranges { + new_str.push_str(&string[range]); + idx += range.len(); + } + + string == new_str +} + +#[test] +fn empty() { + assert_lex! { + "", + EOF:0 + } +} + +#[test] +fn string() { + assert_lex! { + "'5098382'", + CSS_STRING_LITERAL:9, + EOF:0 + } + + // double quote + assert_lex! { + r#"'hel"lo"'"#, + CSS_STRING_LITERAL:9, + EOF:0 + } + + // escaped quote + assert_lex! { + r#"'hel\'lo\''"#, + CSS_STRING_LITERAL:11, + EOF:0 + } + + // escaped quote + assert_lex! { + r#""hel\"lo\"""#, + CSS_STRING_LITERAL:11, + EOF:0 + } + + // unicode + assert_lex! { + "'юникод'", + CSS_STRING_LITERAL:14, + EOF:0 + } + + // missing single closing quote + assert_lex! { + "'he", + ERROR_TOKEN:3, + EOF:0 + } + + // missing double closing quote + assert_lex! { + r#""he"#, + ERROR_TOKEN:3, + EOF:0 + } + + // line break + assert_lex! { + r#"'he + "#, + ERROR_TOKEN:3, + NEWLINE:1, + WHITESPACE:4, + EOF:0 + } + + // line break + assert_lex! { + r#"'he + '"#, + ERROR_TOKEN:3, + NEWLINE:1, + WHITESPACE:4, + ERROR_TOKEN:1, + EOF:0 + } + + assert_lex! { + r#""Escaped \n""#, + CSS_STRING_LITERAL:12, + EOF:0 + } + + assert_lex! { + r#""Escaped \r""#, + CSS_STRING_LITERAL:12, + EOF:0 + } + + // invalid escape sequence + assert_lex! { + r#"'\0'"#, + ERROR_TOKEN:4, + EOF:0 + } +} + +#[test] +fn single_line_comments() { + assert_lex! { + "//abc + ", + COMMENT:5, + NEWLINE:1, + WHITESPACE:4, + EOF:0 + } + + assert_lex! { + "//a", + COMMENT:3, + EOF:0 + } +} + +#[test] +fn block_comment() { + assert_lex! { + "/* + */", + MULTILINE_COMMENT:13, + EOF:0 + } + + assert_lex! { + "/* */", + COMMENT:5, + EOF:0 + } + + assert_lex! { + "/* *", + COMMENT:4, + EOF:0 + } +} diff --git a/crates/rome_css_parser/src/lib.rs b/crates/rome_css_parser/src/lib.rs new file mode 100644 index 00000000000..027ef38d7e4 --- /dev/null +++ b/crates/rome_css_parser/src/lib.rs @@ -0,0 +1,4 @@ +//! Extremely fast, lossless, and error tolerant CSS Parser. + +mod lexer; +mod prelude; diff --git a/crates/rome_css_parser/src/prelude.rs b/crates/rome_css_parser/src/prelude.rs new file mode 100644 index 00000000000..bd22b87c894 --- /dev/null +++ b/crates/rome_css_parser/src/prelude.rs @@ -0,0 +1,2 @@ +pub use rome_css_syntax::T; +pub use rome_parser::prelude::*; diff --git a/crates/rome_css_syntax/Cargo.toml b/crates/rome_css_syntax/Cargo.toml index 8f6d869dab7..770a2a9e4b7 100644 --- a/crates/rome_css_syntax/Cargo.toml +++ b/crates/rome_css_syntax/Cargo.toml @@ -1,11 +1,12 @@ [package] authors.workspace = true +description = "SyntaxKind and common rowan definitions for rome_css_parser" +documentation = "https://rustdocs.rome.tools/rome_css_parser/index.html" edition.workspace = true license.workspace = true name = "rome_css_syntax" repository.workspace = true -version = "0.0.0" - +version = "0.0.1" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/crates/rome_css_syntax/src/generated/kind.rs b/crates/rome_css_syntax/src/generated/kind.rs index 7acafe73281..ca29377b187 100644 --- a/crates/rome_css_syntax/src/generated/kind.rs +++ b/crates/rome_css_syntax/src/generated/kind.rs @@ -208,6 +208,7 @@ pub enum CssSyntaxKind { NEWLINE, WHITESPACE, COMMENT, + MULTILINE_COMMENT, CSS_ROOT, CSS_ID_SELECTOR_PATTERN, CSS_RULE, diff --git a/crates/rome_json_parser/src/lexer/mod.rs b/crates/rome_json_parser/src/lexer/mod.rs index 25fbbaaead3..95cc27c5362 100644 --- a/crates/rome_json_parser/src/lexer/mod.rs +++ b/crates/rome_json_parser/src/lexer/mod.rs @@ -24,7 +24,7 @@ impl Token { } } -/// An extremely fast, lookup table based, lossless ECMAScript lexer +/// An extremely fast, lookup table based, lossless JSON lexer #[derive(Debug)] pub(crate) struct Lexer<'src> { /// Source text diff --git a/xtask/codegen/src/css_kinds_src.rs b/xtask/codegen/src/css_kinds_src.rs index 7c2d3e2962d..43003f298c6 100644 --- a/xtask/codegen/src/css_kinds_src.rs +++ b/xtask/codegen/src/css_kinds_src.rs @@ -200,7 +200,14 @@ pub const CSS_KINDS_SRC: KindsSrc = KindsSrc { "CSS_CUSTOM_PROPERTY", "CSS_SPACE_LITERAL", ], - tokens: &["ERROR_TOKEN", "IDENT", "NEWLINE", "WHITESPACE", "COMMENT"], + tokens: &[ + "ERROR_TOKEN", + "IDENT", + "NEWLINE", + "WHITESPACE", + "COMMENT", + "MULTILINE_COMMENT", + ], nodes: &[ "CSS_ROOT", "CSS_ID_SELECTOR_PATTERN",