From 566e9ccdacd790fa491c41e2162b73234e811bb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 7 May 2024 18:29:41 +0200 Subject: [PATCH 001/135] First prototype of lexer with parcours. --- Cargo.lock | 5 ++ jaq-parse/Cargo.toml | 1 + jaq-parse/src/lex.rs | 137 +++++++++++++++++++++++++++++++++++++++++++ jaq-parse/src/lib.rs | 28 ++++++++- 4 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 jaq-parse/src/lex.rs diff --git a/Cargo.lock b/Cargo.lock index 06caaf260..28ee4bc4f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -317,6 +317,7 @@ version = "1.0.2" dependencies = [ "chumsky", "jaq-syn", + "parcours", ] [[package]] @@ -432,6 +433,10 @@ version = "6.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" +[[package]] +name = "parcours" +version = "0.3.0" + [[package]] name = "proc-macro-error" version = "1.0.4" diff --git a/jaq-parse/Cargo.toml b/jaq-parse/Cargo.toml index 9c2160266..bc2defa82 100644 --- a/jaq-parse/Cargo.toml +++ b/jaq-parse/Cargo.toml @@ -14,3 +14,4 @@ rust-version = "1.64" [dependencies] chumsky = { version = "0.9.0", default-features = false } jaq-syn = { version = "1.0.0", path = "../jaq-syn" } +parcours = { version = "0.3.0", path = "../../parcours" } diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs new file mode 100644 index 000000000..c610160cf --- /dev/null +++ b/jaq-parse/src/lex.rs @@ -0,0 +1,137 @@ +use crate::token::{Delim, Token, Tree}; +use alloc::string::{String, ToString}; +use alloc::vec::Vec; +use parcours::{all, any, consumed, lazy, select, str, Combinator, Parser}; + +/// Decimal with optional exponent. +fn num<'a>() -> impl Parser<&'a str, O = &'a str> { + let digits = str::take_while1(|c, _| c.is_numeric()); + let comma = str::matches(".").then(digits.opt()); + let exp = all(( + str::next().filter(|c| "eE".contains(*c)), + str::next().filter(|c| "+-".contains(*c)).opt(), + digits, + )); + consumed(all((digits, comma.opt(), exp.opt()))) +} + +/// Hexadecimal number with `len` digits. +fn hex<'a>(len: usize) -> impl Parser<&'a str, O = &'a str> + Clone { + let mut n = 0; + str::take_while(move |c, _| { + n += 1; + n <= len && c.is_ascii_hexdigit() + }) + .filter(move |digits| digits.len() == len) +} + +/// JSON string character. +fn char_<'a>() -> impl Parser<&'a str, O = char> + Clone { + let unicode = str::matches("u").ignore_then(hex(4).map(|digits| { + let num = u32::from_str_radix(&digits, 16).unwrap(); + char::from_u32(num).unwrap_or_else(|| { + //emit(Simple::custom(span, "invalid unicode character")); + '\u{FFFD}' // unicode replacement character + }) + })); + + let bla = str::next().filter_map(select!( + '\\' => '\\', + '/' => '/', + '"' => '"', + 'b' => '\x08', + 'f' => '\x0C', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + )); + let escape = str::matches("\\").ignore_then(bla.or(unicode)); + + str::next().filter(|c| *c != '\\' && *c != '"').or(escape) +} + +fn ident<'a>() -> impl Parser<&'a str, O = &'a str> { + consumed(all(( + str::matches("@").opt(), + str::next().filter(|c| c.is_ascii_alphabetic() || *c == '_'), + str::take_while(|c, _s| c.is_ascii_alphanumeric() || *c == '_'), + ))) +} + +fn token<'a>() -> impl Parser<&'a str, O = Token> { + let op = str::take_while1(|c, _| "|=!<>+-*/%".contains(*c)); + + let var = str::matches("$").ignore_then(ident()); + + let ident = ident().map(|ident| match ident { + "def" => Token::Def, + "if" => Token::If, + "then" => Token::Then, + "elif" => Token::Elif, + "else" => Token::Else, + "end" => Token::End, + "or" => Token::Or, + "and" => Token::And, + "as" => Token::As, + "reduce" => Token::Reduce, + "for" => Token::For, + "foreach" => Token::Foreach, + "try" => Token::Try, + "catch" => Token::Catch, + _ => Token::Ident(ident.to_string()), + }); + + any(( + ident, + str::matches("..").map(|_| Token::DotDot), + str::next().filter_map(select!( + '.' => Token::Dot, + ':' => Token::Colon, + ';' => Token::Semicolon, + ',' => Token::Comma, + '?' => Token::Question, + )), + op.map(|op| op.to_string()).map(Token::Op), + var.map(|s| s.to_string()).map(Token::Var), + num().map(|n| n.to_string()).map(Token::Num), + )) +} + +/// Whitespace and comments. +fn space<'a>() -> impl Parser<&'a str, O = ()> + Clone { + let space = str::take_while(|c, _| c.is_ascii_whitespace()); + let comment = str::matches("#").then(str::take_while(|c, _| *c != '\n')); + let comments = space.then(comment).map(|_| ()).repeated::<()>(); + comments.then(space).map(|_| ()) +} + +fn tree<'a>() -> impl Parser<&'a str, O = Tree> { + // TODO: span! + let trees = lazy!(tree).map(|t| (t, 0..42)).repeated(); + let close = |s| space().ignore_then(str::matches(s)); + let paren = trees.delimited_by(str::matches("("), close(")")); + let brack = trees.delimited_by(str::matches("["), close("]")); + let brace = trees.delimited_by(str::matches("{"), close("}")); + + let chars = char_().repeated::().map(|s| (s, 0..42)); + + let pair = |p| (Tree::Delim(Delim::Paren, p), 0..42); + let interpol = str::matches("\\").ignore_then(paren.clone().map(pair)); + + let string = chars + .clone() + .then(interpol.then(chars).repeated()) + .delimited_by(str::matches("\""), str::matches("\"")); + + space().ignore_then(any(( + paren.map(|t| Tree::Delim(Delim::Paren, t)), + brack.map(|t| Tree::Delim(Delim::Brack, t)), + brace.map(|t| Tree::Delim(Delim::Brace, t)), + string.map(|(s, interpol)| Tree::String(s, interpol)), + token().map(Tree::Token), + ))) +} + +pub fn lex<'a>() -> impl Parser<&'a str, O = Vec> { + lazy!(tree).repeated().then_ignore(space()) +} diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index 9971cbc16..6bc9bfc38 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -1,5 +1,5 @@ //! JSON query language parser. -#![no_std] +//#![no_std] #![forbid(unsafe_code)] #![warn(missing_docs)] @@ -7,6 +7,7 @@ extern crate alloc; mod def; mod filter; +mod lex; mod path; mod prec_climb; mod string; @@ -39,10 +40,35 @@ pub fn parse(src: &str, parser: P) -> (Option, Vec) where P: Parser> + Clone, { + /* + for i in 0..500 { + + let (tokens, lex_errs) = lex() + .then_ignore(end()) + .recover_with(skip_then_retry_until([])) + .parse_recovery(src); + if let Some((tokens2, rest)) = crate::lex::lex().parse(src, &mut ()) { + let v: Vec<_> = tokens2.into_iter().map(|tree| tree.tokens(0..42)).flatten().collect(); + } + } + */ + use parcours::Parser; + if let Some((tokens2, rest)) = crate::lex::lex().parse(src, &mut ()) { + let v: Vec<_> = tokens2 + .into_iter() + .map(|tree| tree.tokens(0..42)) + .flatten() + .collect(); + std::println!("{v:?}"); + std::println!("finished: {}", rest.is_empty()); + } else { + std::println!("parse error"); + } let (tokens, lex_errs) = lex() .then_ignore(end()) .recover_with(skip_then_retry_until([])) .parse_recovery(src); + let lex_errs: Vec> = lex_errs; let (parsed, parse_errs) = if let Some(tokens) = tokens { let len = src.chars().count(); From cd5d7358c2d47cbed187b13255eaf284d3db52a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 8 May 2024 11:14:56 +0200 Subject: [PATCH 002/135] Rewrite tokeniser without library support. --- jaq-parse/src/lex.rs | 143 +++++++++++++++++++++++++------------------ 1 file changed, 85 insertions(+), 58 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index c610160cf..313d9c42f 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -1,18 +1,92 @@ use crate::token::{Delim, Token, Tree}; use alloc::string::{String, ToString}; use alloc::vec::Vec; -use parcours::{all, any, consumed, lazy, select, str, Combinator, Parser}; +use parcours::{any, lazy, select, str, Combinator, Parser}; + +fn strip_digits(i: &str) -> Option<&str> { + i.strip_prefix(|c: char| c.is_numeric()) + .map(|i| i.trim_start_matches(|c: char| c.is_numeric())) +} /// Decimal with optional exponent. -fn num<'a>() -> impl Parser<&'a str, O = &'a str> { - let digits = str::take_while1(|c, _| c.is_numeric()); - let comma = str::matches(".").then(digits.opt()); - let exp = all(( - str::next().filter(|c| "eE".contains(*c)), - str::next().filter(|c| "+-".contains(*c)).opt(), - digits, - )); - consumed(all((digits, comma.opt(), exp.opt()))) +fn trim_num(i: &str) -> &str { + let i = i.trim_start_matches(|c: char| c.is_numeric()); + let i = i.strip_prefix('.').map_or(i, |i| { + strip_digits(i).unwrap_or_else(|| { + // TODO: register error + todo!(); + i + }) + }); + let i = i.strip_prefix(['e', 'E']).map_or(i, |i| { + let i = i.strip_prefix(['+', '-']).unwrap_or(i); + strip_digits(i).unwrap_or_else(|| { + // TODO: register error + todo!(); + i + }) + }); + i +} + +fn trim_ident(i: &str) -> &str { + i.trim_start_matches(|c: char| c.is_ascii_alphanumeric() || c == '_') +} + +fn strip_ident(i: &str) -> Option<&str> { + i.strip_prefix(|c: char| c.is_ascii_alphabetic() || c == '_') + .map(trim_ident) +} + +fn token(i: &str) -> Option<(Token, &str)> { + let is_op = |c| "|=!<>+-*/%".contains(c); + let prefix = |rest: &str| &i[..i.len() - rest.len()]; + let single = |tk: Token| (tk, &i[1..]); + + let mut chars = i.chars(); + Some(match chars.next()? { + 'a'..='z' | 'A'..='Z' | '@' | '_' => { + let rest = trim_ident(chars.as_str()); + let tk = match prefix(rest) { + "def" => Token::Def, + "if" => Token::If, + "then" => Token::Then, + "elif" => Token::Elif, + "else" => Token::Else, + "end" => Token::End, + "or" => Token::Or, + "and" => Token::And, + "as" => Token::As, + "reduce" => Token::Reduce, + "for" => Token::For, + "foreach" => Token::Foreach, + "try" => Token::Try, + "catch" => Token::Catch, + ident => Token::Ident(ident.to_string()), + }; + (tk, rest) + } + '$' => { + // TODO: handle error + let rest = strip_ident(chars.as_str()).unwrap(); + (Token::Var(i[1..i.len() - rest.len()].to_string()), rest) + } + '0'..='9' => { + let rest = trim_num(chars.as_str()); + (Token::Num(prefix(rest).to_string()), rest) + } + '.' if chars.next() == Some('.') => (Token::DotDot, &i[2..]), + '.' => single(Token::Dot), + ':' => single(Token::Colon), + ';' => single(Token::Semicolon), + ',' => single(Token::Comma), + '?' => single(Token::Question), + c if is_op(c) => { + let rest = chars.as_str().trim_start_matches(is_op); + (Token::Op(prefix(rest).to_string()), rest) + } + _ => return None, + }) } /// Hexadecimal number with `len` digits. @@ -50,53 +124,6 @@ fn char_<'a>() -> impl Parser<&'a str, O = char> + Clone { str::next().filter(|c| *c != '\\' && *c != '"').or(escape) } -fn ident<'a>() -> impl Parser<&'a str, O = &'a str> { - consumed(all(( - str::matches("@").opt(), - str::next().filter(|c| c.is_ascii_alphabetic() || *c == '_'), - str::take_while(|c, _s| c.is_ascii_alphanumeric() || *c == '_'), - ))) -} - -fn token<'a>() -> impl Parser<&'a str, O = Token> { - let op = str::take_while1(|c, _| "|=!<>+-*/%".contains(*c)); - - let var = str::matches("$").ignore_then(ident()); - - let ident = ident().map(|ident| match ident { - "def" => Token::Def, - "if" => Token::If, - "then" => Token::Then, - "elif" => Token::Elif, - "else" => Token::Else, - "end" => Token::End, - "or" => Token::Or, - "and" => Token::And, - "as" => Token::As, - "reduce" => Token::Reduce, - "for" => Token::For, - "foreach" => Token::Foreach, - "try" => Token::Try, - "catch" => Token::Catch, - _ => Token::Ident(ident.to_string()), - }); - - any(( - ident, - str::matches("..").map(|_| Token::DotDot), - str::next().filter_map(select!( - '.' => Token::Dot, - ':' => Token::Colon, - ';' => Token::Semicolon, - ',' => Token::Comma, - '?' => Token::Question, - )), - op.map(|op| op.to_string()).map(Token::Op), - var.map(|s| s.to_string()).map(Token::Var), - num().map(|n| n.to_string()).map(Token::Num), - )) -} - /// Whitespace and comments. fn space<'a>() -> impl Parser<&'a str, O = ()> + Clone { let space = str::take_while(|c, _| c.is_ascii_whitespace()); @@ -128,7 +155,7 @@ fn tree<'a>() -> impl Parser<&'a str, O = Tree> { brack.map(|t| Tree::Delim(Delim::Brack, t)), brace.map(|t| Tree::Delim(Delim::Brace, t)), string.map(|(s, interpol)| Tree::String(s, interpol)), - token().map(Tree::Token), + parcours::from_fn(|i, _| token(i)).map(Tree::Token), ))) } From 32d0fcd4078989b818e1f31be61d35d1474217ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 8 May 2024 13:35:32 +0200 Subject: [PATCH 003/135] Bare metal string parser. --- jaq-parse/src/lex.rs | 117 +++++++++++++++++++++++++++---------------- 1 file changed, 73 insertions(+), 44 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 313d9c42f..c28e43caf 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -1,7 +1,7 @@ use crate::token::{Delim, Token, Tree}; use alloc::string::{String, ToString}; use alloc::vec::Vec; -use parcours::{any, lazy, select, str, Combinator, Parser}; +use parcours::{any, lazy, str, Combinator, Parser}; fn strip_digits(i: &str) -> Option<&str> { i.strip_prefix(|c: char| c.is_numeric()) @@ -75,7 +75,7 @@ fn token(i: &str) -> Option<(Token, &str)> { let rest = trim_num(chars.as_str()); (Token::Num(prefix(rest).to_string()), rest) } - '.' if chars.next() == Some('.') => (Token::DotDot, &i[2..]), + '.' if chars.next()? == '.' => (Token::DotDot, &i[2..]), '.' => single(Token::Dot), ':' => single(Token::Colon), ';' => single(Token::Semicolon), @@ -89,47 +89,79 @@ fn token(i: &str) -> Option<(Token, &str)> { }) } -/// Hexadecimal number with `len` digits. -fn hex<'a>(len: usize) -> impl Parser<&'a str, O = &'a str> + Clone { - let mut n = 0; - str::take_while(move |c, _| { - n += 1; - n <= len && c.is_ascii_hexdigit() - }) - .filter(move |digits| digits.len() == len) +use jaq_syn::string::Part; + +/// Returns `None` when an unexpected EOF was encountered. +fn string(mut i: &str) -> Option<(Vec>, &str)> { + let mut parts = Vec::new(); + + loop { + let rest = i.trim_start_matches(|c| c != '\\' && c != '"'); + parts.push(Part::Str(i[..i.len() - rest.len()].to_string())); + let mut chars = rest.chars(); + let c = match chars.next()? { + '\\' => match chars.next()? { + c @ ('\\' | '/' | '"') => c, + 'b' => '\x08', + 'f' => '\x0C', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + 'u' => { + let mut hex = String::with_capacity(4); + (0..4).try_for_each(|_| Some(hex.push(chars.next()?)))?; + let num = u32::from_str_radix(&hex, 16).unwrap(); + char::from_u32(num).unwrap_or_else(|| { + //emit(Simple::custom(span, "invalid unicode character")); + '\u{FFFD}' // unicode replacement character + }) + }, + '(' => todo!(), + _ => todo!("add error"), + }, + '"' => return Some((parts, chars.as_str())), + _ => unreachable!(), + }; + parts.push(Part::Str(c.into())); + i = chars.as_str(); + } } -/// JSON string character. -fn char_<'a>() -> impl Parser<&'a str, O = char> + Clone { - let unicode = str::matches("u").ignore_then(hex(4).map(|digits| { - let num = u32::from_str_radix(&digits, 16).unwrap(); - char::from_u32(num).unwrap_or_else(|| { - //emit(Simple::custom(span, "invalid unicode character")); - '\u{FFFD}' // unicode replacement character - }) - })); - - let bla = str::next().filter_map(select!( - '\\' => '\\', - '/' => '/', - '"' => '"', - 'b' => '\x08', - 'f' => '\x0C', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - )); - let escape = str::matches("\\").ignore_then(bla.or(unicode)); - - str::next().filter(|c| *c != '\\' && *c != '"').or(escape) +/// Whitespace and comments. +fn space_(i: &str) -> &str { + let mut i = i.trim_start(); + while let Some(comment) = i.strip_prefix('#') { + i = comment.trim_start_matches(|c| c != '\n').trim_start(); + } + i } -/// Whitespace and comments. fn space<'a>() -> impl Parser<&'a str, O = ()> + Clone { - let space = str::take_while(|c, _| c.is_ascii_whitespace()); - let comment = str::matches("#").then(str::take_while(|c, _| *c != '\n')); - let comments = space.then(comment).map(|_| ()).repeated::<()>(); - comments.then(space).map(|_| ()) + parcours::from_fn(|i, _| Some(((), space_(i)))) +} + +use jaq_syn::Spanned; +fn parts_to_interpol( + parts: Vec>, +) -> (Spanned, Vec<(Spanned, Spanned)>) { + let mut init = (String::new(), 0..42); + let mut tail = Vec::new(); + let mut parts = parts.into_iter(); + while let Some(part) = parts.next() { + match part { + Part::Str(s) => init.0.extend(s.chars()), + Part::Fun(f) => { + tail.push(((f, 0..42), (String::new(), 0..42))); + while let Some(part) = parts.next() { + match part { + Part::Str(s) => tail.last_mut().unwrap().1.0.extend(s.chars()), + Part::Fun(f) => tail.push(((f, 0..42), (String::new(), 0..42))), + } + } + } + } + } + (init, tail) } fn tree<'a>() -> impl Parser<&'a str, O = Tree> { @@ -140,15 +172,12 @@ fn tree<'a>() -> impl Parser<&'a str, O = Tree> { let brack = trees.delimited_by(str::matches("["), close("]")); let brace = trees.delimited_by(str::matches("{"), close("}")); - let chars = char_().repeated::().map(|s| (s, 0..42)); - let pair = |p| (Tree::Delim(Delim::Paren, p), 0..42); let interpol = str::matches("\\").ignore_then(paren.clone().map(pair)); - let string = chars - .clone() - .then(interpol.then(chars).repeated()) - .delimited_by(str::matches("\""), str::matches("\"")); + let string = str::matches("\"").ignore_then(parcours::from_fn(|i, _| { + string(i).map(|(parts, rest)| (parts_to_interpol(parts), rest)) + })); space().ignore_then(any(( paren.map(|t| Tree::Delim(Delim::Paren, t)), From 6293c1440881cd865381621ea3079b43c04707f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 8 May 2024 20:46:12 +0200 Subject: [PATCH 004/135] Finish lexer conversion. --- jaq-parse/src/lex.rs | 84 +++++++++++++++++++++++++----------------- jaq-parse/src/lib.rs | 20 ++++------ jaq-parse/src/token.rs | 2 +- 3 files changed, 60 insertions(+), 46 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index c28e43caf..fce73dcf7 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -100,6 +100,7 @@ fn string(mut i: &str) -> Option<(Vec>, &str)> { parts.push(Part::Str(i[..i.len() - rest.len()].to_string())); let mut chars = rest.chars(); let c = match chars.next()? { + '"' => return Some((parts, chars.as_str())), '\\' => match chars.next()? { c @ ('\\' | '/' | '"') => c, 'b' => '\x08', @@ -115,11 +116,15 @@ fn string(mut i: &str) -> Option<(Vec>, &str)> { //emit(Simple::custom(span, "invalid unicode character")); '\u{FFFD}' // unicode replacement character }) - }, - '(' => todo!(), + } + '(' => { + let (trees, rest) = trees(chars.as_str(), Delim::Paren); + parts.push(Part::Fun(trees)); + i = rest; + continue; + } _ => todo!("add error"), }, - '"' => return Some((parts, chars.as_str())), _ => unreachable!(), }; parts.push(Part::Str(c.into())); @@ -128,7 +133,7 @@ fn string(mut i: &str) -> Option<(Vec>, &str)> { } /// Whitespace and comments. -fn space_(i: &str) -> &str { +fn trim_space(i: &str) -> &str { let mut i = i.trim_start(); while let Some(comment) = i.strip_prefix('#') { i = comment.trim_start_matches(|c| c != '\n').trim_start(); @@ -136,10 +141,6 @@ fn space_(i: &str) -> &str { i } -fn space<'a>() -> impl Parser<&'a str, O = ()> + Clone { - parcours::from_fn(|i, _| Some(((), space_(i)))) -} - use jaq_syn::Spanned; fn parts_to_interpol( parts: Vec>, @@ -154,7 +155,7 @@ fn parts_to_interpol( tail.push(((f, 0..42), (String::new(), 0..42))); while let Some(part) = parts.next() { match part { - Part::Str(s) => tail.last_mut().unwrap().1.0.extend(s.chars()), + Part::Str(s) => tail.last_mut().unwrap().1 .0.extend(s.chars()), Part::Fun(f) => tail.push(((f, 0..42), (String::new(), 0..42))), } } @@ -164,30 +165,47 @@ fn parts_to_interpol( (init, tail) } -fn tree<'a>() -> impl Parser<&'a str, O = Tree> { - // TODO: span! - let trees = lazy!(tree).map(|t| (t, 0..42)).repeated(); - let close = |s| space().ignore_then(str::matches(s)); - let paren = trees.delimited_by(str::matches("("), close(")")); - let brack = trees.delimited_by(str::matches("["), close("]")); - let brace = trees.delimited_by(str::matches("{"), close("}")); - - let pair = |p| (Tree::Delim(Delim::Paren, p), 0..42); - let interpol = str::matches("\\").ignore_then(paren.clone().map(pair)); - - let string = str::matches("\"").ignore_then(parcours::from_fn(|i, _| { - string(i).map(|(parts, rest)| (parts_to_interpol(parts), rest)) - })); - - space().ignore_then(any(( - paren.map(|t| Tree::Delim(Delim::Paren, t)), - brack.map(|t| Tree::Delim(Delim::Brack, t)), - brace.map(|t| Tree::Delim(Delim::Brace, t)), - string.map(|(s, interpol)| Tree::String(s, interpol)), - parcours::from_fn(|i, _| token(i)).map(Tree::Token), - ))) +fn trees2(mut i: &str) -> (Vec>, &str) { + let mut trees = Vec::new(); + while let Some((tree, rest)) = tree_(i) { + trees.push((tree, 0..42)); + i = rest; + } + (trees, i) +} + +fn trees(mut i: &str, delim: Delim) -> (Tree, &str) { + let (trees, i) = trees2(i); + let i = trim_space(i); + let i = i.strip_prefix(delim.close()).unwrap_or_else(|| { + todo!("add error"); + i + }); + (Tree::Delim(delim, trees), i) +} + +fn tree_(i: &str) -> Option<(Tree, &str)> { + let i = trim_space(i); + let mut chars = i.chars(); + + Some(match chars.next()? { + '"' => { + let (parts, rest) = string(chars.as_str())?; + let (init, tail) = parts_to_interpol(parts); + (Tree::String(init, tail), rest) + } + '(' => trees(chars.as_str(), Delim::Paren), + '[' => trees(chars.as_str(), Delim::Brack), + '{' => trees(chars.as_str(), Delim::Brace), + _ => { + let (token, rest) = token(i)?; + (Tree::Token(token), rest) + } + }) } -pub fn lex<'a>() -> impl Parser<&'a str, O = Vec> { - lazy!(tree).repeated().then_ignore(space()) +pub fn lex_(i: &str) -> (Vec>, &str) { + let (trees, i) = trees2(i); + let i = trim_space(i); + (trees, i) } diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index 6bc9bfc38..cf4df693a 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -52,18 +52,14 @@ where } } */ - use parcours::Parser; - if let Some((tokens2, rest)) = crate::lex::lex().parse(src, &mut ()) { - let v: Vec<_> = tokens2 - .into_iter() - .map(|tree| tree.tokens(0..42)) - .flatten() - .collect(); - std::println!("{v:?}"); - std::println!("finished: {}", rest.is_empty()); - } else { - std::println!("parse error"); - } + let (tokens2, rest) = crate::lex::lex_(src); + let v: Vec<_> = tokens2 + .into_iter() + .map(|(tree, span)| tree.tokens(0..42)) + .flatten() + .collect(); + std::println!("{v:?}"); + std::println!("finished: {}", rest.is_empty()); let (tokens, lex_errs) = lex() .then_ignore(end()) .recover_with(skip_then_retry_until([])) diff --git a/jaq-parse/src/token.rs b/jaq-parse/src/token.rs index 18ab296e7..c01f00fcf 100644 --- a/jaq-parse/src/token.rs +++ b/jaq-parse/src/token.rs @@ -19,7 +19,7 @@ impl Delim { } } - fn close(self) -> char { + pub(crate) fn close(self) -> char { match self { Self::Paren => ')', Self::Brack => ']', From db2b78e3ec4a812ac61c95ff0ceed26d7a86e407 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 9 May 2024 10:19:57 +0200 Subject: [PATCH 005/135] New Token type for simpler and faster lexing. --- jaq-parse/src/lex.rs | 142 +++++++++++++++++++++++-------------------- jaq-parse/src/lib.rs | 7 +-- 2 files changed, 78 insertions(+), 71 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index fce73dcf7..0f936ce36 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -1,7 +1,42 @@ -use crate::token::{Delim, Token, Tree}; +use crate::token::Delim; use alloc::string::{String, ToString}; use alloc::vec::Vec; -use parcours::{any, lazy, str, Combinator, Parser}; + +/// Token (tree) generic over string type `S`. +#[derive(Debug)] +pub enum Token { + /// keywords such as `def`, but also identifiers such as `map` or `@csv`. + Word(S), + /// variable, including leading `$` + Var(S), + /// number + Num(S), + /// interpolated string + Str(Vec>), + /// operator, such as `|` or `+=` + Op(S), + /// punctuation, such as `.` or `;` + Punct(Punct, S), + /// delimited tokens, e.g. `(...)` or `{...}` + Delim(Delim, Vec), +} + +/// Punctuation. +#[derive(Debug)] +pub enum Punct { + /// `.` + Dot, + /// `..` + DotDot, + /// `?` + Question, + /// `,` + Comma, + /// `:` + Colon, + /// `;` + Semicolon, +} fn strip_digits(i: &str) -> Option<&str> { i.strip_prefix(|c: char| c.is_numeric()) @@ -38,53 +73,45 @@ fn strip_ident(i: &str) -> Option<&str> { .map(trim_ident) } -fn token(i: &str) -> Option<(Token, &str)> { +fn token(i: &str) -> Option<(Token<&str>, &str)> { + let i = trim_space(i); + let is_op = |c| "|=!<>+-*/%".contains(c); let prefix = |rest: &str| &i[..i.len() - rest.len()]; - let single = |tk: Token| (tk, &i[1..]); + let punct = |len: usize, p: Punct| (Token::Punct(p, &i[..len]), &i[len..]); let mut chars = i.chars(); Some(match chars.next()? { 'a'..='z' | 'A'..='Z' | '@' | '_' => { let rest = trim_ident(chars.as_str()); - let tk = match prefix(rest) { - "def" => Token::Def, - "if" => Token::If, - "then" => Token::Then, - "elif" => Token::Elif, - "else" => Token::Else, - "end" => Token::End, - "or" => Token::Or, - "and" => Token::And, - "as" => Token::As, - "reduce" => Token::Reduce, - "for" => Token::For, - "foreach" => Token::Foreach, - "try" => Token::Try, - "catch" => Token::Catch, - ident => Token::Ident(ident.to_string()), - }; - (tk, rest) + (Token::Word(prefix(rest)), rest) } '$' => { // TODO: handle error let rest = strip_ident(chars.as_str()).unwrap(); - (Token::Var(i[1..i.len() - rest.len()].to_string()), rest) + (Token::Var(prefix(rest)), rest) } '0'..='9' => { let rest = trim_num(chars.as_str()); - (Token::Num(prefix(rest).to_string()), rest) + (Token::Num(prefix(rest)), rest) } - '.' if chars.next()? == '.' => (Token::DotDot, &i[2..]), - '.' => single(Token::Dot), - ':' => single(Token::Colon), - ';' => single(Token::Semicolon), - ',' => single(Token::Comma), - '?' => single(Token::Question), + '.' if chars.next()? == '.' => punct(2, Punct::DotDot), + '.' => punct(1, Punct::Dot), + ':' => punct(1, Punct::Colon), + ';' => punct(1, Punct::Semicolon), + ',' => punct(1, Punct::Comma), + '?' => punct(1, Punct::Question), c if is_op(c) => { let rest = chars.as_str().trim_start_matches(is_op); - (Token::Op(prefix(rest).to_string()), rest) + (Token::Op(prefix(rest)), rest) } + '"' => { + let (parts, rest) = string(chars.as_str())?; + (Token::Str(parts), rest) + } + '(' => tokens_then(chars.as_str(), Delim::Paren), + '[' => tokens_then(chars.as_str(), Delim::Brack), + '{' => tokens_then(chars.as_str(), Delim::Brace), _ => return None, }) } @@ -92,12 +119,15 @@ fn token(i: &str) -> Option<(Token, &str)> { use jaq_syn::string::Part; /// Returns `None` when an unexpected EOF was encountered. -fn string(mut i: &str) -> Option<(Vec>, &str)> { +fn string(mut i: &str) -> Option<(Vec>>, &str)> { let mut parts = Vec::new(); loop { let rest = i.trim_start_matches(|c| c != '\\' && c != '"'); - parts.push(Part::Str(i[..i.len() - rest.len()].to_string())); + let s = &i[..i.len() - rest.len()]; + if !s.is_empty() { + parts.push(Part::Str(s.to_string())) + } let mut chars = rest.chars(); let c = match chars.next()? { '"' => return Some((parts, chars.as_str())), @@ -118,7 +148,7 @@ fn string(mut i: &str) -> Option<(Vec>, &str)> { }) } '(' => { - let (trees, rest) = trees(chars.as_str(), Delim::Paren); + let (trees, rest) = tokens_then(chars.as_str(), Delim::Paren); parts.push(Part::Fun(trees)); i = rest; continue; @@ -141,6 +171,7 @@ fn trim_space(i: &str) -> &str { i } +/* use jaq_syn::Spanned; fn parts_to_interpol( parts: Vec>, @@ -164,48 +195,29 @@ fn parts_to_interpol( } (init, tail) } +*/ -fn trees2(mut i: &str) -> (Vec>, &str) { - let mut trees = Vec::new(); - while let Some((tree, rest)) = tree_(i) { - trees.push((tree, 0..42)); +fn tokens(mut i: &str) -> (Vec>, &str) { + let mut tokens = Vec::new(); + while let Some((tk, rest)) = token(i) { + tokens.push(tk); i = rest; } - (trees, i) + (tokens, i) } -fn trees(mut i: &str, delim: Delim) -> (Tree, &str) { - let (trees, i) = trees2(i); +fn tokens_then(i: &str, delim: Delim) -> (Token<&str>, &str) { + let (tokens, i) = tokens(i); let i = trim_space(i); let i = i.strip_prefix(delim.close()).unwrap_or_else(|| { todo!("add error"); i }); - (Tree::Delim(delim, trees), i) -} - -fn tree_(i: &str) -> Option<(Tree, &str)> { - let i = trim_space(i); - let mut chars = i.chars(); - - Some(match chars.next()? { - '"' => { - let (parts, rest) = string(chars.as_str())?; - let (init, tail) = parts_to_interpol(parts); - (Tree::String(init, tail), rest) - } - '(' => trees(chars.as_str(), Delim::Paren), - '[' => trees(chars.as_str(), Delim::Brack), - '{' => trees(chars.as_str(), Delim::Brace), - _ => { - let (token, rest) = token(i)?; - (Tree::Token(token), rest) - } - }) + (Token::Delim(delim, tokens), i) } -pub fn lex_(i: &str) -> (Vec>, &str) { - let (trees, i) = trees2(i); +pub fn lex_(i: &str) -> (Vec>, &str) { + let (tokens, i) = tokens(i); let i = trim_space(i); - (trees, i) + (tokens, i) } diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index cf4df693a..464fb5610 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -53,12 +53,7 @@ where } */ let (tokens2, rest) = crate::lex::lex_(src); - let v: Vec<_> = tokens2 - .into_iter() - .map(|(tree, span)| tree.tokens(0..42)) - .flatten() - .collect(); - std::println!("{v:?}"); + std::println!("{tokens2:?}"); std::println!("finished: {}", rest.is_empty()); let (tokens, lex_errs) = lex() .then_ignore(end()) From 69091747a5037c7efd1dedab3a1b8c2e92b48c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 9 May 2024 10:20:24 +0200 Subject: [PATCH 006/135] Remove parcours dependency. --- Cargo.lock | 5 ----- jaq-parse/Cargo.toml | 1 - 2 files changed, 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 28ee4bc4f..06caaf260 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -317,7 +317,6 @@ version = "1.0.2" dependencies = [ "chumsky", "jaq-syn", - "parcours", ] [[package]] @@ -433,10 +432,6 @@ version = "6.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" -[[package]] -name = "parcours" -version = "0.3.0" - [[package]] name = "proc-macro-error" version = "1.0.4" diff --git a/jaq-parse/Cargo.toml b/jaq-parse/Cargo.toml index bc2defa82..9c2160266 100644 --- a/jaq-parse/Cargo.toml +++ b/jaq-parse/Cargo.toml @@ -14,4 +14,3 @@ rust-version = "1.64" [dependencies] chumsky = { version = "0.9.0", default-features = false } jaq-syn = { version = "1.0.0", path = "../jaq-syn" } -parcours = { version = "0.3.0", path = "../../parcours" } From 0c671f17223c0233ccb2266719a1c22e2776721c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 9 May 2024 11:23:03 +0200 Subject: [PATCH 007/135] Nicer handling of words. --- jaq-parse/src/lex.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 0f936ce36..2ce505452 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -5,10 +5,8 @@ use alloc::vec::Vec; /// Token (tree) generic over string type `S`. #[derive(Debug)] pub enum Token { - /// keywords such as `def`, but also identifiers such as `map` or `@csv`. + /// keywords such as `def`, but also identifiers such as `map`, `$x`, or `@csv` Word(S), - /// variable, including leading `$` - Var(S), /// number Num(S), /// interpolated string @@ -82,14 +80,14 @@ fn token(i: &str) -> Option<(Token<&str>, &str)> { let mut chars = i.chars(); Some(match chars.next()? { - 'a'..='z' | 'A'..='Z' | '@' | '_' => { + 'a'..='z' | 'A'..='Z' | '_' => { let rest = trim_ident(chars.as_str()); (Token::Word(prefix(rest)), rest) } - '$' => { + '$' | '@' => { // TODO: handle error let rest = strip_ident(chars.as_str()).unwrap(); - (Token::Var(prefix(rest)), rest) + (Token::Word(prefix(rest)), rest) } '0'..='9' => { let rest = trim_num(chars.as_str()); From 3bde32622d253ca0fdaa2e1a1571ebf1d41b65ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 9 May 2024 12:30:17 +0200 Subject: [PATCH 008/135] Error reporting. --- jaq-parse/src/lex.rs | 96 ++++++++++++++++++++++++++++---------------- jaq-parse/src/lib.rs | 6 ++- 2 files changed, 66 insertions(+), 36 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 2ce505452..8b89a039d 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -15,7 +15,7 @@ pub enum Token { Op(S), /// punctuation, such as `.` or `;` Punct(Punct, S), - /// delimited tokens, e.g. `(...)` or `{...}` + /// delimited tokens, e.g. `(...)` or `[...]` Delim(Delim, Vec), } @@ -36,28 +36,35 @@ pub enum Punct { Semicolon, } +#[derive(Debug)] +pub enum Expect<'a> { + Digit, + Ident, + Delim(&'a str), + Escape, +} + +type Errors<'a> = Vec<(Expect<'a>, &'a str)>; + +fn fail<'a>(e: Expect<'a>, i: &'a str, errs: &mut Errors<'a>) -> &'a str { + errs.push((e, i)); + i +} + fn strip_digits(i: &str) -> Option<&str> { i.strip_prefix(|c: char| c.is_numeric()) .map(|i| i.trim_start_matches(|c: char| c.is_numeric())) } /// Decimal with optional exponent. -fn trim_num(i: &str) -> &str { +fn trim_num<'a>(i: &'a str, e: &mut Errors<'a>) -> &'a str { let i = i.trim_start_matches(|c: char| c.is_numeric()); let i = i.strip_prefix('.').map_or(i, |i| { - strip_digits(i).unwrap_or_else(|| { - // TODO: register error - todo!(); - i - }) + strip_digits(i).unwrap_or_else(|| fail(Expect::Digit, i, e)) }); let i = i.strip_prefix(['e', 'E']).map_or(i, |i| { let i = i.strip_prefix(['+', '-']).unwrap_or(i); - strip_digits(i).unwrap_or_else(|| { - // TODO: register error - todo!(); - i - }) + strip_digits(i).unwrap_or_else(|| fail(Expect::Digit, i, e)) }); i } @@ -71,7 +78,7 @@ fn strip_ident(i: &str) -> Option<&str> { .map(trim_ident) } -fn token(i: &str) -> Option<(Token<&str>, &str)> { +fn token<'a>(i: &'a str, e: &mut Errors<'a>) -> Option<(Token<&'a str>, &'a str)> { let i = trim_space(i); let is_op = |c| "|=!<>+-*/%".contains(c); @@ -85,15 +92,15 @@ fn token(i: &str) -> Option<(Token<&str>, &str)> { (Token::Word(prefix(rest)), rest) } '$' | '@' => { - // TODO: handle error - let rest = strip_ident(chars.as_str()).unwrap(); + let rest = strip_ident(chars.as_str()) + .unwrap_or_else(|| fail(Expect::Ident, chars.as_str(), e)); (Token::Word(prefix(rest)), rest) } '0'..='9' => { - let rest = trim_num(chars.as_str()); + let rest = trim_num(chars.as_str(), e); (Token::Num(prefix(rest)), rest) } - '.' if chars.next()? == '.' => punct(2, Punct::DotDot), + '.' if chars.next() == Some('.') => punct(2, Punct::DotDot), '.' => punct(1, Punct::Dot), ':' => punct(1, Punct::Colon), ';' => punct(1, Punct::Semicolon), @@ -104,12 +111,10 @@ fn token(i: &str) -> Option<(Token<&str>, &str)> { (Token::Op(prefix(rest)), rest) } '"' => { - let (parts, rest) = string(chars.as_str())?; + let (parts, rest) = string(chars.as_str(), e)?; (Token::Str(parts), rest) } - '(' => tokens_then(chars.as_str(), Delim::Paren), - '[' => tokens_then(chars.as_str(), Delim::Brack), - '{' => tokens_then(chars.as_str(), Delim::Brace), + '(' | '[' | '{' => delim(i, e), _ => return None, }) } @@ -117,7 +122,7 @@ fn token(i: &str) -> Option<(Token<&str>, &str)> { use jaq_syn::string::Part; /// Returns `None` when an unexpected EOF was encountered. -fn string(mut i: &str) -> Option<(Vec>>, &str)> { +fn string<'a>(mut i: &'a str, e: &mut Errors<'a>) -> Option<(Vec>>, &'a str)> { let mut parts = Vec::new(); loop { @@ -146,12 +151,15 @@ fn string(mut i: &str) -> Option<(Vec>>, &str)> { }) } '(' => { - let (trees, rest) = tokens_then(chars.as_str(), Delim::Paren); + let (trees, rest) = delim(&rest[1..], e); parts.push(Part::Fun(trees)); i = rest; continue; } - _ => todo!("add error"), + _ => { + e.push((Expect::Escape, &rest[1..])); + continue; + } }, _ => unreachable!(), }; @@ -195,27 +203,47 @@ fn parts_to_interpol( } */ -fn tokens(mut i: &str) -> (Vec>, &str) { +fn tokens<'a>(mut i: &'a str, e: &mut Errors<'a>) -> (Vec>, &'a str) { let mut tokens = Vec::new(); - while let Some((tk, rest)) = token(i) { + while let Some((tk, rest)) = token(i, e) { tokens.push(tk); i = rest; } (tokens, i) } -fn tokens_then(i: &str, delim: Delim) -> (Token<&str>, &str) { - let (tokens, i) = tokens(i); +/// Parse a delimited sequence of tokens. +/// +/// The input string has to start with either '(', '[', or '{'. +fn delim<'a>(i: &'a str, e: &mut Errors<'a>) -> (Token<&'a str>, &'a str) { + let mut chars = i.chars(); + let delim = match chars.next().unwrap() { + '(' => Delim::Paren, + '[' => Delim::Brack, + '{' => Delim::Brace, + _ => panic!(), + }; + let (tokens, rest) = tokens(chars.as_str(), e); + let rest = trim_space(rest); + let rest = rest + .strip_prefix(delim.close()) + .unwrap_or_else(|| fail(Expect::Delim(i), rest, e)); + (Token::Delim(delim, tokens), rest) +} + +/* +fn tokens_then<'a>(i: &'a str, e: &mut Errors<'a>, delim: Delim) -> (Token<&'a str>, &'a str) { + let (tokens, i) = tokens(i, e); let i = trim_space(i); - let i = i.strip_prefix(delim.close()).unwrap_or_else(|| { - todo!("add error"); - i - }); + let i = i + .strip_prefix(delim.close()) + .unwrap_or_else(|| fail(Expect::Delim(delim), i, e)); (Token::Delim(delim, tokens), i) } +*/ -pub fn lex_(i: &str) -> (Vec>, &str) { - let (tokens, i) = tokens(i); +pub fn lex<'a>(i: &'a str, e: &mut Errors<'a>) -> (Vec>, &'a str) { + let (tokens, i) = tokens(i, e); let i = trim_space(i); (tokens, i) } diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index 464fb5610..46d49742f 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -52,8 +52,10 @@ where } } */ - let (tokens2, rest) = crate::lex::lex_(src); - std::println!("{tokens2:?}"); + let mut lex_errs = Vec::new(); + let (tokens, rest) = crate::lex::lex(src, &mut lex_errs); + std::println!("Tokens: {tokens:?}"); + std::println!("Errors: {lex_errs:?}"); std::println!("finished: {}", rest.is_empty()); let (tokens, lex_errs) = lex() .then_ignore(end()) From 8ac7319a3fc35e30f967443ef7be5d11293b470c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 9 May 2024 18:08:33 +0200 Subject: [PATCH 009/135] Report Unicode errors. --- jaq-parse/src/lex.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 8b89a039d..6aaec51c4 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -42,6 +42,7 @@ pub enum Expect<'a> { Ident, Delim(&'a str), Escape, + Unicode, } type Errors<'a> = Vec<(Expect<'a>, &'a str)>; @@ -144,10 +145,10 @@ fn string<'a>(mut i: &'a str, e: &mut Errors<'a>) -> Option<(Vec { let mut hex = String::with_capacity(4); (0..4).try_for_each(|_| Some(hex.push(chars.next()?)))?; - let num = u32::from_str_radix(&hex, 16).unwrap(); - char::from_u32(num).unwrap_or_else(|| { - //emit(Simple::custom(span, "invalid unicode character")); - '\u{FFFD}' // unicode replacement character + let c = u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32); + c.unwrap_or_else(|| { + e.push((Expect::Unicode, &rest[2..])); + '\u{FFFD}' // Unicode replacement character }) } '(' => { @@ -161,6 +162,7 @@ fn string<'a>(mut i: &'a str, e: &mut Errors<'a>) -> Option<(Vec unreachable!(), }; parts.push(Part::Str(c.into())); From 9c1a3e9d4a4f3392115851d0520ba1ed3472c8f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 9 May 2024 18:12:47 +0200 Subject: [PATCH 010/135] Correct lexing of incorrect string escapes, e.g. "\0". --- jaq-parse/src/lex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 6aaec51c4..fe5353d25 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -159,7 +159,7 @@ fn string<'a>(mut i: &'a str, e: &mut Errors<'a>) -> Option<(Vec { e.push((Expect::Escape, &rest[1..])); - continue; + '\0' } }, // SAFETY: due to `trim_start_matches` From 4f001de09b77bdfbbeb8e3733b357a05c4af4840 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 9 May 2024 18:14:33 +0200 Subject: [PATCH 011/135] Remove unused function. --- jaq-parse/src/lex.rs | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index fe5353d25..86599ae91 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -233,17 +233,6 @@ fn delim<'a>(i: &'a str, e: &mut Errors<'a>) -> (Token<&'a str>, &'a str) { (Token::Delim(delim, tokens), rest) } -/* -fn tokens_then<'a>(i: &'a str, e: &mut Errors<'a>, delim: Delim) -> (Token<&'a str>, &'a str) { - let (tokens, i) = tokens(i, e); - let i = trim_space(i); - let i = i - .strip_prefix(delim.close()) - .unwrap_or_else(|| fail(Expect::Delim(delim), i, e)); - (Token::Delim(delim, tokens), i) -} -*/ - pub fn lex<'a>(i: &'a str, e: &mut Errors<'a>) -> (Vec>, &'a str) { let (tokens, i) = tokens(i, e); let i = trim_space(i); From 03df6628d2852e7ff6966f460a11d6e5eb452b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 10 May 2024 09:02:54 +0200 Subject: [PATCH 012/135] Make lexer an object. --- jaq-parse/src/lex.rs | 333 ++++++++++++++++++++++++------------------- jaq-parse/src/lib.rs | 9 +- 2 files changed, 192 insertions(+), 150 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 86599ae91..48ead9329 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -1,6 +1,7 @@ use crate::token::Delim; use alloc::string::{String, ToString}; use alloc::vec::Vec; +use jaq_syn::string::Part; /// Token (tree) generic over string type `S`. #[derive(Debug)] @@ -10,7 +11,7 @@ pub enum Token { /// number Num(S), /// interpolated string - Str(Vec>), + Str(Vec>), /// operator, such as `|` or `+=` Op(S), /// punctuation, such as `.` or `;` @@ -20,7 +21,7 @@ pub enum Token { } /// Punctuation. -#[derive(Debug)] +#[derive(Copy, Clone, Debug)] pub enum Punct { /// `.` Dot, @@ -36,6 +37,19 @@ pub enum Punct { Semicolon, } +impl Punct { + fn as_str(self) -> &'static str { + match self { + Self::Dot => ".", + Self::DotDot => "..", + Self::Question => "?", + Self::Comma => ",", + Self::Colon => ":", + Self::Semicolon => ";", + } + } +} + #[derive(Debug)] pub enum Expect<'a> { Digit, @@ -47,136 +61,197 @@ pub enum Expect<'a> { type Errors<'a> = Vec<(Expect<'a>, &'a str)>; -fn fail<'a>(e: Expect<'a>, i: &'a str, errs: &mut Errors<'a>) -> &'a str { - errs.push((e, i)); - i +pub struct Lex<'a> { + i: &'a str, + e: Errors<'a>, } -fn strip_digits(i: &str) -> Option<&str> { - i.strip_prefix(|c: char| c.is_numeric()) - .map(|i| i.trim_start_matches(|c: char| c.is_numeric())) -} +impl<'a> Lex<'a> { + pub fn new(i: &'a str) -> Self { + let e = Vec::new(); + Self { i, e } + } -/// Decimal with optional exponent. -fn trim_num<'a>(i: &'a str, e: &mut Errors<'a>) -> &'a str { - let i = i.trim_start_matches(|c: char| c.is_numeric()); - let i = i.strip_prefix('.').map_or(i, |i| { - strip_digits(i).unwrap_or_else(|| fail(Expect::Digit, i, e)) - }); - let i = i.strip_prefix(['e', 'E']).map_or(i, |i| { - let i = i.strip_prefix(['+', '-']).unwrap_or(i); - strip_digits(i).unwrap_or_else(|| fail(Expect::Digit, i, e)) - }); - i -} + pub fn lex(&mut self) -> Vec> { + let tokens = self.tokens(); + self.space(); + tokens + } -fn trim_ident(i: &str) -> &str { - i.trim_start_matches(|c: char| c.is_ascii_alphanumeric() || c == '_') -} + pub fn input(&self) -> &'a str { + self.i + } -fn strip_ident(i: &str) -> Option<&str> { - i.strip_prefix(|c: char| c.is_ascii_alphabetic() || c == '_') - .map(trim_ident) -} + pub fn errors(&self) -> &Errors<'a> { + &self.e + } -fn token<'a>(i: &'a str, e: &mut Errors<'a>) -> Option<(Token<&'a str>, &'a str)> { - let i = trim_space(i); + fn next(&mut self) -> Option { + let mut chars = self.i.chars(); + let c = chars.next()?; + self.i = chars.as_str(); + Some(c) + } + + fn trim(&mut self, f: impl FnMut(char) -> bool) { + self.i = self.i.trim_start_matches(f); + } - let is_op = |c| "|=!<>+-*/%".contains(c); - let prefix = |rest: &str| &i[..i.len() - rest.len()]; - let punct = |len: usize, p: Punct| (Token::Punct(p, &i[..len]), &i[len..]); + fn consumed(&mut self, chars: core::str::Chars<'a>, f: impl FnOnce(&mut Self)) -> &'a str { + let start = self.i; + self.i = chars.as_str(); + f(self); + &start[..start.len() - self.i.len()] + } - let mut chars = i.chars(); - Some(match chars.next()? { - 'a'..='z' | 'A'..='Z' | '_' => { - let rest = trim_ident(chars.as_str()); - (Token::Word(prefix(rest)), rest) + /// Whitespace and comments. + fn space(&mut self) { + self.i = self.i.trim_start(); + while let Some(comment) = self.i.strip_prefix('#') { + self.i = comment.trim_start_matches(|c| c != '\n').trim_start(); } - '$' | '@' => { - let rest = strip_ident(chars.as_str()) - .unwrap_or_else(|| fail(Expect::Ident, chars.as_str(), e)); - (Token::Word(prefix(rest)), rest) + } + + fn ident0(&mut self) { + self.trim(|c: char| c.is_ascii_alphanumeric() || c == '_'); + } + + fn ident1(&mut self) { + let f = |c: char| c.is_ascii_alphabetic() || c == '_'; + if let Some(rest) = self.i.strip_prefix(f) { + self.i = rest; + self.ident0(); + } else { + self.e.push((Expect::Ident, self.i)); } - '0'..='9' => { - let rest = trim_num(chars.as_str(), e); - (Token::Num(prefix(rest)), rest) + } + + fn digits1(&mut self) { + if let Some(rest) = self.i.strip_prefix(|c: char| c.is_numeric()) { + self.i = rest.trim_start_matches(|c: char| c.is_numeric()); + } else { + self.e.push((Expect::Digit, self.i)); } - '.' if chars.next() == Some('.') => punct(2, Punct::DotDot), - '.' => punct(1, Punct::Dot), - ':' => punct(1, Punct::Colon), - ';' => punct(1, Punct::Semicolon), - ',' => punct(1, Punct::Comma), - '?' => punct(1, Punct::Question), - c if is_op(c) => { - let rest = chars.as_str().trim_start_matches(is_op); - (Token::Op(prefix(rest)), rest) + } + + /// Decimal with optional exponent. + fn num(&mut self) { + self.trim(|c| c.is_numeric()); + if let Some(i) = self.i.strip_prefix('.') { + self.i = i; + self.digits1(); } - '"' => { - let (parts, rest) = string(chars.as_str(), e)?; - (Token::Str(parts), rest) + if let Some(i) = self.i.strip_prefix(['e', 'E']) { + self.i = i.strip_prefix(['+', '-']).unwrap_or(i); + self.digits1(); } - '(' | '[' | '{' => delim(i, e), - _ => return None, - }) -} + } -use jaq_syn::string::Part; + /// Returns `None` when an unexpected EOF was encountered. + fn str(&mut self) -> Option>>> { + assert_eq!(self.next(), Some('"')); + let mut parts = Vec::new(); -/// Returns `None` when an unexpected EOF was encountered. -fn string<'a>(mut i: &'a str, e: &mut Errors<'a>) -> Option<(Vec>>, &'a str)> { - let mut parts = Vec::new(); + loop { + let s = self.consumed(self.i.chars(), |lex| lex.trim(|c| c != '\\' && c != '"')); + if !s.is_empty() { + parts.push(Part::Str(s.to_string())) + } + match self.next()? { + '"' => return Some(parts), + '\\' => { + let mut chars = self.i.chars(); + let c = match chars.next()? { + c @ ('\\' | '/' | '"') => c, + 'b' => '\x08', + 'f' => '\x0C', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + 'u' => { + let mut hex = String::with_capacity(4); + for _ in 0..4 { + hex.push(chars.next()?); + } + let c = u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32); + c.unwrap_or_else(|| { + self.e.push((Expect::Unicode, self.i)); + '\u{FFFD}' // Unicode replacement character + }) + } + '(' => { + parts.push(Part::Fun(self.delim())); + continue; + } + _ => { + self.e.push((Expect::Escape, self.i)); + '\0' + } + }; - loop { - let rest = i.trim_start_matches(|c| c != '\\' && c != '"'); - let s = &i[..i.len() - rest.len()]; - if !s.is_empty() { - parts.push(Part::Str(s.to_string())) - } - let mut chars = rest.chars(); - let c = match chars.next()? { - '"' => return Some((parts, chars.as_str())), - '\\' => match chars.next()? { - c @ ('\\' | '/' | '"') => c, - 'b' => '\x08', - 'f' => '\x0C', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - 'u' => { - let mut hex = String::with_capacity(4); - (0..4).try_for_each(|_| Some(hex.push(chars.next()?)))?; - let c = u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32); - c.unwrap_or_else(|| { - e.push((Expect::Unicode, &rest[2..])); - '\u{FFFD}' // Unicode replacement character - }) + self.i = chars.as_str(); + parts.push(Part::Str(c.into())); } - '(' => { - let (trees, rest) = delim(&rest[1..], e); - parts.push(Part::Fun(trees)); - i = rest; - continue; - } - _ => { - e.push((Expect::Escape, &rest[1..])); - '\0' - } - }, - // SAFETY: due to `trim_start_matches` - _ => unreachable!(), - }; - parts.push(Part::Str(c.into())); - i = chars.as_str(); + // SAFETY: due to `lex.trim()` + _ => unreachable!(), + }; + } } -} -/// Whitespace and comments. -fn trim_space(i: &str) -> &str { - let mut i = i.trim_start(); - while let Some(comment) = i.strip_prefix('#') { - i = comment.trim_start_matches(|c| c != '\n').trim_start(); + fn punct(&mut self, p: Punct) -> Token<&'a str> { + let (s, after) = self.i.split_at(p.as_str().len()); + self.i = after; + Token::Punct(p, s) + } + + fn token(&mut self) -> Option> { + self.space(); + + let is_op = |c| "|=!<>+-*/%".contains(c); + + let mut chars = self.i.chars(); + Some(match chars.next()? { + 'a'..='z' | 'A'..='Z' | '_' => Token::Word(self.consumed(chars, |lex| lex.ident0())), + '$' | '@' => Token::Word(self.consumed(chars, |lex| lex.ident1())), + '0'..='9' => Token::Num(self.consumed(chars, |lex| lex.num())), + c if is_op(c) => Token::Op(self.consumed(chars, |lex| lex.trim(is_op))), + '.' if chars.next() == Some('.') => self.punct(Punct::DotDot), + '.' => self.punct(Punct::Dot), + ':' => self.punct(Punct::Colon), + ';' => self.punct(Punct::Semicolon), + ',' => self.punct(Punct::Comma), + '?' => self.punct(Punct::Question), + '"' => Token::Str(self.str()?), + '(' | '[' | '{' => self.delim(), + _ => return None, + }) + } + + fn tokens(&mut self) -> Vec> { + core::iter::from_fn(|| self.token()).collect() + } + + /// Parse a delimited sequence of tokens. + /// + /// The input string has to start with either '(', '[', or '{'. + fn delim(&mut self) -> Token<&'a str> { + let start = self.i; + let delim = match self.next() { + Some('(') => Delim::Paren, + Some('[') => Delim::Brack, + Some('{') => Delim::Brace, + _ => panic!(), + }; + let tokens = self.tokens(); + + self.space(); + if let Some(rest) = self.i.strip_prefix(delim.close()) { + self.i = rest + } else { + self.e.push((Expect::Delim(start), self.i)); + } + Token::Delim(delim, tokens) } - i } /* @@ -204,37 +279,3 @@ fn parts_to_interpol( (init, tail) } */ - -fn tokens<'a>(mut i: &'a str, e: &mut Errors<'a>) -> (Vec>, &'a str) { - let mut tokens = Vec::new(); - while let Some((tk, rest)) = token(i, e) { - tokens.push(tk); - i = rest; - } - (tokens, i) -} - -/// Parse a delimited sequence of tokens. -/// -/// The input string has to start with either '(', '[', or '{'. -fn delim<'a>(i: &'a str, e: &mut Errors<'a>) -> (Token<&'a str>, &'a str) { - let mut chars = i.chars(); - let delim = match chars.next().unwrap() { - '(' => Delim::Paren, - '[' => Delim::Brack, - '{' => Delim::Brace, - _ => panic!(), - }; - let (tokens, rest) = tokens(chars.as_str(), e); - let rest = trim_space(rest); - let rest = rest - .strip_prefix(delim.close()) - .unwrap_or_else(|| fail(Expect::Delim(i), rest, e)); - (Token::Delim(delim, tokens), rest) -} - -pub fn lex<'a>(i: &'a str, e: &mut Errors<'a>) -> (Vec>, &'a str) { - let (tokens, i) = tokens(i, e); - let i = trim_space(i); - (tokens, i) -} diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index 46d49742f..d3e1608f7 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -52,11 +52,12 @@ where } } */ - let mut lex_errs = Vec::new(); - let (tokens, rest) = crate::lex::lex(src, &mut lex_errs); + let mut lexer = crate::lex::Lex::new(src); + let tokens = lexer.lex(); std::println!("Tokens: {tokens:?}"); - std::println!("Errors: {lex_errs:?}"); - std::println!("finished: {}", rest.is_empty()); + std::println!("Errors: {:?}", lexer.errors()); + std::println!("finished: {}", lexer.input().is_empty()); + let (tokens, lex_errs) = lex() .then_ignore(end()) .recover_with(skip_then_retry_until([])) From 0f91ca2caa21bcd224252e382a9f0a224e090dd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 10 May 2024 09:17:35 +0200 Subject: [PATCH 013/135] Improve string handling. --- jaq-parse/src/lex.rs | 60 ++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 48ead9329..4168e938b 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -55,6 +55,7 @@ pub enum Expect<'a> { Digit, Ident, Delim(&'a str), + String(&'a str), Escape, Unicode, } @@ -147,8 +148,8 @@ impl<'a> Lex<'a> { } } - /// Returns `None` when an unexpected EOF was encountered. - fn str(&mut self) -> Option>>> { + fn str(&mut self) -> Vec>> { + let start = self.i; assert_eq!(self.next(), Some('"')); let mut parts = Vec::new(); @@ -157,33 +158,26 @@ impl<'a> Lex<'a> { if !s.is_empty() { parts.push(Part::Str(s.to_string())) } - match self.next()? { - '"' => return Some(parts), - '\\' => { + match self.next() { + Some('"') => return parts, + Some('\\') => { let mut chars = self.i.chars(); - let c = match chars.next()? { - c @ ('\\' | '/' | '"') => c, - 'b' => '\x08', - 'f' => '\x0C', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - 'u' => { - let mut hex = String::with_capacity(4); - for _ in 0..4 { - hex.push(chars.next()?); - } - let c = u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32); - c.unwrap_or_else(|| { - self.e.push((Expect::Unicode, self.i)); - '\u{FFFD}' // Unicode replacement character - }) - } - '(' => { + let c = match chars.next() { + Some(c @ ('\\' | '/' | '"')) => c, + Some('b') => '\x08', + Some('f') => '\x0C', + Some('n') => '\n', + Some('r') => '\r', + Some('t') => '\t', + Some('u') => unicode(&mut chars).unwrap_or_else(|| { + self.e.push((Expect::Unicode, self.i)); + '\u{FFFD}' // Unicode replacement character + }), + Some('(') => { parts.push(Part::Fun(self.delim())); continue; } - _ => { + Some(_) | None => { self.e.push((Expect::Escape, self.i)); '\0' } @@ -193,7 +187,11 @@ impl<'a> Lex<'a> { parts.push(Part::Str(c.into())); } // SAFETY: due to `lex.trim()` - _ => unreachable!(), + Some(_) => unreachable!(), + None => { + self.e.push((Expect::String(start), self.i)); + return parts; + } }; } } @@ -221,7 +219,7 @@ impl<'a> Lex<'a> { ';' => self.punct(Punct::Semicolon), ',' => self.punct(Punct::Comma), '?' => self.punct(Punct::Question), - '"' => Token::Str(self.str()?), + '"' => Token::Str(self.str()), '(' | '[' | '{' => self.delim(), _ => return None, }) @@ -254,6 +252,14 @@ impl<'a> Lex<'a> { } } +fn unicode(chars: &mut core::str::Chars) -> Option { + let mut hex = String::with_capacity(4); + for _ in 0..4 { + hex.push(chars.next()?); + } + u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) +} + /* use jaq_syn::Spanned; fn parts_to_interpol( From c596dd9fe06ce609aee3efa9a2c70c7538589e58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 10 May 2024 09:27:18 +0200 Subject: [PATCH 014/135] Document. --- jaq-parse/src/lex.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 4168e938b..55a85fb82 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -148,6 +148,9 @@ impl<'a> Lex<'a> { } } + /// Lex a (possibly interpolated) string. + /// + /// The input string has to start with '"'. fn str(&mut self) -> Vec>> { let start = self.i; assert_eq!(self.next(), Some('"')); @@ -229,7 +232,7 @@ impl<'a> Lex<'a> { core::iter::from_fn(|| self.token()).collect() } - /// Parse a delimited sequence of tokens. + /// Lex a sequence of tokens that is surrounded by parentheses, curly braces, or brackets. /// /// The input string has to start with either '(', '[', or '{'. fn delim(&mut self) -> Token<&'a str> { From 3928fc883d78641918ae0ed757fbd96a53cee1e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 10 May 2024 09:44:28 +0200 Subject: [PATCH 015/135] Documentation, a bit of refactoring. --- jaq-parse/src/lex.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 55a85fb82..abdee3932 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -50,7 +50,7 @@ impl Punct { } } -#[derive(Debug)] +#[derive(Clone, Debug)] pub enum Expect<'a> { Digit, Ident, @@ -60,11 +60,11 @@ pub enum Expect<'a> { Unicode, } -type Errors<'a> = Vec<(Expect<'a>, &'a str)>; +type Error<'a> = (Expect<'a>, &'a str); pub struct Lex<'a> { i: &'a str, - e: Errors<'a>, + e: Vec>, } impl<'a> Lex<'a> { @@ -83,7 +83,7 @@ impl<'a> Lex<'a> { self.i } - pub fn errors(&self) -> &Errors<'a> { + pub fn errors(&self) -> &[Error<'a>] { &self.e } @@ -113,13 +113,15 @@ impl<'a> Lex<'a> { } } + /// Lex a sequence matching `[a-zA-Z0-9_]*`. fn ident0(&mut self) { self.trim(|c: char| c.is_ascii_alphanumeric() || c == '_'); } + /// Lex a sequence matching `[a-zA-Z_][a-zA-Z0-9_]*`. fn ident1(&mut self) { - let f = |c: char| c.is_ascii_alphabetic() || c == '_'; - if let Some(rest) = self.i.strip_prefix(f) { + let first = |c: char| c.is_ascii_alphabetic() || c == '_'; + if let Some(rest) = self.i.strip_prefix(first) { self.i = rest; self.ident0(); } else { @@ -127,6 +129,7 @@ impl<'a> Lex<'a> { } } + /// Lex a non-empty digit sequence. fn digits1(&mut self) { if let Some(rest) = self.i.strip_prefix(|c: char| c.is_numeric()) { self.i = rest.trim_start_matches(|c: char| c.is_numeric()); From fd85d661c945bb0243785beb0018e2485ad764c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 10 May 2024 09:46:15 +0200 Subject: [PATCH 016/135] '"' is a delimiter, too. --- jaq-parse/src/lex.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index abdee3932..162327483 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -55,7 +55,6 @@ pub enum Expect<'a> { Digit, Ident, Delim(&'a str), - String(&'a str), Escape, Unicode, } @@ -195,7 +194,7 @@ impl<'a> Lex<'a> { // SAFETY: due to `lex.trim()` Some(_) => unreachable!(), None => { - self.e.push((Expect::String(start), self.i)); + self.e.push((Expect::Delim(start), self.i)); return parts; } }; From 9ec392f53e4addd2331e7d5be10e5a3e2458b527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 10 May 2024 18:42:15 +0200 Subject: [PATCH 017/135] Enable new lexer! --- jaq-parse/src/lex.rs | 91 ++++++++++++++++++++++++++++++++------------ jaq-parse/src/lib.rs | 27 ++++++------- 2 files changed, 77 insertions(+), 41 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 162327483..a7e064220 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -1,7 +1,8 @@ -use crate::token::Delim; +use crate::token::{Delim, Token as OToken}; use alloc::string::{String, ToString}; use alloc::vec::Vec; use jaq_syn::string::Part; +use jaq_syn::{Span, Spanned}; /// Token (tree) generic over string type `S`. #[derive(Debug)] @@ -160,9 +161,9 @@ impl<'a> Lex<'a> { loop { let s = self.consumed(self.i.chars(), |lex| lex.trim(|c| c != '\\' && c != '"')); - if !s.is_empty() { - parts.push(Part::Str(s.to_string())) - } + //if !s.is_empty() { + parts.push(Part::Str(s.to_string())); + //} match self.next() { Some('"') => return parts, Some('\\') => { @@ -265,28 +266,68 @@ fn unicode(chars: &mut core::str::Chars) -> Option { u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) } -/* -use jaq_syn::Spanned; -fn parts_to_interpol( - parts: Vec>, -) -> (Spanned, Vec<(Spanned, Spanned)>) { - let mut init = (String::new(), 0..42); - let mut tail = Vec::new(); - let mut parts = parts.into_iter(); - while let Some(part) = parts.next() { - match part { - Part::Str(s) => init.0.extend(s.chars()), - Part::Fun(f) => { - tail.push(((f, 0..42), (String::new(), 0..42))); - while let Some(part) = parts.next() { - match part { - Part::Str(s) => tail.last_mut().unwrap().1 .0.extend(s.chars()), - Part::Fun(f) => tail.push(((f, 0..42), (String::new(), 0..42))), - } - } +fn span(whole_buffer: &str, part: &str) -> Span { + let start = part.as_ptr() as usize - whole_buffer.as_ptr() as usize; + let end = start + part.len(); + start..end +} + +impl<'a> Token<&'a str> { + pub fn tokens(self, i: &'a str) -> Box> + 'a> { + use core::iter::once; + match self { + Self::Word(w) => Box::new(once(( + match w { + "def" => OToken::Def, + "if" => OToken::If, + "then" => OToken::Then, + "elif" => OToken::Elif, + "else" => OToken::Else, + "end" => OToken::End, + "or" => OToken::Or, + "and" => OToken::And, + "as" => OToken::As, + "reduce" => OToken::Reduce, + "for" => OToken::For, + "foreach" => OToken::Foreach, + "try" => OToken::Try, + "catch" => OToken::Catch, + w if w.starts_with("$") => OToken::Var(w[1..].to_string()), + w => OToken::Ident(w.to_string()), + }, + span(i, w), + ))), + Self::Num(n) => Box::new(once((OToken::Num(n.to_string()), span(i, n)))), + Self::Op(o) => Box::new(once((OToken::Op(o.to_string()), span(i, o)))), + Self::Punct(p, s) => Box::new(once(( + match p { + Punct::Dot => OToken::Dot, + Punct::DotDot => OToken::DotDot, + Punct::Question => OToken::Question, + Punct::Comma => OToken::Comma, + Punct::Colon => OToken::Colon, + Punct::Semicolon => OToken::Semicolon, + }, + span(i, s), + ))), + Self::Delim(delim, tokens) => { + let init = once((OToken::Open(delim), 0..0)); + let last = once((OToken::Close(delim), 0..0)); + Box::new(init.chain(tokens.into_iter().flat_map(|t| t.tokens(i)).chain(last))) + } + Self::Str(parts) => { + let quote = once((OToken::Quote, 0..0)); + let f = |part: Part>| match part { + Part::Fun(t) => t.tokens(i), + Part::Str(s) => Box::new(once((OToken::Str(s.to_string()), 0..0))), + }; + Box::new( + quote + .clone() + .chain(parts.into_iter().flat_map(f)) + .chain(quote), + ) } } } - (init, tail) } -*/ diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index d3e1608f7..267375ba2 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -40,31 +40,26 @@ pub fn parse(src: &str, parser: P) -> (Option, Vec) where P: Parser> + Clone, { - /* - for i in 0..500 { - - let (tokens, lex_errs) = lex() - .then_ignore(end()) - .recover_with(skip_then_retry_until([])) - .parse_recovery(src); - if let Some((tokens2, rest)) = crate::lex::lex().parse(src, &mut ()) { - let v: Vec<_> = tokens2.into_iter().map(|tree| tree.tokens(0..42)).flatten().collect(); - } - } - */ let mut lexer = crate::lex::Lex::new(src); - let tokens = lexer.lex(); - std::println!("Tokens: {tokens:?}"); + let tokens: Vec<_> = lexer + .lex() + .into_iter() + .flat_map(|t| t.tokens(src)) + .collect(); + /* std::println!("Errors: {:?}", lexer.errors()); std::println!("finished: {}", lexer.input().is_empty()); + std::println!("Tokens: {tokens:?}"); + */ - let (tokens, lex_errs) = lex() + let (tokens2, lex_errs) = lex() .then_ignore(end()) .recover_with(skip_then_retry_until([])) .parse_recovery(src); let lex_errs: Vec> = lex_errs; - let (parsed, parse_errs) = if let Some(tokens) = tokens { + let (parsed, parse_errs) = if let Some(_tokens2) = tokens2 { + //std::println!("Tokens: {tokens2:?} (old)"); let len = src.chars().count(); let stream = chumsky::Stream::from_iter(len..len + 1, tokens.into_iter()); parser.then_ignore(end()).parse_recovery(stream) From 8eb8ed680be9830736f65c3654415cdaff5485e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 10 May 2024 22:47:07 +0200 Subject: [PATCH 018/135] Compress strings. --- jaq-parse/src/lex.rs | 10 ++++++++-- jaq-parse/src/lib.rs | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index a7e064220..c5008af43 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -162,7 +162,10 @@ impl<'a> Lex<'a> { loop { let s = self.consumed(self.i.chars(), |lex| lex.trim(|c| c != '\\' && c != '"')); //if !s.is_empty() { - parts.push(Part::Str(s.to_string())); + match parts.last_mut() { + Some(Part::Str(prev)) => prev.push_str(s), + Some(_) | None => parts.push(Part::Str(s.to_string())), + } //} match self.next() { Some('"') => return parts, @@ -190,7 +193,10 @@ impl<'a> Lex<'a> { }; self.i = chars.as_str(); - parts.push(Part::Str(c.into())); + match parts.last_mut() { + Some(Part::Str(prev)) => prev.push(c), + Some(_) | None => parts.push(Part::Str(c.to_string())), + } } // SAFETY: due to `lex.trim()` Some(_) => unreachable!(), diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index 267375ba2..23e70e39a 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -59,7 +59,7 @@ where let lex_errs: Vec> = lex_errs; let (parsed, parse_errs) = if let Some(_tokens2) = tokens2 { - //std::println!("Tokens: {tokens2:?} (old)"); + //std::println!("Tokens: {_tokens2:?} (old)"); let len = src.chars().count(); let stream = chumsky::Stream::from_iter(len..len + 1, tokens.into_iter()); parser.then_ignore(end()).parse_recovery(stream) From 08453507bb731df1ce34647bb02fc0ffd172791e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 13 May 2024 09:03:12 +0200 Subject: [PATCH 019/135] Removed the old lexer. --- jaq-interpret/tests/tests.rs | 6 +- jaq-parse/src/lex.rs | 36 +++++--- jaq-parse/src/lib.rs | 39 ++------- jaq-parse/src/token.rs | 156 +---------------------------------- 4 files changed, 38 insertions(+), 199 deletions(-) diff --git a/jaq-interpret/tests/tests.rs b/jaq-interpret/tests/tests.rs index ae7773b69..613ecf8c8 100644 --- a/jaq-interpret/tests/tests.rs +++ b/jaq-interpret/tests/tests.rs @@ -20,9 +20,9 @@ yields!(cartesian_arith, "[(1,2) * (3,4)]", [3, 4, 6, 8]); #[test] fn add() { give(json!(1), ". + 2", json!(3)); - give(json!(1.0), ". + 2.", json!(3.0)); + give(json!(1.0), ". + 2.0", json!(3.0)); give(json!(1), "2.0 + .", json!(3.0)); - give(json!(null), "1.e1 + 2.1e2", json!(220.0)); + give(json!(null), "1.0e1 + 2.1e2", json!(220.0)); give(json!("Hello "), ". + \"world\"", json!("Hello world")); give(json!([1, 2]), ". + [3, 4]", json!([1, 2, 3, 4])); @@ -48,7 +48,7 @@ yields!(sub_arr, "[1, 2, 3] - [2, 3, 4]", json!([1])); #[test] fn mul() { give(json!(1), ". * 2", json!(2)); - give(json!(1.0), ". * 2.", json!(2.0)); + give(json!(1.0), ". * 2.0", json!(2.0)); give(json!(1), "2.0 * .", json!(2.0)); give(json!("Hello"), "2 * .", json!("HelloHello")); diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index c5008af43..1c584ae61 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -58,6 +58,27 @@ pub enum Expect<'a> { Delim(&'a str), Escape, Unicode, + Token, +} + +impl<'a> Expect<'a> { + pub fn to_simple_error(&self, pos: &'a str, full: &'a str) -> (&'static str, Span) { + let mut pos = span(full, pos); + pos.end = pos.start; + let s = match self { + Self::Digit => "expected digit", + Self::Ident => "expected identifier", + Self::Delim(start) => { + let mut start = span(full, start); + start.end = pos.start; + return ("unclosed delimiter", start); + } + Self::Escape => "expected string escape sequence", + Self::Unicode => "expected 4-digit hexadecimal UTF-8 code point", + Self::Token => "expected token", + }; + (s, pos) + } } type Error<'a> = (Expect<'a>, &'a str); @@ -73,18 +94,13 @@ impl<'a> Lex<'a> { Self { i, e } } - pub fn lex(&mut self) -> Vec> { + pub fn lex(mut self) -> (Vec>, Vec>) { let tokens = self.tokens(); self.space(); - tokens - } - - pub fn input(&self) -> &'a str { - self.i - } - - pub fn errors(&self) -> &[Error<'a>] { - &self.e + if !self.i.is_empty() { + self.e.push((Expect::Token, self.i)); + } + (tokens, self.e) } fn next(&mut self) -> Option { diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index 23e70e39a..4ef166359 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -13,26 +13,15 @@ mod prec_climb; mod string; mod token; -use jaq_syn as syn; - pub use def::{defs, main}; use token::{Delim, Token}; use alloc::{string::String, string::ToString, vec::Vec}; use chumsky::prelude::*; -use syn::Spanned; /// Lex/parse error. pub type Error = Simple; -fn lex() -> impl Parser>, Error = Simple> { - recursive(token::tree) - .map_with_span(|tree, span| tree.tokens(span)) - .repeated() - .flatten() - .collect() -} - /// Parse a string with a given parser. /// /// May produce `Some` output even if there were errors. @@ -40,26 +29,11 @@ pub fn parse(src: &str, parser: P) -> (Option, Vec) where P: Parser> + Clone, { - let mut lexer = crate::lex::Lex::new(src); - let tokens: Vec<_> = lexer - .lex() - .into_iter() - .flat_map(|t| t.tokens(src)) - .collect(); - /* - std::println!("Errors: {:?}", lexer.errors()); - std::println!("finished: {}", lexer.input().is_empty()); - std::println!("Tokens: {tokens:?}"); - */ - - let (tokens2, lex_errs) = lex() - .then_ignore(end()) - .recover_with(skip_then_retry_until([])) - .parse_recovery(src); - let lex_errs: Vec> = lex_errs; + let (tokens, lex_errs) = crate::lex::Lex::new(src).lex(); + let tokens: Vec<_> = tokens.into_iter().flat_map(|t| t.tokens(src)).collect(); + //std::println!("Tokens: {tokens:?}"); - let (parsed, parse_errs) = if let Some(_tokens2) = tokens2 { - //std::println!("Tokens: {_tokens2:?} (old)"); + let (parsed, parse_errs) = if lex_errs.is_empty() { let len = src.chars().count(); let stream = chumsky::Stream::from_iter(len..len + 1, tokens.into_iter()); parser.then_ignore(end()).parse_recovery(stream) @@ -67,7 +41,10 @@ where (None, Vec::new()) }; - let lex_errs = lex_errs.into_iter().map(|e| e.map(|c| c.to_string())); + let lex_errs = lex_errs.iter().map(|(e, s)| { + let (e, span) = e.to_simple_error(s, src); + Simple::custom(span, e) + }); let parse_errs = parse_errs.into_iter().map(|e| e.map(|tok| tok.to_string())); let errs: Vec<_> = lex_errs.chain(parse_errs).collect(); diff --git a/jaq-parse/src/token.rs b/jaq-parse/src/token.rs index c01f00fcf..d12ba0275 100644 --- a/jaq-parse/src/token.rs +++ b/jaq-parse/src/token.rs @@ -1,7 +1,6 @@ -use alloc::{boxed::Box, string::String, vec::Vec}; +use alloc::{string::String}; use chumsky::prelude::*; use core::fmt; -use jaq_syn::{Span, Spanned}; #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub enum Delim { @@ -36,38 +35,6 @@ impl Delim { } } -#[derive(Clone, Debug, PartialEq, Eq, Hash)] -pub enum Tree { - Token(Token), - Delim(Delim, Vec>), - String(Spanned, Vec<(Spanned, Spanned)>), -} - -impl Tree { - pub fn tokens(self, span: Span) -> Box>> { - let ft = |(tree, span): Spanned| tree.tokens(span); - let fs = |(s, span): Spanned| (Token::Str(s), span); - use core::iter::once; - match self { - Self::Token(token) => Box::new(once((token, span))), - Self::Delim(delim, tree) => { - let s = (Token::Open(delim), span.start..span.start + 1); - let e = (Token::Close(delim), span.end - 1..span.end); - let tokens = tree.into_iter().flat_map(ft); - Box::new(once(s).chain(tokens).chain(once(e))) - } - Self::String(head, tail) => { - let s = (Token::Quote, span.start..span.start + 1); - let e = (Token::Quote, span.end - 1..span.end); - let tail = tail - .into_iter() - .flat_map(move |(tree, str_)| ft(tree).chain(once(fs(str_)))); - Box::new(once(s).chain(once(fs(head))).chain(tail).chain(once(e))) - } - } - } -} - #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub enum Token { Num(String), @@ -131,124 +98,3 @@ impl fmt::Display for Token { } } } - -// A parser for numbers -fn num() -> impl Parser> { - let comma = just('.').chain(text::digits(10).or_not()); - - let exp = one_of("eE") - .chain(one_of("+-").or_not()) - .chain::(text::digits(10)); - - text::int(10) - .chain::(comma.or_not()) - .chain::(exp.or_not()) - .collect() -} - -// A parser for strings; adapted from Chumsky's JSON example parser. -fn char_() -> impl Parser> { - let unicode = filter(|c: &char| c.is_ascii_hexdigit()) - .repeated() - .exactly(4) - .collect::() - .validate(|digits, span, emit| { - char::from_u32(u32::from_str_radix(&digits, 16).unwrap()).unwrap_or_else(|| { - emit(Simple::custom(span, "invalid unicode character")); - '\u{FFFD}' // unicode replacement character - }) - }); - - let escape = just('\\').ignore_then(choice(( - just('\\'), - just('/'), - just('"'), - just('b').to('\x08'), - just('f').to('\x0C'), - just('n').to('\n'), - just('r').to('\r'), - just('t').to('\t'), - just('u').ignore_then(unicode), - ))); - - filter(|c| *c != '\\' && *c != '"').or(escape) -} - -pub fn tree( - tree: impl Parser> + Clone, -) -> impl Parser> { - let trees = || tree.clone().map_with_span(|t, span| (t, span)).repeated(); - let paren = trees().delimited_by(just('('), just(')')); - let brack = trees().delimited_by(just('['), just(']')); - let brace = trees().delimited_by(just('{'), just('}')); - - let pair = |s, span| (s, span); - let chars = || char_().repeated().collect().map_with_span(pair); - - let pair = |p, span| (Tree::Delim(Delim::Paren, p), span); - let interpol = just('\\').ignore_then(paren.clone().map_with_span(pair)); - - let string = chars() - .then(interpol.then(chars()).repeated().collect()) - .delimited_by(just('"'), just('"')) - .labelled("string"); - - let comment = just("#").then(take_until(just('\n'))).padded(); - - let strategy = |open, close, others| { - nested_delimiters(open, close, others, |_span| Tree::Token(Token::Dot)) - }; - - choice(( - paren.map(|t| Tree::Delim(Delim::Paren, t)), - brack.map(|t| Tree::Delim(Delim::Brack, t)), - brace.map(|t| Tree::Delim(Delim::Brace, t)), - string.map(|(s, interpol)| Tree::String(s, interpol)), - token().map(Tree::Token), - )) - .recover_with(strategy('(', ')', [('[', ']'), ('{', '}')])) - .recover_with(strategy('[', ']', [('{', '}'), ('(', ')')])) - .recover_with(strategy('{', '}', [('(', ')'), ('[', ']')])) - .padded_by(comment.repeated()) - .padded() -} - -pub fn token() -> impl Parser> { - // A parser for operators - let op = one_of("|=!<>+-*/%").chain(one_of("=/").or_not()).collect(); - - let var = just('$').ignore_then(text::ident()); - - // A parser for identifiers and keywords - let ident = just('@').or_not().chain::(text::ident()); - let ident = ident.collect().map(|ident: String| match ident.as_str() { - "def" => Token::Def, - "if" => Token::If, - "then" => Token::Then, - "elif" => Token::Elif, - "else" => Token::Else, - "end" => Token::End, - "or" => Token::Or, - "and" => Token::And, - "as" => Token::As, - "reduce" => Token::Reduce, - "for" => Token::For, - "foreach" => Token::Foreach, - "try" => Token::Try, - "catch" => Token::Catch, - _ => Token::Ident(ident), - }); - - choice(( - ident, - just("..").to(Token::DotDot), - just('.').to(Token::Dot), - just(':').to(Token::Colon), - just(';').to(Token::Semicolon), - just(',').to(Token::Comma), - just('?').to(Token::Question), - op.map(Token::Op), - var.map(Token::Var), - num().map(Token::Num), - )) -} From 0a126d1d7e565be69a322ea41fe3f663ea101716 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 15 May 2024 17:43:13 +0200 Subject: [PATCH 020/135] Make `Punct` derive `Eq`. --- jaq-parse/src/lex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 1c584ae61..943d8c42c 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -22,7 +22,7 @@ pub enum Token { } /// Punctuation. -#[derive(Copy, Clone, Debug)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum Punct { /// `.` Dot, From 195f6941a9e7b8cb68a8d4e4bffe5939afdbb3e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 15 May 2024 17:43:33 +0200 Subject: [PATCH 021/135] Format. --- jaq-parse/src/token.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jaq-parse/src/token.rs b/jaq-parse/src/token.rs index d12ba0275..8c0ecb450 100644 --- a/jaq-parse/src/token.rs +++ b/jaq-parse/src/token.rs @@ -1,4 +1,4 @@ -use alloc::{string::String}; +use alloc::string::String; use chumsky::prelude::*; use core::fmt; From 03ea6f85329a5db2716f7de5e742ec490571ceef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 15 May 2024 18:59:19 +0200 Subject: [PATCH 022/135] Work on new term parser. --- jaq-parse/src/lib.rs | 5 + jaq-parse/src/term.rs | 280 ++++++++++++++++++++++++++++++++++++++++++ jaq-std/Cargo.toml | 3 +- jaq/src/main.rs | 2 +- 4 files changed, 288 insertions(+), 2 deletions(-) create mode 100644 jaq-parse/src/term.rs diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index 4ef166359..7f54663d0 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -11,6 +11,7 @@ mod lex; mod path; mod prec_climb; mod string; +mod term; mod token; pub use def::{defs, main}; @@ -30,6 +31,10 @@ where P: Parser> + Clone, { let (tokens, lex_errs) = crate::lex::Lex::new(src).lex(); + + let mut new_parser = term::Parser::new(&tokens); + std::println!("{:?}", new_parser.term()); + let tokens: Vec<_> = tokens.into_iter().flat_map(|t| t.tokens(src)).collect(); //std::println!("Tokens: {tokens:?}"); diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs new file mode 100644 index 000000000..75e9c5916 --- /dev/null +++ b/jaq-parse/src/term.rs @@ -0,0 +1,280 @@ +use crate::lex::{Punct, Token}; +use crate::Delim; +use jaq_syn::filter::KeyVal; +use jaq_syn::{path, string}; + +type Error<'a> = (Expect, Option<&'a Token<&'a str>>); +enum Expect {} + +pub(crate) struct Parser<'a> { + i: core::slice::Iter<'a, Token<&'a str>>, + e: Vec>, +} + +#[derive(Debug)] +pub enum Term { + Num(S), + Str(string::Str), + Arr(Option>), + Obj(Vec>), + Id, + Recurse, + Neg(Box), + BinOp(Box, Vec<(S, Self)>), + IfThenElse(Box, Box, Option>), + TryCatch(Box, Option>), + Var(S), + Call(S, Vec), + Key(S), + Path(Box, Vec<(path::Part, path::Opt)>), +} + +/// Keywords that may not appear at the beginning of an expression. +/// +/// Note that for example `reduce` is not part of this list, +/// because it *can* appear at the beginning of an expression. +const KEYWORDS: &[&str] = &["include", "import", "def", "as", "and", "or"]; + +impl<'a> Parser<'a> { + pub fn new(i: &'a [Token<&'a str>]) -> Self { + Self { + i: i.iter(), + e: Vec::new(), + } + } + + fn with(&mut self, tokens: &'a [Token<&'a str>], f: impl FnOnce(&mut Self) -> T) -> T { + let i = core::mem::replace(&mut self.i, tokens.iter()); + let tm = f(self); + self.i = i; + tm + } + + fn maybe(&mut self, f: impl Fn(&mut Self) -> Option) -> Option { + let i = self.i.clone(); + let y = f(self); + // rewind to previous state in case of non-match + if y.is_none() { + self.i = i; + } + y + } + + fn sep_by1(&mut self, punct: Punct, f: impl Fn(&mut Self) -> T) -> Vec { + let mut ys = Vec::from([f(self)]); + loop { + match self.i.next() { + Some(Token::Punct(p, _)) if *p == punct => ys.push(f(self)), + None => break, + _ => todo!(), + } + } + ys + } + + fn args(&mut self, f: impl Fn(&mut Self) -> T + Copy) -> Vec { + self.maybe(|p| match p.i.next() { + Some(Token::Delim(Delim::Paren, tokens)) => { + Some(p.with(tokens, |p| p.sep_by1(Punct::Semicolon, f))) + } + _ => None, + }) + .unwrap_or_else(|| Vec::new()) + } + + fn op(&mut self, with_comma: bool) -> Option<&'a str> { + self.maybe(|p| match p.i.next() { + Some(Token::Op(o) | Token::Word(o @ ("and" | "or"))) => Some(*o), + Some(Token::Punct(Punct::Comma, o)) if with_comma => Some(*o), + _ => None, + }) + } + + fn punct(&mut self, punct: Punct) -> Option<&'a str> { + self.maybe(|p| match p.i.next() { + Some(Token::Punct(p, s)) if *p == punct => Some(*s), + _ => None, + }) + } + + pub fn term_with_comma(&mut self, with_comma: bool) -> Term<&'a str> { + let head = self.atom(); + let mut tail = Vec::new(); + while let Some(op) = self.op(with_comma) { + tail.push((op, self.atom())); + } + + if tail.is_empty() { + head + } else { + Term::BinOp(Box::new(head), tail) + } + } + + pub fn term(&mut self) -> Term<&'a str> { + self.term_with_comma(true) + } + + fn atom(&mut self) -> Term<&'a str> { + let mut tm = match self.i.next() { + Some(Token::Op("-")) => Term::Neg(Box::new(self.atom())), + Some(Token::Word("if")) => { + let if_ = self.term(); + if !matches!(self.i.next(), Some(&Token::Word("then"))) { + todo!(); + } + let then_ = self.term(); + let else_ = match self.i.next() { + Some(Token::Word("else")) => { + let else_ = self.term(); + if !matches!(self.i.next(), Some(&Token::Word("end"))) { + todo!(); + } + Some(else_) + } + Some(Token::Word("end")) => None, + _ => todo!(), + }; + Term::IfThenElse(Box::new(if_), Box::new(then_), else_.map(Box::new)) + } + Some(Token::Word("try")) => { + let try_ = self.atom(); + let catch = self.maybe(|p| match p.i.next() { + Some(Token::Word("catch")) => Some(p.atom()), + _ => None, + }); + Term::TryCatch(Box::new(try_), catch.map(Box::new)) + } + Some(Token::Word("reduce")) => todo!(), + Some(Token::Word("foreach")) => todo!(), + Some(Token::Word(id)) if id.starts_with('$') => Term::Var(*id), + Some(Token::Word(id)) if !KEYWORDS.contains(id) => { + let head = Term::Call(*id, self.args(|p| p.term())); + let s = self.maybe(|p| match p.i.next() { + Some(Token::Str(parts)) if id.starts_with('@') => Some(p.str_parts(parts)), + _ => None, + }); + match s { + None => head, + Some(parts) => Term::Str(string::Str { + fmt: Some(Box::new(head)), + parts, + }), + } + } + Some(Token::Punct(Punct::Dot, _)) => self + .maybe(|p| p.i.next().and_then(ident_key)) + .map_or(Term::Id, Term::Key), + Some(Token::Punct(Punct::DotDot, _)) => Term::Recurse, + Some(Token::Num(n)) => Term::Num(*n), + Some(Token::Delim(Delim::Paren, tokens)) => self.with(tokens, |p| p.term()), + Some(Token::Delim(Delim::Brack, tokens)) if tokens.is_empty() => Term::Arr(None), + Some(Token::Delim(Delim::Brack, tokens)) => { + Term::Arr(Some(Box::new(self.with(tokens, |p| p.term())))) + } + Some(Token::Delim(Delim::Brace, tokens)) if tokens.is_empty() => Term::Obj(Vec::new()), + Some(Token::Delim(Delim::Brace, tokens)) => self.with(tokens, |p| { + Term::Obj(p.sep_by1(Punct::Comma, |p| p.obj_entry())) + }), + Some(Token::Str(parts)) => Term::Str(string::Str { + fmt: None, + parts: self.str_parts(parts), + }), + _ => todo!(), + }; + if matches!(self.opt(), path::Opt::Optional) { + tm = Term::TryCatch(Box::new(tm), None); + } + + let mut path: Vec<_> = core::iter::from_fn(|| self.path_part_opt()).collect(); + while self.punct(Punct::Dot).is_some() { + use path::Opt; + let key = self.i.next().and_then(ident_key).unwrap(); + let opt = self.punct(Punct::Question).is_some(); + let key = Term::Str(string::Str::from(key.to_string())); + let opt = if opt { Opt::Optional } else { Opt::Essential }; + path.push((path::Part::Index(key), opt)); + path.extend(core::iter::from_fn(|| self.path_part_opt())); + } + if path.is_empty() { + tm + } else { + Term::Path(Box::new(tm), path) + } + } + + fn obj_entry(&mut self) -> KeyVal> { + self.maybe(|p| match p.i.next() { + Some(Token::Word(k)) if !k.starts_with(['$', '@']) => { + let k = string::Str::from(k.to_string()); + let v = p.punct(Punct::Colon).map(|_| p.term_with_comma(false)); + Some(KeyVal::Str(k, v)) + } + _ => None, + }) + .unwrap_or_else(|| { + let k = self.atom(); + if self.punct(Punct::Colon).is_none() { + todo!() + } + KeyVal::Filter(k, self.term()) + }) + } + + fn str_parts( + &mut self, + parts: &'a [string::Part>], + ) -> Vec>> { + let parts = parts.iter().map(|part| match part { + string::Part::Str(s) => string::Part::Str(s.clone()), + string::Part::Fun(Token::Delim(Delim::Paren, tokens)) => { + string::Part::Fun(self.with(tokens, |p| p.term())) + } + string::Part::Fun(_) => unreachable!(), + }); + parts.collect() + } + + fn path_part(&mut self) -> path::Part> { + use path::Part::{Index, Range}; + if self.i.as_slice().is_empty() { + Range(None, None) + } else if self.punct(Punct::Colon).is_some() { + Range(None, Some(self.term())) + } else { + let tm = self.term(); + if self.punct(Punct::Colon).is_some() { + if self.i.as_slice().is_empty() { + Range(Some(tm), None) + } else { + Range(Some(tm), Some(self.term())) + } + } else { + Index(tm) + } + } + } + + fn path_part_opt(&mut self) -> Option<(path::Part>, path::Opt)> { + let part = self.maybe(|p| match p.i.next() { + Some(Token::Delim(Delim::Brack, tokens)) => Some(p.with(&tokens, |p| p.path_part())), + _ => None, + })?; + Some((part, self.opt())) + } + + fn opt(&mut self) -> path::Opt { + let mut opt = path::Opt::Essential; + while self.punct(Punct::Question).is_some() { + opt = path::Opt::Optional; + } + opt + } +} + +fn ident_key<'a>(token: &Token<&'a str>) -> Option<&'a str> { + match token { + Token::Word(id) if !id.starts_with(['$', '@']) => Some(*id), + _ => None, + } +} diff --git a/jaq-std/Cargo.toml b/jaq-std/Cargo.toml index 79606d60f..db5d878dc 100644 --- a/jaq-std/Cargo.toml +++ b/jaq-std/Cargo.toml @@ -11,7 +11,8 @@ keywords = ["json", "query", "jq"] rust-version = "1.64" [features] -default = ["bincode"] +#default = ["bincode"] +default = [] [build-dependencies] jaq-parse = { version = "1.0.0", path = "../jaq-parse" } diff --git a/jaq/src/main.rs b/jaq/src/main.rs index 373aff029..c370bf21f 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -243,7 +243,7 @@ fn args_named(var_val: &[(String, Val)]) -> Val { fn parse(filter_str: &str, vars: Vec) -> Result> { let mut defs = ParseCtx::new(vars); defs.insert_natives(jaq_core::core()); - defs.insert_defs(jaq_std::std()); + //defs.insert_defs(jaq_std::std()); assert!(defs.errs.is_empty()); let (filter, errs) = jaq_parse::parse(filter_str, jaq_parse::main()); if !errs.is_empty() { From 50232d899525f15e77e40129231553c5bab95f18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 16 May 2024 10:37:06 +0200 Subject: [PATCH 023/135] Variable bindings. --- jaq-parse/src/term.rs | 51 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index 75e9c5916..e65224477 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -20,9 +20,11 @@ pub enum Term { Id, Recurse, Neg(Box), + Pipe(Box, Option, Box), BinOp(Box, Vec<(S, Self)>), IfThenElse(Box, Box, Option>), TryCatch(Box, Option>), + Fold(S, Box, S, Vec), Var(S), Call(S, Vec), Key(S), @@ -84,6 +86,8 @@ impl<'a> Parser<'a> { fn op(&mut self, with_comma: bool) -> Option<&'a str> { self.maybe(|p| match p.i.next() { + // handle pipe directly in `term()` + Some(Token::Op("|")) => None, Some(Token::Op(o) | Token::Word(o @ ("and" | "or"))) => Some(*o), Some(Token::Punct(Punct::Comma, o)) if with_comma => Some(*o), _ => None, @@ -97,6 +101,13 @@ impl<'a> Parser<'a> { }) } + fn var(&mut self) -> &'a str { + match self.i.next() { + Some(Token::Word(x)) if x.starts_with('$') => *x, + _ => todo!(), + } + } + pub fn term_with_comma(&mut self, with_comma: bool) -> Term<&'a str> { let head = self.atom(); let mut tail = Vec::new(); @@ -104,10 +115,26 @@ impl<'a> Parser<'a> { tail.push((op, self.atom())); } - if tail.is_empty() { + let tm = if tail.is_empty() { head } else { Term::BinOp(Box::new(head), tail) + }; + + let pipe = self.maybe(|p| match p.i.next() { + Some(Token::Op("|")) => Some(None), + Some(Token::Word("as")) => { + let x = p.var(); + match p.i.next() { + Some(Token::Op("|")) => Some(Some(x)), + _ => None, + } + } + _ => None, + }); + match pipe { + None => tm, + Some(x) => Term::Pipe(Box::new(tm), x, Box::new(self.term_with_comma(with_comma))), } } @@ -116,7 +143,7 @@ impl<'a> Parser<'a> { } fn atom(&mut self) -> Term<&'a str> { - let mut tm = match self.i.next() { + let tm = match self.i.next() { Some(Token::Op("-")) => Term::Neg(Box::new(self.atom())), Some(Token::Word("if")) => { let if_ = self.term(); @@ -145,8 +172,16 @@ impl<'a> Parser<'a> { }); Term::TryCatch(Box::new(try_), catch.map(Box::new)) } - Some(Token::Word("reduce")) => todo!(), - Some(Token::Word("foreach")) => todo!(), + Some(Token::Word(fold @ ("reduce" | "foreach"))) => { + let xs = self.term(); + assert!(matches!(self.i.next(), Some(Token::Word("as")))); + let x = self.var(); + let args = self.args(|p| p.term()); + if args.is_empty() { + todo!() + } + Term::Fold(*fold, Box::new(xs), x, args) + }, Some(Token::Word(id)) if id.starts_with('$') => Term::Var(*id), Some(Token::Word(id)) if !KEYWORDS.contains(id) => { let head = Term::Call(*id, self.args(|p| p.term())); @@ -182,9 +217,11 @@ impl<'a> Parser<'a> { }), _ => todo!(), }; - if matches!(self.opt(), path::Opt::Optional) { - tm = Term::TryCatch(Box::new(tm), None); - } + + let tm = match self.opt() { + path::Opt::Optional => Term::TryCatch(Box::new(tm), None), + path::Opt::Essential => tm, + }; let mut path: Vec<_> = core::iter::from_fn(|| self.path_part_opt()).collect(); while self.punct(Punct::Dot).is_some() { From 378539b3018f28c842126f01a9cc000c1a3774c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 16 May 2024 12:16:02 +0200 Subject: [PATCH 024/135] Fix a typo. --- jaq-syn/src/def.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jaq-syn/src/def.rs b/jaq-syn/src/def.rs index 24158fe9b..23b237142 100644 --- a/jaq-syn/src/def.rs +++ b/jaq-syn/src/def.rs @@ -127,6 +127,6 @@ impl Arg { pub struct Main { /// Definitions at the top of the filter pub defs: Vec>, - /// Body of the filter, e.g. `[.[] | f`. + /// Body of the filter, e.g. `[.[] | f]`. pub body: Spanned, } From bbf75f10d66571578748557c3a30bb5d6f0aa7f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 17 May 2024 15:53:56 +0200 Subject: [PATCH 025/135] Parse definitions, less verbose error reporting. --- jaq-parse/src/lib.rs | 3 +- jaq-parse/src/term.rs | 330 ++++++++++++++++++++++++++++++------------ jaq-syn/src/path.rs | 6 + 3 files changed, 246 insertions(+), 93 deletions(-) diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index 7f54663d0..338798a49 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -33,7 +33,8 @@ where let (tokens, lex_errs) = crate::lex::Lex::new(src).lex(); let mut new_parser = term::Parser::new(&tokens); - std::println!("{:?}", new_parser.term()); + std::println!("{:?}", new_parser.main()); + std::println!("{:?}", new_parser.e); let tokens: Vec<_> = tokens.into_iter().flat_map(|t| t.tokens(src)).collect(); //std::println!("Tokens: {tokens:?}"); diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index e65224477..678977777 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -1,32 +1,56 @@ use crate::lex::{Punct, Token}; use crate::Delim; use jaq_syn::filter::KeyVal; -use jaq_syn::{path, string}; +use jaq_syn::{path, string, Arg, Call, Def}; +// TODO: problem: we cannot get a position from a None +// example: (0 | ) +// include a scope? type Error<'a> = (Expect, Option<&'a Token<&'a str>>); -enum Expect {} +#[derive(Debug)] +pub enum Expect { + Keyword(&'static str), + Punct(Punct), + Var, + ElseOrEnd, + Term, + Key, + Ident, + Arg, + Nothing, +} + +type Result<'a, T> = core::result::Result>; pub(crate) struct Parser<'a> { i: core::slice::Iter<'a, Token<&'a str>>, - e: Vec>, + pub e: Vec>, + /// names of fold-like filters, e.g. "reduce" and "foreach" + fold: &'a [&'a str], } -#[derive(Debug)] +#[derive(Debug, Default)] pub enum Term { + #[default] + Id, + Recurse, + Num(S), Str(string::Str), Arr(Option>), Obj(Vec>), - Id, - Recurse, + Neg(Box), Pipe(Box, Option, Box), BinOp(Box, Vec<(S, Self)>), - IfThenElse(Box, Box, Option>), - TryCatch(Box, Option>), + Fold(S, Box, S, Vec), - Var(S), + TryCatch(Box, Option>), + IfThenElse(Box, Box, Option>), + Call(S, Vec), + Var(S), + Key(S), Path(Box, Vec<(path::Part, path::Opt)>), } @@ -35,21 +59,38 @@ pub enum Term { /// /// Note that for example `reduce` is not part of this list, /// because it *can* appear at the beginning of an expression. -const KEYWORDS: &[&str] = &["include", "import", "def", "as", "and", "or"]; +const KEYWORDS: &[&str] = &[ + "include", "import", "def", "as", "and", "or", "catch", "then", "else", "end", +]; impl<'a> Parser<'a> { pub fn new(i: &'a [Token<&'a str>]) -> Self { Self { i: i.iter(), e: Vec::new(), + fold: &["reduce", "foreach"], } } - fn with(&mut self, tokens: &'a [Token<&'a str>], f: impl FnOnce(&mut Self) -> T) -> T { + fn with(&mut self, tokens: &'a [Token<&'a str>], f: F) -> T + where + F: FnOnce(&mut Self) -> Result<'a, T>, + { let i = core::mem::replace(&mut self.i, tokens.iter()); - let tm = f(self); + let y = match f(self) { + Ok(y) => { + if let Some(next) = self.i.next() { + self.e.push((Expect::Nothing, Some(next))); + } + y + } + Err(e) => { + self.e.push(e); + T::default() + } + }; self.i = i; - tm + y } fn maybe(&mut self, f: impl Fn(&mut Self) -> Option) -> Option { @@ -62,26 +103,41 @@ impl<'a> Parser<'a> { y } - fn sep_by1(&mut self, punct: Punct, f: impl Fn(&mut Self) -> T) -> Vec { - let mut ys = Vec::from([f(self)]); + fn try_maybe(&mut self, f: F) -> Result<'a, Option> + where + F: Fn(&mut Self) -> Result<'a, Option>, + { + let i = self.i.clone(); + let y = f(self)?; + // rewind to previous state in case of non-match + if y.is_none() { + self.i = i; + } + Ok(y) + } + + fn sep_by1(&mut self, punct: Punct, f: F) -> Result<'a, Vec> + where + F: Fn(&mut Self) -> Result<'a, T>, + { + let mut ys = Vec::from([f(self)?]); loop { match self.i.next() { - Some(Token::Punct(p, _)) if *p == punct => ys.push(f(self)), - None => break, - _ => todo!(), + Some(Token::Punct(p, _)) if *p == punct => ys.push(f(self)?), + None => return Ok(ys), + next => return Err((Expect::Punct(punct), next)), } } - ys } - fn args(&mut self, f: impl Fn(&mut Self) -> T + Copy) -> Vec { + fn args(&mut self, f: impl Fn(&mut Self) -> Result<'a, T> + Copy) -> Vec { self.maybe(|p| match p.i.next() { Some(Token::Delim(Delim::Paren, tokens)) => { Some(p.with(tokens, |p| p.sep_by1(Punct::Semicolon, f))) } _ => None, }) - .unwrap_or_else(|| Vec::new()) + .unwrap_or_default() } fn op(&mut self, with_comma: bool) -> Option<&'a str> { @@ -101,18 +157,32 @@ impl<'a> Parser<'a> { }) } - fn var(&mut self) -> &'a str { + fn punct1(&mut self, punct: Punct) -> Result<'a, &'a str> { match self.i.next() { - Some(Token::Word(x)) if x.starts_with('$') => *x, - _ => todo!(), + Some(Token::Punct(p, s)) if *p == punct => Ok(*s), + next => Err((Expect::Punct(punct), next)), } } - pub fn term_with_comma(&mut self, with_comma: bool) -> Term<&'a str> { - let head = self.atom(); + fn keyword(&mut self, kw: &'static str) -> Result<'a, ()> { + match self.i.next() { + Some(Token::Word(w)) if *w == kw => Ok(()), + next => Err((Expect::Keyword(kw), next)), + } + } + + fn var(&mut self) -> Result<'a, &'a str> { + match self.i.next() { + Some(Token::Word(x)) if x.starts_with('$') => Ok(*x), + next => Err((Expect::Var, next)), + } + } + + pub fn term_with_comma(&mut self, with_comma: bool) -> Result<'a, Term<&'a str>> { + let head = self.atom_path()?; let mut tail = Vec::new(); while let Some(op) = self.op(with_comma) { - tail.push((op, self.atom())); + tail.push((op, self.atom_path()?)); } let tm = if tail.is_empty() { @@ -121,67 +191,56 @@ impl<'a> Parser<'a> { Term::BinOp(Box::new(head), tail) }; - let pipe = self.maybe(|p| match p.i.next() { - Some(Token::Op("|")) => Some(None), + let pipe = self.try_maybe(|p| match p.i.next() { + Some(Token::Op("|")) => Ok(Some(None)), Some(Token::Word("as")) => { - let x = p.var(); + let x = p.var()?; match p.i.next() { - Some(Token::Op("|")) => Some(Some(x)), - _ => None, + Some(Token::Op("|")) => Ok(Some(Some(x))), + next => Err((todo!(), next)), } } - _ => None, - }); - match pipe { + _ => Ok(None), + })?; + Ok(match pipe { None => tm, - Some(x) => Term::Pipe(Box::new(tm), x, Box::new(self.term_with_comma(with_comma))), - } - } - - pub fn term(&mut self) -> Term<&'a str> { - self.term_with_comma(true) + Some(x) => Term::Pipe(Box::new(tm), x, Box::new(self.term_with_comma(with_comma)?)), + }) } - fn atom(&mut self) -> Term<&'a str> { - let tm = match self.i.next() { - Some(Token::Op("-")) => Term::Neg(Box::new(self.atom())), + fn atom(&mut self) -> Result<'a, Term<&'a str>> { + Ok(match self.i.next() { + Some(Token::Op("-")) => Term::Neg(Box::new(self.atom_path()?)), Some(Token::Word("if")) => { - let if_ = self.term(); - if !matches!(self.i.next(), Some(&Token::Word("then"))) { - todo!(); - } - let then_ = self.term(); + let if_ = self.term()?; + self.keyword("then")?; + let then_ = self.term()?; let else_ = match self.i.next() { Some(Token::Word("else")) => { - let else_ = self.term(); - if !matches!(self.i.next(), Some(&Token::Word("end"))) { - todo!(); - } + let else_ = self.term()?; + self.keyword("end")?; Some(else_) } Some(Token::Word("end")) => None, - _ => todo!(), + next => return Err((Expect::ElseOrEnd, next)), }; Term::IfThenElse(Box::new(if_), Box::new(then_), else_.map(Box::new)) } Some(Token::Word("try")) => { - let try_ = self.atom(); - let catch = self.maybe(|p| match p.i.next() { - Some(Token::Word("catch")) => Some(p.atom()), - _ => None, - }); + let try_ = self.atom_path()?; + let catch = self.try_maybe(|p| match p.i.next() { + Some(Token::Word("catch")) => Ok(Some(p.atom_path()?)), + _ => Ok(None), + })?; Term::TryCatch(Box::new(try_), catch.map(Box::new)) } - Some(Token::Word(fold @ ("reduce" | "foreach"))) => { - let xs = self.term(); - assert!(matches!(self.i.next(), Some(Token::Word("as")))); - let x = self.var(); + Some(Token::Word(fold)) if self.fold.contains(fold) => { + let xs = self.atom_path()?; + self.keyword("as")?; + let x = self.var()?; let args = self.args(|p| p.term()); - if args.is_empty() { - todo!() - } Term::Fold(*fold, Box::new(xs), x, args) - }, + } Some(Token::Word(id)) if id.starts_with('$') => Term::Var(*id), Some(Token::Word(id)) if !KEYWORDS.contains(id) => { let head = Term::Call(*id, self.args(|p| p.term())); @@ -209,14 +268,18 @@ impl<'a> Parser<'a> { } Some(Token::Delim(Delim::Brace, tokens)) if tokens.is_empty() => Term::Obj(Vec::new()), Some(Token::Delim(Delim::Brace, tokens)) => self.with(tokens, |p| { - Term::Obj(p.sep_by1(Punct::Comma, |p| p.obj_entry())) + p.sep_by1(Punct::Comma, |p| p.obj_entry()).map(Term::Obj) }), Some(Token::Str(parts)) => Term::Str(string::Str { fmt: None, parts: self.str_parts(parts), }), - _ => todo!(), - }; + next => return Err((Expect::Term, next)), + }) + } + + fn atom_path(&mut self) -> Result<'a, Term<&'a str>> { + let tm = self.atom()?; let tm = match self.opt() { path::Opt::Optional => Term::TryCatch(Box::new(tm), None), @@ -226,36 +289,45 @@ impl<'a> Parser<'a> { let mut path: Vec<_> = core::iter::from_fn(|| self.path_part_opt()).collect(); while self.punct(Punct::Dot).is_some() { use path::Opt; - let key = self.i.next().and_then(ident_key).unwrap(); + let key = match self.i.next() { + Some(Token::Word(id)) if !id.starts_with(['$', '@']) => Some(*id), + next => return Err((Expect::Key, next)), + }; let opt = self.punct(Punct::Question).is_some(); - let key = Term::Str(string::Str::from(key.to_string())); + let key = Term::Str(string::Str::from(key.unwrap_or("").to_string())); let opt = if opt { Opt::Optional } else { Opt::Essential }; path.push((path::Part::Index(key), opt)); path.extend(core::iter::from_fn(|| self.path_part_opt())); } - if path.is_empty() { + Ok(if path.is_empty() { tm } else { Term::Path(Box::new(tm), path) - } + }) } - fn obj_entry(&mut self) -> KeyVal> { - self.maybe(|p| match p.i.next() { + fn term(&mut self) -> Result<'a, Term<&'a str>> { + self.term_with_comma(true) + } + + fn obj_entry(&mut self) -> Result<'a, KeyVal>> { + match self.i.next() { Some(Token::Word(k)) if !k.starts_with(['$', '@']) => { let k = string::Str::from(k.to_string()); - let v = p.punct(Punct::Colon).map(|_| p.term_with_comma(false)); - Some(KeyVal::Str(k, v)) + let v = self + .punct(Punct::Colon) + .map(|_| self.term_with_comma(false)) + .transpose()?; + Ok(KeyVal::Str(k, v)) } - _ => None, - }) - .unwrap_or_else(|| { - let k = self.atom(); - if self.punct(Punct::Colon).is_none() { - todo!() + // TODO: handle $x + Some(Token::Delim(Delim::Paren, tokens)) => { + let k = self.with(tokens, |p| p.term()); + self.punct1(Punct::Colon)?; + Ok(KeyVal::Filter(k, self.term()?)) } - KeyVal::Filter(k, self.term()) - }) + next => Err((todo!(), next)), + } } fn str_parts( @@ -272,24 +344,24 @@ impl<'a> Parser<'a> { parts.collect() } - fn path_part(&mut self) -> path::Part> { + fn path_part(&mut self) -> Result<'a, path::Part>> { use path::Part::{Index, Range}; - if self.i.as_slice().is_empty() { + Ok(if self.i.as_slice().is_empty() { Range(None, None) } else if self.punct(Punct::Colon).is_some() { - Range(None, Some(self.term())) + Range(None, Some(self.term()?)) } else { - let tm = self.term(); + let tm = self.term()?; if self.punct(Punct::Colon).is_some() { if self.i.as_slice().is_empty() { Range(Some(tm), None) } else { - Range(Some(tm), Some(self.term())) + Range(Some(tm), Some(self.term()?)) } } else { Index(tm) } - } + }) } fn path_part_opt(&mut self) -> Option<(path::Part>, path::Opt)> { @@ -307,6 +379,80 @@ impl<'a> Parser<'a> { } opt } + + pub fn main(&mut self) -> Result<'a, Main>> { + Ok(Main { + defs: self.defs()?, + body: self.i.as_slice(), + } + .map(&mut |tokens| self.with(tokens, |p| p.term()))) + } + + pub fn defs(&mut self) -> Result<'a, Vec]>>>> { + core::iter::from_fn(|| self.def_head().map(|()| self.def_tail())).collect() + } + + pub fn def_rhs(&mut self) -> Result<'a, Main<&'a [Token<&'a str>]>> { + let defs = self.defs()?; + let i = self.i.as_slice(); + let is_semicolon = |tk| matches!(tk, &Token::Punct(Punct::Semicolon, _)); + let body = match self.i.position(is_semicolon) { + None => return Err((Expect::Punct(Punct::Semicolon), None)), + Some(p) => &i[..p], + }; + Ok(Main { defs, body }) + } + + fn def_head(&mut self) -> Option<()> { + self.maybe(|p| match p.i.next() { + Some(Token::Word("def")) => Some(()), + _ => None, + }) + } + + fn def_lhs(&mut self) -> Result<'a, Call> { + let name = match self.i.next() { + Some(Token::Word(name)) if !name.starts_with(['$', '@']) => name.to_string(), + next => return Err((Expect::Ident, next)), + }; + let args = self.args(|p| { + Ok(match p.i.next() { + Some(Token::Word(v)) if v.starts_with('$') => Arg::Var(v.to_string()), + Some(Token::Word(arg)) if !arg.starts_with('@') => Arg::Fun(arg.to_string()), + next => return Err((Expect::Arg, next)), + }) + }); + Ok(Call { name, args }) + } + + fn def_tail(&mut self) -> Result<'a, Def]>>> { + let lhs = self.def_lhs()?; + self.punct1(Punct::Colon)?; + let rhs = self.def_rhs()?; + + Ok(Def { lhs, rhs }) + } +} + +#[derive(Debug)] +pub struct Main { + /// Definitions at the top of the filter + pub defs: Vec>, + /// Body of the filter, e.g. `[.[] | f]`. + pub body: F, +} + +impl Main { + fn map(self, f: &mut impl FnMut(F) -> G) -> Main { + let defs = self.defs.into_iter().map(|def| Def { + lhs: def.lhs, + rhs: def.rhs.map(f), + }); + Main { + defs: defs.collect(), + body: f(self.body), + } + } } fn ident_key<'a>(token: &Token<&'a str>) -> Option<&'a str> { diff --git a/jaq-syn/src/path.rs b/jaq-syn/src/path.rs index f491a109f..2fd3ff353 100644 --- a/jaq-syn/src/path.rs +++ b/jaq-syn/src/path.rs @@ -16,6 +16,12 @@ pub enum Part { Range(Option, Option), } +impl Default for Part { + fn default() -> Self { + Self::Range(None, None) + } +} + /// Optionality of a path part, i.e. whether `?` is present. /// /// For example, `[] | .a` fails with an error, while `[] | .a?` returns nothing. From b31f014d5a24f300e48884317e55408139a7365b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Sat, 18 May 2024 10:56:00 +0200 Subject: [PATCH 026/135] Properly report next token in blocks. --- jaq-parse/src/lex.rs | 109 ++++++++++++++++----------------------- jaq-parse/src/term.rs | 114 ++++++++++++++++++++++------------------- jaq-parse/src/token.rs | 2 +- 3 files changed, 104 insertions(+), 121 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 943d8c42c..879adec51 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -16,39 +16,9 @@ pub enum Token { /// operator, such as `|` or `+=` Op(S), /// punctuation, such as `.` or `;` - Punct(Punct, S), + Char(S), /// delimited tokens, e.g. `(...)` or `[...]` - Delim(Delim, Vec), -} - -/// Punctuation. -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum Punct { - /// `.` - Dot, - /// `..` - DotDot, - /// `?` - Question, - /// `,` - Comma, - /// `:` - Colon, - /// `;` - Semicolon, -} - -impl Punct { - fn as_str(self) -> &'static str { - match self { - Self::Dot => ".", - Self::DotDot => "..", - Self::Question => "?", - Self::Comma => ",", - Self::Colon => ":", - Self::Semicolon => ";", - } - } + Block(S, Vec), } #[derive(Clone, Debug)] @@ -110,6 +80,12 @@ impl<'a> Lex<'a> { Some(c) } + fn take(&mut self, len: usize) -> &'a str { + let (head, tail) = self.i.split_at(len); + self.i = tail; + head + } + fn trim(&mut self, f: impl FnMut(char) -> bool) { self.i = self.i.trim_start_matches(f); } @@ -224,12 +200,6 @@ impl<'a> Lex<'a> { } } - fn punct(&mut self, p: Punct) -> Token<&'a str> { - let (s, after) = self.i.split_at(p.as_str().len()); - self.i = after; - Token::Punct(p, s) - } - fn token(&mut self) -> Option> { self.space(); @@ -241,12 +211,8 @@ impl<'a> Lex<'a> { '$' | '@' => Token::Word(self.consumed(chars, |lex| lex.ident1())), '0'..='9' => Token::Num(self.consumed(chars, |lex| lex.num())), c if is_op(c) => Token::Op(self.consumed(chars, |lex| lex.trim(is_op))), - '.' if chars.next() == Some('.') => self.punct(Punct::DotDot), - '.' => self.punct(Punct::Dot), - ':' => self.punct(Punct::Colon), - ';' => self.punct(Punct::Semicolon), - ',' => self.punct(Punct::Comma), - '?' => self.punct(Punct::Question), + '.' if chars.next() == Some('.') => Token::Char(self.take(2)), + '.' | ':' | ';' | ',' | '?' => Token::Char(self.take(1)), '"' => Token::Str(self.str()), '(' | '[' | '{' => self.delim(), _ => return None, @@ -262,21 +228,23 @@ impl<'a> Lex<'a> { /// The input string has to start with either '(', '[', or '{'. fn delim(&mut self) -> Token<&'a str> { let start = self.i; - let delim = match self.next() { - Some('(') => Delim::Paren, - Some('[') => Delim::Brack, - Some('{') => Delim::Brace, + let open = &self.i[..1]; + let close = match self.next() { + Some('(') => ')', + Some('[') => ']', + Some('{') => '}', _ => panic!(), }; - let tokens = self.tokens(); + let mut tokens = self.tokens(); self.space(); - if let Some(rest) = self.i.strip_prefix(delim.close()) { + if let Some(rest) = self.i.strip_prefix(close) { + tokens.push(Token::Char(&self.i[..1])); self.i = rest } else { self.e.push((Expect::Delim(start), self.i)); } - Token::Delim(delim, tokens) + Token::Block(open, tokens) } } @@ -321,21 +289,30 @@ impl<'a> Token<&'a str> { ))), Self::Num(n) => Box::new(once((OToken::Num(n.to_string()), span(i, n)))), Self::Op(o) => Box::new(once((OToken::Op(o.to_string()), span(i, o)))), - Self::Punct(p, s) => Box::new(once(( - match p { - Punct::Dot => OToken::Dot, - Punct::DotDot => OToken::DotDot, - Punct::Question => OToken::Question, - Punct::Comma => OToken::Comma, - Punct::Colon => OToken::Colon, - Punct::Semicolon => OToken::Semicolon, - }, - span(i, s), - ))), - Self::Delim(delim, tokens) => { - let init = once((OToken::Open(delim), 0..0)); - let last = once((OToken::Close(delim), 0..0)); - Box::new(init.chain(tokens.into_iter().flat_map(|t| t.tokens(i)).chain(last))) + Self::Char(c) => { + let token = match c { + ".." => OToken::DotDot, + "." => OToken::Dot, + "?" => OToken::Question, + "," => OToken::Comma, + ":" => OToken::Colon, + ";" => OToken::Semicolon, + ")" => OToken::Close(Delim::Paren), + "]" => OToken::Close(Delim::Brack), + "}" => OToken::Close(Delim::Brace), + _ => panic!("{}", c), + }; + Box::new(once((token, span(i, c)))) + } + Self::Block(open, tokens) => { + let delim = match open { + "(" => Delim::Paren, + "[" => Delim::Brack, + "{" => Delim::Brace, + _ => panic!(), + }; + let init = once((OToken::Open(delim), span(i, open))); + Box::new(init.chain(tokens.into_iter().flat_map(|t| t.tokens(i)))) } Self::Str(parts) => { let quote = once((OToken::Quote, 0..0)); diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index 678977777..25f327719 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -1,16 +1,12 @@ -use crate::lex::{Punct, Token}; -use crate::Delim; +use crate::lex::Token; use jaq_syn::filter::KeyVal; use jaq_syn::{path, string, Arg, Call, Def}; -// TODO: problem: we cannot get a position from a None -// example: (0 | ) -// include a scope? type Error<'a> = (Expect, Option<&'a Token<&'a str>>); #[derive(Debug)] pub enum Expect { Keyword(&'static str), - Punct(Punct), + Char(char), Var, ElseOrEnd, Term, @@ -72,15 +68,21 @@ impl<'a> Parser<'a> { } } - fn with(&mut self, tokens: &'a [Token<&'a str>], f: F) -> T + fn with(&mut self, tokens: &'a [Token<&'a str>], last: &'a str, f: F) -> T where F: FnOnce(&mut Self) -> Result<'a, T>, { let i = core::mem::replace(&mut self.i, tokens.iter()); let y = match f(self) { Ok(y) => { - if let Some(next) = self.i.next() { - self.e.push((Expect::Nothing, Some(next))); + match (self.i.as_slice(), last) { + ([], "") => (), + ([], _) => panic!(), + ([next, ..], "") => self.e.push((Expect::Nothing, Some(next))), + ([Token::Char(c)], last) if *c == last => (), + ([next, ..], last) => self + .e + .push((Expect::Char(last.chars().next().unwrap()), Some(next))), } y } @@ -116,25 +118,23 @@ impl<'a> Parser<'a> { Ok(y) } - fn sep_by1(&mut self, punct: Punct, f: F) -> Result<'a, Vec> + fn sep_by1(&mut self, sep: char, f: F) -> Result<'a, Vec> where F: Fn(&mut Self) -> Result<'a, T>, { let mut ys = Vec::from([f(self)?]); loop { match self.i.next() { - Some(Token::Punct(p, _)) if *p == punct => ys.push(f(self)?), - None => return Ok(ys), - next => return Err((Expect::Punct(punct), next)), + Some(Token::Char(c)) if c.chars().eq([sep]) => ys.push(f(self)?), + Some(Token::Char(")" | "}")) => return Ok(ys), + next => return Err((Expect::Char(sep), next)), } } } fn args(&mut self, f: impl Fn(&mut Self) -> Result<'a, T> + Copy) -> Vec { self.maybe(|p| match p.i.next() { - Some(Token::Delim(Delim::Paren, tokens)) => { - Some(p.with(tokens, |p| p.sep_by1(Punct::Semicolon, f))) - } + Some(Token::Block("(", tokens)) => Some(p.with(tokens, "", |p| p.sep_by1(';', f))), _ => None, }) .unwrap_or_default() @@ -145,22 +145,22 @@ impl<'a> Parser<'a> { // handle pipe directly in `term()` Some(Token::Op("|")) => None, Some(Token::Op(o) | Token::Word(o @ ("and" | "or"))) => Some(*o), - Some(Token::Punct(Punct::Comma, o)) if with_comma => Some(*o), + Some(Token::Char(o @ ",")) if with_comma => Some(*o), _ => None, }) } - fn punct(&mut self, punct: Punct) -> Option<&'a str> { + fn char0(&mut self, c: char) -> Option<&'a str> { self.maybe(|p| match p.i.next() { - Some(Token::Punct(p, s)) if *p == punct => Some(*s), + Some(Token::Char(s)) if s.chars().eq([c]) => Some(*s), _ => None, }) } - fn punct1(&mut self, punct: Punct) -> Result<'a, &'a str> { + fn char1(&mut self, c: char) -> Result<'a, &'a str> { match self.i.next() { - Some(Token::Punct(p, s)) if *p == punct => Ok(*s), - next => Err((Expect::Punct(punct), next)), + Some(Token::Char(s)) if s.chars().eq([c]) => Ok(*s), + next => Err((Expect::Char(c), next)), } } @@ -197,7 +197,7 @@ impl<'a> Parser<'a> { let x = p.var()?; match p.i.next() { Some(Token::Op("|")) => Ok(Some(Some(x))), - next => Err((todo!(), next)), + next => Err((Expect::Char('|'), next)), } } _ => Ok(None), @@ -256,19 +256,23 @@ impl<'a> Parser<'a> { }), } } - Some(Token::Punct(Punct::Dot, _)) => self + Some(Token::Char(".")) => self .maybe(|p| p.i.next().and_then(ident_key)) .map_or(Term::Id, Term::Key), - Some(Token::Punct(Punct::DotDot, _)) => Term::Recurse, + Some(Token::Char("..")) => Term::Recurse, Some(Token::Num(n)) => Term::Num(*n), - Some(Token::Delim(Delim::Paren, tokens)) => self.with(tokens, |p| p.term()), - Some(Token::Delim(Delim::Brack, tokens)) if tokens.is_empty() => Term::Arr(None), - Some(Token::Delim(Delim::Brack, tokens)) => { - Term::Arr(Some(Box::new(self.with(tokens, |p| p.term())))) + Some(Token::Block("[", tokens)) if matches!(tokens[..], [Token::Char("]")]) => { + Term::Arr(None) + } + Some(Token::Block("{", tokens)) if matches!(tokens[..], [Token::Char("}")]) => { + Term::Obj(Vec::new()) + } + Some(Token::Block("(", tokens)) => self.with(tokens, ")", |p| p.term()), + Some(Token::Block("[", tokens)) => { + Term::Arr(Some(Box::new(self.with(tokens, "]", |p| p.term())))) } - Some(Token::Delim(Delim::Brace, tokens)) if tokens.is_empty() => Term::Obj(Vec::new()), - Some(Token::Delim(Delim::Brace, tokens)) => self.with(tokens, |p| { - p.sep_by1(Punct::Comma, |p| p.obj_entry()).map(Term::Obj) + Some(Token::Block("{", tokens)) => self.with(tokens, "", |p| { + p.sep_by1(',', |p| p.obj_entry()).map(Term::Obj) }), Some(Token::Str(parts)) => Term::Str(string::Str { fmt: None, @@ -287,13 +291,13 @@ impl<'a> Parser<'a> { }; let mut path: Vec<_> = core::iter::from_fn(|| self.path_part_opt()).collect(); - while self.punct(Punct::Dot).is_some() { + while self.char0('.').is_some() { use path::Opt; let key = match self.i.next() { Some(Token::Word(id)) if !id.starts_with(['$', '@']) => Some(*id), next => return Err((Expect::Key, next)), }; - let opt = self.punct(Punct::Question).is_some(); + let opt = self.char0('?').is_some(); let key = Term::Str(string::Str::from(key.unwrap_or("").to_string())); let opt = if opt { Opt::Optional } else { Opt::Essential }; path.push((path::Part::Index(key), opt)); @@ -315,18 +319,18 @@ impl<'a> Parser<'a> { Some(Token::Word(k)) if !k.starts_with(['$', '@']) => { let k = string::Str::from(k.to_string()); let v = self - .punct(Punct::Colon) + .char0(':') .map(|_| self.term_with_comma(false)) .transpose()?; Ok(KeyVal::Str(k, v)) } // TODO: handle $x - Some(Token::Delim(Delim::Paren, tokens)) => { - let k = self.with(tokens, |p| p.term()); - self.punct1(Punct::Colon)?; + Some(Token::Block("(", tokens)) => { + let k = self.with(tokens, ")", |p| p.term()); + self.char1(':')?; Ok(KeyVal::Filter(k, self.term()?)) } - next => Err((todo!(), next)), + next => Err((Expect::Key, next)), } } @@ -336,8 +340,8 @@ impl<'a> Parser<'a> { ) -> Vec>> { let parts = parts.iter().map(|part| match part { string::Part::Str(s) => string::Part::Str(s.clone()), - string::Part::Fun(Token::Delim(Delim::Paren, tokens)) => { - string::Part::Fun(self.with(tokens, |p| p.term())) + string::Part::Fun(Token::Block("(", tokens)) => { + string::Part::Fun(self.with(tokens, ")", |p| p.term())) } string::Part::Fun(_) => unreachable!(), }); @@ -346,14 +350,15 @@ impl<'a> Parser<'a> { fn path_part(&mut self) -> Result<'a, path::Part>> { use path::Part::{Index, Range}; - Ok(if self.i.as_slice().is_empty() { + let done = |p: &Self| matches!(p.i.as_slice(), [Token::Char("]")]); + Ok(if done(self) { Range(None, None) - } else if self.punct(Punct::Colon).is_some() { + } else if self.char0(':').is_some() { Range(None, Some(self.term()?)) } else { let tm = self.term()?; - if self.punct(Punct::Colon).is_some() { - if self.i.as_slice().is_empty() { + if self.char0(':').is_some() { + if done(self) { Range(Some(tm), None) } else { Range(Some(tm), Some(self.term()?)) @@ -366,7 +371,7 @@ impl<'a> Parser<'a> { fn path_part_opt(&mut self) -> Option<(path::Part>, path::Opt)> { let part = self.maybe(|p| match p.i.next() { - Some(Token::Delim(Delim::Brack, tokens)) => Some(p.with(&tokens, |p| p.path_part())), + Some(Token::Block("[", tokens)) => Some(p.with(&tokens, "]", |p| p.path_part())), _ => None, })?; Some((part, self.opt())) @@ -374,18 +379,20 @@ impl<'a> Parser<'a> { fn opt(&mut self) -> path::Opt { let mut opt = path::Opt::Essential; - while self.punct(Punct::Question).is_some() { + while self.char0('?').is_some() { opt = path::Opt::Optional; } opt } pub fn main(&mut self) -> Result<'a, Main>> { - Ok(Main { + let mut main = Main { defs: self.defs()?, body: self.i.as_slice(), } - .map(&mut |tokens| self.with(tokens, |p| p.term()))) + .map(&mut |tokens| (tokens, ";")); + main.body.1 = ""; + Ok(main.map(&mut |(tokens, last)| self.with(tokens, last, |p| p.term()))) } pub fn defs(&mut self) -> Result<'a, Vec]>>>> { @@ -395,10 +402,9 @@ impl<'a> Parser<'a> { pub fn def_rhs(&mut self) -> Result<'a, Main<&'a [Token<&'a str>]>> { let defs = self.defs()?; let i = self.i.as_slice(); - let is_semicolon = |tk| matches!(tk, &Token::Punct(Punct::Semicolon, _)); - let body = match self.i.position(is_semicolon) { - None => return Err((Expect::Punct(Punct::Semicolon), None)), - Some(p) => &i[..p], + let body = match self.i.position(|tk| matches!(tk, &Token::Char(";"))) { + None => return Err((Expect::Char(';'), None)), + Some(p) => &i[..p + 1], }; Ok(Main { defs, body }) } @@ -427,7 +433,7 @@ impl<'a> Parser<'a> { fn def_tail(&mut self) -> Result<'a, Def]>>> { let lhs = self.def_lhs()?; - self.punct1(Punct::Colon)?; + self.char1(':')?; let rhs = self.def_rhs()?; Ok(Def { lhs, rhs }) diff --git a/jaq-parse/src/token.rs b/jaq-parse/src/token.rs index 8c0ecb450..116bb3cae 100644 --- a/jaq-parse/src/token.rs +++ b/jaq-parse/src/token.rs @@ -18,7 +18,7 @@ impl Delim { } } - pub(crate) fn close(self) -> char { + fn close(self) -> char { match self { Self::Paren => ')', Self::Brack => ']', From 28a4994aefe664bdfd1b3994af81e6bc787d0983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 21 May 2024 17:43:35 +0200 Subject: [PATCH 027/135] Lex `?//` operator. --- jaq-parse/src/lex.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 879adec51..4a68e19f1 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -211,6 +211,9 @@ impl<'a> Lex<'a> { '$' | '@' => Token::Word(self.consumed(chars, |lex| lex.ident1())), '0'..='9' => Token::Num(self.consumed(chars, |lex| lex.num())), c if is_op(c) => Token::Op(self.consumed(chars, |lex| lex.trim(is_op))), + '?' if (chars.next(), chars.next()) == (Some('/'), Some('/')) => { + Token::Op(self.take(3)) + } '.' if chars.next() == Some('.') => Token::Char(self.take(2)), '.' | ':' | ';' | ',' | '?' => Token::Char(self.take(1)), '"' => Token::Str(self.str()), From fef8322abb2cd2f16a716fad8856808e24a04d01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 21 May 2024 17:44:34 +0200 Subject: [PATCH 028/135] Simplify definitions. --- jaq-parse/src/term.rs | 44 ++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index 25f327719..74f19af46 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -1,6 +1,6 @@ use crate::lex::Token; use jaq_syn::filter::KeyVal; -use jaq_syn::{path, string, Arg, Call, Def}; +use jaq_syn::{path, string}; type Error<'a> = (Expect, Option<&'a Token<&'a str>>); #[derive(Debug)] @@ -385,7 +385,7 @@ impl<'a> Parser<'a> { opt } - pub fn main(&mut self) -> Result<'a, Main>> { + pub fn main(&mut self) -> Result<'a, Main<&'a str, Term<&'a str>>> { let mut main = Main { defs: self.defs()?, body: self.i.as_slice(), @@ -395,11 +395,11 @@ impl<'a> Parser<'a> { Ok(main.map(&mut |(tokens, last)| self.with(tokens, last, |p| p.term()))) } - pub fn defs(&mut self) -> Result<'a, Vec]>>>> { + pub fn defs(&mut self) -> Result<'a, Vec]>>>> { core::iter::from_fn(|| self.def_head().map(|()| self.def_tail())).collect() } - pub fn def_rhs(&mut self) -> Result<'a, Main<&'a [Token<&'a str>]>> { + pub fn def_rhs(&mut self) -> Result<'a, Main<&'a str, &'a [Token<&'a str>]>> { let defs = self.defs()?; let i = self.i.as_slice(); let body = match self.i.position(|tk| matches!(tk, &Token::Char(";"))) { @@ -416,43 +416,45 @@ impl<'a> Parser<'a> { }) } - fn def_lhs(&mut self) -> Result<'a, Call> { + fn def_tail(&mut self) -> Result<'a, Def<&'a str, Main<&'a str, &'a [Token<&'a str>]>>> { let name = match self.i.next() { - Some(Token::Word(name)) if !name.starts_with(['$', '@']) => name.to_string(), + Some(Token::Word(name)) if !name.starts_with(['$', '@']) => name, next => return Err((Expect::Ident, next)), }; let args = self.args(|p| { Ok(match p.i.next() { - Some(Token::Word(v)) if v.starts_with('$') => Arg::Var(v.to_string()), - Some(Token::Word(arg)) if !arg.starts_with('@') => Arg::Fun(arg.to_string()), + Some(Token::Word(arg)) if !arg.starts_with('@') => *arg, next => return Err((Expect::Arg, next)), }) }); - Ok(Call { name, args }) - } - - fn def_tail(&mut self) -> Result<'a, Def]>>> { - let lhs = self.def_lhs()?; self.char1(':')?; - let rhs = self.def_rhs()?; + let body = self.def_rhs()?; - Ok(Def { lhs, rhs }) + Ok(Def { name, args, body }) } } #[derive(Debug)] -pub struct Main { +pub struct Main { /// Definitions at the top of the filter - pub defs: Vec>, + pub defs: Vec>, /// Body of the filter, e.g. `[.[] | f]`. pub body: F, } -impl Main { - fn map(self, f: &mut impl FnMut(F) -> G) -> Main { +#[derive(Debug)] +pub struct Def { + name: S, + args: Vec, + body: F, +} + +impl Main { + fn map(self, f: &mut impl FnMut(F) -> G) -> Main { let defs = self.defs.into_iter().map(|def| Def { - lhs: def.lhs, - rhs: def.rhs.map(f), + name: def.name, + args: def.args, + body: def.body.map(f), }); Main { defs: defs.collect(), From 022d913d528e652d3ad0b4538212af2e296e0aa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 21 May 2024 18:16:54 +0200 Subject: [PATCH 029/135] Definitions inside terms. --- jaq-parse/src/term.rs | 60 +++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index 74f19af46..ac7458959 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -44,6 +44,7 @@ pub enum Term { TryCatch(Box, Option>), IfThenElse(Box, Box, Option>), + Def(Vec>, Box), Call(S, Vec), Var(S), @@ -179,6 +180,8 @@ impl<'a> Parser<'a> { } pub fn term_with_comma(&mut self, with_comma: bool) -> Result<'a, Term<&'a str>> { + let defs = self.defs()?; + let head = self.atom_path()?; let mut tail = Vec::new(); while let Some(op) = self.op(with_comma) { @@ -202,10 +205,17 @@ impl<'a> Parser<'a> { } _ => Ok(None), })?; - Ok(match pipe { + let tm = match pipe { None => tm, Some(x) => Term::Pipe(Box::new(tm), x, Box::new(self.term_with_comma(with_comma)?)), - }) + }; + let tm = if defs.is_empty() { + tm + } else { + Term::Def(defs, Box::new(tm)) + }; + + Ok(tm) } fn atom(&mut self) -> Result<'a, Term<&'a str>> { @@ -386,29 +396,16 @@ impl<'a> Parser<'a> { } pub fn main(&mut self) -> Result<'a, Main<&'a str, Term<&'a str>>> { - let mut main = Main { + Ok(Main { defs: self.defs()?, - body: self.i.as_slice(), - } - .map(&mut |tokens| (tokens, ";")); - main.body.1 = ""; - Ok(main.map(&mut |(tokens, last)| self.with(tokens, last, |p| p.term()))) + body: self.term()?, + }) } - pub fn defs(&mut self) -> Result<'a, Vec]>>>> { + pub fn defs(&mut self) -> Result<'a, Vec>>> { core::iter::from_fn(|| self.def_head().map(|()| self.def_tail())).collect() } - pub fn def_rhs(&mut self) -> Result<'a, Main<&'a str, &'a [Token<&'a str>]>> { - let defs = self.defs()?; - let i = self.i.as_slice(); - let body = match self.i.position(|tk| matches!(tk, &Token::Char(";"))) { - None => return Err((Expect::Char(';'), None)), - Some(p) => &i[..p + 1], - }; - Ok(Main { defs, body }) - } - fn def_head(&mut self) -> Option<()> { self.maybe(|p| match p.i.next() { Some(Token::Word("def")) => Some(()), @@ -416,7 +413,7 @@ impl<'a> Parser<'a> { }) } - fn def_tail(&mut self) -> Result<'a, Def<&'a str, Main<&'a str, &'a [Token<&'a str>]>>> { + fn def_tail(&mut self) -> Result<'a, Def<&'a str, Term<&'a str>>> { let name = match self.i.next() { Some(Token::Word(name)) if !name.starts_with(['$', '@']) => name, next => return Err((Expect::Ident, next)), @@ -428,7 +425,12 @@ impl<'a> Parser<'a> { }) }); self.char1(':')?; - let body = self.def_rhs()?; + + let body = self.term()?; + match self.i.next() { + Some(Token::Char(";")) => (), + next => return Err((Expect::Char(';'), next)), + }; Ok(Def { name, args, body }) } @@ -437,7 +439,7 @@ impl<'a> Parser<'a> { #[derive(Debug)] pub struct Main { /// Definitions at the top of the filter - pub defs: Vec>, + pub defs: Vec>, /// Body of the filter, e.g. `[.[] | f]`. pub body: F, } @@ -449,20 +451,6 @@ pub struct Def { body: F, } -impl Main { - fn map(self, f: &mut impl FnMut(F) -> G) -> Main { - let defs = self.defs.into_iter().map(|def| Def { - name: def.name, - args: def.args, - body: def.body.map(f), - }); - Main { - defs: defs.collect(), - body: f(self.body), - } - } -} - fn ident_key<'a>(token: &Token<&'a str>) -> Option<&'a str> { match token { Token::Word(id) if !id.starts_with(['$', '@']) => Some(*id), From 458333a128bb7e5c1b9c24984cb090e1da57b95d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 00:01:20 +0200 Subject: [PATCH 030/135] Remove `Main`; restrict object construction. --- jaq-parse/src/lib.rs | 2 +- jaq-parse/src/term.rs | 40 ++++++++++++---------------------------- 2 files changed, 13 insertions(+), 29 deletions(-) diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index 338798a49..37aa8dd7a 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -33,7 +33,7 @@ where let (tokens, lex_errs) = crate::lex::Lex::new(src).lex(); let mut new_parser = term::Parser::new(&tokens); - std::println!("{:?}", new_parser.main()); + std::println!("{:?}", new_parser.term()); std::println!("{:?}", new_parser.e); let tokens: Vec<_> = tokens.into_iter().flat_map(|t| t.tokens(src)).collect(); diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index ac7458959..1ec0e7e66 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -180,8 +180,6 @@ impl<'a> Parser<'a> { } pub fn term_with_comma(&mut self, with_comma: bool) -> Result<'a, Term<&'a str>> { - let defs = self.defs()?; - let head = self.atom_path()?; let mut tail = Vec::new(); while let Some(op) = self.op(with_comma) { @@ -205,17 +203,10 @@ impl<'a> Parser<'a> { } _ => Ok(None), })?; - let tm = match pipe { + Ok(match pipe { None => tm, Some(x) => Term::Pipe(Box::new(tm), x, Box::new(self.term_with_comma(with_comma)?)), - }; - let tm = if defs.is_empty() { - tm - } else { - Term::Def(defs, Box::new(tm)) - }; - - Ok(tm) + }) } fn atom(&mut self) -> Result<'a, Term<&'a str>> { @@ -320,8 +311,15 @@ impl<'a> Parser<'a> { }) } - fn term(&mut self) -> Result<'a, Term<&'a str>> { - self.term_with_comma(true) + pub fn term(&mut self) -> Result<'a, Term<&'a str>> { + let defs = self.defs()?; + let tm = self.term_with_comma(true)?; + + Ok(if defs.is_empty() { + tm + } else { + Term::Def(defs, Box::new(tm)) + }) } fn obj_entry(&mut self) -> Result<'a, KeyVal>> { @@ -395,13 +393,6 @@ impl<'a> Parser<'a> { opt } - pub fn main(&mut self) -> Result<'a, Main<&'a str, Term<&'a str>>> { - Ok(Main { - defs: self.defs()?, - body: self.term()?, - }) - } - pub fn defs(&mut self) -> Result<'a, Vec>>> { core::iter::from_fn(|| self.def_head().map(|()| self.def_tail())).collect() } @@ -436,18 +427,11 @@ impl<'a> Parser<'a> { } } -#[derive(Debug)] -pub struct Main { - /// Definitions at the top of the filter - pub defs: Vec>, - /// Body of the filter, e.g. `[.[] | f]`. - pub body: F, -} - #[derive(Debug)] pub struct Def { name: S, args: Vec, + /// Body of the filter, e.g. `[.[] | f]`. body: F, } From 50872381c1fcba235c104f3b3b8a2e5f3986e449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 09:51:24 +0200 Subject: [PATCH 031/135] Implement `elif`. --- jaq-parse/src/term.rs | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index 1ec0e7e66..6a0f20c72 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -42,7 +42,7 @@ pub enum Term { Fold(S, Box, S, Vec), TryCatch(Box, Option>), - IfThenElse(Box, Box, Option>), + IfThenElse(Vec<(Self, Self)>, Option>), Def(Vec>, Box), Call(S, Vec), @@ -57,7 +57,7 @@ pub enum Term { /// Note that for example `reduce` is not part of this list, /// because it *can* appear at the beginning of an expression. const KEYWORDS: &[&str] = &[ - "include", "import", "def", "as", "and", "or", "catch", "then", "else", "end", + "include", "import", "def", "as", "and", "or", "catch", "then", "elif", "else", "end", ]; impl<'a> Parser<'a> { @@ -213,19 +213,25 @@ impl<'a> Parser<'a> { Ok(match self.i.next() { Some(Token::Op("-")) => Term::Neg(Box::new(self.atom_path()?)), Some(Token::Word("if")) => { - let if_ = self.term()?; - self.keyword("then")?; - let then_ = self.term()?; - let else_ = match self.i.next() { - Some(Token::Word("else")) => { - let else_ = self.term()?; - self.keyword("end")?; - Some(else_) + let if_then = |p: &mut Self| { + let if_ = p.term()?; + p.keyword("then")?; + Ok((if_, p.term()?)) + }; + let mut if_thens = Vec::from([if_then(self)?]); + let else_ = loop { + match self.i.next() { + Some(Token::Word("elif")) => if_thens.push(if_then(self)?), + Some(Token::Word("else")) => { + let else_ = self.term()?; + self.keyword("end")?; + break Some(else_); + } + Some(Token::Word("end")) => break None, + next => return Err((Expect::ElseOrEnd, next)), } - Some(Token::Word("end")) => None, - next => return Err((Expect::ElseOrEnd, next)), }; - Term::IfThenElse(Box::new(if_), Box::new(then_), else_.map(Box::new)) + Term::IfThenElse(if_thens, else_.map(Box::new)) } Some(Token::Word("try")) => { let try_ = self.atom_path()?; @@ -437,7 +443,7 @@ pub struct Def { fn ident_key<'a>(token: &Token<&'a str>) -> Option<&'a str> { match token { - Token::Word(id) if !id.starts_with(['$', '@']) => Some(*id), + Token::Word(id) if !id.starts_with(['$', '@']) && !KEYWORDS.contains(id) => Some(*id), _ => None, } } From bb7452c350f0e812e4078e56aef7ca115577eac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 10:46:05 +0200 Subject: [PATCH 032/135] More powerful object construction. --- jaq-parse/src/term.rs | 57 +++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index 6a0f20c72..8ed9051a0 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -13,12 +13,13 @@ pub enum Expect { Key, Ident, Arg, + Str, Nothing, } type Result<'a, T> = core::result::Result>; -pub(crate) struct Parser<'a> { +pub struct Parser<'a> { i: core::slice::Iter<'a, Token<&'a str>>, pub e: Vec>, /// names of fold-like filters, e.g. "reduce" and "foreach" @@ -32,9 +33,9 @@ pub enum Term { Recurse, Num(S), - Str(string::Str), + Str(Option, Vec>), Arr(Option>), - Obj(Vec>), + Obj(Vec<(Self, Option)>), Neg(Box), Pipe(Box, Option, Box), @@ -249,20 +250,19 @@ impl<'a> Parser<'a> { Term::Fold(*fold, Box::new(xs), x, args) } Some(Token::Word(id)) if id.starts_with('$') => Term::Var(*id), - Some(Token::Word(id)) if !KEYWORDS.contains(id) => { - let head = Term::Call(*id, self.args(|p| p.term())); + Some(Token::Word(id)) if id.starts_with('@') => { let s = self.maybe(|p| match p.i.next() { Some(Token::Str(parts)) if id.starts_with('@') => Some(p.str_parts(parts)), _ => None, }); match s { - None => head, - Some(parts) => Term::Str(string::Str { - fmt: Some(Box::new(head)), - parts, - }), + None => Term::Call(*id, Vec::new()), + Some(parts) => Term::Str(Some(id), parts), } } + Some(Token::Word(id)) if !KEYWORDS.contains(id) => { + Term::Call(*id, self.args(|p| p.term())) + } Some(Token::Char(".")) => self .maybe(|p| p.i.next().and_then(ident_key)) .map_or(Term::Id, Term::Key), @@ -281,10 +281,7 @@ impl<'a> Parser<'a> { Some(Token::Block("{", tokens)) => self.with(tokens, "", |p| { p.sep_by1(',', |p| p.obj_entry()).map(Term::Obj) }), - Some(Token::Str(parts)) => Term::Str(string::Str { - fmt: None, - parts: self.str_parts(parts), - }), + Some(Token::Str(parts)) => Term::Str(None, self.str_parts(parts)), next => return Err((Expect::Term, next)), }) } @@ -301,11 +298,11 @@ impl<'a> Parser<'a> { while self.char0('.').is_some() { use path::Opt; let key = match self.i.next() { - Some(Token::Word(id)) if !id.starts_with(['$', '@']) => Some(*id), + Some(Token::Word(id)) if !id.starts_with(['$', '@']) => *id, next => return Err((Expect::Key, next)), }; let opt = self.char0('?').is_some(); - let key = Term::Str(string::Str::from(key.unwrap_or("").to_string())); + let key = Term::Str(None, Vec::from([string::Part::Str(key.to_string())])); let opt = if opt { Opt::Optional } else { Opt::Essential }; path.push((path::Part::Index(key), opt)); path.extend(core::iter::from_fn(|| self.path_part_opt())); @@ -328,24 +325,26 @@ impl<'a> Parser<'a> { }) } - fn obj_entry(&mut self) -> Result<'a, KeyVal>> { - match self.i.next() { - Some(Token::Word(k)) if !k.starts_with(['$', '@']) => { - let k = string::Str::from(k.to_string()); - let v = self - .char0(':') - .map(|_| self.term_with_comma(false)) - .transpose()?; - Ok(KeyVal::Str(k, v)) + fn obj_entry(&mut self) -> Result<'a, (Term<&'a str>, Option>)> { + let key = match self.i.next() { + Some(Token::Str(parts)) => Term::Str(None, self.str_parts(parts)), + Some(Token::Word(k)) if k.starts_with('@') => match self.i.next() { + Some(Token::Str(parts)) => Term::Str(Some(*k), self.str_parts(parts)), + next => return Err((Expect::Str, next)), + }, + Some(Token::Word(k)) if k.starts_with('$') => Term::Var(*k), + Some(Token::Word(k)) if !!KEYWORDS.contains(k) => { + Term::Str(None, Vec::from([string::Part::Str(k.to_string())])) } - // TODO: handle $x Some(Token::Block("(", tokens)) => { let k = self.with(tokens, ")", |p| p.term()); self.char1(':')?; - Ok(KeyVal::Filter(k, self.term()?)) + return Ok((k, Some(self.term()?))); } - next => Err((Expect::Key, next)), - } + next => return Err((Expect::Key, next)), + }; + let v = self.char0(':').map(|_| self.term_with_comma(false)); + Ok((key, v.transpose()?)) } fn str_parts( From e5e6d945f480e0f80c7f1016852b6c719504ea96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 11:39:07 +0200 Subject: [PATCH 033/135] String parsing with non-owned strings. --- jaq-parse/src/lex.rs | 50 +++++++++++++++++++++++++------------------ jaq-parse/src/term.rs | 22 ++++++++++--------- 2 files changed, 41 insertions(+), 31 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 4a68e19f1..35be07c7a 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -4,6 +4,14 @@ use alloc::vec::Vec; use jaq_syn::string::Part; use jaq_syn::{Span, Spanned}; +#[derive(Debug)] +pub enum StrPart { + Str(S), + Filter(F), + Char(char), + Unicode(u32), +} + /// Token (tree) generic over string type `S`. #[derive(Debug)] pub enum Token { @@ -12,7 +20,7 @@ pub enum Token { /// number Num(S), /// interpolated string - Str(Vec>), + Str(Vec>), /// operator, such as `|` or `+=` Op(S), /// punctuation, such as `.` or `;` @@ -146,19 +154,16 @@ impl<'a> Lex<'a> { /// Lex a (possibly interpolated) string. /// /// The input string has to start with '"'. - fn str(&mut self) -> Vec>> { + fn str(&mut self) -> Vec>> { let start = self.i; assert_eq!(self.next(), Some('"')); let mut parts = Vec::new(); loop { let s = self.consumed(self.i.chars(), |lex| lex.trim(|c| c != '\\' && c != '"')); - //if !s.is_empty() { - match parts.last_mut() { - Some(Part::Str(prev)) => prev.push_str(s), - Some(_) | None => parts.push(Part::Str(s.to_string())), + if !s.is_empty() { + parts.push(StrPart::Str(s)); } - //} match self.next() { Some('"') => return parts, Some('\\') => { @@ -170,12 +175,16 @@ impl<'a> Lex<'a> { Some('n') => '\n', Some('r') => '\r', Some('t') => '\t', - Some('u') => unicode(&mut chars).unwrap_or_else(|| { - self.e.push((Expect::Unicode, self.i)); - '\u{FFFD}' // Unicode replacement character - }), + Some('u') => { + let unicode = unicode(&mut chars).unwrap_or_else(|| { + self.e.push((Expect::Unicode, self.i)); + 0xFFFD // Unicode replacement character + }); + parts.push(StrPart::Unicode(unicode)); + continue; + } Some('(') => { - parts.push(Part::Fun(self.delim())); + parts.push(StrPart::Filter(self.delim())); continue; } Some(_) | None => { @@ -185,10 +194,7 @@ impl<'a> Lex<'a> { }; self.i = chars.as_str(); - match parts.last_mut() { - Some(Part::Str(prev)) => prev.push(c), - Some(_) | None => parts.push(Part::Str(c.to_string())), - } + parts.push(StrPart::Char(c)); } // SAFETY: due to `lex.trim()` Some(_) => unreachable!(), @@ -251,12 +257,12 @@ impl<'a> Lex<'a> { } } -fn unicode(chars: &mut core::str::Chars) -> Option { +fn unicode(chars: &mut core::str::Chars) -> Option { let mut hex = String::with_capacity(4); for _ in 0..4 { hex.push(chars.next()?); } - u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) + u32::from_str_radix(&hex, 16).ok() } fn span(whole_buffer: &str, part: &str) -> Span { @@ -319,9 +325,11 @@ impl<'a> Token<&'a str> { } Self::Str(parts) => { let quote = once((OToken::Quote, 0..0)); - let f = |part: Part>| match part { - Part::Fun(t) => t.tokens(i), - Part::Str(s) => Box::new(once((OToken::Str(s.to_string()), 0..0))), + let f = |part: StrPart<&'a str, Token<&'a str>>| match part { + StrPart::Filter(t) => t.tokens(i), + StrPart::Str(s) => Box::new(once((OToken::Str(s.to_string()), 0..0))), + StrPart::Char(c) => Box::new(once((OToken::Str(c.to_string()), 0..0))), + StrPart::Unicode(_) => todo!(), }; Box::new( quote diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index 8ed9051a0..017d1e382 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -1,4 +1,4 @@ -use crate::lex::Token; +use crate::lex::{StrPart, Token}; use jaq_syn::filter::KeyVal; use jaq_syn::{path, string}; @@ -33,7 +33,7 @@ pub enum Term { Recurse, Num(S), - Str(Option, Vec>), + Str(Option, Vec>), Arr(Option>), Obj(Vec<(Self, Option)>), @@ -302,7 +302,7 @@ impl<'a> Parser<'a> { next => return Err((Expect::Key, next)), }; let opt = self.char0('?').is_some(); - let key = Term::Str(None, Vec::from([string::Part::Str(key.to_string())])); + let key = Term::Str(None, Vec::from([StrPart::Str(key)])); let opt = if opt { Opt::Optional } else { Opt::Essential }; path.push((path::Part::Index(key), opt)); path.extend(core::iter::from_fn(|| self.path_part_opt())); @@ -334,7 +334,7 @@ impl<'a> Parser<'a> { }, Some(Token::Word(k)) if k.starts_with('$') => Term::Var(*k), Some(Token::Word(k)) if !!KEYWORDS.contains(k) => { - Term::Str(None, Vec::from([string::Part::Str(k.to_string())])) + Term::Str(None, Vec::from([StrPart::Str(*k)])) } Some(Token::Block("(", tokens)) => { let k = self.with(tokens, ")", |p| p.term()); @@ -349,14 +349,16 @@ impl<'a> Parser<'a> { fn str_parts( &mut self, - parts: &'a [string::Part>], - ) -> Vec>> { + parts: &'a [StrPart<&'a str, Token<&'a str>>], + ) -> Vec>> { let parts = parts.iter().map(|part| match part { - string::Part::Str(s) => string::Part::Str(s.clone()), - string::Part::Fun(Token::Block("(", tokens)) => { - string::Part::Fun(self.with(tokens, ")", |p| p.term())) + StrPart::Str(s) => StrPart::Str(*s), + StrPart::Filter(Token::Block("(", tokens)) => { + StrPart::Filter(self.with(tokens, ")", |p| p.term())) } - string::Part::Fun(_) => unreachable!(), + StrPart::Filter(_) => unreachable!(), + StrPart::Char(c) => StrPart::Char(*c), + StrPart::Unicode(u) => StrPart::Unicode(*u), }); parts.collect() } From da082566b6e3660976c5b7ece88e4371ec9f5935 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 11:52:17 +0200 Subject: [PATCH 034/135] Restore old lexer. --- jaq-parse/src/lex.rs | 71 ------------------- jaq-parse/src/lib.rs | 32 +++++---- jaq-parse/src/token.rs | 156 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 172 insertions(+), 87 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 35be07c7a..d1d26962e 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -270,74 +270,3 @@ fn span(whole_buffer: &str, part: &str) -> Span { let end = start + part.len(); start..end } - -impl<'a> Token<&'a str> { - pub fn tokens(self, i: &'a str) -> Box> + 'a> { - use core::iter::once; - match self { - Self::Word(w) => Box::new(once(( - match w { - "def" => OToken::Def, - "if" => OToken::If, - "then" => OToken::Then, - "elif" => OToken::Elif, - "else" => OToken::Else, - "end" => OToken::End, - "or" => OToken::Or, - "and" => OToken::And, - "as" => OToken::As, - "reduce" => OToken::Reduce, - "for" => OToken::For, - "foreach" => OToken::Foreach, - "try" => OToken::Try, - "catch" => OToken::Catch, - w if w.starts_with("$") => OToken::Var(w[1..].to_string()), - w => OToken::Ident(w.to_string()), - }, - span(i, w), - ))), - Self::Num(n) => Box::new(once((OToken::Num(n.to_string()), span(i, n)))), - Self::Op(o) => Box::new(once((OToken::Op(o.to_string()), span(i, o)))), - Self::Char(c) => { - let token = match c { - ".." => OToken::DotDot, - "." => OToken::Dot, - "?" => OToken::Question, - "," => OToken::Comma, - ":" => OToken::Colon, - ";" => OToken::Semicolon, - ")" => OToken::Close(Delim::Paren), - "]" => OToken::Close(Delim::Brack), - "}" => OToken::Close(Delim::Brace), - _ => panic!("{}", c), - }; - Box::new(once((token, span(i, c)))) - } - Self::Block(open, tokens) => { - let delim = match open { - "(" => Delim::Paren, - "[" => Delim::Brack, - "{" => Delim::Brace, - _ => panic!(), - }; - let init = once((OToken::Open(delim), span(i, open))); - Box::new(init.chain(tokens.into_iter().flat_map(|t| t.tokens(i)))) - } - Self::Str(parts) => { - let quote = once((OToken::Quote, 0..0)); - let f = |part: StrPart<&'a str, Token<&'a str>>| match part { - StrPart::Filter(t) => t.tokens(i), - StrPart::Str(s) => Box::new(once((OToken::Str(s.to_string()), 0..0))), - StrPart::Char(c) => Box::new(once((OToken::Str(c.to_string()), 0..0))), - StrPart::Unicode(_) => todo!(), - }; - Box::new( - quote - .clone() - .chain(parts.into_iter().flat_map(f)) - .chain(quote), - ) - } - } - } -} diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index 37aa8dd7a..0aaf2ef12 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -7,11 +7,11 @@ extern crate alloc; mod def; mod filter; -mod lex; +pub mod lex; mod path; mod prec_climb; mod string; -mod term; +pub mod term; mod token; pub use def::{defs, main}; @@ -19,10 +19,19 @@ use token::{Delim, Token}; use alloc::{string::String, string::ToString, vec::Vec}; use chumsky::prelude::*; +use jaq_syn::Spanned; /// Lex/parse error. pub type Error = Simple; +fn lex() -> impl Parser>, Error = Simple> { + recursive(token::tree) + .map_with_span(|tree, span| tree.tokens(span)) + .repeated() + .flatten() + .collect() +} + /// Parse a string with a given parser. /// /// May produce `Some` output even if there were errors. @@ -30,16 +39,12 @@ pub fn parse(src: &str, parser: P) -> (Option, Vec) where P: Parser> + Clone, { - let (tokens, lex_errs) = crate::lex::Lex::new(src).lex(); - - let mut new_parser = term::Parser::new(&tokens); - std::println!("{:?}", new_parser.term()); - std::println!("{:?}", new_parser.e); - - let tokens: Vec<_> = tokens.into_iter().flat_map(|t| t.tokens(src)).collect(); - //std::println!("Tokens: {tokens:?}"); + let (tokens, lex_errs) = lex() + .then_ignore(end()) + .recover_with(skip_then_retry_until([])) + .parse_recovery(src); - let (parsed, parse_errs) = if lex_errs.is_empty() { + let (parsed, parse_errs) = if let Some(tokens) = tokens { let len = src.chars().count(); let stream = chumsky::Stream::from_iter(len..len + 1, tokens.into_iter()); parser.then_ignore(end()).parse_recovery(stream) @@ -47,10 +52,7 @@ where (None, Vec::new()) }; - let lex_errs = lex_errs.iter().map(|(e, s)| { - let (e, span) = e.to_simple_error(s, src); - Simple::custom(span, e) - }); + let lex_errs = lex_errs.into_iter().map(|e| e.map(|c| c.to_string())); let parse_errs = parse_errs.into_iter().map(|e| e.map(|tok| tok.to_string())); let errs: Vec<_> = lex_errs.chain(parse_errs).collect(); diff --git a/jaq-parse/src/token.rs b/jaq-parse/src/token.rs index 116bb3cae..933e43cc3 100644 --- a/jaq-parse/src/token.rs +++ b/jaq-parse/src/token.rs @@ -1,6 +1,7 @@ -use alloc::string::String; +use alloc::{boxed::Box, string::String, vec::Vec}; use chumsky::prelude::*; use core::fmt; +use jaq_syn::{Span, Spanned}; #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub enum Delim { @@ -35,6 +36,38 @@ impl Delim { } } +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum Tree { + Token(Token), + Delim(Delim, Vec>), + String(Spanned, Vec<(Spanned, Spanned)>), +} + +impl Tree { + pub fn tokens(self, span: Span) -> Box>> { + let ft = |(tree, span): Spanned| tree.tokens(span); + let fs = |(s, span): Spanned| (Token::Str(s), span); + use core::iter::once; + match self { + Self::Token(token) => Box::new(once((token, span))), + Self::Delim(delim, tree) => { + let s = (Token::Open(delim), span.start..span.start + 1); + let e = (Token::Close(delim), span.end - 1..span.end); + let tokens = tree.into_iter().flat_map(ft); + Box::new(once(s).chain(tokens).chain(once(e))) + } + Self::String(head, tail) => { + let s = (Token::Quote, span.start..span.start + 1); + let e = (Token::Quote, span.end - 1..span.end); + let tail = tail + .into_iter() + .flat_map(move |(tree, str_)| ft(tree).chain(once(fs(str_)))); + Box::new(once(s).chain(once(fs(head))).chain(tail).chain(once(e))) + } + } + } +} + #[derive(Clone, Debug, PartialEq, Eq, Hash)] pub enum Token { Num(String), @@ -98,3 +131,124 @@ impl fmt::Display for Token { } } } + +// A parser for numbers +fn num() -> impl Parser> { + let comma = just('.').chain(text::digits(10).or_not()); + + let exp = one_of("eE") + .chain(one_of("+-").or_not()) + .chain::(text::digits(10)); + + text::int(10) + .chain::(comma.or_not()) + .chain::(exp.or_not()) + .collect() +} + +// A parser for strings; adapted from Chumsky's JSON example parser. +fn char_() -> impl Parser> { + let unicode = filter(|c: &char| c.is_ascii_hexdigit()) + .repeated() + .exactly(4) + .collect::() + .validate(|digits, span, emit| { + char::from_u32(u32::from_str_radix(&digits, 16).unwrap()).unwrap_or_else(|| { + emit(Simple::custom(span, "invalid unicode character")); + '\u{FFFD}' // unicode replacement character + }) + }); + + let escape = just('\\').ignore_then(choice(( + just('\\'), + just('/'), + just('"'), + just('b').to('\x08'), + just('f').to('\x0C'), + just('n').to('\n'), + just('r').to('\r'), + just('t').to('\t'), + just('u').ignore_then(unicode), + ))); + + filter(|c| *c != '\\' && *c != '"').or(escape) +} + +pub fn tree( + tree: impl Parser> + Clone, +) -> impl Parser> { + let trees = || tree.clone().map_with_span(|t, span| (t, span)).repeated(); + let paren = trees().delimited_by(just('('), just(')')); + let brack = trees().delimited_by(just('['), just(']')); + let brace = trees().delimited_by(just('{'), just('}')); + + let pair = |s, span| (s, span); + let chars = || char_().repeated().collect().map_with_span(pair); + + let pair = |p, span| (Tree::Delim(Delim::Paren, p), span); + let interpol = just('\\').ignore_then(paren.clone().map_with_span(pair)); + + let string = chars() + .then(interpol.then(chars()).repeated().collect()) + .delimited_by(just('"'), just('"')) + .labelled("string"); + + let comment = just("#").then(take_until(just('\n'))).padded(); + + let strategy = |open, close, others| { + nested_delimiters(open, close, others, |_span| Tree::Token(Token::Dot)) + }; + + choice(( + paren.map(|t| Tree::Delim(Delim::Paren, t)), + brack.map(|t| Tree::Delim(Delim::Brack, t)), + brace.map(|t| Tree::Delim(Delim::Brace, t)), + string.map(|(s, interpol)| Tree::String(s, interpol)), + token().map(Tree::Token), + )) + .recover_with(strategy('(', ')', [('[', ']'), ('{', '}')])) + .recover_with(strategy('[', ']', [('{', '}'), ('(', ')')])) + .recover_with(strategy('{', '}', [('(', ')'), ('[', ']')])) + .padded_by(comment.repeated()) + .padded() +} + +fn token() -> impl Parser> { + // A parser for operators + let op = one_of("|=!<>+-*/%").chain(one_of("=/").or_not()).collect(); + + let var = just('$').ignore_then(text::ident()); + + // A parser for identifiers and keywords + let ident = just('@').or_not().chain::(text::ident()); + let ident = ident.collect().map(|ident: String| match ident.as_str() { + "def" => Token::Def, + "if" => Token::If, + "then" => Token::Then, + "elif" => Token::Elif, + "else" => Token::Else, + "end" => Token::End, + "or" => Token::Or, + "and" => Token::And, + "as" => Token::As, + "reduce" => Token::Reduce, + "for" => Token::For, + "foreach" => Token::Foreach, + "try" => Token::Try, + "catch" => Token::Catch, + _ => Token::Ident(ident), + }); + + choice(( + ident, + just("..").to(Token::DotDot), + just('.').to(Token::Dot), + just(':').to(Token::Colon), + just(';').to(Token::Semicolon), + just(',').to(Token::Comma), + just('?').to(Token::Question), + op.map(Token::Op), + var.map(Token::Var), + num().map(Token::Num), + )) +} From 9e45931260f61b2be0e0393a21515f52bc180b62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 12:45:53 +0200 Subject: [PATCH 035/135] Restore bincode. --- jaq-std/Cargo.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jaq-std/Cargo.toml b/jaq-std/Cargo.toml index db5d878dc..79606d60f 100644 --- a/jaq-std/Cargo.toml +++ b/jaq-std/Cargo.toml @@ -11,8 +11,7 @@ keywords = ["json", "query", "jq"] rust-version = "1.64" [features] -#default = ["bincode"] -default = [] +default = ["bincode"] [build-dependencies] jaq-parse = { version = "1.0.0", path = "../jaq-parse" } From 6d9271f3672a7b509d57403bd246574df76250eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 12:46:05 +0200 Subject: [PATCH 036/135] Make jaq-parse no_std again!!! --- jaq-parse/src/lib.rs | 2 +- jaq-parse/src/term.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index 0aaf2ef12..4d19b608a 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -1,5 +1,5 @@ //! JSON query language parser. -//#![no_std] +#![no_std] #![forbid(unsafe_code)] #![warn(missing_docs)] diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index 017d1e382..7bca1d466 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -1,6 +1,6 @@ use crate::lex::{StrPart, Token}; -use jaq_syn::filter::KeyVal; -use jaq_syn::{path, string}; +use alloc::{boxed::Box, vec::Vec}; +use jaq_syn::path; type Error<'a> = (Expect, Option<&'a Token<&'a str>>); #[derive(Debug)] From 0541ee23d1a66a152065e4198c7f3aea74e511fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 12:54:08 +0200 Subject: [PATCH 037/135] Reenable jaq-std, measure performance. To parse 100_000 times the jaq standard library (std.jq), jaq-std (using bincode) took 3.2 seconds, whereas jaq-parse (using the new lexer/parser) took 8.6 seconds. In comparison, to parse only 1000 times (!!) the jaq standard library, jaq-parse (using the old lexer/parser with chumsky) took 9.6 seconds. That means that the new parser is fast enough to replace the bincode solution. --- jaq/src/main.rs | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/jaq/src/main.rs b/jaq/src/main.rs index c370bf21f..0cff43640 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -243,7 +243,31 @@ fn args_named(var_val: &[(String, Val)]) -> Val { fn parse(filter_str: &str, vars: Vec) -> Result> { let mut defs = ParseCtx::new(vars); defs.insert_natives(jaq_core::core()); - //defs.insert_defs(jaq_std::std()); + defs.insert_defs(jaq_std::std()); + + /* + let std = include_str!("../../jaq-std/src/std.jq"); + for i in 0..1000 { + //let (filter, errs) = jaq_parse::parse(std, jaq_parse::defs()); + + //let _ = jaq_std::std(); + + let (tokens, lex_errs) = jaq_parse::lex::Lex::new(std).lex(); + + let mut new_parser = jaq_parse::term::Parser::new(&tokens); + let defs = new_parser.defs(); + //std::println!("{:?}", new_parser.e); + } + */ + + /* + let (tokens, lex_errs) = jaq_parse::lex::Lex::new(filter_str).lex(); + + let mut new_parser = jaq_parse::term::Parser::new(&tokens); + std::println!("{:?}", new_parser.term()); + std::println!("{:?}", new_parser.e); + */ + assert!(defs.errs.is_empty()); let (filter, errs) = jaq_parse::parse(filter_str, jaq_parse::main()); if !errs.is_empty() { From 26ada807bf1c9eeb549ac8ff88db2e05906335bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 13:38:45 +0200 Subject: [PATCH 038/135] Thanks to clippy! --- jaq-parse/src/term.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index 7bca1d466..8a95be475 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -333,7 +333,7 @@ impl<'a> Parser<'a> { next => return Err((Expect::Str, next)), }, Some(Token::Word(k)) if k.starts_with('$') => Term::Var(*k), - Some(Token::Word(k)) if !!KEYWORDS.contains(k) => { + Some(Token::Word(k)) if !KEYWORDS.contains(k) => { Term::Str(None, Vec::from([StrPart::Str(*k)])) } Some(Token::Block("(", tokens)) => { @@ -386,7 +386,7 @@ impl<'a> Parser<'a> { fn path_part_opt(&mut self) -> Option<(path::Part>, path::Opt)> { let part = self.maybe(|p| match p.i.next() { - Some(Token::Block("[", tokens)) => Some(p.with(&tokens, "]", |p| p.path_part())), + Some(Token::Block("[", tokens)) => Some(p.with(tokens, "]", |p| p.path_part())), _ => None, })?; Some((part, self.opt())) From 2694dc0745e24e936cc72a7d2e87f1265551f974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 13:38:56 +0200 Subject: [PATCH 039/135] Avoid unnecessary allocation. --- jaq-parse/src/lex.rs | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index d1d26962e..9312e9cef 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -1,8 +1,4 @@ -use crate::token::{Delim, Token as OToken}; -use alloc::string::{String, ToString}; use alloc::vec::Vec; -use jaq_syn::string::Part; -use jaq_syn::{Span, Spanned}; #[derive(Debug)] pub enum StrPart { @@ -40,7 +36,7 @@ pub enum Expect<'a> { } impl<'a> Expect<'a> { - pub fn to_simple_error(&self, pos: &'a str, full: &'a str) -> (&'static str, Span) { + pub fn to_simple_error(&self, pos: &'a str, full: &'a str) -> (&'static str, jaq_syn::Span) { let mut pos = span(full, pos); pos.end = pos.start; let s = match self { @@ -258,14 +254,14 @@ impl<'a> Lex<'a> { } fn unicode(chars: &mut core::str::Chars) -> Option { - let mut hex = String::with_capacity(4); - for _ in 0..4 { - hex.push(chars.next()?); + let s = chars.as_str(); + for i in 0..4 { + chars.next()?; } - u32::from_str_radix(&hex, 16).ok() + u32::from_str_radix(&s[..4], 16).ok() } -fn span(whole_buffer: &str, part: &str) -> Span { +fn span(whole_buffer: &str, part: &str) -> jaq_syn::Span { let start = part.as_ptr() as usize - whole_buffer.as_ptr() as usize; let end = start + part.len(); start..end From 408aa889e254eec7ecef83d8496b2298514ec558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 13:43:07 +0200 Subject: [PATCH 040/135] Remove unused variable. --- jaq-parse/src/lex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 9312e9cef..84be987cf 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -255,7 +255,7 @@ impl<'a> Lex<'a> { fn unicode(chars: &mut core::str::Chars) -> Option { let s = chars.as_str(); - for i in 0..4 { + for _ in 0..4 { chars.next()?; } u32::from_str_radix(&s[..4], 16).ok() From 07cd6e4d86699afb7b718fa856b4f1a207439f25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 13:59:18 +0200 Subject: [PATCH 041/135] Correctly advance input after Unicode escape sequence. --- jaq-parse/src/lex.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 84be987cf..c31d0248e 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -176,6 +176,7 @@ impl<'a> Lex<'a> { self.e.push((Expect::Unicode, self.i)); 0xFFFD // Unicode replacement character }); + self.i = chars.as_str(); parts.push(StrPart::Unicode(unicode)); continue; } From 26dee39e4153ecc2e18e483f9d0aede1f7a43f31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 14:10:09 +0200 Subject: [PATCH 042/135] Load standard library with new parser. --- jaq/src/main.rs | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/jaq/src/main.rs b/jaq/src/main.rs index 0cff43640..e9f686b6a 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -245,29 +245,25 @@ fn parse(filter_str: &str, vars: Vec) -> Result> defs.insert_natives(jaq_core::core()); defs.insert_defs(jaq_std::std()); - /* let std = include_str!("../../jaq-std/src/std.jq"); - for i in 0..1000 { - //let (filter, errs) = jaq_parse::parse(std, jaq_parse::defs()); - - //let _ = jaq_std::std(); - - let (tokens, lex_errs) = jaq_parse::lex::Lex::new(std).lex(); - - let mut new_parser = jaq_parse::term::Parser::new(&tokens); - let defs = new_parser.defs(); - //std::println!("{:?}", new_parser.e); - } - */ + let (tokens, lex_errs) = jaq_parse::lex::Lex::new(std).lex(); + assert!(lex_errs.is_empty()); + let mut parser = jaq_parse::term::Parser::new(&tokens); + let _std = parser.defs(); + assert!(parser.e.is_empty()); /* let (tokens, lex_errs) = jaq_parse::lex::Lex::new(filter_str).lex(); - - let mut new_parser = jaq_parse::term::Parser::new(&tokens); - std::println!("{:?}", new_parser.term()); - std::println!("{:?}", new_parser.e); + if lex_errs.is_empty() { + let mut parser = jaq_parse::term::Parser::new(&tokens); + std::println!("{:?}", parser.term()); + std::println!("{:?}", parser.e); + } else { + std::println!("{:?}", lex_errs); + } */ + assert!(defs.errs.is_empty()); let (filter, errs) = jaq_parse::parse(filter_str, jaq_parse::main()); if !errs.is_empty() { From 0ad3dc4b25b6247cd228d4e28b87b6f513200a3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 22 May 2024 18:04:18 +0200 Subject: [PATCH 043/135] More robust Unicode escape handling. --- jaq-parse/src/lex.rs | 75 +++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index c31d0248e..08bea9f69 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -147,6 +147,41 @@ impl<'a> Lex<'a> { } } + fn escape(&mut self) -> Option>> { + let mut chars = self.i.chars(); + let part = match chars.next() { + Some(c @ ('\\' | '/' | '"')) => StrPart::Char(c), + Some('b') => StrPart::Char('\x08'), + Some('f') => StrPart::Char('\x0C'), + Some('n') => StrPart::Char('\n'), + Some('r') => StrPart::Char('\r'), + Some('t') => StrPart::Char('\t'), + Some('u') => { + let mut hex = 0; + for _ in 0..4 { + let i = chars.as_str(); + match chars.next().and_then(|c| c.to_digit(16)) { + Some(digit) => hex = (hex << 4) + digit, + None => { + self.i = i; + self.e.push((Expect::Unicode, self.i)); + return None; + } + } + } + StrPart::Unicode(hex) + } + Some('(') => return Some(StrPart::Filter(self.delim())), + Some(_) | None => { + self.e.push((Expect::Escape, self.i)); + return None; + } + }; + + self.i = chars.as_str(); + Some(part) + } + /// Lex a (possibly interpolated) string. /// /// The input string has to start with '"'. @@ -162,37 +197,7 @@ impl<'a> Lex<'a> { } match self.next() { Some('"') => return parts, - Some('\\') => { - let mut chars = self.i.chars(); - let c = match chars.next() { - Some(c @ ('\\' | '/' | '"')) => c, - Some('b') => '\x08', - Some('f') => '\x0C', - Some('n') => '\n', - Some('r') => '\r', - Some('t') => '\t', - Some('u') => { - let unicode = unicode(&mut chars).unwrap_or_else(|| { - self.e.push((Expect::Unicode, self.i)); - 0xFFFD // Unicode replacement character - }); - self.i = chars.as_str(); - parts.push(StrPart::Unicode(unicode)); - continue; - } - Some('(') => { - parts.push(StrPart::Filter(self.delim())); - continue; - } - Some(_) | None => { - self.e.push((Expect::Escape, self.i)); - '\0' - } - }; - - self.i = chars.as_str(); - parts.push(StrPart::Char(c)); - } + Some('\\') => self.escape().map(|part| parts.push(part)), // SAFETY: due to `lex.trim()` Some(_) => unreachable!(), None => { @@ -254,14 +259,6 @@ impl<'a> Lex<'a> { } } -fn unicode(chars: &mut core::str::Chars) -> Option { - let s = chars.as_str(); - for _ in 0..4 { - chars.next()?; - } - u32::from_str_radix(&s[..4], 16).ok() -} - fn span(whole_buffer: &str, part: &str) -> jaq_syn::Span { let start = part.as_ptr() as usize - whole_buffer.as_ptr() as usize; let end = start + part.len(); From ef0b298ced351f0b69f1a853bb3f09e2132e8a2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 24 May 2024 18:44:08 +0200 Subject: [PATCH 044/135] Parse module syntax. --- jaq-parse/src/term.rs | 67 ++++++++++++++++++++++++++++++++++++++++--- jaq/src/main.rs | 3 +- 2 files changed, 64 insertions(+), 6 deletions(-) diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index 8a95be475..01cd47e1e 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -159,6 +159,12 @@ impl<'a> Parser<'a> { }) } + fn terminated(&mut self, f: impl FnOnce(&mut Self) -> Result<'a, T>) -> Result<'a, T> { + let y = f(self)?; + self.char1(';')?; + Ok(y) + } + fn char1(&mut self, c: char) -> Result<'a, &'a str> { match self.i.next() { Some(Token::Char(s)) if s.chars().eq([c]) => Ok(*s), @@ -413,7 +419,7 @@ impl<'a> Parser<'a> { fn def_tail(&mut self) -> Result<'a, Def<&'a str, Term<&'a str>>> { let name = match self.i.next() { - Some(Token::Word(name)) if !name.starts_with(['$', '@']) => name, + Some(Token::Word(name)) if !name.starts_with(['$']) => name, next => return Err((Expect::Ident, next)), }; let args = self.args(|p| { @@ -425,15 +431,68 @@ impl<'a> Parser<'a> { self.char1(':')?; let body = self.term()?; + self.char1(';')?; + + Ok(Def { name, args, body }) + } + + fn bare_str(&mut self) -> Result<'a, &'a str> { match self.i.next() { - Some(Token::Char(";")) => (), - next => return Err((Expect::Char(';'), next)), + Some(Token::Str(parts)) => match parts[..] { + [StrPart::Str(s)] => Ok(s), + _ => todo!(), + }, + next => Err((Expect::Str, next)), + } + } + + fn include(&mut self) -> Result<'a, (&'a str, Option<&'a str>)> { + self.bare_str().map(|path| (path, None)) + } + + fn import(&mut self) -> Result<'a, (&'a str, Option<&'a str>)> { + let path = self.bare_str()?; + self.keyword("as")?; + let name = match self.i.next() { + Some(Token::Word(name)) if !name.starts_with(['$', '@']) => *name, + next => return Err((Expect::Ident, next)), }; + Ok((path, Some(name))) + } - Ok(Def { name, args, body }) + pub fn module(&mut self, f: F) -> Result<'a, Module<&'a str, B>> + where + F: FnOnce(&mut Self) -> Result<'a, B>, + { + let meta = self + .maybe(|p| match p.i.next() { + Some(Token::Word("module")) => Some(p.terminated(|p| p.term())), + _ => None, + }) + .transpose()?; + + let mods = core::iter::from_fn(|| { + self.maybe(|p| match p.i.next() { + Some(Token::Word("include")) => Some(p.terminated(|p| p.include())), + Some(Token::Word("import")) => Some(p.terminated(|p| p.import())), + _ => None, + }) + }) + .collect::>()?; + + let body = f(self)?; + + Ok(Module { meta, mods, body }) } } +#[derive(Debug)] +pub struct Module { + meta: Option>, + mods: Vec<(S, Option)>, + body: B, +} + #[derive(Debug)] pub struct Def { name: S, diff --git a/jaq/src/main.rs b/jaq/src/main.rs index e9f686b6a..f13d223ac 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -256,14 +256,13 @@ fn parse(filter_str: &str, vars: Vec) -> Result> let (tokens, lex_errs) = jaq_parse::lex::Lex::new(filter_str).lex(); if lex_errs.is_empty() { let mut parser = jaq_parse::term::Parser::new(&tokens); - std::println!("{:?}", parser.term()); + std::println!("{:?}", parser.module(|p| p.term())); std::println!("{:?}", parser.e); } else { std::println!("{:?}", lex_errs); } */ - assert!(defs.errs.is_empty()); let (filter, errs) = jaq_parse::parse(filter_str, jaq_parse::main()); if !errs.is_empty() { From 7f7bd0adce15572c1b6d79ccac2cbbe1405d4d4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 28 May 2024 17:12:54 +0200 Subject: [PATCH 045/135] Pedantic clippy. --- jaq-parse/src/lex.rs | 12 +++++++----- jaq-parse/src/term.rs | 23 ++++++++++++----------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index 08bea9f69..d4f2139d5 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -63,11 +63,13 @@ pub struct Lex<'a> { } impl<'a> Lex<'a> { + #[must_use] pub fn new(i: &'a str) -> Self { let e = Vec::new(); Self { i, e } } + #[must_use] pub fn lex(mut self) -> (Vec>, Vec>) { let tokens = self.tokens(); self.space(); @@ -136,7 +138,7 @@ impl<'a> Lex<'a> { /// Decimal with optional exponent. fn num(&mut self) { - self.trim(|c| c.is_numeric()); + self.trim(char::is_numeric); if let Some(i) = self.i.strip_prefix('.') { self.i = i; self.digits1(); @@ -215,9 +217,9 @@ impl<'a> Lex<'a> { let mut chars = self.i.chars(); Some(match chars.next()? { - 'a'..='z' | 'A'..='Z' | '_' => Token::Word(self.consumed(chars, |lex| lex.ident0())), - '$' | '@' => Token::Word(self.consumed(chars, |lex| lex.ident1())), - '0'..='9' => Token::Num(self.consumed(chars, |lex| lex.num())), + 'a'..='z' | 'A'..='Z' | '_' => Token::Word(self.consumed(chars, Self::ident0)), + '$' | '@' => Token::Word(self.consumed(chars, Self::ident1)), + '0'..='9' => Token::Num(self.consumed(chars, Self::num)), c if is_op(c) => Token::Op(self.consumed(chars, |lex| lex.trim(is_op))), '?' if (chars.next(), chars.next()) == (Some('/'), Some('/')) => { Token::Op(self.take(3)) @@ -251,7 +253,7 @@ impl<'a> Lex<'a> { self.space(); if let Some(rest) = self.i.strip_prefix(close) { tokens.push(Token::Char(&self.i[..1])); - self.i = rest + self.i = rest; } else { self.e.push((Expect::Delim(start), self.i)); } diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index 01cd47e1e..26d16ea29 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -62,6 +62,7 @@ const KEYWORDS: &[&str] = &[ ]; impl<'a> Parser<'a> { + #[must_use] pub fn new(i: &'a [Token<&'a str>]) -> Self { Self { i: i.iter(), @@ -252,7 +253,7 @@ impl<'a> Parser<'a> { let xs = self.atom_path()?; self.keyword("as")?; let x = self.var()?; - let args = self.args(|p| p.term()); + let args = self.args(Self::term); Term::Fold(*fold, Box::new(xs), x, args) } Some(Token::Word(id)) if id.starts_with('$') => Term::Var(*id), @@ -267,7 +268,7 @@ impl<'a> Parser<'a> { } } Some(Token::Word(id)) if !KEYWORDS.contains(id) => { - Term::Call(*id, self.args(|p| p.term())) + Term::Call(*id, self.args(Self::term)) } Some(Token::Char(".")) => self .maybe(|p| p.i.next().and_then(ident_key)) @@ -280,12 +281,12 @@ impl<'a> Parser<'a> { Some(Token::Block("{", tokens)) if matches!(tokens[..], [Token::Char("}")]) => { Term::Obj(Vec::new()) } - Some(Token::Block("(", tokens)) => self.with(tokens, ")", |p| p.term()), + Some(Token::Block("(", tokens)) => self.with(tokens, ")", Self::term), Some(Token::Block("[", tokens)) => { - Term::Arr(Some(Box::new(self.with(tokens, "]", |p| p.term())))) + Term::Arr(Some(Box::new(self.with(tokens, "]", Self::term)))) } Some(Token::Block("{", tokens)) => self.with(tokens, "", |p| { - p.sep_by1(',', |p| p.obj_entry()).map(Term::Obj) + p.sep_by1(',', Self::obj_entry).map(Term::Obj) }), Some(Token::Str(parts)) => Term::Str(None, self.str_parts(parts)), next => return Err((Expect::Term, next)), @@ -343,7 +344,7 @@ impl<'a> Parser<'a> { Term::Str(None, Vec::from([StrPart::Str(*k)])) } Some(Token::Block("(", tokens)) => { - let k = self.with(tokens, ")", |p| p.term()); + let k = self.with(tokens, ")", Self::term); self.char1(':')?; return Ok((k, Some(self.term()?))); } @@ -360,7 +361,7 @@ impl<'a> Parser<'a> { let parts = parts.iter().map(|part| match part { StrPart::Str(s) => StrPart::Str(*s), StrPart::Filter(Token::Block("(", tokens)) => { - StrPart::Filter(self.with(tokens, ")", |p| p.term())) + StrPart::Filter(self.with(tokens, ")", Self::term)) } StrPart::Filter(_) => unreachable!(), StrPart::Char(c) => StrPart::Char(*c), @@ -392,7 +393,7 @@ impl<'a> Parser<'a> { fn path_part_opt(&mut self) -> Option<(path::Part>, path::Opt)> { let part = self.maybe(|p| match p.i.next() { - Some(Token::Block("[", tokens)) => Some(p.with(tokens, "]", |p| p.path_part())), + Some(Token::Block("[", tokens)) => Some(p.with(tokens, "]", Self::path_part)), _ => None, })?; Some((part, self.opt())) @@ -466,15 +467,15 @@ impl<'a> Parser<'a> { { let meta = self .maybe(|p| match p.i.next() { - Some(Token::Word("module")) => Some(p.terminated(|p| p.term())), + Some(Token::Word("module")) => Some(p.terminated(Self::term)), _ => None, }) .transpose()?; let mods = core::iter::from_fn(|| { self.maybe(|p| match p.i.next() { - Some(Token::Word("include")) => Some(p.terminated(|p| p.include())), - Some(Token::Word("import")) => Some(p.terminated(|p| p.import())), + Some(Token::Word("include")) => Some(p.terminated(Self::include)), + Some(Token::Word("import")) => Some(p.terminated(Self::import)), _ => None, }) }) From a4cbd9772b069f75822e95749aacf88102919f6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 5 Jun 2024 10:16:02 +0200 Subject: [PATCH 046/135] Parse module syntax. --- jaq-parse/src/lex.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/jaq-parse/src/lex.rs b/jaq-parse/src/lex.rs index d4f2139d5..9593d1575 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-parse/src/lex.rs @@ -111,6 +111,14 @@ impl<'a> Lex<'a> { } } + fn mod_then_ident(&mut self) { + self.ident0(); + if let Some(rest) = self.i.strip_prefix("::") { + self.i = rest.strip_prefix(['@', '$']).unwrap_or(rest); + self.ident1(); + } + } + /// Lex a sequence matching `[a-zA-Z0-9_]*`. fn ident0(&mut self) { self.trim(|c: char| c.is_ascii_alphanumeric() || c == '_'); @@ -217,7 +225,7 @@ impl<'a> Lex<'a> { let mut chars = self.i.chars(); Some(match chars.next()? { - 'a'..='z' | 'A'..='Z' | '_' => Token::Word(self.consumed(chars, Self::ident0)), + 'a'..='z' | 'A'..='Z' | '_' => Token::Word(self.consumed(chars, Self::mod_then_ident)), '$' | '@' => Token::Word(self.consumed(chars, Self::ident1)), '0'..='9' => Token::Num(self.consumed(chars, Self::num)), c if is_op(c) => Token::Op(self.consumed(chars, |lex| lex.trim(is_op))), From c4e42c2862cfe64065023be3ef47dc819f9b0472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 19 Jun 2024 19:05:27 +0200 Subject: [PATCH 047/135] Parse label-break. --- jaq-parse/src/term.rs | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index 26d16ea29..bc103b176 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -41,6 +41,9 @@ pub enum Term { Pipe(Box, Option, Box), BinOp(Box, Vec<(S, Self)>), + Label(S, Box), + Break(S), + Fold(S, Box, S, Vec), TryCatch(Box, Option>), IfThenElse(Vec<(Self, Self)>, Option>), @@ -187,7 +190,26 @@ impl<'a> Parser<'a> { } } + fn pipe(&mut self) -> Result<'a, ()> { + match self.i.next() { + Some(Token::Op("|")) => Ok(()), + next => Err((Expect::Char('|'), next)), + } + } + pub fn term_with_comma(&mut self, with_comma: bool) -> Result<'a, Term<&'a str>> { + if let Some(tm) = self.try_maybe(|p| match p.i.next() { + Some(Token::Word("label")) => { + let v = p.var()?; + p.pipe()?; + let tm = p.term_with_comma(with_comma)?; + Ok(Some(Term::Label(v, Box::new(tm)))) + } + _ => Ok(None), + })? { + return Ok(tm); + } + let head = self.atom_path()?; let mut tail = Vec::new(); while let Some(op) = self.op(with_comma) { @@ -204,10 +226,8 @@ impl<'a> Parser<'a> { Some(Token::Op("|")) => Ok(Some(None)), Some(Token::Word("as")) => { let x = p.var()?; - match p.i.next() { - Some(Token::Op("|")) => Ok(Some(Some(x))), - next => Err((Expect::Char('|'), next)), - } + p.pipe()?; + Ok(Some(Some(x))) } _ => Ok(None), })?; @@ -249,6 +269,7 @@ impl<'a> Parser<'a> { })?; Term::TryCatch(Box::new(try_), catch.map(Box::new)) } + Some(Token::Word("break")) => Term::Break(self.var()?), Some(Token::Word(fold)) if self.fold.contains(fold) => { let xs = self.atom_path()?; self.keyword("as")?; From a5c6b73af64068e54d3e288e36196ffc5c3b24f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Sun, 23 Jun 2024 18:24:50 +0200 Subject: [PATCH 048/135] Labels and definitions are atoms. --- jaq-parse/src/term.rs | 86 ++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 46 deletions(-) diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index bc103b176..44bc4a4f4 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -198,22 +198,10 @@ impl<'a> Parser<'a> { } pub fn term_with_comma(&mut self, with_comma: bool) -> Result<'a, Term<&'a str>> { - if let Some(tm) = self.try_maybe(|p| match p.i.next() { - Some(Token::Word("label")) => { - let v = p.var()?; - p.pipe()?; - let tm = p.term_with_comma(with_comma)?; - Ok(Some(Term::Label(v, Box::new(tm)))) - } - _ => Ok(None), - })? { - return Ok(tm); - } - - let head = self.atom_path()?; + let head = self.atom()?; let mut tail = Vec::new(); while let Some(op) = self.op(with_comma) { - tail.push((op, self.atom_path()?)); + tail.push((op, self.atom()?)); } let tm = if tail.is_empty() { @@ -238,8 +226,14 @@ impl<'a> Parser<'a> { } fn atom(&mut self) -> Result<'a, Term<&'a str>> { - Ok(match self.i.next() { - Some(Token::Op("-")) => Term::Neg(Box::new(self.atom_path()?)), + let tm = match self.i.next() { + Some(Token::Op("-")) => Term::Neg(Box::new(self.atom()?)), + Some(Token::Word("def")) => { + let head = self.def_tail()?; + let tail = self.defs()?; + let tm = self.term()?; + Term::Def(core::iter::once(head).chain(tail).collect(), Box::new(tm)) + } Some(Token::Word("if")) => { let if_then = |p: &mut Self| { let if_ = p.term()?; @@ -262,16 +256,22 @@ impl<'a> Parser<'a> { Term::IfThenElse(if_thens, else_.map(Box::new)) } Some(Token::Word("try")) => { - let try_ = self.atom_path()?; + let try_ = self.atom()?; let catch = self.try_maybe(|p| match p.i.next() { - Some(Token::Word("catch")) => Ok(Some(p.atom_path()?)), + Some(Token::Word("catch")) => Ok(Some(p.atom()?)), _ => Ok(None), })?; Term::TryCatch(Box::new(try_), catch.map(Box::new)) } + Some(Token::Word("label")) => { + let x = self.var()?; + self.pipe()?; + let tm = self.term()?; + Term::Label(x, Box::new(tm)) + } Some(Token::Word("break")) => Term::Break(self.var()?), Some(Token::Word(fold)) if self.fold.contains(fold) => { - let xs = self.atom_path()?; + let xs = self.term()?; self.keyword("as")?; let x = self.var()?; let args = self.args(Self::term); @@ -285,7 +285,7 @@ impl<'a> Parser<'a> { }); match s { None => Term::Call(*id, Vec::new()), - Some(parts) => Term::Str(Some(id), parts), + Some(parts) => Term::Str(Some(*id), parts), } } Some(Token::Word(id)) if !KEYWORDS.contains(id) => { @@ -311,30 +311,14 @@ impl<'a> Parser<'a> { }), Some(Token::Str(parts)) => Term::Str(None, self.str_parts(parts)), next => return Err((Expect::Term, next)), - }) - } - - fn atom_path(&mut self) -> Result<'a, Term<&'a str>> { - let tm = self.atom()?; + }; let tm = match self.opt() { path::Opt::Optional => Term::TryCatch(Box::new(tm), None), path::Opt::Essential => tm, }; - let mut path: Vec<_> = core::iter::from_fn(|| self.path_part_opt()).collect(); - while self.char0('.').is_some() { - use path::Opt; - let key = match self.i.next() { - Some(Token::Word(id)) if !id.starts_with(['$', '@']) => *id, - next => return Err((Expect::Key, next)), - }; - let opt = self.char0('?').is_some(); - let key = Term::Str(None, Vec::from([StrPart::Str(key)])); - let opt = if opt { Opt::Optional } else { Opt::Essential }; - path.push((path::Part::Index(key), opt)); - path.extend(core::iter::from_fn(|| self.path_part_opt())); - } + let path = self.path()?; Ok(if path.is_empty() { tm } else { @@ -343,14 +327,7 @@ impl<'a> Parser<'a> { } pub fn term(&mut self) -> Result<'a, Term<&'a str>> { - let defs = self.defs()?; - let tm = self.term_with_comma(true)?; - - Ok(if defs.is_empty() { - tm - } else { - Term::Def(defs, Box::new(tm)) - }) + self.term_with_comma(true) } fn obj_entry(&mut self) -> Result<'a, (Term<&'a str>, Option>)> { @@ -391,6 +368,23 @@ impl<'a> Parser<'a> { parts.collect() } + fn path(&mut self) -> Result<'a, Vec<(path::Part>, path::Opt)>> { + let mut path: Vec<_> = core::iter::from_fn(|| self.path_part_opt()).collect(); + while self.char0('.').is_some() { + use path::Opt; + let key = match self.i.next() { + Some(Token::Word(id)) if !id.starts_with(['$', '@']) => *id, + next => return Err((Expect::Key, next)), + }; + let opt = self.char0('?').is_some(); + let key = Term::Str(None, Vec::from([StrPart::Str(key)])); + let opt = if opt { Opt::Optional } else { Opt::Essential }; + path.push((path::Part::Index(key), opt)); + path.extend(core::iter::from_fn(|| self.path_part_opt())); + } + Ok(path) + } + fn path_part(&mut self) -> Result<'a, path::Part>> { use path::Part::{Index, Range}; let done = |p: &Self| matches!(p.i.as_slice(), [Token::Char("]")]); From 899a8829543c5b234cd0acc6f8b4efebe2d488d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Sun, 23 Jun 2024 18:28:07 +0200 Subject: [PATCH 049/135] For `{(k): v}`, `v` must not have commas. --- jaq-parse/src/term.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jaq-parse/src/term.rs b/jaq-parse/src/term.rs index 44bc4a4f4..1fdb8e9c2 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-parse/src/term.rs @@ -344,7 +344,7 @@ impl<'a> Parser<'a> { Some(Token::Block("(", tokens)) => { let k = self.with(tokens, ")", Self::term); self.char1(':')?; - return Ok((k, Some(self.term()?))); + return Ok((k, Some(self.term_with_comma(false)?))); } next => return Err((Expect::Key, next)), }; From edcd73159acae2ec874005aa57b486e0a45a9b1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 27 Jun 2024 09:01:24 +0200 Subject: [PATCH 050/135] Move new lexer/parser to jaq-syn. --- jaq-parse/src/lib.rs | 2 -- {jaq-parse => jaq-syn}/src/lex.rs | 8 ++++---- jaq-syn/src/lib.rs | 3 +++ jaq-parse/src/term.rs => jaq-syn/src/parse.rs | 2 +- jaq/src/main.rs | 4 ++-- 5 files changed, 10 insertions(+), 9 deletions(-) rename {jaq-parse => jaq-syn}/src/lex.rs (98%) rename jaq-parse/src/term.rs => jaq-syn/src/parse.rs (99%) diff --git a/jaq-parse/src/lib.rs b/jaq-parse/src/lib.rs index 4d19b608a..5650f54f8 100644 --- a/jaq-parse/src/lib.rs +++ b/jaq-parse/src/lib.rs @@ -7,11 +7,9 @@ extern crate alloc; mod def; mod filter; -pub mod lex; mod path; mod prec_climb; mod string; -pub mod term; mod token; pub use def::{defs, main}; diff --git a/jaq-parse/src/lex.rs b/jaq-syn/src/lex.rs similarity index 98% rename from jaq-parse/src/lex.rs rename to jaq-syn/src/lex.rs index 9593d1575..577522517 100644 --- a/jaq-parse/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -36,7 +36,7 @@ pub enum Expect<'a> { } impl<'a> Expect<'a> { - pub fn to_simple_error(&self, pos: &'a str, full: &'a str) -> (&'static str, jaq_syn::Span) { + pub fn to_simple_error(&self, pos: &'a str, full: &'a str) -> (&'static str, crate::Span) { let mut pos = span(full, pos); pos.end = pos.start; let s = match self { @@ -57,12 +57,12 @@ impl<'a> Expect<'a> { type Error<'a> = (Expect<'a>, &'a str); -pub struct Lex<'a> { +pub struct Lexer<'a> { i: &'a str, e: Vec>, } -impl<'a> Lex<'a> { +impl<'a> Lexer<'a> { #[must_use] pub fn new(i: &'a str) -> Self { let e = Vec::new(); @@ -269,7 +269,7 @@ impl<'a> Lex<'a> { } } -fn span(whole_buffer: &str, part: &str) -> jaq_syn::Span { +fn span(whole_buffer: &str, part: &str) -> crate::Span { let start = part.as_ptr() as usize - whole_buffer.as_ptr() as usize; let end = start + part.len(); start..end diff --git a/jaq-syn/src/lib.rs b/jaq-syn/src/lib.rs index 415fe743c..938d652af 100644 --- a/jaq-syn/src/lib.rs +++ b/jaq-syn/src/lib.rs @@ -12,6 +12,9 @@ pub mod path; pub mod string; pub mod test; +pub mod lex; +pub mod parse; + pub use def::{Arg, Call, Def, Main}; pub use ops::{MathOp, OrdOp}; use path::Path; diff --git a/jaq-parse/src/term.rs b/jaq-syn/src/parse.rs similarity index 99% rename from jaq-parse/src/term.rs rename to jaq-syn/src/parse.rs index 1fdb8e9c2..5344a4754 100644 --- a/jaq-parse/src/term.rs +++ b/jaq-syn/src/parse.rs @@ -1,6 +1,6 @@ use crate::lex::{StrPart, Token}; use alloc::{boxed::Box, vec::Vec}; -use jaq_syn::path; +use crate::path; type Error<'a> = (Expect, Option<&'a Token<&'a str>>); #[derive(Debug)] diff --git a/jaq/src/main.rs b/jaq/src/main.rs index 47dc39603..dd35eb580 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -258,9 +258,9 @@ fn parse(filter_str: &str, vars: Vec) -> Result> defs.insert_defs(jaq_std::std()); let std = include_str!("../../jaq-std/src/std.jq"); - let (tokens, lex_errs) = jaq_parse::lex::Lex::new(std).lex(); + let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(std).lex(); assert!(lex_errs.is_empty()); - let mut parser = jaq_parse::term::Parser::new(&tokens); + let mut parser = jaq_syn::parse::Parser::new(&tokens); let _std = parser.defs(); assert!(parser.e.is_empty()); From ef6ee2ca6e03345eca5d9f7537b4b2e066d39a5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 27 Jun 2024 15:24:22 +0200 Subject: [PATCH 051/135] Start work on conversion to legacy terms. --- jaq-syn/src/filter.rs | 149 ++++++++++++++++++++++++++++++++++++++ jaq-syn/src/lex.rs | 3 +- jaq-syn/src/lib.rs | 1 + jaq-syn/src/parse.rs | 3 +- jaq-syn/src/prec_climb.rs | 111 ++++++++++++++++++++++++++++ 5 files changed, 263 insertions(+), 4 deletions(-) create mode 100644 jaq-syn/src/prec_climb.rs diff --git a/jaq-syn/src/filter.rs b/jaq-syn/src/filter.rs index 8c16ef70c..fbd48ff75 100644 --- a/jaq-syn/src/filter.rs +++ b/jaq-syn/src/filter.rs @@ -143,6 +143,155 @@ pub enum Filter { Binary(Box>, BinaryOp, Box>), } +use crate::parse; +use alloc::string::ToString; + +impl From<&parse::Term<&str>> for Filter { + fn from(tm: &parse::Term<&str>) -> Self { + use crate::path::{Opt, Part}; + let span = |tm: &parse::Term<_>| Box::new((tm.into(), 0..42)); + let from_part = |(part, opt): &(Part<_>, Opt)| { + let part = match part { + Part::Index(i) => Part::Index(*span(i)), + Part::Range(l, h) => { + Part::Range(l.as_ref().map(|l| *span(l)), h.as_ref().map(|h| *span(h))) + } + }; + (part, *opt) + }; + let from_str = |part: &StrPart<&str, _>| match part { + StrPart::Str(s) => string::Part::Str(s.to_string()), + StrPart::Filter(tm) => string::Part::Fun(*span(tm)), + StrPart::Char(c) => string::Part::Str(c.to_string()), + }; + // TODO: this is wrong when v is not given! + let from_obj = |(k, v): &(_, Option<_>)| { + KeyVal::Filter(*span(k), v.as_ref().map_or_else(|| *span(k), |v| *span(v))) + }; + let from_op = |op| match op { + "," => BinaryOp::Comma, + "//" => BinaryOp::Alt, + "or" => BinaryOp::Or, + "and" => BinaryOp::And, + "+" => BinaryOp::Math(MathOp::Add), + "-" => BinaryOp::Math(MathOp::Sub), + "*" => BinaryOp::Math(MathOp::Mul), + "/" => BinaryOp::Math(MathOp::Div), + "%" => BinaryOp::Math(MathOp::Rem), + "=" => BinaryOp::Assign(AssignOp::Assign), + "|=" => BinaryOp::Assign(AssignOp::Update), + "+=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Add)), + "-=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Sub)), + "*=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Mul)), + "/=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Div)), + "%=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Rem)), + "<" => BinaryOp::Ord(OrdOp::Lt), + ">" => BinaryOp::Ord(OrdOp::Gt), + "<=" => BinaryOp::Ord(OrdOp::Le), + ">=" => BinaryOp::Ord(OrdOp::Ge), + "==" => BinaryOp::Ord(OrdOp::Eq), + "!=" => BinaryOp::Ord(OrdOp::Ne), + _ => todo!("unknown operator"), + }; + use crate::lex::StrPart; + use crate::string; + use parse::Term::*; + match tm { + Id => Self::Id, + Recurse => Self::Recurse, + Num(n) => Self::Num(n.to_string()), + Str(fmt, parts) => Self::Str(Box::new(crate::Str { + fmt: fmt.map(|fmt| span(&Call(fmt, Vec::new()))), + parts: parts.iter().map(from_str).collect(), + })), + Arr(a) => Self::Array(a.as_deref().map(span)), + Obj(o) => Self::Object(o.iter().map(from_obj).collect()), + Neg(tm) => Self::Neg(span(&*tm)), + Pipe(l, v, r) => Self::Binary( + span(&*l), + BinaryOp::Pipe(v.map(ToString::to_string)), + span(&*r), + ), + BinOp(head, tail) => { + let head = *span(head); + let tail = tail.iter().map(|(op, tm)| (from_op(op), *span(tm))); + prec_climb::climb(head, tail).0 + } + + Label(v, ..) | Break(v) => unimplemented!("label-break is not supported yet"), + + Fold(fold, xs, v, args) => { + let fold_type = match *fold { + "reduce" => FoldType::Reduce, + "foreach" => FoldType::Foreach, + "for" => FoldType::For, + _ => panic!(), + }; + let [init, update] = &args[..] else { panic!() }; + let fold = self::Fold { + xs: span(&*xs), + x: v.to_string(), + init: span(&init), + f: span(&update), + }; + Self::Fold(fold_type, fold) + } + TryCatch(try_, catch) => Self::TryCatch(span(try_), catch.as_deref().map(span)), + IfThenElse(if_thens, else_) => Self::Ite( + if_thens + .iter() + .map(|(if_, then_)| (*span(if_), *span(then_))) + .collect(), + else_.as_deref().map(span), + ), + + Def(defs, tm) => panic!(), + Call(c, args) => Self::Call(c.to_string(), args.iter().map(|a| *span(a)).collect()), + Var(v) => Self::Var(v.to_string()), + + Key(s) => { + let s = Self::Str(Box::new(crate::Str::from(s.to_string()))); + let part = (Part::Index((s, 0..42)), Opt::Essential); + Self::Path(span(&Id), Vec::from([part])) + } + Path(tm, path) => Self::Path(span(tm), path.iter().map(from_part).collect()), + } + } +} + +use crate::prec_climb::{self, Associativity}; + +impl prec_climb::Op for BinaryOp { + fn precedence(&self) -> usize { + match self { + Self::Pipe(_) => 0, + Self::Comma => 1, + Self::Assign(_) => 2, + Self::Alt => 3, + Self::Or => Self::Alt.precedence() + 1, + Self::And => Self::Or.precedence() + 1, + Self::Ord(OrdOp::Eq | OrdOp::Ne) => Self::And.precedence() + 1, + Self::Ord(OrdOp::Lt | OrdOp::Gt | OrdOp::Le | OrdOp::Ge) => Self::And.precedence() + 2, + Self::Math(MathOp::Add | MathOp::Sub) => Self::And.precedence() + 3, + Self::Math(MathOp::Mul | MathOp::Div) => Self::Math(MathOp::Add).precedence() + 1, + Self::Math(MathOp::Rem) => Self::Math(MathOp::Mul).precedence() + 1, + } + } + + fn associativity(&self) -> Associativity { + match self { + Self::Pipe(_) | Self::Assign(_) => Associativity::Right, + _ => Associativity::Left, + } + } +} + +impl prec_climb::Expr for Spanned { + fn from_op(lhs: Self, op: BinaryOp, rhs: Self) -> Self { + Filter::binary(lhs, op, rhs) + } +} + impl From>> for Filter { fn from(s: Str>) -> Self { Self::Str(Box::new(s)) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index 577522517..89c988072 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -5,7 +5,6 @@ pub enum StrPart { Str(S), Filter(F), Char(char), - Unicode(u32), } /// Token (tree) generic over string type `S`. @@ -179,7 +178,7 @@ impl<'a> Lexer<'a> { } } } - StrPart::Unicode(hex) + StrPart::Char(char::from_u32(hex).unwrap()) } Some('(') => return Some(StrPart::Filter(self.delim())), Some(_) | None => { diff --git a/jaq-syn/src/lib.rs b/jaq-syn/src/lib.rs index 938d652af..a39200de9 100644 --- a/jaq-syn/src/lib.rs +++ b/jaq-syn/src/lib.rs @@ -14,6 +14,7 @@ pub mod test; pub mod lex; pub mod parse; +mod prec_climb; pub use def::{Arg, Call, Def, Main}; pub use ops::{MathOp, OrdOp}; diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 5344a4754..014eb498e 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -1,6 +1,6 @@ use crate::lex::{StrPart, Token}; -use alloc::{boxed::Box, vec::Vec}; use crate::path; +use alloc::{boxed::Box, vec::Vec}; type Error<'a> = (Expect, Option<&'a Token<&'a str>>); #[derive(Debug)] @@ -363,7 +363,6 @@ impl<'a> Parser<'a> { } StrPart::Filter(_) => unreachable!(), StrPart::Char(c) => StrPart::Char(*c), - StrPart::Unicode(u) => StrPart::Unicode(*u), }); parts.collect() } diff --git a/jaq-syn/src/prec_climb.rs b/jaq-syn/src/prec_climb.rs new file mode 100644 index 000000000..7b97ee78f --- /dev/null +++ b/jaq-syn/src/prec_climb.rs @@ -0,0 +1,111 @@ +//! Precedence climbing for parsing expressions with binary operators. +//! +//! This allows you to parse expressions that are +//! separated by binary operators with precedence and associativity. +//! For example, in the expression `1 + 2 * 3`, we usually want to +//! parse this into `1 + (2 * 3)`, not `(1 + 2) * 3`. +//! This is handled by saying that `*` has higher *precedence* than `+`. +//! Also, when we have a power operator `^`, we want +//! `2 ^ 3 ^ 4` to mean `(2 ^ 3) ^ 4`, not `2 ^ (3 ^ 4)`. +//! This is handled by saying that `^` is *left-associative*. +//! +//! This was adapted from +//! . +//! +//! +//! ~~~ +//! ~~~ + +use core::iter::Peekable; + +/// Associativity of an operator. +pub enum Associativity { + /// `(x + y) + z` + Left, + /// `x + (y + z)` + Right, +} + +/// Binary operator. +pub trait Op { + /// "Stickiness" of the operator + fn precedence(&self) -> usize; + /// Is the operator left- or right-associative? + fn associativity(&self) -> Associativity; +} + +/// An expression that can be built from other expressions with some operator. +pub trait Expr { + /// Combine two expressions with an operator. + fn from_op(lhs: Self, op: O, rhs: Self) -> Self; +} + +/// Perform precedence climbing. +pub fn climb>(head: T, iter: impl IntoIterator) -> T { + climb1(head, &mut iter.into_iter().peekable(), 0) +} + +fn climb1, I>(mut x: T, iter: &mut Peekable, min_prec: usize) -> T +where + I: Iterator, +{ + while let Some((op, mut rhs)) = iter.next_if(|(op, _)| op.precedence() >= min_prec) { + let right_assoc = matches!(op.associativity(), Associativity::Right); + let this_prec = op.precedence(); + + while let Some(next) = iter.peek() { + let next_prec = next.0.precedence(); + + if next_prec > this_prec || (right_assoc && next_prec == this_prec) { + rhs = climb1(rhs, iter, next_prec) + } else { + break; + } + } + x = T::from_op(x, op, rhs); + } + x +} + +/// Simple arithmetic expressions +#[test] +fn test() { + enum Op { + Add, + Sub, + Mul, + Div, + } + + impl crate::prec_climb::Op for Op { + fn precedence(&self) -> usize { + match self { + Op::Add | Op::Sub => 0, + Op::Mul | Op::Div => 1, + } + } + + fn associativity(&self) -> Associativity { + Associativity::Right + } + } + + impl Expr for isize { + fn from_op(lhs: Self, op: Op, rhs: Self) -> Self { + match op { + Op::Add => lhs + rhs, + Op::Sub => lhs - rhs, + Op::Mul => lhs * rhs, + Op::Div => lhs / rhs, + } + } + } + + use Op::{Add, Div, Mul, Sub}; + // 1 + 2 * 3 - 6 / 2 = + // 1 + 6 - 3 = 4 + let head: isize = 1; + let tail = [(Add, 2), (Mul, 3), (Sub, 6), (Div, 2)]; + let out = climb(head, tail); + assert_eq!(out, 4); +} From bc2b0da666bb58cad30e2460941208635d85904f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 27 Jun 2024 17:28:53 +0200 Subject: [PATCH 052/135] Remove empty test. --- jaq-syn/src/prec_climb.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/jaq-syn/src/prec_climb.rs b/jaq-syn/src/prec_climb.rs index 7b97ee78f..83eb2f2b3 100644 --- a/jaq-syn/src/prec_climb.rs +++ b/jaq-syn/src/prec_climb.rs @@ -11,10 +11,6 @@ //! //! This was adapted from //! . -//! -//! -//! ~~~ -//! ~~~ use core::iter::Peekable; From 645dccf35cedebe3e20650a516d3c7625d51db6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 27 Jun 2024 17:29:10 +0200 Subject: [PATCH 053/135] Make Def fields public. --- jaq-syn/src/parse.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 014eb498e..04ebc4ad4 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -510,10 +510,10 @@ pub struct Module { #[derive(Debug)] pub struct Def { - name: S, - args: Vec, + pub(crate) name: S, + pub(crate) args: Vec, /// Body of the filter, e.g. `[.[] | f]`. - body: F, + pub(crate) body: F, } fn ident_key<'a>(token: &Token<&'a str>) -> Option<&'a str> { From 320919169ca12333be42891838764725879e7f27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 27 Jun 2024 17:29:30 +0200 Subject: [PATCH 054/135] Convert definitions and main. --- jaq-syn/src/def.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/jaq-syn/src/def.rs b/jaq-syn/src/def.rs index 23b237142..06db54295 100644 --- a/jaq-syn/src/def.rs +++ b/jaq-syn/src/def.rs @@ -130,3 +130,41 @@ pub struct Main { /// Body of the filter, e.g. `[.[] | f]`. pub body: Spanned, } + +use crate::parse; + +impl From<&parse::Def<&str, parse::Term<&str>>> for Def { + fn from(def: &parse::Def<&str, parse::Term<&str>>) -> Self { + use alloc::string::ToString; + let args = def.args.iter().map(|arg| { + if let Some(v) = arg.strip_prefix('$') { + Arg::Var(v.to_string()) + } else { + Arg::Fun(arg.to_string()) + } + }); + Def { + lhs: Call { + name: def.name.to_string(), + args: args.collect(), + }, + rhs: (&def.body).into(), + } + } +} + +impl From<&parse::Term<&str>> for Main { + fn from(tm: &parse::Term<&str>) -> Self { + use alloc::string::ToString; + match tm { + parse::Term::Def(defs, tm) => Main { + defs: defs.iter().map(Def::from).collect(), + body: ((&**tm).into(), 0..42), + }, + tm => Main { + defs: Vec::new(), + body: ((&*tm).into(), 0..42), + }, + } + } +} From b31f89f4a5ec5423997ad2c90808c1cd05475722 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 27 Jun 2024 17:29:50 +0200 Subject: [PATCH 055/135] More descriptive panic. --- jaq-syn/src/filter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jaq-syn/src/filter.rs b/jaq-syn/src/filter.rs index fbd48ff75..f9695cfb5 100644 --- a/jaq-syn/src/filter.rs +++ b/jaq-syn/src/filter.rs @@ -245,7 +245,7 @@ impl From<&parse::Term<&str>> for Filter { else_.as_deref().map(span), ), - Def(defs, tm) => panic!(), + Def(defs, tm) => unimplemented!("definitions inside terms are not supported yet"), Call(c, args) => Self::Call(c.to_string(), args.iter().map(|a| *span(a)).collect()), Var(v) => Self::Var(v.to_string()), From b6b515d9c612fff17f0e98397913c65753f7063e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 28 Jun 2024 09:08:38 +0200 Subject: [PATCH 056/135] Split away leading `$` for variables. --- jaq-syn/src/filter.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jaq-syn/src/filter.rs b/jaq-syn/src/filter.rs index f9695cfb5..d3efc7212 100644 --- a/jaq-syn/src/filter.rs +++ b/jaq-syn/src/filter.rs @@ -209,7 +209,7 @@ impl From<&parse::Term<&str>> for Filter { Neg(tm) => Self::Neg(span(&*tm)), Pipe(l, v, r) => Self::Binary( span(&*l), - BinaryOp::Pipe(v.map(ToString::to_string)), + BinaryOp::Pipe(v.map(|v| v[1..].to_string())), span(&*r), ), BinOp(head, tail) => { @@ -230,7 +230,7 @@ impl From<&parse::Term<&str>> for Filter { let [init, update] = &args[..] else { panic!() }; let fold = self::Fold { xs: span(&*xs), - x: v.to_string(), + x: v[1..].to_string(), init: span(&init), f: span(&update), }; @@ -247,7 +247,7 @@ impl From<&parse::Term<&str>> for Filter { Def(defs, tm) => unimplemented!("definitions inside terms are not supported yet"), Call(c, args) => Self::Call(c.to_string(), args.iter().map(|a| *span(a)).collect()), - Var(v) => Self::Var(v.to_string()), + Var(v) => Self::Var(v[1..].to_string()), Key(s) => { let s = Self::Str(Box::new(crate::Str::from(s.to_string()))); From 8737856c29a546f188411ecfbcc2c70cacc3eb97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 28 Jun 2024 09:10:52 +0200 Subject: [PATCH 057/135] Make folding operations accept only atoms for now. Otherwise, `reduce .[] as $x (...)` tries to interpret `.[] as $x` as term. --- jaq-syn/src/parse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 04ebc4ad4..207b1151e 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -271,7 +271,7 @@ impl<'a> Parser<'a> { } Some(Token::Word("break")) => Term::Break(self.var()?), Some(Token::Word(fold)) if self.fold.contains(fold) => { - let xs = self.term()?; + let xs = self.atom()?; self.keyword("as")?; let x = self.var()?; let args = self.args(Self::term); From 7374b0ca32e4453de3de6e1d58d0efb50f608c17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 28 Jun 2024 09:11:27 +0200 Subject: [PATCH 058/135] Refactor. --- jaq-syn/src/parse.rs | 49 +++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 207b1151e..562973e80 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -74,31 +74,38 @@ impl<'a> Parser<'a> { } } + pub fn verify_last(&mut self, last: &'a str) { + let last_char = || last.chars().next().unwrap(); + match (self.i.as_slice(), last) { + ([], "") => (), + ([Token::Char(c)], last) if *c == last => (), + ([], _) => self.e.push((Expect::Char(last_char()), None)), + ([next, ..], "") => self.e.push((Expect::Nothing, Some(next))), + ([next, ..], _) => self.e.push((Expect::Char(last_char()), Some(next))), + } + } + + /// Run given parse function with given tokens, then reset tokens to previous tokens. + fn with_tok(&mut self, tokens: &'a [Token<&'a str>], f: impl FnOnce(&mut Self) -> T) -> T { + let i = core::mem::replace(&mut self.i, tokens.iter()); + let y = f(self); + self.i = i; + y + } + + pub fn ok_or_default(&mut self, y: Result<'a, T>) -> T { + y.unwrap_or_else(|e| { + self.e.push(e); + T::default() + }) + } + fn with(&mut self, tokens: &'a [Token<&'a str>], last: &'a str, f: F) -> T where F: FnOnce(&mut Self) -> Result<'a, T>, { - let i = core::mem::replace(&mut self.i, tokens.iter()); - let y = match f(self) { - Ok(y) => { - match (self.i.as_slice(), last) { - ([], "") => (), - ([], _) => panic!(), - ([next, ..], "") => self.e.push((Expect::Nothing, Some(next))), - ([Token::Char(c)], last) if *c == last => (), - ([next, ..], last) => self - .e - .push((Expect::Char(last.chars().next().unwrap()), Some(next))), - } - y - } - Err(e) => { - self.e.push(e); - T::default() - } - }; - self.i = i; - y + let y = self.with_tok(tokens, |p| f(p).inspect(|_| p.verify_last(last))); + self.ok_or_default(y) } fn maybe(&mut self, f: impl Fn(&mut Self) -> Option) -> Option { From 4201dd5f9f44e7f94599b5c64584a4a8d51100de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 28 Jun 2024 09:11:52 +0200 Subject: [PATCH 059/135] Make `Module` usable. --- jaq-syn/src/parse.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 562973e80..fb3f4ba78 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -508,11 +508,11 @@ impl<'a> Parser<'a> { } } -#[derive(Debug)] +#[derive(Debug, Default)] pub struct Module { meta: Option>, mods: Vec<(S, Option)>, - body: B, + pub body: B, } #[derive(Debug)] From 08947bd0364dbdef0985e7b4919b91ff568f9599 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 28 Jun 2024 09:28:01 +0200 Subject: [PATCH 060/135] Remove dependency on jaq-std! --- Cargo.lock | 1 - jaq/Cargo.toml | 1 - jaq/src/main.rs | 17 ++++++++++++----- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f91e1d2e1..631f20586 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -255,7 +255,6 @@ dependencies = [ "jaq-core", "jaq-interpret", "jaq-parse", - "jaq-std", "jaq-syn", "memmap2", "mimalloc", diff --git a/jaq/Cargo.toml b/jaq/Cargo.toml index 2f1b06fdb..046491d36 100644 --- a/jaq/Cargo.toml +++ b/jaq/Cargo.toml @@ -19,7 +19,6 @@ jaq-syn = { version = "1.1.0", path = "../jaq-syn" } jaq-parse = { version = "1.0.0", path = "../jaq-parse" } jaq-interpret = { version = "1.2.0", path = "../jaq-interpret" } jaq-core = { version = "1.2.0", path = "../jaq-core" } -jaq-std = { version = "1.2.0", path = "../jaq-std" } atty = "0.2" chumsky = { version = "0.9.0", default-features = false } codesnake = { version = "0.1" } diff --git a/jaq/src/main.rs b/jaq/src/main.rs index dd35eb580..bd35aedc0 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -255,20 +255,27 @@ fn args_named(var_val: &[(String, Val)]) -> Val { fn parse(filter_str: &str, vars: Vec) -> Result> { let mut defs = ParseCtx::new(vars); defs.insert_natives(jaq_core::core()); - defs.insert_defs(jaq_std::std()); let std = include_str!("../../jaq-std/src/std.jq"); let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(std).lex(); assert!(lex_errs.is_empty()); let mut parser = jaq_syn::parse::Parser::new(&tokens); - let _std = parser.defs(); + let std = parser + .module(|p| p.defs()) + .inspect(|_| parser.verify_last("")); + let std = parser.ok_or_default(std); assert!(parser.e.is_empty()); + let std: Vec<_> = std.body.iter().map(jaq_syn::Def::from).collect(); + defs.insert_defs(std); /* - let (tokens, lex_errs) = jaq_parse::lex::Lex::new(filter_str).lex(); + let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(filter_str).lex(); if lex_errs.is_empty() { - let mut parser = jaq_parse::term::Parser::new(&tokens); - std::println!("{:?}", parser.module(|p| p.term())); + let mut parser = jaq_syn::parse::Parser::new(&tokens); + let main = parser.module(|p| p.term()).inspect(|_| parser.verify_last("")); + let main = parser.ok_or_default(main); + std::println!("{:?}", main); + std::println!("{:?}", jaq_syn::Main::from(&main.body)); std::println!("{:?}", parser.e); } else { std::println!("{:?}", lex_errs); From 9cc0187da527202e798b67bdb45b1f399fedc8da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Fri, 28 Jun 2024 12:15:20 +0200 Subject: [PATCH 061/135] Simplify final parsing, make compile with MSRV. --- jaq-syn/src/def.rs | 3 +-- jaq-syn/src/filter.rs | 19 +++++++++++-------- jaq-syn/src/parse.rs | 33 ++++++++++++++++++++------------- jaq/src/main.rs | 8 ++------ 4 files changed, 34 insertions(+), 29 deletions(-) diff --git a/jaq-syn/src/def.rs b/jaq-syn/src/def.rs index 06db54295..c2f7e75d2 100644 --- a/jaq-syn/src/def.rs +++ b/jaq-syn/src/def.rs @@ -155,7 +155,6 @@ impl From<&parse::Def<&str, parse::Term<&str>>> for Def { impl From<&parse::Term<&str>> for Main { fn from(tm: &parse::Term<&str>) -> Self { - use alloc::string::ToString; match tm { parse::Term::Def(defs, tm) => Main { defs: defs.iter().map(Def::from).collect(), @@ -163,7 +162,7 @@ impl From<&parse::Term<&str>> for Main { }, tm => Main { defs: Vec::new(), - body: ((&*tm).into(), 0..42), + body: (tm.into(), 0..42), }, } } diff --git a/jaq-syn/src/filter.rs b/jaq-syn/src/filter.rs index d3efc7212..a869c05b8 100644 --- a/jaq-syn/src/filter.rs +++ b/jaq-syn/src/filter.rs @@ -206,11 +206,11 @@ impl From<&parse::Term<&str>> for Filter { })), Arr(a) => Self::Array(a.as_deref().map(span)), Obj(o) => Self::Object(o.iter().map(from_obj).collect()), - Neg(tm) => Self::Neg(span(&*tm)), + Neg(tm) => Self::Neg(span(tm)), Pipe(l, v, r) => Self::Binary( - span(&*l), + span(l), BinaryOp::Pipe(v.map(|v| v[1..].to_string())), - span(&*r), + span(r), ), BinOp(head, tail) => { let head = *span(head); @@ -218,7 +218,7 @@ impl From<&parse::Term<&str>> for Filter { prec_climb::climb(head, tail).0 } - Label(v, ..) | Break(v) => unimplemented!("label-break is not supported yet"), + Label(_v, ..) | Break(_v) => unimplemented!("label-break is not supported yet"), Fold(fold, xs, v, args) => { let fold_type = match *fold { @@ -227,12 +227,15 @@ impl From<&parse::Term<&str>> for Filter { "for" => FoldType::For, _ => panic!(), }; - let [init, update] = &args[..] else { panic!() }; + let (init, update) = match &args[..] { + [init, update] => (init, update), + _ => todo!(), + }; let fold = self::Fold { - xs: span(&*xs), + xs: span(xs), x: v[1..].to_string(), - init: span(&init), - f: span(&update), + init: span(init), + f: span(update), }; Self::Fold(fold_type, fold) } diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index fb3f4ba78..85910c103 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -74,14 +74,14 @@ impl<'a> Parser<'a> { } } - pub fn verify_last(&mut self, last: &'a str) { + fn verify_last(&mut self, last: &'a str) -> Result<'a, ()> { let last_char = || last.chars().next().unwrap(); match (self.i.as_slice(), last) { - ([], "") => (), - ([Token::Char(c)], last) if *c == last => (), - ([], _) => self.e.push((Expect::Char(last_char()), None)), - ([next, ..], "") => self.e.push((Expect::Nothing, Some(next))), - ([next, ..], _) => self.e.push((Expect::Char(last_char()), Some(next))), + ([], "") => Ok(()), + ([Token::Char(c)], last) if *c == last => Ok(()), + ([], _) => Err((Expect::Char(last_char()), None)), + ([next, ..], "") => Err((Expect::Nothing, Some(next))), + ([next, ..], _) => Err((Expect::Char(last_char()), Some(next))), } } @@ -93,19 +93,26 @@ impl<'a> Parser<'a> { y } - pub fn ok_or_default(&mut self, y: Result<'a, T>) -> T { - y.unwrap_or_else(|e| { - self.e.push(e); - T::default() - }) + pub fn finish(&mut self, last: &'a str, f: F) -> T + where + F: FnOnce(&mut Self) -> Result<'a, T>, + { + f(self) + .and_then(|y| { + self.verify_last(last)?; + Ok(y) + }) + .unwrap_or_else(|e| { + self.e.push(e); + T::default() + }) } fn with(&mut self, tokens: &'a [Token<&'a str>], last: &'a str, f: F) -> T where F: FnOnce(&mut Self) -> Result<'a, T>, { - let y = self.with_tok(tokens, |p| f(p).inspect(|_| p.verify_last(last))); - self.ok_or_default(y) + self.with_tok(tokens, |p| p.finish(last, f)) } fn maybe(&mut self, f: impl Fn(&mut Self) -> Option) -> Option { diff --git a/jaq/src/main.rs b/jaq/src/main.rs index bd35aedc0..0043559f5 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -260,10 +260,7 @@ fn parse(filter_str: &str, vars: Vec) -> Result> let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(std).lex(); assert!(lex_errs.is_empty()); let mut parser = jaq_syn::parse::Parser::new(&tokens); - let std = parser - .module(|p| p.defs()) - .inspect(|_| parser.verify_last("")); - let std = parser.ok_or_default(std); + let std = parser.finish("", |p| p.module(|p| p.defs())); assert!(parser.e.is_empty()); let std: Vec<_> = std.body.iter().map(jaq_syn::Def::from).collect(); defs.insert_defs(std); @@ -272,8 +269,7 @@ fn parse(filter_str: &str, vars: Vec) -> Result> let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(filter_str).lex(); if lex_errs.is_empty() { let mut parser = jaq_syn::parse::Parser::new(&tokens); - let main = parser.module(|p| p.term()).inspect(|_| parser.verify_last("")); - let main = parser.ok_or_default(main); + let main = parser.finish("", |p| p.module(|p| p.term())); std::println!("{:?}", main); std::println!("{:?}", jaq_syn::Main::from(&main.body)); std::println!("{:?}", parser.e); From b444f6b4b950a36505d7a1a0341c341a6b7db967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 2 Jul 2024 08:16:30 +0200 Subject: [PATCH 062/135] =?UTF-8?q?Exclude=20unpermitted=20numbers=20such?= =?UTF-8?q?=20as=20`=E4=B8=89`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- jaq-syn/src/lex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index 89c988072..85699ed4f 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -145,7 +145,7 @@ impl<'a> Lexer<'a> { /// Decimal with optional exponent. fn num(&mut self) { - self.trim(char::is_numeric); + self.trim(|c| c.is_ascii_digit()); if let Some(i) = self.i.strip_prefix('.') { self.i = i; self.digits1(); From 695d012be451490e101ebb98f2264d243426cec1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 2 Jul 2024 08:18:10 +0200 Subject: [PATCH 063/135] Use more `iter::from_fn`. --- jaq-syn/src/parse.rs | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 85910c103..6713329f2 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -142,14 +142,13 @@ impl<'a> Parser<'a> { where F: Fn(&mut Self) -> Result<'a, T>, { - let mut ys = Vec::from([f(self)?]); - loop { - match self.i.next() { - Some(Token::Char(c)) if c.chars().eq([sep]) => ys.push(f(self)?), - Some(Token::Char(")" | "}")) => return Ok(ys), - next => return Err((Expect::Char(sep), next)), - } - } + let head = core::iter::once(f(self)); + let tail = core::iter::from_fn(|| match self.i.next() { + Some(Token::Char(c)) if c.chars().eq([sep]) => Some(f(self)), + Some(Token::Char(")" | "}")) => None, + next => Some(Err((Expect::Char(sep), next))), + }); + head.chain(tail).collect() } fn args(&mut self, f: impl Fn(&mut Self) -> Result<'a, T> + Copy) -> Vec { @@ -213,10 +212,8 @@ impl<'a> Parser<'a> { pub fn term_with_comma(&mut self, with_comma: bool) -> Result<'a, Term<&'a str>> { let head = self.atom()?; - let mut tail = Vec::new(); - while let Some(op) = self.op(with_comma) { - tail.push((op, self.atom()?)); - } + let tail = core::iter::from_fn(|| self.op(with_comma).map(|op| Ok((op, self.atom()?)))) + .collect::>>()?; let tm = if tail.is_empty() { head From 95e40a7ec509113de0798da643f3c0c5ca46892f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 2 Jul 2024 19:12:25 +0200 Subject: [PATCH 064/135] Correct more numeric lexing. --- jaq-syn/src/lex.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index 85699ed4f..e8b5b73b9 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -136,8 +136,8 @@ impl<'a> Lexer<'a> { /// Lex a non-empty digit sequence. fn digits1(&mut self) { - if let Some(rest) = self.i.strip_prefix(|c: char| c.is_numeric()) { - self.i = rest.trim_start_matches(|c: char| c.is_numeric()); + if let Some(rest) = self.i.strip_prefix(|c: char| c.is_ascii_digit()) { + self.i = rest.trim_start_matches(|c: char| c.is_ascii_digit()); } else { self.e.push((Expect::Digit, self.i)); } From f75d26ce90a520861fb69b7cf79815e28499a142 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 2 Jul 2024 19:13:12 +0200 Subject: [PATCH 065/135] Make consumed take usize instead of Chars. --- jaq-syn/src/lex.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index e8b5b73b9..deaedeb15 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -95,9 +95,9 @@ impl<'a> Lexer<'a> { self.i = self.i.trim_start_matches(f); } - fn consumed(&mut self, chars: core::str::Chars<'a>, f: impl FnOnce(&mut Self)) -> &'a str { + fn consumed(&mut self, skip: usize, f: impl FnOnce(&mut Self)) -> &'a str { let start = self.i; - self.i = chars.as_str(); + self.i = &self.i[skip..]; f(self); &start[..start.len() - self.i.len()] } @@ -200,7 +200,7 @@ impl<'a> Lexer<'a> { let mut parts = Vec::new(); loop { - let s = self.consumed(self.i.chars(), |lex| lex.trim(|c| c != '\\' && c != '"')); + let s = self.consumed(0, |lex| lex.trim(|c| c != '\\' && c != '"')); if !s.is_empty() { parts.push(StrPart::Str(s)); } @@ -224,10 +224,10 @@ impl<'a> Lexer<'a> { let mut chars = self.i.chars(); Some(match chars.next()? { - 'a'..='z' | 'A'..='Z' | '_' => Token::Word(self.consumed(chars, Self::mod_then_ident)), - '$' | '@' => Token::Word(self.consumed(chars, Self::ident1)), - '0'..='9' => Token::Num(self.consumed(chars, Self::num)), - c if is_op(c) => Token::Op(self.consumed(chars, |lex| lex.trim(is_op))), + 'a'..='z' | 'A'..='Z' | '_' => Token::Word(self.consumed(1, Self::mod_then_ident)), + '$' | '@' => Token::Word(self.consumed(1, Self::ident1)), + '0'..='9' => Token::Num(self.consumed(1, Self::num)), + c if is_op(c) => Token::Op(self.consumed(1, |lex| lex.trim(is_op))), '?' if (chars.next(), chars.next()) == (Some('/'), Some('/')) => { Token::Op(self.take(3)) } From 17c77fc826861b9553ae75bf71aa6812716a8e56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 2 Jul 2024 19:15:15 +0200 Subject: [PATCH 066/135] Move conversion functions to own module. --- jaq-syn/src/convert.rs | 193 +++++++++++++++++++++++++++++++++++++++++ jaq-syn/src/def.rs | 37 -------- jaq-syn/src/filter.rs | 152 -------------------------------- jaq-syn/src/lib.rs | 1 + jaq/src/main.rs | 8 +- 5 files changed, 198 insertions(+), 193 deletions(-) create mode 100644 jaq-syn/src/convert.rs diff --git a/jaq-syn/src/convert.rs b/jaq-syn/src/convert.rs new file mode 100644 index 000000000..064fefe60 --- /dev/null +++ b/jaq-syn/src/convert.rs @@ -0,0 +1,193 @@ +use crate::filter::{AssignOp, BinaryOp, Filter, Fold, FoldType, KeyVal}; +use crate::prec_climb::{self, Associativity}; +use crate::{parse, Arg, Call, Def, Main, MathOp, OrdOp, Path, Span, Spanned, Str}; +use alloc::string::ToString; +use alloc::{boxed::Box, string::String, vec::Vec}; + +impl parse::Term<&str> { + fn conv(&self, s: &str) -> Filter { + use crate::lex::StrPart; + use crate::path::{Opt, Part}; + use crate::string; + use Filter::*; + + let span = |tm: &Self| Box::new((tm.conv(s), 0..42)); + let from_part = |(part, opt): &(Part<_>, Opt)| { + let part = match part { + Part::Index(i) => Part::Index(*span(i)), + Part::Range(l, h) => { + Part::Range(l.as_ref().map(|l| *span(l)), h.as_ref().map(|h| *span(h))) + } + }; + (part, *opt) + }; + let index_path = |k| { + let path = Vec::from([(Part::Index(k), Opt::Essential)]); + Filter::Path(span(&Self::Id), path) + }; + let from_str = |part: &StrPart<&str, _>| match part { + StrPart::Str(s) => string::Part::Str(s.to_string()), + StrPart::Filter(tm) => string::Part::Fun(*span(tm)), + StrPart::Char(c) => string::Part::Str(c.to_string()), + }; + let from_obj = |(k, v): &(_, Option<_>)| { + let f = || (index_path(*span(k)), 0..42); + KeyVal::Filter(*span(k), v.as_ref().map_or_else(|| f(), |v| *span(v))) + }; + let from_op = |op| match op { + "," => BinaryOp::Comma, + "//" => BinaryOp::Alt, + "or" => BinaryOp::Or, + "and" => BinaryOp::And, + "+" => BinaryOp::Math(MathOp::Add), + "-" => BinaryOp::Math(MathOp::Sub), + "*" => BinaryOp::Math(MathOp::Mul), + "/" => BinaryOp::Math(MathOp::Div), + "%" => BinaryOp::Math(MathOp::Rem), + "=" => BinaryOp::Assign(AssignOp::Assign), + "|=" => BinaryOp::Assign(AssignOp::Update), + "+=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Add)), + "-=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Sub)), + "*=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Mul)), + "/=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Div)), + "%=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Rem)), + "<" => BinaryOp::Ord(OrdOp::Lt), + ">" => BinaryOp::Ord(OrdOp::Gt), + "<=" => BinaryOp::Ord(OrdOp::Le), + ">=" => BinaryOp::Ord(OrdOp::Ge), + "==" => BinaryOp::Ord(OrdOp::Eq), + "!=" => BinaryOp::Ord(OrdOp::Ne), + _ => todo!("unknown operator"), + }; + match self { + Self::Id => Id, + Self::Recurse => Recurse, + Self::Num(n) => Num(n.to_string()), + Self::Str(fmt, parts) => Str(Box::new(crate::Str { + fmt: fmt.map(|fmt| span(&Self::Call(fmt, Vec::new()))), + parts: parts.iter().map(from_str).collect(), + })), + Self::Arr(a) => Array(a.as_deref().map(span)), + Self::Obj(o) => Object(o.iter().map(from_obj).collect()), + Self::Neg(tm) => Neg(span(tm)), + Self::Pipe(l, v, r) => Binary( + span(l), + BinaryOp::Pipe(v.map(|v| v[1..].to_string())), + span(r), + ), + Self::BinOp(head, tail) => { + let head = *span(head); + let tail = tail.iter().map(|(op, tm)| (from_op(op), *span(tm))); + prec_climb::climb(head, tail).0 + } + + Self::Label(_v, ..) | Self::Break(_v) => { + unimplemented!("label-break is not supported yet") + } + + Self::Fold(fold, xs, v, args) => { + let fold_type = match *fold { + "reduce" => FoldType::Reduce, + "foreach" => FoldType::Foreach, + "for" => FoldType::For, + _ => panic!(), + }; + let (init, update) = match &args[..] { + [init, update] => (init, update), + _ => todo!(), + }; + let fold = self::Fold { + xs: span(xs), + x: v[1..].to_string(), + init: span(init), + f: span(update), + }; + Fold(fold_type, fold) + } + Self::TryCatch(try_, catch) => TryCatch(span(try_), catch.as_deref().map(span)), + Self::IfThenElse(if_thens, else_) => Ite( + if_thens + .iter() + .map(|(if_, then_)| (*span(if_), *span(then_))) + .collect(), + else_.as_deref().map(span), + ), + + Self::Def(defs, tm) => unimplemented!("definitions inside terms are not supported yet"), + Self::Call(c, args) => Call(c.to_string(), args.iter().map(|a| *span(a)).collect()), + Self::Var(v) => Var(v[1..].to_string()), + + Self::Key(s) => index_path((Str(Box::new(crate::Str::from(s.to_string()))), 0..42)), + Self::Path(tm, path) => Path(span(tm), path.iter().map(from_part).collect()), + } + } + + pub fn conv_main(&self, s: &str) -> Main { + match self { + parse::Term::Def(defs, tm) => Main { + defs: defs.iter().map(|def| def.conv(s)).collect(), + body: (tm.conv(s), 0..42), + }, + tm => Main { + defs: Vec::new(), + body: (tm.conv(s), 0..42), + }, + } + } +} + +impl From<&parse::Term<&str>> for Filter { + fn from(tm: &parse::Term<&str>) -> Self { + tm.conv("") + } +} + +impl prec_climb::Op for BinaryOp { + fn precedence(&self) -> usize { + match self { + Self::Pipe(_) => 0, + Self::Comma => 1, + Self::Assign(_) => 2, + Self::Alt => 3, + Self::Or => Self::Alt.precedence() + 1, + Self::And => Self::Or.precedence() + 1, + Self::Ord(OrdOp::Eq | OrdOp::Ne) => Self::And.precedence() + 1, + Self::Ord(OrdOp::Lt | OrdOp::Gt | OrdOp::Le | OrdOp::Ge) => Self::And.precedence() + 2, + Self::Math(MathOp::Add | MathOp::Sub) => Self::And.precedence() + 3, + Self::Math(MathOp::Mul | MathOp::Div) => Self::Math(MathOp::Add).precedence() + 1, + Self::Math(MathOp::Rem) => Self::Math(MathOp::Mul).precedence() + 1, + } + } + + fn associativity(&self) -> Associativity { + match self { + Self::Pipe(_) | Self::Assign(_) => Associativity::Right, + _ => Associativity::Left, + } + } +} + +impl prec_climb::Expr for Spanned { + fn from_op(lhs: Self, op: BinaryOp, rhs: Self) -> Self { + Filter::binary(lhs, op, rhs) + } +} + +impl parse::Def<&str, parse::Term<&str>> { + pub fn conv(&self, s: &str) -> Def { + let args = self.args.iter().map(|arg| { + if let Some(v) = arg.strip_prefix('$') { + Arg::Var(v.to_string()) + } else { + Arg::Fun(arg.to_string()) + } + }); + Def { + lhs: Call { + name: self.name.to_string(), + args: args.collect(), + }, + rhs: self.body.conv_main(s), + } + } +} diff --git a/jaq-syn/src/def.rs b/jaq-syn/src/def.rs index c2f7e75d2..23b237142 100644 --- a/jaq-syn/src/def.rs +++ b/jaq-syn/src/def.rs @@ -130,40 +130,3 @@ pub struct Main { /// Body of the filter, e.g. `[.[] | f]`. pub body: Spanned, } - -use crate::parse; - -impl From<&parse::Def<&str, parse::Term<&str>>> for Def { - fn from(def: &parse::Def<&str, parse::Term<&str>>) -> Self { - use alloc::string::ToString; - let args = def.args.iter().map(|arg| { - if let Some(v) = arg.strip_prefix('$') { - Arg::Var(v.to_string()) - } else { - Arg::Fun(arg.to_string()) - } - }); - Def { - lhs: Call { - name: def.name.to_string(), - args: args.collect(), - }, - rhs: (&def.body).into(), - } - } -} - -impl From<&parse::Term<&str>> for Main { - fn from(tm: &parse::Term<&str>) -> Self { - match tm { - parse::Term::Def(defs, tm) => Main { - defs: defs.iter().map(Def::from).collect(), - body: ((&**tm).into(), 0..42), - }, - tm => Main { - defs: Vec::new(), - body: (tm.into(), 0..42), - }, - } - } -} diff --git a/jaq-syn/src/filter.rs b/jaq-syn/src/filter.rs index a869c05b8..8c16ef70c 100644 --- a/jaq-syn/src/filter.rs +++ b/jaq-syn/src/filter.rs @@ -143,158 +143,6 @@ pub enum Filter { Binary(Box>, BinaryOp, Box>), } -use crate::parse; -use alloc::string::ToString; - -impl From<&parse::Term<&str>> for Filter { - fn from(tm: &parse::Term<&str>) -> Self { - use crate::path::{Opt, Part}; - let span = |tm: &parse::Term<_>| Box::new((tm.into(), 0..42)); - let from_part = |(part, opt): &(Part<_>, Opt)| { - let part = match part { - Part::Index(i) => Part::Index(*span(i)), - Part::Range(l, h) => { - Part::Range(l.as_ref().map(|l| *span(l)), h.as_ref().map(|h| *span(h))) - } - }; - (part, *opt) - }; - let from_str = |part: &StrPart<&str, _>| match part { - StrPart::Str(s) => string::Part::Str(s.to_string()), - StrPart::Filter(tm) => string::Part::Fun(*span(tm)), - StrPart::Char(c) => string::Part::Str(c.to_string()), - }; - // TODO: this is wrong when v is not given! - let from_obj = |(k, v): &(_, Option<_>)| { - KeyVal::Filter(*span(k), v.as_ref().map_or_else(|| *span(k), |v| *span(v))) - }; - let from_op = |op| match op { - "," => BinaryOp::Comma, - "//" => BinaryOp::Alt, - "or" => BinaryOp::Or, - "and" => BinaryOp::And, - "+" => BinaryOp::Math(MathOp::Add), - "-" => BinaryOp::Math(MathOp::Sub), - "*" => BinaryOp::Math(MathOp::Mul), - "/" => BinaryOp::Math(MathOp::Div), - "%" => BinaryOp::Math(MathOp::Rem), - "=" => BinaryOp::Assign(AssignOp::Assign), - "|=" => BinaryOp::Assign(AssignOp::Update), - "+=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Add)), - "-=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Sub)), - "*=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Mul)), - "/=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Div)), - "%=" => BinaryOp::Assign(AssignOp::UpdateWith(MathOp::Rem)), - "<" => BinaryOp::Ord(OrdOp::Lt), - ">" => BinaryOp::Ord(OrdOp::Gt), - "<=" => BinaryOp::Ord(OrdOp::Le), - ">=" => BinaryOp::Ord(OrdOp::Ge), - "==" => BinaryOp::Ord(OrdOp::Eq), - "!=" => BinaryOp::Ord(OrdOp::Ne), - _ => todo!("unknown operator"), - }; - use crate::lex::StrPart; - use crate::string; - use parse::Term::*; - match tm { - Id => Self::Id, - Recurse => Self::Recurse, - Num(n) => Self::Num(n.to_string()), - Str(fmt, parts) => Self::Str(Box::new(crate::Str { - fmt: fmt.map(|fmt| span(&Call(fmt, Vec::new()))), - parts: parts.iter().map(from_str).collect(), - })), - Arr(a) => Self::Array(a.as_deref().map(span)), - Obj(o) => Self::Object(o.iter().map(from_obj).collect()), - Neg(tm) => Self::Neg(span(tm)), - Pipe(l, v, r) => Self::Binary( - span(l), - BinaryOp::Pipe(v.map(|v| v[1..].to_string())), - span(r), - ), - BinOp(head, tail) => { - let head = *span(head); - let tail = tail.iter().map(|(op, tm)| (from_op(op), *span(tm))); - prec_climb::climb(head, tail).0 - } - - Label(_v, ..) | Break(_v) => unimplemented!("label-break is not supported yet"), - - Fold(fold, xs, v, args) => { - let fold_type = match *fold { - "reduce" => FoldType::Reduce, - "foreach" => FoldType::Foreach, - "for" => FoldType::For, - _ => panic!(), - }; - let (init, update) = match &args[..] { - [init, update] => (init, update), - _ => todo!(), - }; - let fold = self::Fold { - xs: span(xs), - x: v[1..].to_string(), - init: span(init), - f: span(update), - }; - Self::Fold(fold_type, fold) - } - TryCatch(try_, catch) => Self::TryCatch(span(try_), catch.as_deref().map(span)), - IfThenElse(if_thens, else_) => Self::Ite( - if_thens - .iter() - .map(|(if_, then_)| (*span(if_), *span(then_))) - .collect(), - else_.as_deref().map(span), - ), - - Def(defs, tm) => unimplemented!("definitions inside terms are not supported yet"), - Call(c, args) => Self::Call(c.to_string(), args.iter().map(|a| *span(a)).collect()), - Var(v) => Self::Var(v[1..].to_string()), - - Key(s) => { - let s = Self::Str(Box::new(crate::Str::from(s.to_string()))); - let part = (Part::Index((s, 0..42)), Opt::Essential); - Self::Path(span(&Id), Vec::from([part])) - } - Path(tm, path) => Self::Path(span(tm), path.iter().map(from_part).collect()), - } - } -} - -use crate::prec_climb::{self, Associativity}; - -impl prec_climb::Op for BinaryOp { - fn precedence(&self) -> usize { - match self { - Self::Pipe(_) => 0, - Self::Comma => 1, - Self::Assign(_) => 2, - Self::Alt => 3, - Self::Or => Self::Alt.precedence() + 1, - Self::And => Self::Or.precedence() + 1, - Self::Ord(OrdOp::Eq | OrdOp::Ne) => Self::And.precedence() + 1, - Self::Ord(OrdOp::Lt | OrdOp::Gt | OrdOp::Le | OrdOp::Ge) => Self::And.precedence() + 2, - Self::Math(MathOp::Add | MathOp::Sub) => Self::And.precedence() + 3, - Self::Math(MathOp::Mul | MathOp::Div) => Self::Math(MathOp::Add).precedence() + 1, - Self::Math(MathOp::Rem) => Self::Math(MathOp::Mul).precedence() + 1, - } - } - - fn associativity(&self) -> Associativity { - match self { - Self::Pipe(_) | Self::Assign(_) => Associativity::Right, - _ => Associativity::Left, - } - } -} - -impl prec_climb::Expr for Spanned { - fn from_op(lhs: Self, op: BinaryOp, rhs: Self) -> Self { - Filter::binary(lhs, op, rhs) - } -} - impl From>> for Filter { fn from(s: Str>) -> Self { Self::Str(Box::new(s)) diff --git a/jaq-syn/src/lib.rs b/jaq-syn/src/lib.rs index a39200de9..759c4f6f5 100644 --- a/jaq-syn/src/lib.rs +++ b/jaq-syn/src/lib.rs @@ -12,6 +12,7 @@ pub mod path; pub mod string; pub mod test; +mod convert; pub mod lex; pub mod parse; mod prec_climb; diff --git a/jaq/src/main.rs b/jaq/src/main.rs index 0043559f5..ea7238c4e 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -256,13 +256,13 @@ fn parse(filter_str: &str, vars: Vec) -> Result> let mut defs = ParseCtx::new(vars); defs.insert_natives(jaq_core::core()); - let std = include_str!("../../jaq-std/src/std.jq"); - let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(std).lex(); + let std_str = include_str!("../../jaq-std/src/std.jq"); + let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(std_str).lex(); assert!(lex_errs.is_empty()); let mut parser = jaq_syn::parse::Parser::new(&tokens); let std = parser.finish("", |p| p.module(|p| p.defs())); assert!(parser.e.is_empty()); - let std: Vec<_> = std.body.iter().map(jaq_syn::Def::from).collect(); + let std: Vec<_> = std.body.iter().map(|def| def.conv(std_str)).collect(); defs.insert_defs(std); /* @@ -271,7 +271,7 @@ fn parse(filter_str: &str, vars: Vec) -> Result> let mut parser = jaq_syn::parse::Parser::new(&tokens); let main = parser.finish("", |p| p.module(|p| p.term())); std::println!("{:?}", main); - std::println!("{:?}", jaq_syn::Main::from(&main.body)); + std::println!("{:?}", main.body.conv_main(filter_str)); std::println!("{:?}", parser.e); } else { std::println!("{:?}", lex_errs); From a36cde28b5eb26cb1bbc073cd73607639f05f342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 2 Jul 2024 19:31:24 +0200 Subject: [PATCH 067/135] Calculate (more) correct spans! --- jaq-syn/src/convert.rs | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/jaq-syn/src/convert.rs b/jaq-syn/src/convert.rs index 064fefe60..53e124131 100644 --- a/jaq-syn/src/convert.rs +++ b/jaq-syn/src/convert.rs @@ -4,14 +4,34 @@ use crate::{parse, Arg, Call, Def, Main, MathOp, OrdOp, Path, Span, Spanned, Str use alloc::string::ToString; use alloc::{boxed::Box, string::String, vec::Vec}; +fn str_offset(large: &str, inner: &str) -> Option { + let large_beg = large.as_ptr() as usize; + let inner = inner.as_ptr() as usize; + if inner < large_beg || inner > large_beg.wrapping_add(large.len()) { + None + } else { + Some(inner.wrapping_sub(large_beg)) + } +} + impl parse::Term<&str> { + fn span(&self, code: &str) -> Span { + match self { + Self::Num(s) | Self::Call(s, ..) | Self::Var(s) => { + let offset = str_offset(code, s).unwrap(); + (offset..offset + s.len()) + } + _ => (0..42), + } + } + fn conv(&self, s: &str) -> Filter { use crate::lex::StrPart; use crate::path::{Opt, Part}; use crate::string; use Filter::*; - let span = |tm: &Self| Box::new((tm.conv(s), 0..42)); + let span = |tm: &Self| Box::new((tm.conv(s), tm.span(s))); let from_part = |(part, opt): &(Part<_>, Opt)| { let part = match part { Part::Index(i) => Part::Index(*span(i)), @@ -126,11 +146,11 @@ impl parse::Term<&str> { match self { parse::Term::Def(defs, tm) => Main { defs: defs.iter().map(|def| def.conv(s)).collect(), - body: (tm.conv(s), 0..42), + body: (tm.conv(s), tm.span(s)), }, tm => Main { defs: Vec::new(), - body: (tm.conv(s), 0..42), + body: (tm.conv(s), tm.span(s)), }, } } From 3a94958fb77a049aabc846efa856c7244c33fbdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 3 Jul 2024 11:14:51 +0200 Subject: [PATCH 068/135] Use lex::span instead of duplicate code. --- jaq-syn/src/convert.rs | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/jaq-syn/src/convert.rs b/jaq-syn/src/convert.rs index 53e124131..57f9d343f 100644 --- a/jaq-syn/src/convert.rs +++ b/jaq-syn/src/convert.rs @@ -4,24 +4,11 @@ use crate::{parse, Arg, Call, Def, Main, MathOp, OrdOp, Path, Span, Spanned, Str use alloc::string::ToString; use alloc::{boxed::Box, string::String, vec::Vec}; -fn str_offset(large: &str, inner: &str) -> Option { - let large_beg = large.as_ptr() as usize; - let inner = inner.as_ptr() as usize; - if inner < large_beg || inner > large_beg.wrapping_add(large.len()) { - None - } else { - Some(inner.wrapping_sub(large_beg)) - } -} - impl parse::Term<&str> { fn span(&self, code: &str) -> Span { match self { - Self::Num(s) | Self::Call(s, ..) | Self::Var(s) => { - let offset = str_offset(code, s).unwrap(); - (offset..offset + s.len()) - } - _ => (0..42), + Self::Num(s) | Self::Call(s, ..) | Self::Var(s) => crate::lex::span(code, s), + _ => 0..42, } } From cdd880608f53020a9544f579448e73ee647a0368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 3 Jul 2024 11:16:04 +0200 Subject: [PATCH 069/135] Make span() public. --- jaq-syn/src/lex.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index deaedeb15..957d370a5 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -268,8 +268,7 @@ impl<'a> Lexer<'a> { } } -fn span(whole_buffer: &str, part: &str) -> crate::Span { +pub fn span(whole_buffer: &str, part: &str) -> crate::Span { let start = part.as_ptr() as usize - whole_buffer.as_ptr() as usize; - let end = start + part.len(); - start..end + start..start + part.len() } From 0aefb7de1cda995778d931536886b6b4dece969b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 3 Jul 2024 11:16:46 +0200 Subject: [PATCH 070/135] String representation for lex::Expect. --- jaq-syn/src/lex.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index 957d370a5..0b3d10bb4 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -35,6 +35,19 @@ pub enum Expect<'a> { } impl<'a> Expect<'a> { + pub fn as_str(&self) -> &'static str { + match self { + Self::Digit => "digit", + Self::Ident => "identifier", + Self::Delim("(") => "closing parenthesis", + Self::Delim("[") => "closing bracket", + Self::Delim("{") => "closing brace", + Self::Delim(_) => panic!(), + Self::Escape => "string escape sequence", + Self::Unicode => "4-digit hexadecimal UTF-8 code point", + Self::Token => "token", + } + } pub fn to_simple_error(&self, pos: &'a str, full: &'a str) -> (&'static str, crate::Span) { let mut pos = span(full, pos); pos.end = pos.start; From 637f58cd7d9519b996401e69423afbbb883bffa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 3 Jul 2024 11:18:24 +0200 Subject: [PATCH 071/135] Make Expect::Delim return only delimiter, not whole remaining input. --- jaq-syn/src/lex.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index 0b3d10bb4..27641ad6d 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -260,12 +260,11 @@ impl<'a> Lexer<'a> { /// /// The input string has to start with either '(', '[', or '{'. fn delim(&mut self) -> Token<&'a str> { - let start = self.i; - let open = &self.i[..1]; - let close = match self.next() { - Some('(') => ')', - Some('[') => ']', - Some('{') => '}', + let open = self.take(1); + let close = match open { + "(" => ')', + "[" => ']', + "{" => '}', _ => panic!(), }; let mut tokens = self.tokens(); @@ -275,7 +274,7 @@ impl<'a> Lexer<'a> { tokens.push(Token::Char(&self.i[..1])); self.i = rest; } else { - self.e.push((Expect::Delim(start), self.i)); + self.e.push((Expect::Delim(open), self.i)); } Token::Block(open, tokens) } From 95c511b369b93524ef5fe2ea3b065b8daba2189f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 3 Jul 2024 11:19:20 +0200 Subject: [PATCH 072/135] Make a few lexer types abstract over S, not over 'a. --- jaq-syn/src/lex.rs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index 27641ad6d..bb5d6e0ff 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -25,16 +25,16 @@ pub enum Token { } #[derive(Clone, Debug)] -pub enum Expect<'a> { +pub enum Expect { Digit, Ident, - Delim(&'a str), + Delim(S), Escape, Unicode, Token, } -impl<'a> Expect<'a> { +impl<'a> Expect<&'a str> { pub fn as_str(&self) -> &'static str { match self { Self::Digit => "digit", @@ -48,6 +48,7 @@ impl<'a> Expect<'a> { Self::Token => "token", } } + pub fn to_simple_error(&self, pos: &'a str, full: &'a str) -> (&'static str, crate::Span) { let mut pos = span(full, pos); pos.end = pos.start; @@ -67,14 +68,14 @@ impl<'a> Expect<'a> { } } -type Error<'a> = (Expect<'a>, &'a str); +pub type Error = (Expect, S); -pub struct Lexer<'a> { - i: &'a str, - e: Vec>, +pub struct Lexer { + i: S, + e: Vec>, } -impl<'a> Lexer<'a> { +impl<'a> Lexer<&'a str> { #[must_use] pub fn new(i: &'a str) -> Self { let e = Vec::new(); @@ -82,7 +83,7 @@ impl<'a> Lexer<'a> { } #[must_use] - pub fn lex(mut self) -> (Vec>, Vec>) { + pub fn lex(mut self) -> (Vec>, Vec>) { let tokens = self.tokens(); self.space(); if !self.i.is_empty() { From 8a4a78b59ffd78a84f965e93cb2a8ad37df49198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 3 Jul 2024 11:20:26 +0200 Subject: [PATCH 073/135] Report lexer errors! --- jaq/src/main.rs | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/jaq/src/main.rs b/jaq/src/main.rs index ea7238c4e..e4b3d9df8 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -265,18 +265,21 @@ fn parse(filter_str: &str, vars: Vec) -> Result> let std: Vec<_> = std.body.iter().map(|def| def.conv(std_str)).collect(); defs.insert_defs(std); - /* let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(filter_str).lex(); if lex_errs.is_empty() { let mut parser = jaq_syn::parse::Parser::new(&tokens); let main = parser.finish("", |p| p.module(|p| p.term())); + /* std::println!("{:?}", main); std::println!("{:?}", main.body.conv_main(filter_str)); std::println!("{:?}", parser.e); + */ } else { - std::println!("{:?}", lex_errs); + //std::println!("{:?}", lex_errs); + for e in lex_errs { + println!("{}", report_lex(filter_str, e)) + } } - */ assert!(defs.errs.is_empty()); let (filter, errs) = jaq_parse::parse(filter_str, jaq_parse::main()); @@ -618,6 +621,32 @@ impl Color { } } +fn report_lex<'a>(code: &'a str, (expected, found): jaq_syn::lex::Error<&'a str>) -> Report<'a> { + use jaq_syn::lex::{span, Expect}; + + let mut found_range = span(code, found); + found_range.end = found_range.start; + let found = match found { + "" => "end of input", + _ => "character", + }; + let label = (found_range, format!("unexpected {found}"), Color::Red); + + let labels = match expected { + Expect::Delim(open) => { + let unclosed = "unclosed delimiter".to_string(); + Vec::from([(span(code, open), unclosed, Color::Yellow), label]) + } + _ => Vec::from([label]), + }; + + Report { + code, + message: format!("expected {}", expected.as_str()), + labels, + } +} + fn report<'a>(code: &'a str, e: &chumsky::error::Simple) -> Report<'a> { use chumsky::error::SimpleReason; From edd90700ba789f1a29336444dace863794d075be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 4 Jul 2024 10:43:35 +0200 Subject: [PATCH 074/135] Proper reporting of reports. --- jaq/src/main.rs | 143 ++++++++++++++++++++++++++++-------------------- 1 file changed, 85 insertions(+), 58 deletions(-) diff --git a/jaq/src/main.rs b/jaq/src/main.rs index e4b3d9df8..553e5a5c2 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -154,18 +154,24 @@ fn real_main(cli: &Cli) -> Result { let mut args = cli.args.iter(); let filter = match &cli.from_file { - Some(file) => parse(&std::fs::read_to_string(file)?, vars)?, - None => { - if let Some(filter) = args.next() { - parse(filter, vars)? - } else { - Filter::default() - } - } + Some(file) => Some(std::fs::read_to_string(file)?), + None => args.next().cloned(), }; - //println!("Filter: {:?}", filter); let files: Vec<_> = args.collect(); + /* + let filter2 = match filter.clone() { + None => todo!(), + Some(filter) => parse2(&filter).map_err(|e| Error::Report(filter, e))?, + }; + */ + + let filter = match filter { + Some(filter) => parse(&filter, vars)?, + None => Filter::default(), + }; + //println!("Filter: {:?}", filter); + let last = if files.is_empty() { let inputs = read_buffered(cli, io::stdin().lock()); with_stdout(|out| run(cli, &filter, ctx, inputs, |v| print(out, cli, &v)))? @@ -252,6 +258,23 @@ fn args_named(var_val: &[(String, Val)]) -> Val { Val::obj(args.collect()) } +fn parse2(filter_str: &str) -> Result> { + let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(filter_str).lex(); + if lex_errs.is_empty() { + let mut parser = jaq_syn::parse::Parser::new(&tokens); + let main = parser.finish("", |p| p.module(|p| p.term())); + //std::println!("{:?}", main); + //std::println!("{:?}", parser.e); + let main = main.body.conv_main(filter_str); + Ok(main) + } else { + Err(lex_errs + .into_iter() + .map(|e| report_lex(filter_str, e)) + .collect()) + } +} + fn parse(filter_str: &str, vars: Vec) -> Result> { let mut defs = ParseCtx::new(vars); defs.insert_natives(jaq_core::core()); @@ -265,22 +288,6 @@ fn parse(filter_str: &str, vars: Vec) -> Result> let std: Vec<_> = std.body.iter().map(|def| def.conv(std_str)).collect(); defs.insert_defs(std); - let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(filter_str).lex(); - if lex_errs.is_empty() { - let mut parser = jaq_syn::parse::Parser::new(&tokens); - let main = parser.finish("", |p| p.module(|p| p.term())); - /* - std::println!("{:?}", main); - std::println!("{:?}", main.body.conv_main(filter_str)); - std::println!("{:?}", parser.e); - */ - } else { - //std::println!("{:?}", lex_errs); - for e in lex_errs { - println!("{}", report_lex(filter_str, e)) - } - } - assert!(defs.errs.is_empty()); let (filter, errs) = jaq_parse::parse(filter_str, jaq_parse::main()); if !errs.is_empty() { @@ -394,6 +401,7 @@ struct ParseError { #[derive(Debug)] enum Error { Io(Option, io::Error), + Report(String, Vec), Chumsky(Vec), Parse(String), Jaq(jaq_interpret::Error), @@ -418,9 +426,22 @@ impl Termination for Error { eprintln!("Error: {e}"); 2 } + Self::Report(code, reports) => { + let idx = codesnake::LineIndex::new(&code); + for e in reports { + eprintln!("Error: {}", e.message); + let block = e.to_block(&idx); + eprintln!("{}\n{}{}", block.prologue(), block, block.epilogue()) + } + 3 + } Self::Chumsky(errs) => { for e in errs { - eprintln!("Error: {}", report(&e.filter, &e.error)); + let idx = codesnake::LineIndex::new(&e.filter); + let report = report(&e.filter, &e.error); + eprintln!("Error: {}", report.message); + let block = report.to_block(&idx); + eprintln!("{}\n{}{}", block.prologue(), block, block.epilogue()) } 3 } @@ -598,10 +619,9 @@ fn with_stdout(f: impl FnOnce(&mut io::StdoutLock) -> Result) -> Re } #[derive(Debug)] -struct Report<'a> { - code: &'a str, +struct Report { message: String, - labels: Vec<(core::ops::Range, String, Color)>, + labels: Vec<(core::ops::Range, Vec<(String, Option)>, Color)>, } #[derive(Clone, Debug)] @@ -621,33 +641,35 @@ impl Color { } } -fn report_lex<'a>(code: &'a str, (expected, found): jaq_syn::lex::Error<&'a str>) -> Report<'a> { +fn report_lex(code: &str, (expected, found): jaq_syn::lex::Error<&str>) -> Report { use jaq_syn::lex::{span, Expect}; let mut found_range = span(code, found); - found_range.end = found_range.start; + found_range.end = core::cmp::min(found_range.start + 1, code.len()); let found = match found { - "" => "end of input", - _ => "character", + "" => [("unexpected end of input".to_string(), None)].into(), + c => [("unexpected character ", None), (c, Some(Color::Red))] + .map(|(s, c)| (s.into(), c)) + .into(), }; - let label = (found_range, format!("unexpected {found}"), Color::Red); + let label = (found_range, found, Color::Red); let labels = match expected { Expect::Delim(open) => { - let unclosed = "unclosed delimiter".to_string(); - Vec::from([(span(code, open), unclosed, Color::Yellow), label]) + let text = [("unclosed delimiter ", None), (open, Some(Color::Yellow))] + .map(|(s, c)| (s.into(), c)); + Vec::from([(span(code, open), text.into(), Color::Yellow), label]) } _ => Vec::from([label]), }; Report { - code, message: format!("expected {}", expected.as_str()), labels, } } -fn report<'a>(code: &'a str, e: &chumsky::error::Simple) -> Report<'a> { +fn report<'a>(code: &'a str, e: &chumsky::error::Simple) -> Report { use chumsky::error::SimpleReason; let eof = || "end of input".to_string(); @@ -675,10 +697,14 @@ fn report<'a>(code: &'a str, e: &chumsky::error::Simple) -> Report<'a> { }; let label = if let SimpleReason::Custom(msg) = e.reason() { - msg.clone() + [(msg.clone(), None)].into() } else { - let token = |c: &String| format!("token {}", Color::Red.apply(c)); - format!("Unexpected {}", e.found().map_or_else(eof, token)) + match e.found() { + None => [("Unexpected end of input".to_string(), None)].into(), + Some(c) => [("Unexpected token ", None), (c, Some(Color::Red))] + .map(|(s, c)| (s.into(), c)) + .into(), + } }; // convert character indices to byte offsets let char_to_byte = |i| { @@ -692,30 +718,31 @@ fn report<'a>(code: &'a str, e: &chumsky::error::Simple) -> Report<'a> { let mut labels = Vec::from([(conv(&e.span()), label, Color::Red)]); if let SimpleReason::Unclosed { span, delimiter } = e.reason() { - let text = format!("Unclosed delimiter {}", Color::Yellow.apply(delimiter)); - labels.insert(0, (conv(span), text, Color::Yellow)); - } - Report { - code, - message, - labels, + let text = ("Unclosed delimiter ".to_string(), None); + let bla = (delimiter.to_string(), Some(Color::Yellow)); + labels.insert(0, (conv(span), [text, bla].into(), Color::Yellow)); } + Report { message, labels } } -impl Display for Report<'_> { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - use codesnake::{Block, CodeWidth, Label, LineIndex}; - let idx = LineIndex::new(self.code); - let labels = self.labels.clone().into_iter().map(|(range, text, color)| { - Label::new(range, text).with_style(move |s| color.apply(s).to_string()) +type CodeBlock = codesnake::Block, String>; + +impl Report { + fn to_block(self, idx: &codesnake::LineIndex) -> CodeBlock { + use codesnake::{Block, CodeWidth, Label}; + let color_maybe = |(text, color): (_, Option)| match color { + None => text, + Some(color) => color.apply(text).to_string(), + }; + let labels = self.labels.into_iter().map(|(range, text, color)| { + let text = text.into_iter().map(color_maybe).collect::>(); + Label::new(range, text.join("")).with_style(move |s| color.apply(s).to_string()) }); - let block = Block::new(&idx, labels).unwrap().map_code(|c| { + Block::new(&idx, labels).unwrap().map_code(|c| { let c = c.replace('\t', " "); let w = unicode_width::UnicodeWidthStr::width(&*c); CodeWidth::new(c, core::cmp::max(w, 1)) - }); - writeln!(f, "{}", self.message)?; - write!(f, "{}\n{}{}", block.prologue(), block, block.epilogue()) + }) } } From 18fc877248a7cfd1e0e6e2785f1a5a29b5d22330 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 4 Jul 2024 11:40:21 +0200 Subject: [PATCH 075/135] Make parse error public. --- jaq-syn/src/parse.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 6713329f2..81093e894 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -2,7 +2,8 @@ use crate::lex::{StrPart, Token}; use crate::path; use alloc::{boxed::Box, vec::Vec}; -type Error<'a> = (Expect, Option<&'a Token<&'a str>>); +pub type Error<'a> = (Expect, Option<&'a Token<&'a str>>); + #[derive(Debug)] pub enum Expect { Keyword(&'static str), From 4a2a15c65a87d48cdea9176329f7532d656e45b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 4 Jul 2024 11:40:45 +0200 Subject: [PATCH 076/135] Handle unclosed quotes. --- jaq-syn/src/lex.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index bb5d6e0ff..6e49b4d29 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -42,6 +42,7 @@ impl<'a> Expect<&'a str> { Self::Delim("(") => "closing parenthesis", Self::Delim("[") => "closing bracket", Self::Delim("{") => "closing brace", + Self::Delim("\"") => "closing quote", Self::Delim(_) => panic!(), Self::Escape => "string escape sequence", Self::Unicode => "4-digit hexadecimal UTF-8 code point", @@ -209,8 +210,8 @@ impl<'a> Lexer<&'a str> { /// /// The input string has to start with '"'. fn str(&mut self) -> Vec>> { - let start = self.i; - assert_eq!(self.next(), Some('"')); + let start = self.take(1); + assert_eq!(start, "\""); let mut parts = Vec::new(); loop { From 201f9ee4c64bd46cc7f25760b872807d971ab709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 4 Jul 2024 12:01:13 +0200 Subject: [PATCH 077/135] Store start and end of strings. --- jaq-syn/src/lex.rs | 25 +++++++++++++++++++------ jaq-syn/src/parse.rs | 12 +++++++----- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index 6e49b4d29..198489beb 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -14,8 +14,8 @@ pub enum Token { Word(S), /// number Num(S), - /// interpolated string - Str(Vec>), + /// (interpolated) string, surrounded by opening and closing '"' + Str(S, Vec>, S), /// operator, such as `|` or `+=` Op(S), /// punctuation, such as `.` or `;` @@ -209,7 +209,7 @@ impl<'a> Lexer<&'a str> { /// Lex a (possibly interpolated) string. /// /// The input string has to start with '"'. - fn str(&mut self) -> Vec>> { + fn str(&mut self) -> Token<&'a str> { let start = self.take(1); assert_eq!(start, "\""); let mut parts = Vec::new(); @@ -219,14 +219,15 @@ impl<'a> Lexer<&'a str> { if !s.is_empty() { parts.push(StrPart::Str(s)); } + let i = self.i; match self.next() { - Some('"') => return parts, + Some('"') => return Token::Str(start, parts, &i[..1]), Some('\\') => self.escape().map(|part| parts.push(part)), // SAFETY: due to `lex.trim()` Some(_) => unreachable!(), None => { self.e.push((Expect::Delim(start), self.i)); - return parts; + return Token::Str(start, parts, &i[..0]); } }; } @@ -248,7 +249,7 @@ impl<'a> Lexer<&'a str> { } '.' if chars.next() == Some('.') => Token::Char(self.take(2)), '.' | ':' | ';' | ',' | '?' => Token::Char(self.take(1)), - '"' => Token::Str(self.str()), + '"' => self.str(), '(' | '[' | '{' => self.delim(), _ => return None, }) @@ -282,6 +283,18 @@ impl<'a> Lexer<&'a str> { } } +impl<'a> Token<&'a str> { + pub fn span(&self, code: &str) -> crate::Span { + match self { + Self::Word(s) | Self::Char(s) | Self::Op(s) | Self::Num(s) => span(code, s), + Self::Str(open, _, close) => span(code, open).start..span(code, close).end, + Self::Block(open, block) => { + span(code, open).start..block.last().unwrap().span(code).end + } + } + } +} + pub fn span(whole_buffer: &str, part: &str) -> crate::Span { let start = part.as_ptr() as usize - whole_buffer.as_ptr() as usize; start..start + part.len() diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 81093e894..84e948f37 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -292,7 +292,9 @@ impl<'a> Parser<'a> { Some(Token::Word(id)) if id.starts_with('$') => Term::Var(*id), Some(Token::Word(id)) if id.starts_with('@') => { let s = self.maybe(|p| match p.i.next() { - Some(Token::Str(parts)) if id.starts_with('@') => Some(p.str_parts(parts)), + Some(Token::Str(_, parts, _)) if id.starts_with('@') => { + Some(p.str_parts(parts)) + } _ => None, }); match s { @@ -321,7 +323,7 @@ impl<'a> Parser<'a> { Some(Token::Block("{", tokens)) => self.with(tokens, "", |p| { p.sep_by1(',', Self::obj_entry).map(Term::Obj) }), - Some(Token::Str(parts)) => Term::Str(None, self.str_parts(parts)), + Some(Token::Str(_, parts, _)) => Term::Str(None, self.str_parts(parts)), next => return Err((Expect::Term, next)), }; @@ -344,9 +346,9 @@ impl<'a> Parser<'a> { fn obj_entry(&mut self) -> Result<'a, (Term<&'a str>, Option>)> { let key = match self.i.next() { - Some(Token::Str(parts)) => Term::Str(None, self.str_parts(parts)), + Some(Token::Str(_, parts, _)) => Term::Str(None, self.str_parts(parts)), Some(Token::Word(k)) if k.starts_with('@') => match self.i.next() { - Some(Token::Str(parts)) => Term::Str(Some(*k), self.str_parts(parts)), + Some(Token::Str(_, parts, _)) => Term::Str(Some(*k), self.str_parts(parts)), next => return Err((Expect::Str, next)), }, Some(Token::Word(k)) if k.starts_with('$') => Term::Var(*k), @@ -465,7 +467,7 @@ impl<'a> Parser<'a> { fn bare_str(&mut self) -> Result<'a, &'a str> { match self.i.next() { - Some(Token::Str(parts)) => match parts[..] { + Some(Token::Str(_, parts, _)) => match parts[..] { [StrPart::Str(s)] => Ok(s), _ => todo!(), }, From 668bc1f79d718e7c3cba3e12eba7a79264064439 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 4 Jul 2024 12:54:58 +0200 Subject: [PATCH 078/135] Report parse errors! --- jaq-syn/src/parse.rs | 58 ++++++++++++++++++++++++++++---------------- jaq/src/main.rs | 33 +++++++++++++++++++------ 2 files changed, 62 insertions(+), 29 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 84e948f37..332adcd6e 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -2,12 +2,13 @@ use crate::lex::{StrPart, Token}; use crate::path; use alloc::{boxed::Box, vec::Vec}; -pub type Error<'a> = (Expect, Option<&'a Token<&'a str>>); +// TODO: save which token raised the expectation +pub type Error<'a> = (Expect<&'a str>, Option<&'a Token<&'a str>>); #[derive(Debug)] -pub enum Expect { - Keyword(&'static str), - Char(char), +pub enum Expect { + Keyword(S), + Char(S), Var, ElseOrEnd, Term, @@ -18,6 +19,22 @@ pub enum Expect { Nothing, } +impl<'a> Expect<&'a str> { + pub fn as_str(&self) -> &'a str { + match self { + Self::Keyword(s) | Self::Char(s) => s, + Self::Var => "variable", + Self::ElseOrEnd => "else or end", + Self::Term => "term", + Self::Key => "key", + Self::Ident => "ident", + Self::Arg => "argument", + Self::Str => "string", + Self::Nothing => "nothing", + } + } +} + type Result<'a, T> = core::result::Result>; pub struct Parser<'a> { @@ -75,14 +92,13 @@ impl<'a> Parser<'a> { } } - fn verify_last(&mut self, last: &'a str) -> Result<'a, ()> { - let last_char = || last.chars().next().unwrap(); + fn verify_last(&mut self, last: &'static str) -> Result<'a, ()> { match (self.i.as_slice(), last) { ([], "") => Ok(()), ([Token::Char(c)], last) if *c == last => Ok(()), - ([], _) => Err((Expect::Char(last_char()), None)), + ([], _) => Err((Expect::Char(last), None)), ([next, ..], "") => Err((Expect::Nothing, Some(next))), - ([next, ..], _) => Err((Expect::Char(last_char()), Some(next))), + ([next, ..], _) => Err((Expect::Char(last), Some(next))), } } @@ -94,7 +110,7 @@ impl<'a> Parser<'a> { y } - pub fn finish(&mut self, last: &'a str, f: F) -> T + pub fn finish(&mut self, last: &'static str, f: F) -> T where F: FnOnce(&mut Self) -> Result<'a, T>, { @@ -109,7 +125,7 @@ impl<'a> Parser<'a> { }) } - fn with(&mut self, tokens: &'a [Token<&'a str>], last: &'a str, f: F) -> T + fn with(&mut self, tokens: &'a [Token<&'a str>], last: &'static str, f: F) -> T where F: FnOnce(&mut Self) -> Result<'a, T>, { @@ -139,13 +155,13 @@ impl<'a> Parser<'a> { Ok(y) } - fn sep_by1(&mut self, sep: char, f: F) -> Result<'a, Vec> + fn sep_by1(&mut self, sep: &'static str, f: F) -> Result<'a, Vec> where F: Fn(&mut Self) -> Result<'a, T>, { let head = core::iter::once(f(self)); let tail = core::iter::from_fn(|| match self.i.next() { - Some(Token::Char(c)) if c.chars().eq([sep]) => Some(f(self)), + Some(Token::Char(c)) if *c == sep => Some(f(self)), Some(Token::Char(")" | "}")) => None, next => Some(Err((Expect::Char(sep), next))), }); @@ -154,7 +170,7 @@ impl<'a> Parser<'a> { fn args(&mut self, f: impl Fn(&mut Self) -> Result<'a, T> + Copy) -> Vec { self.maybe(|p| match p.i.next() { - Some(Token::Block("(", tokens)) => Some(p.with(tokens, "", |p| p.sep_by1(';', f))), + Some(Token::Block("(", tokens)) => Some(p.with(tokens, "", |p| p.sep_by1(";", f))), _ => None, }) .unwrap_or_default() @@ -179,13 +195,13 @@ impl<'a> Parser<'a> { fn terminated(&mut self, f: impl FnOnce(&mut Self) -> Result<'a, T>) -> Result<'a, T> { let y = f(self)?; - self.char1(';')?; + self.char1(";")?; Ok(y) } - fn char1(&mut self, c: char) -> Result<'a, &'a str> { + fn char1(&mut self, c: &'static str) -> Result<'a, &'a str> { match self.i.next() { - Some(Token::Char(s)) if s.chars().eq([c]) => Ok(*s), + Some(Token::Char(s)) if *s == c => Ok(*s), next => Err((Expect::Char(c), next)), } } @@ -207,7 +223,7 @@ impl<'a> Parser<'a> { fn pipe(&mut self) -> Result<'a, ()> { match self.i.next() { Some(Token::Op("|")) => Ok(()), - next => Err((Expect::Char('|'), next)), + next => Err((Expect::Char("|"), next)), } } @@ -321,7 +337,7 @@ impl<'a> Parser<'a> { Term::Arr(Some(Box::new(self.with(tokens, "]", Self::term)))) } Some(Token::Block("{", tokens)) => self.with(tokens, "", |p| { - p.sep_by1(',', Self::obj_entry).map(Term::Obj) + p.sep_by1(",", Self::obj_entry).map(Term::Obj) }), Some(Token::Str(_, parts, _)) => Term::Str(None, self.str_parts(parts)), next => return Err((Expect::Term, next)), @@ -357,7 +373,7 @@ impl<'a> Parser<'a> { } Some(Token::Block("(", tokens)) => { let k = self.with(tokens, ")", Self::term); - self.char1(':')?; + self.char1(":")?; return Ok((k, Some(self.term_with_comma(false)?))); } next => return Err((Expect::Key, next)), @@ -457,10 +473,10 @@ impl<'a> Parser<'a> { next => return Err((Expect::Arg, next)), }) }); - self.char1(':')?; + self.char1(":")?; let body = self.term()?; - self.char1(';')?; + self.char1(";")?; Ok(Def { name, args, body }) } diff --git a/jaq/src/main.rs b/jaq/src/main.rs index 553e5a5c2..a96f2c560 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -263,15 +263,18 @@ fn parse2(filter_str: &str) -> Result> { if lex_errs.is_empty() { let mut parser = jaq_syn::parse::Parser::new(&tokens); let main = parser.finish("", |p| p.module(|p| p.term())); - //std::println!("{:?}", main); - //std::println!("{:?}", parser.e); - let main = main.body.conv_main(filter_str); - Ok(main) + if parser.e.is_empty() { + //std::println!("{:?}", main); + let main = main.body.conv_main(filter_str); + Ok(main) + } else { + std::println!("{:?}", parser.e); + let errs = parser.e.into_iter(); + Err(errs.map(|e| report_parse(filter_str, e)).collect()) + } } else { - Err(lex_errs - .into_iter() - .map(|e| report_lex(filter_str, e)) - .collect()) + let errs = lex_errs.into_iter(); + Err(errs.map(|e| report_lex(filter_str, e)).collect()) } } @@ -669,6 +672,20 @@ fn report_lex(code: &str, (expected, found): jaq_syn::lex::Error<&str>) -> Repor } } +fn report_parse(code: &str, (expected, found): jaq_syn::parse::Error) -> Report { + let found_range = match found { + None => code.len()..code.len(), + Some(found) => found.span(code), + }; + let found = found.map_or("unexpected end of input", |_| "unexpected token"); + let found = [(found.to_string(), None)].into(); + + Report { + message: format!("expected {}", expected.as_str()), + labels: Vec::from([(found_range, found, Color::Red)]), + } +} + fn report<'a>(code: &'a str, e: &chumsky::error::Simple) -> Report { use chumsky::error::SimpleReason; From 7a68ede0d1c82cde2f1320c79b3021e9a9a736e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 4 Jul 2024 12:59:31 +0200 Subject: [PATCH 079/135] Flatten. --- jaq/src/main.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/jaq/src/main.rs b/jaq/src/main.rs index a96f2c560..43313d4ce 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -260,22 +260,22 @@ fn args_named(var_val: &[(String, Val)]) -> Val { fn parse2(filter_str: &str) -> Result> { let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(filter_str).lex(); - if lex_errs.is_empty() { - let mut parser = jaq_syn::parse::Parser::new(&tokens); - let main = parser.finish("", |p| p.module(|p| p.term())); - if parser.e.is_empty() { - //std::println!("{:?}", main); - let main = main.body.conv_main(filter_str); - Ok(main) - } else { - std::println!("{:?}", parser.e); - let errs = parser.e.into_iter(); - Err(errs.map(|e| report_parse(filter_str, e)).collect()) - } - } else { + if !lex_errs.is_empty() { let errs = lex_errs.into_iter(); - Err(errs.map(|e| report_lex(filter_str, e)).collect()) + return Err(errs.map(|e| report_lex(filter_str, e)).collect()); } + + let mut parser = jaq_syn::parse::Parser::new(&tokens); + let main = parser.finish("", |p| p.module(|p| p.term())); + if !parser.e.is_empty() { + std::println!("{:?}", parser.e); + let errs = parser.e.into_iter(); + return Err(errs.map(|e| report_parse(filter_str, e)).collect()); + } + + //std::println!("{:?}", main); + let main = main.body.conv_main(filter_str); + Ok(main) } fn parse(filter_str: &str, vars: Vec) -> Result> { From 5142761c82ac647208d333eb0a2d5a3f9e2816fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 15 Jul 2024 09:45:08 +0200 Subject: [PATCH 080/135] Output compilation errors via new report infrastructure. --- jaq/src/main.rs | 147 ++++++++---------------------------------------- 1 file changed, 24 insertions(+), 123 deletions(-) diff --git a/jaq/src/main.rs b/jaq/src/main.rs index 43313d4ce..54343aae2 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -159,16 +159,9 @@ fn real_main(cli: &Cli) -> Result { }; let files: Vec<_> = args.collect(); - /* - let filter2 = match filter.clone() { - None => todo!(), - Some(filter) => parse2(&filter).map_err(|e| Error::Report(filter, e))?, - }; - */ - let filter = match filter { - Some(filter) => parse(&filter, vars)?, None => Filter::default(), + Some(filter_str) => parse(&filter_str, vars).map_err(|e| Error::Report(filter_str, e))?, }; //println!("Filter: {:?}", filter); @@ -258,7 +251,16 @@ fn args_named(var_val: &[(String, Val)]) -> Val { Val::obj(args.collect()) } -fn parse2(filter_str: &str) -> Result> { +fn parse_defs(std_str: &str) -> Vec { + let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(std_str).lex(); + assert!(lex_errs.is_empty()); + let mut parser = jaq_syn::parse::Parser::new(&tokens); + let std = parser.finish("", |p| p.module(|p| p.defs())); + assert!(parser.e.is_empty()); + std.body.iter().map(|def| def.conv(std_str)).collect() +} + +fn parse_term(filter_str: &str) -> Result> { let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(filter_str).lex(); if !lex_errs.is_empty() { let errs = lex_errs.into_iter(); @@ -278,42 +280,20 @@ fn parse2(filter_str: &str) -> Result> { Ok(main) } -fn parse(filter_str: &str, vars: Vec) -> Result> { - let mut defs = ParseCtx::new(vars); - defs.insert_natives(jaq_core::core()); - - let std_str = include_str!("../../jaq-std/src/std.jq"); - let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(std_str).lex(); - assert!(lex_errs.is_empty()); - let mut parser = jaq_syn::parse::Parser::new(&tokens); - let std = parser.finish("", |p| p.module(|p| p.defs())); - assert!(parser.e.is_empty()); - let std: Vec<_> = std.body.iter().map(|def| def.conv(std_str)).collect(); - defs.insert_defs(std); - - assert!(defs.errs.is_empty()); - let (filter, errs) = jaq_parse::parse(filter_str, jaq_parse::main()); - if !errs.is_empty() { - return Err(errs - .into_iter() - .map(|error| ParseError { - error, - filter: filter_str.to_owned(), - }) - .collect()); - } - let filter = defs.compile(filter.unwrap()); - if defs.errs.is_empty() { +fn parse(filter_str: &str, vars: Vec) -> Result> { + let mut ctx = ParseCtx::new(vars); + ctx.insert_natives(jaq_core::core()); + ctx.insert_defs(parse_defs(include_str!("../../jaq-std/src/std.jq"))); + let filter = parse_term(filter_str)?; + let filter = ctx.compile(filter); + if ctx.errs.is_empty() { Ok(filter) } else { - Err(defs - .errs - .into_iter() - .map(|error| ParseError { - error: chumsky::error::Simple::custom(error.1, error.0.to_string()), - filter: filter_str.to_owned(), - }) - .collect()) + let reports = ctx.errs.into_iter().map(|error| Report { + message: error.0.to_string(), + labels: Vec::from([(error.1, [(error.0.to_string(), None)].into(), Color::Red)]), + }); + Err(reports.collect()) } } @@ -395,17 +375,10 @@ fn collect_if<'a, T: 'a, E: 'a>( } } -#[derive(Debug)] -struct ParseError { - error: chumsky::error::Simple, - filter: String, -} - #[derive(Debug)] enum Error { Io(Option, io::Error), Report(String, Vec), - Chumsky(Vec), Parse(String), Jaq(jaq_interpret::Error), Persist(tempfile::PersistError), @@ -438,16 +411,6 @@ impl Termination for Error { } 3 } - Self::Chumsky(errs) => { - for e in errs { - let idx = codesnake::LineIndex::new(&e.filter); - let report = report(&e.filter, &e.error); - eprintln!("Error: {}", report.message); - let block = report.to_block(&idx); - eprintln!("{}\n{}{}", block.prologue(), block, block.epilogue()) - } - 3 - } Self::NoOutput => 4, Self::Parse(e) => { eprintln!("Error: failed to parse: {e}"); @@ -468,12 +431,6 @@ impl From for Error { } } -impl From> for Error { - fn from(e: Vec) -> Self { - Self::Chumsky(e) - } -} - /// Run a filter with given input values and run `f` for every value output. /// /// This function cannot return an `Iterator` because it creates an `RcIter`. @@ -686,62 +643,6 @@ fn report_parse(code: &str, (expected, found): jaq_syn::parse::Error) -> Report } } -fn report<'a>(code: &'a str, e: &chumsky::error::Simple) -> Report { - use chumsky::error::SimpleReason; - - let eof = || "end of input".to_string(); - - let message = if let SimpleReason::Custom(msg) = e.reason() { - msg.clone() - } else { - let found = if e.found().is_some() { - "Unexpected token" - } else { - "Unexpected end of input" - }; - let when = if let Some(label) = e.label() { - format!(" while parsing {label}") - } else { - String::new() - }; - let expected = if e.expected().len() == 0 { - "something else".to_string() - } else { - let f = |e: &Option| e.as_ref().map_or_else(eof, |e| e.to_string()); - e.expected().map(f).collect::>().join(", ") - }; - format!("{found}{when}, expected {expected}",) - }; - - let label = if let SimpleReason::Custom(msg) = e.reason() { - [(msg.clone(), None)].into() - } else { - match e.found() { - None => [("Unexpected end of input".to_string(), None)].into(), - Some(c) => [("Unexpected token ", None), (c, Some(Color::Red))] - .map(|(s, c)| (s.into(), c)) - .into(), - } - }; - // convert character indices to byte offsets - let char_to_byte = |i| { - code.char_indices() - .map(|(i, _c)| i) - .chain([code.len(), code.len()]) - .nth(i) - .unwrap() - }; - let conv = |span: &core::ops::Range<_>| char_to_byte(span.start)..char_to_byte(span.end); - let mut labels = Vec::from([(conv(&e.span()), label, Color::Red)]); - - if let SimpleReason::Unclosed { span, delimiter } = e.reason() { - let text = ("Unclosed delimiter ".to_string(), None); - let bla = (delimiter.to_string(), Some(Color::Yellow)); - labels.insert(0, (conv(span), [text, bla].into(), Color::Yellow)); - } - Report { message, labels } -} - type CodeBlock = codesnake::Block, String>; impl Report { @@ -767,7 +668,7 @@ fn run_test(test: jaq_syn::test::Test) -> Result<(Val, Val), Error> { let inputs = RcIter::new(Box::new(core::iter::empty())); let ctx = Ctx::new(Vec::new(), &inputs); - let filter = parse(&test.filter, Vec::new())?; + let filter = parse(&test.filter, Vec::new()).map_err(|e| Error::Report(test.filter, e))?; let json = |s: String| { use hifijson::token::Lex; From 2d9b3c03d8651e92a5d3fc715e51859f329a018b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 15 Jul 2024 09:45:44 +0200 Subject: [PATCH 081/135] Remove chumsky and jaq_parse dependencies from jaq. --- Cargo.lock | 2 -- jaq/Cargo.toml | 2 -- 2 files changed, 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 631f20586..a23a794eb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -247,14 +247,12 @@ name = "jaq" version = "1.5.0" dependencies = [ "atty", - "chumsky", "clap", "codesnake", "env_logger", "hifijson", "jaq-core", "jaq-interpret", - "jaq-parse", "jaq-syn", "memmap2", "mimalloc", diff --git a/jaq/Cargo.toml b/jaq/Cargo.toml index 046491d36..8cc1790af 100644 --- a/jaq/Cargo.toml +++ b/jaq/Cargo.toml @@ -16,11 +16,9 @@ default = ["mimalloc"] [dependencies] jaq-syn = { version = "1.1.0", path = "../jaq-syn" } -jaq-parse = { version = "1.0.0", path = "../jaq-parse" } jaq-interpret = { version = "1.2.0", path = "../jaq-interpret" } jaq-core = { version = "1.2.0", path = "../jaq-core" } atty = "0.2" -chumsky = { version = "0.9.0", default-features = false } codesnake = { version = "0.1" } clap = { version = "4.0.0", features = ["derive"] } env_logger = { version = "0.10.0", default-features = false } From c87c67893845ead502afb13f199f69287989ae48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 15 Jul 2024 11:25:16 +0200 Subject: [PATCH 082/135] Remove unused imports. --- jaq-syn/src/convert.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jaq-syn/src/convert.rs b/jaq-syn/src/convert.rs index 57f9d343f..1481ac7f6 100644 --- a/jaq-syn/src/convert.rs +++ b/jaq-syn/src/convert.rs @@ -1,8 +1,8 @@ use crate::filter::{AssignOp, BinaryOp, Filter, Fold, FoldType, KeyVal}; use crate::prec_climb::{self, Associativity}; -use crate::{parse, Arg, Call, Def, Main, MathOp, OrdOp, Path, Span, Spanned, Str}; +use crate::{parse, Arg, Call, Def, Main, MathOp, OrdOp, Span, Spanned}; use alloc::string::ToString; -use alloc::{boxed::Box, string::String, vec::Vec}; +use alloc::{boxed::Box, vec::Vec}; impl parse::Term<&str> { fn span(&self, code: &str) -> Span { From c4298a5f4f20d9998156f7311392aa59e060fd87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 15 Jul 2024 11:25:46 +0200 Subject: [PATCH 083/135] Warn for unimplemented functionality. --- jaq-syn/src/convert.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jaq-syn/src/convert.rs b/jaq-syn/src/convert.rs index 1481ac7f6..e31b858d3 100644 --- a/jaq-syn/src/convert.rs +++ b/jaq-syn/src/convert.rs @@ -101,7 +101,7 @@ impl parse::Term<&str> { }; let (init, update) = match &args[..] { [init, update] => (init, update), - _ => todo!(), + _ => unimplemented!("folding filters currently only take two arguments"), }; let fold = self::Fold { xs: span(xs), @@ -120,7 +120,7 @@ impl parse::Term<&str> { else_.as_deref().map(span), ), - Self::Def(defs, tm) => unimplemented!("definitions inside terms are not supported yet"), + Self::Def(_defs, _tm) => unimplemented!("definitions inside terms are not supported yet"), Self::Call(c, args) => Call(c.to_string(), args.iter().map(|a| *span(a)).collect()), Self::Var(v) => Var(v[1..].to_string()), From 91f69165f6ffc1d38d75ee68c739424dcd2faa3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 15 Jul 2024 11:26:13 +0200 Subject: [PATCH 084/135] Document. --- jaq-syn/src/lex.rs | 2 ++ jaq-syn/src/parse.rs | 2 ++ 2 files changed, 4 insertions(+) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index 198489beb..eb50566ad 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -1,3 +1,5 @@ +//! Lexing. + use alloc::vec::Vec; #[derive(Debug)] diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 332adcd6e..4d266428e 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -1,3 +1,5 @@ +//! Parsing. + use crate::lex::{StrPart, Token}; use crate::path; use alloc::{boxed::Box, vec::Vec}; From fde7a833fcda9b89daab1fc036c796e15d0fbeac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 15 Jul 2024 11:27:34 +0200 Subject: [PATCH 085/135] Nicer conversion. --- jaq-syn/src/convert.rs | 18 +++++++++++++++++- jaq-syn/src/parse.rs | 4 ++-- jaq/src/main.rs | 4 ++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/jaq-syn/src/convert.rs b/jaq-syn/src/convert.rs index e31b858d3..7ed2229ef 100644 --- a/jaq-syn/src/convert.rs +++ b/jaq-syn/src/convert.rs @@ -129,7 +129,7 @@ impl parse::Term<&str> { } } - pub fn conv_main(&self, s: &str) -> Main { + fn conv_main(&self, s: &str) -> Main { match self { parse::Term::Def(defs, tm) => Main { defs: defs.iter().map(|def| def.conv(s)).collect(), @@ -198,3 +198,19 @@ impl parse::Def<&str, parse::Term<&str>> { } } } + +impl parse::Module<&str, Vec>>> { + pub fn conv(&self, s: &str) -> Vec { + self.body.iter().map(|def| def.conv(s)).collect() + } +} + + +impl parse::Module<&str, parse::Term<&str>> { + pub fn conv(&self, s: &str) -> Main { + if !self.mods.is_empty() { + panic!("include / import is not supported yet"); + } + self.body.conv_main(s) + } +} diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 4d266428e..b1f80a1f0 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -536,8 +536,8 @@ impl<'a> Parser<'a> { #[derive(Debug, Default)] pub struct Module { meta: Option>, - mods: Vec<(S, Option)>, - pub body: B, + pub(crate) mods: Vec<(S, Option)>, + pub(crate) body: B, } #[derive(Debug)] diff --git a/jaq/src/main.rs b/jaq/src/main.rs index 54343aae2..04448bd8b 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -257,7 +257,7 @@ fn parse_defs(std_str: &str) -> Vec { let mut parser = jaq_syn::parse::Parser::new(&tokens); let std = parser.finish("", |p| p.module(|p| p.defs())); assert!(parser.e.is_empty()); - std.body.iter().map(|def| def.conv(std_str)).collect() + std.conv(std_str) } fn parse_term(filter_str: &str) -> Result> { @@ -276,7 +276,7 @@ fn parse_term(filter_str: &str) -> Result> { } //std::println!("{:?}", main); - let main = main.body.conv_main(filter_str); + let main = main.conv(filter_str); Ok(main) } From 02079a55a227f513c52b3079164ca9eec2b733d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 15 Jul 2024 11:28:58 +0200 Subject: [PATCH 086/135] Document Term type. --- jaq-syn/src/parse.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index b1f80a1f0..61ccc9992 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -48,31 +48,47 @@ pub struct Parser<'a> { #[derive(Debug, Default)] pub enum Term { + /// Identity, i.e. `.` #[default] Id, + /// Recursion (`..`) Recurse, + /// Integer or floating-point number. Num(S), + /// String Str(Option, Vec>), + /// Array, empty if `None` Arr(Option>), + /// Object, specifying its key-value pairs Obj(Vec<(Self, Option)>), Neg(Box), Pipe(Box, Option, Box), + /// Sequence of binary operations, e.g. `1 + 2 - 3 * 4` BinOp(Box, Vec<(S, Self)>), + /// Control flow variable declaration, e.g. `label $x | ...` Label(S, Box), + /// Break out from control flow to location variable, e.g. `break $x` Break(S), + /// `reduce` and `foreach`, e.g. `reduce .[] as $x (0; .+$x)` Fold(S, Box, S, Vec), + /// `try` and optional `catch` TryCatch(Box, Option>), + /// If-then-else IfThenElse(Vec<(Self, Self)>, Option>), + /// Local definition Def(Vec>, Box), + /// Call to another filter, e.g. `map(.+1)` Call(S, Vec), + /// Variable, such as `$x` (including leading '$') Var(S), Key(S), + /// Path such as `.`, `.a`, `.[][]."b"` Path(Box, Vec<(path::Part, path::Opt)>), } From c7b106f691a9cf159d839406f7dfffeba19c11b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 15 Jul 2024 11:48:44 +0200 Subject: [PATCH 087/135] Format. --- jaq-syn/src/convert.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/jaq-syn/src/convert.rs b/jaq-syn/src/convert.rs index 7ed2229ef..9b9887e3c 100644 --- a/jaq-syn/src/convert.rs +++ b/jaq-syn/src/convert.rs @@ -120,7 +120,9 @@ impl parse::Term<&str> { else_.as_deref().map(span), ), - Self::Def(_defs, _tm) => unimplemented!("definitions inside terms are not supported yet"), + Self::Def(_defs, _tm) => { + unimplemented!("definitions inside terms are not supported yet") + } Self::Call(c, args) => Call(c.to_string(), args.iter().map(|a| *span(a)).collect()), Self::Var(v) => Var(v[1..].to_string()), @@ -205,7 +207,6 @@ impl parse::Module<&str, Vec>>> { } } - impl parse::Module<&str, parse::Term<&str>> { pub fn conv(&self, s: &str) -> Main { if !self.mods.is_empty() { From 2007fefad13cb9d7b11ab6ea3970ff61c3f5a31a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 15 Jul 2024 11:51:41 +0200 Subject: [PATCH 088/135] Nicer handling of leading key in path expression. --- jaq-syn/src/convert.rs | 1 - jaq-syn/src/parse.rs | 24 +++++++++++++++--------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/jaq-syn/src/convert.rs b/jaq-syn/src/convert.rs index 9b9887e3c..ee2e30cc1 100644 --- a/jaq-syn/src/convert.rs +++ b/jaq-syn/src/convert.rs @@ -126,7 +126,6 @@ impl parse::Term<&str> { Self::Call(c, args) => Call(c.to_string(), args.iter().map(|a| *span(a)).collect()), Self::Var(v) => Var(v[1..].to_string()), - Self::Key(s) => index_path((Str(Box::new(crate::Str::from(s.to_string()))), 0..42)), Self::Path(tm, path) => Path(span(tm), path.iter().map(from_part).collect()), } } diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 61ccc9992..6a5e04e4e 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -87,11 +87,16 @@ pub enum Term { /// Variable, such as `$x` (including leading '$') Var(S), - Key(S), /// Path such as `.`, `.a`, `.[][]."b"` Path(Box, Vec<(path::Part, path::Opt)>), } +impl Term { + fn str(s: S) -> Self { + Self::Str(None, [StrPart::Str(s)].into()) + } +} + /// Keywords that may not appear at the beginning of an expression. /// /// Note that for example `reduce` is not part of this list, @@ -339,9 +344,13 @@ impl<'a> Parser<'a> { Some(Token::Word(id)) if !KEYWORDS.contains(id) => { Term::Call(*id, self.args(Self::term)) } - Some(Token::Char(".")) => self - .maybe(|p| p.i.next().and_then(ident_key)) - .map_or(Term::Id, Term::Key), + Some(Token::Char(".")) => match self.maybe(|p| p.i.next().and_then(ident_key)) { + None => Term::Id, + Some(k) => Term::Path( + Box::new(Term::Id), + [(path::Part::Index(Term::str(k)), path::Opt::Essential)].into(), + ), + }, Some(Token::Char("..")) => Term::Recurse, Some(Token::Num(n)) => Term::Num(*n), Some(Token::Block("[", tokens)) if matches!(tokens[..], [Token::Char("]")]) => { @@ -386,9 +395,7 @@ impl<'a> Parser<'a> { next => return Err((Expect::Str, next)), }, Some(Token::Word(k)) if k.starts_with('$') => Term::Var(*k), - Some(Token::Word(k)) if !KEYWORDS.contains(k) => { - Term::Str(None, Vec::from([StrPart::Str(*k)])) - } + Some(Token::Word(k)) if !KEYWORDS.contains(k) => Term::str(*k), Some(Token::Block("(", tokens)) => { let k = self.with(tokens, ")", Self::term); self.char1(":")?; @@ -424,9 +431,8 @@ impl<'a> Parser<'a> { next => return Err((Expect::Key, next)), }; let opt = self.char0('?').is_some(); - let key = Term::Str(None, Vec::from([StrPart::Str(key)])); let opt = if opt { Opt::Optional } else { Opt::Essential }; - path.push((path::Part::Index(key), opt)); + path.push((path::Part::Index(Term::str(key)), opt)); path.extend(core::iter::from_fn(|| self.path_part_opt())); } Ok(path) From bfb52f1a37735146545361f7cc79b214def3fcba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 15 Jul 2024 12:12:19 +0200 Subject: [PATCH 089/135] Remove duplicate check. --- jaq-syn/src/parse.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 6a5e04e4e..df09fbbcd 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -331,9 +331,7 @@ impl<'a> Parser<'a> { Some(Token::Word(id)) if id.starts_with('$') => Term::Var(*id), Some(Token::Word(id)) if id.starts_with('@') => { let s = self.maybe(|p| match p.i.next() { - Some(Token::Str(_, parts, _)) if id.starts_with('@') => { - Some(p.str_parts(parts)) - } + Some(Token::Str(_, parts, _)) => Some(p.str_parts(parts)), _ => None, }); match s { From d3f918953f8a1296b90319542be9b52f9e75bc67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 15 Jul 2024 13:37:19 +0200 Subject: [PATCH 090/135] More robust handling of paths starting with a key. --- jaq-syn/src/parse.rs | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index df09fbbcd..787bb9c35 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -342,13 +342,20 @@ impl<'a> Parser<'a> { Some(Token::Word(id)) if !KEYWORDS.contains(id) => { Term::Call(*id, self.args(Self::term)) } - Some(Token::Char(".")) => match self.maybe(|p| p.i.next().and_then(ident_key)) { - None => Term::Id, - Some(k) => Term::Path( - Box::new(Term::Id), - [(path::Part::Index(Term::str(k)), path::Opt::Essential)].into(), - ), - }, + Some(Token::Char(".")) => { + let key = self.maybe(|p| match p.i.next() { + Some(Token::Word(id)) if ident_key(id) => Some(*id), + next => None, + }); + let key = key.map(|key| (path::Part::Index(Term::str(key)), self.opt())); + + let path: Vec<_> = key.into_iter().chain(self.path()?).collect(); + if path.is_empty() { + Term::Id + } else { + Term::Path(Box::new(Term::Id), path) + } + } Some(Token::Char("..")) => Term::Recurse, Some(Token::Num(n)) => Term::Num(*n), Some(Token::Block("[", tokens)) if matches!(tokens[..], [Token::Char("]")]) => { @@ -423,14 +430,11 @@ impl<'a> Parser<'a> { fn path(&mut self) -> Result<'a, Vec<(path::Part>, path::Opt)>> { let mut path: Vec<_> = core::iter::from_fn(|| self.path_part_opt()).collect(); while self.char0('.').is_some() { - use path::Opt; let key = match self.i.next() { - Some(Token::Word(id)) if !id.starts_with(['$', '@']) => *id, + Some(Token::Word(id)) if ident_key(id) => *id, next => return Err((Expect::Key, next)), }; - let opt = self.char0('?').is_some(); - let opt = if opt { Opt::Optional } else { Opt::Essential }; - path.push((path::Part::Index(Term::str(key)), opt)); + path.push((path::Part::Index(Term::str(key)), self.opt())); path.extend(core::iter::from_fn(|| self.path_part_opt())); } Ok(path) @@ -568,9 +572,6 @@ pub struct Def { pub(crate) body: F, } -fn ident_key<'a>(token: &Token<&'a str>) -> Option<&'a str> { - match token { - Token::Word(id) if !id.starts_with(['$', '@']) && !KEYWORDS.contains(id) => Some(*id), - _ => None, - } +fn ident_key(id: &str) -> bool { + !id.starts_with(['$', '@']) && !KEYWORDS.contains(&id) } From 2d16f1aa7d096255c0b93d86461a9f619529e3e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 15 Jul 2024 13:52:22 +0200 Subject: [PATCH 091/135] Document. --- jaq-syn/src/convert.rs | 4 +++- jaq-syn/src/lex.rs | 12 ++++++++++-- jaq-syn/src/parse.rs | 5 ++++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/jaq-syn/src/convert.rs b/jaq-syn/src/convert.rs index ee2e30cc1..1aa7c933b 100644 --- a/jaq-syn/src/convert.rs +++ b/jaq-syn/src/convert.rs @@ -182,7 +182,7 @@ impl prec_climb::Expr for Spanned { } impl parse::Def<&str, parse::Term<&str>> { - pub fn conv(&self, s: &str) -> Def { + fn conv(&self, s: &str) -> Def { let args = self.args.iter().map(|arg| { if let Some(v) = arg.strip_prefix('$') { Arg::Var(v.to_string()) @@ -201,12 +201,14 @@ impl parse::Def<&str, parse::Term<&str>> { } impl parse::Module<&str, Vec>>> { + /// Convert a definitions module to a [`Def`] vector. pub fn conv(&self, s: &str) -> Vec { self.body.iter().map(|def| def.conv(s)).collect() } } impl parse::Module<&str, parse::Term<&str>> { + /// Convert a term module to a [`Main`]. pub fn conv(&self, s: &str) -> Main { if !self.mods.is_empty() { panic!("include / import is not supported yet"); diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index eb50566ad..e2051b190 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -71,20 +71,24 @@ impl<'a> Expect<&'a str> { } } +/// Lexer error, storing what we expected and what we got instead. pub type Error = (Expect, S); +/// Lexer for jq files. pub struct Lexer { i: S, e: Vec>, } impl<'a> Lexer<&'a str> { + /// Initialise a new lexer for the given input. #[must_use] pub fn new(i: &'a str) -> Self { let e = Vec::new(); Self { i, e } } + /// Lex, returning the resulting tokens and errors. #[must_use] pub fn lex(mut self) -> (Vec>, Vec>) { let tokens = self.tokens(); @@ -286,6 +290,7 @@ impl<'a> Lexer<&'a str> { } impl<'a> Token<&'a str> { + /// Return the span of a token that was lexed from some given input. pub fn span(&self, code: &str) -> crate::Span { match self { Self::Word(s) | Self::Char(s) | Self::Op(s) | Self::Num(s) => span(code, s), @@ -297,7 +302,10 @@ impl<'a> Token<&'a str> { } } -pub fn span(whole_buffer: &str, part: &str) -> crate::Span { - let start = part.as_ptr() as usize - whole_buffer.as_ptr() as usize; +/// Return the span of a string slice `part` relative to a string slice `whole`. +/// +/// The caller must ensure that `part` is fully contained inside `whole`. +pub fn span(whole: &str, part: &str) -> crate::Span { + let start = part.as_ptr() as usize - whole.as_ptr() as usize; start..start + part.len() } diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 787bb9c35..38a09a56f 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -46,6 +46,7 @@ pub struct Parser<'a> { fold: &'a [&'a str], } +/// Function from value to stream of values, such as `.[] | add / length`. #[derive(Debug, Default)] pub enum Term { /// Identity, i.e. `.` @@ -63,7 +64,9 @@ pub enum Term { /// Object, specifying its key-value pairs Obj(Vec<(Self, Option)>), + /// Negation Neg(Box), + /// Application, i.e. `l | r` if no string is given, else `l as $x | r` Pipe(Box, Option, Box), /// Sequence of binary operations, e.g. `1 + 2 - 3 * 4` BinOp(Box, Vec<(S, Self)>), @@ -345,7 +348,7 @@ impl<'a> Parser<'a> { Some(Token::Char(".")) => { let key = self.maybe(|p| match p.i.next() { Some(Token::Word(id)) if ident_key(id) => Some(*id), - next => None, + _ => None, }); let key = key.map(|key| (path::Part::Index(Term::str(key)), self.opt())); From 033ef57a1ab79afecd89d643d12b5282f71d2027 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 15 Jul 2024 13:56:51 +0200 Subject: [PATCH 092/135] Remove unused function. --- jaq-syn/src/lex.rs | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index e2051b190..8d1b32922 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -51,24 +51,6 @@ impl<'a> Expect<&'a str> { Self::Token => "token", } } - - pub fn to_simple_error(&self, pos: &'a str, full: &'a str) -> (&'static str, crate::Span) { - let mut pos = span(full, pos); - pos.end = pos.start; - let s = match self { - Self::Digit => "expected digit", - Self::Ident => "expected identifier", - Self::Delim(start) => { - let mut start = span(full, start); - start.end = pos.start; - return ("unclosed delimiter", start); - } - Self::Escape => "expected string escape sequence", - Self::Unicode => "expected 4-digit hexadecimal UTF-8 code point", - Self::Token => "expected token", - }; - (s, pos) - } } /// Lexer error, storing what we expected and what we got instead. From 77aa7d00d724baa2555da9e7778dc073723d0575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 08:25:35 +0200 Subject: [PATCH 093/135] Unify logic for key followed by optionality. --- jaq-syn/src/parse.rs | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 38a09a56f..dbacf5840 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -346,13 +346,8 @@ impl<'a> Parser<'a> { Term::Call(*id, self.args(Self::term)) } Some(Token::Char(".")) => { - let key = self.maybe(|p| match p.i.next() { - Some(Token::Word(id)) if ident_key(id) => Some(*id), - _ => None, - }); - let key = key.map(|key| (path::Part::Index(Term::str(key)), self.opt())); - - let path: Vec<_> = key.into_iter().chain(self.path()?).collect(); + let key_opt = self.maybe(|p| p.key_opt().ok()); + let path: Vec<_> = key_opt.into_iter().chain(self.path()?).collect(); if path.is_empty() { Term::Id } else { @@ -433,11 +428,7 @@ impl<'a> Parser<'a> { fn path(&mut self) -> Result<'a, Vec<(path::Part>, path::Opt)>> { let mut path: Vec<_> = core::iter::from_fn(|| self.path_part_opt()).collect(); while self.char0('.').is_some() { - let key = match self.i.next() { - Some(Token::Word(id)) if ident_key(id) => *id, - next => return Err((Expect::Key, next)), - }; - path.push((path::Part::Index(Term::str(key)), self.opt())); + path.push(self.key_opt()?); path.extend(core::iter::from_fn(|| self.path_part_opt())); } Ok(path) @@ -472,6 +463,14 @@ impl<'a> Parser<'a> { Some((part, self.opt())) } + fn key_opt(&mut self) -> Result<'a, (path::Part>, path::Opt)> { + let key = match self.i.next() { + Some(Token::Word(id)) if !id.starts_with(['$', '@']) && !KEYWORDS.contains(id) => *id, + next => return Err((Expect::Key, next)), + }; + Ok((path::Part::Index(Term::str(key)), self.opt())) + } + fn opt(&mut self) -> path::Opt { let mut opt = path::Opt::Essential; while self.char0('?').is_some() { @@ -574,7 +573,3 @@ pub struct Def { /// Body of the filter, e.g. `[.[] | f]`. pub(crate) body: F, } - -fn ident_key(id: &str) -> bool { - !id.starts_with(['$', '@']) && !KEYWORDS.contains(&id) -} From 70b4ab450ffcb3d2042890c06c2199741fe6277f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 09:13:14 +0200 Subject: [PATCH 094/135] More streamlined lexing/parsing API. --- jaq-syn/src/lex.rs | 9 +++++++-- jaq-syn/src/lib.rs | 2 ++ jaq-syn/src/parse.rs | 20 ++++++++++++++++---- jaq/src/main.rs | 36 ++++++++++++++++-------------------- 4 files changed, 41 insertions(+), 26 deletions(-) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index 8d1b32922..d09b2cb70 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -72,13 +72,18 @@ impl<'a> Lexer<&'a str> { /// Lex, returning the resulting tokens and errors. #[must_use] - pub fn lex(mut self) -> (Vec>, Vec>) { + pub fn lex(mut self) -> Result>, Vec>> { let tokens = self.tokens(); self.space(); if !self.i.is_empty() { self.e.push((Expect::Token, self.i)); } - (tokens, self.e) + + if self.e.is_empty() { + Ok(tokens) + } else { + Err(self.e) + } } fn next(&mut self) -> Option { diff --git a/jaq-syn/src/lib.rs b/jaq-syn/src/lib.rs index 759c4f6f5..75483853b 100644 --- a/jaq-syn/src/lib.rs +++ b/jaq-syn/src/lib.rs @@ -18,7 +18,9 @@ pub mod parse; mod prec_climb; pub use def::{Arg, Call, Def, Main}; +pub use lex::Lexer; pub use ops::{MathOp, OrdOp}; +pub use parse::Parser; use path::Path; pub use string::Str; diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index dbacf5840..fb7402058 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -37,11 +37,11 @@ impl<'a> Expect<&'a str> { } } -type Result<'a, T> = core::result::Result>; +pub type Result<'a, T> = core::result::Result>; pub struct Parser<'a> { i: core::slice::Iter<'a, Token<&'a str>>, - pub e: Vec>, + e: Vec>, /// names of fold-like filters, e.g. "reduce" and "foreach" fold: &'a [&'a str], } @@ -118,6 +118,18 @@ impl<'a> Parser<'a> { } } + pub fn parse(mut self, f: F) -> core::result::Result>> + where + F: FnOnce(&mut Self) -> Result<'a, T>, + { + let y = self.finish("", f); + if self.e.is_empty() { + Ok(y) + } else { + Err(self.e) + } + } + fn verify_last(&mut self, last: &'static str) -> Result<'a, ()> { match (self.i.as_slice(), last) { ([], "") => Ok(()), @@ -136,7 +148,7 @@ impl<'a> Parser<'a> { y } - pub fn finish(&mut self, last: &'static str, f: F) -> T + fn finish(&mut self, last: &'static str, f: F) -> T where F: FnOnce(&mut Self) -> Result<'a, T>, { @@ -253,7 +265,7 @@ impl<'a> Parser<'a> { } } - pub fn term_with_comma(&mut self, with_comma: bool) -> Result<'a, Term<&'a str>> { + fn term_with_comma(&mut self, with_comma: bool) -> Result<'a, Term<&'a str>> { let head = self.atom()?; let tail = core::iter::from_fn(|| self.op(with_comma).map(|op| Ok((op, self.atom()?)))) .collect::>>()?; diff --git a/jaq/src/main.rs b/jaq/src/main.rs index c1e5f810c..3f43cb701 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -258,32 +258,28 @@ fn args_named(var_val: &[(String, Val)]) -> Val { } fn parse_defs(std_str: &str) -> Vec { - let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(std_str).lex(); - assert!(lex_errs.is_empty()); - let mut parser = jaq_syn::parse::Parser::new(&tokens); - let std = parser.finish("", |p| p.module(|p| p.defs())); - assert!(parser.e.is_empty()); - std.conv(std_str) + let tokens = jaq_syn::Lexer::new(std_str).lex().unwrap(); + let std = jaq_syn::Parser::new(&tokens).parse(|p| p.module(|p| p.defs())); + std.unwrap().conv(std_str) } fn parse_term(filter_str: &str) -> Result> { - let (tokens, lex_errs) = jaq_syn::lex::Lexer::new(filter_str).lex(); - if !lex_errs.is_empty() { - let errs = lex_errs.into_iter(); - return Err(errs.map(|e| report_lex(filter_str, e)).collect()); - } + let tokens = jaq_syn::Lexer::new(filter_str).lex().map_err(|errs| { + errs.into_iter() + .map(|e| report_lex(filter_str, e)) + .collect::>() + })?; - let mut parser = jaq_syn::parse::Parser::new(&tokens); - let main = parser.finish("", |p| p.module(|p| p.term())); - if !parser.e.is_empty() { - std::println!("{:?}", parser.e); - let errs = parser.e.into_iter(); - return Err(errs.map(|e| report_parse(filter_str, e)).collect()); - } + let main = jaq_syn::Parser::new(&tokens).parse(|p| p.module(|p| p.term())); + let main = main.map_err(|errs| { + //std::println!("{:?}", errs); + errs.into_iter() + .map(|e| report_parse(filter_str, e)) + .collect::>() + })?; //std::println!("{:?}", main); - let main = main.conv(filter_str); - Ok(main) + Ok(main.conv(filter_str)) } fn parse(filter_str: &str, vars: Vec) -> Result> { From c872e4e398ce884ebc29ac97fe5bd239770fa374 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 09:29:29 +0200 Subject: [PATCH 095/135] Document. --- jaq-syn/src/parse.rs | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index fb7402058..5b0edd1d0 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -4,9 +4,10 @@ use crate::lex::{StrPart, Token}; use crate::path; use alloc::{boxed::Box, vec::Vec}; -// TODO: save which token raised the expectation +/// Parse error, storing what we expected and what we got instead. pub type Error<'a> = (Expect<&'a str>, Option<&'a Token<&'a str>>); +/// Type of token that we expected. #[derive(Debug)] pub enum Expect { Keyword(S), @@ -22,6 +23,7 @@ pub enum Expect { } impl<'a> Expect<&'a str> { + /// String representation of an expected token. pub fn as_str(&self) -> &'a str { match self { Self::Keyword(s) | Self::Char(s) => s, @@ -37,8 +39,10 @@ impl<'a> Expect<&'a str> { } } +/// Output of a fallible parsing operation. pub type Result<'a, T> = core::result::Result>; +/// Parser for jq programs. pub struct Parser<'a> { i: core::slice::Iter<'a, Token<&'a str>>, e: Vec>, @@ -109,15 +113,19 @@ const KEYWORDS: &[&str] = &[ ]; impl<'a> Parser<'a> { + /// Initialise a new parser on a sequence of [`Token`]s. #[must_use] pub fn new(i: &'a [Token<&'a str>]) -> Self { Self { i: i.iter(), e: Vec::new(), - fold: &["reduce", "foreach"], + fold: &["reduce", "foreach", "for"], } } + /// Parse tokens with the given function. + /// + /// Returns [`Ok`] if the function consumes the whole output without producing any error. pub fn parse(mut self, f: F) -> core::result::Result>> where F: FnOnce(&mut Self) -> Result<'a, T>, @@ -398,6 +406,7 @@ impl<'a> Parser<'a> { }) } + /// Parse a term such as `.[] | .+1`. pub fn term(&mut self) -> Result<'a, Term<&'a str>> { self.term_with_comma(true) } @@ -491,6 +500,7 @@ impl<'a> Parser<'a> { opt } + /// Parse a sequence of definitions, such as `def x: 1; def y: 2;`. pub fn defs(&mut self) -> Result<'a, Vec>>> { core::iter::from_fn(|| self.def_head().map(|()| self.def_tail())).collect() } @@ -545,6 +555,7 @@ impl<'a> Parser<'a> { Ok((path, Some(name))) } + /// Parse a module with a body returned by the given function. pub fn module(&mut self, f: F) -> Result<'a, Module<&'a str, B>> where F: FnOnce(&mut Self) -> Result<'a, B>, From 00fd324108d96b5006add039e441deeba5a33046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 16:23:02 +0200 Subject: [PATCH 096/135] Distinguish string lifetime from token lifetime. --- jaq-syn/src/lib.rs | 8 ++++ jaq-syn/src/parse.rs | 91 +++++++++++++++++++++++--------------------- jaq/src/main.rs | 6 +-- 3 files changed, 58 insertions(+), 47 deletions(-) diff --git a/jaq-syn/src/lib.rs b/jaq-syn/src/lib.rs index 75483853b..1f2f71a4b 100644 --- a/jaq-syn/src/lib.rs +++ b/jaq-syn/src/lib.rs @@ -29,3 +29,11 @@ pub type Span = core::ops::Range; /// An object with position information. pub type Spanned = (T, Span); + +/// Lex a string and parse resulting tokens, returning [`None`] if any error occurred. +pub fn parse<'s, T: Default, F>(s: &'s str, f: F) -> Option +where + F: for<'t> FnOnce(&mut Parser<'s, 't>) -> parse::Result<'s, 't, T>, +{ + Parser::new(&Lexer::new(s).lex().ok()?).parse(f).ok() +} diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 5b0edd1d0..32ecad3ba 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -5,7 +5,7 @@ use crate::path; use alloc::{boxed::Box, vec::Vec}; /// Parse error, storing what we expected and what we got instead. -pub type Error<'a> = (Expect<&'a str>, Option<&'a Token<&'a str>>); +pub type Error<'s, 't> = (Expect<&'s str>, Option<&'t Token<&'s str>>); /// Type of token that we expected. #[derive(Debug)] @@ -40,14 +40,14 @@ impl<'a> Expect<&'a str> { } /// Output of a fallible parsing operation. -pub type Result<'a, T> = core::result::Result>; +pub type Result<'s, 't, T> = core::result::Result>; /// Parser for jq programs. -pub struct Parser<'a> { - i: core::slice::Iter<'a, Token<&'a str>>, - e: Vec>, +pub struct Parser<'s, 't> { + i: core::slice::Iter<'t, Token<&'s str>>, + e: Vec>, /// names of fold-like filters, e.g. "reduce" and "foreach" - fold: &'a [&'a str], + fold: &'s [&'s str], } /// Function from value to stream of values, such as `.[] | add / length`. @@ -112,10 +112,10 @@ const KEYWORDS: &[&str] = &[ "include", "import", "def", "as", "and", "or", "catch", "then", "elif", "else", "end", ]; -impl<'a> Parser<'a> { +impl<'s, 't> Parser<'s, 't> { /// Initialise a new parser on a sequence of [`Token`]s. #[must_use] - pub fn new(i: &'a [Token<&'a str>]) -> Self { + pub fn new(i: &'t [Token<&'s str>]) -> Self { Self { i: i.iter(), e: Vec::new(), @@ -126,9 +126,9 @@ impl<'a> Parser<'a> { /// Parse tokens with the given function. /// /// Returns [`Ok`] if the function consumes the whole output without producing any error. - pub fn parse(mut self, f: F) -> core::result::Result>> + pub fn parse(mut self, f: F) -> core::result::Result>> where - F: FnOnce(&mut Self) -> Result<'a, T>, + F: FnOnce(&mut Self) -> Result<'s, 't, T>, { let y = self.finish("", f); if self.e.is_empty() { @@ -138,7 +138,7 @@ impl<'a> Parser<'a> { } } - fn verify_last(&mut self, last: &'static str) -> Result<'a, ()> { + fn verify_last(&mut self, last: &'static str) -> Result<'s, 't, ()> { match (self.i.as_slice(), last) { ([], "") => Ok(()), ([Token::Char(c)], last) if *c == last => Ok(()), @@ -149,7 +149,7 @@ impl<'a> Parser<'a> { } /// Run given parse function with given tokens, then reset tokens to previous tokens. - fn with_tok(&mut self, tokens: &'a [Token<&'a str>], f: impl FnOnce(&mut Self) -> T) -> T { + fn with_tok(&mut self, tokens: &'t [Token<&'s str>], f: impl FnOnce(&mut Self) -> T) -> T { let i = core::mem::replace(&mut self.i, tokens.iter()); let y = f(self); self.i = i; @@ -158,7 +158,7 @@ impl<'a> Parser<'a> { fn finish(&mut self, last: &'static str, f: F) -> T where - F: FnOnce(&mut Self) -> Result<'a, T>, + F: FnOnce(&mut Self) -> Result<'s, 't, T>, { f(self) .and_then(|y| { @@ -171,9 +171,9 @@ impl<'a> Parser<'a> { }) } - fn with(&mut self, tokens: &'a [Token<&'a str>], last: &'static str, f: F) -> T + fn with(&mut self, tokens: &'t [Token<&'s str>], last: &'static str, f: F) -> T where - F: FnOnce(&mut Self) -> Result<'a, T>, + F: FnOnce(&mut Self) -> Result<'s, 't, T>, { self.with_tok(tokens, |p| p.finish(last, f)) } @@ -188,9 +188,9 @@ impl<'a> Parser<'a> { y } - fn try_maybe(&mut self, f: F) -> Result<'a, Option> + fn try_maybe(&mut self, f: F) -> Result<'s, 't, Option> where - F: Fn(&mut Self) -> Result<'a, Option>, + F: Fn(&mut Self) -> Result<'s, 't, Option>, { let i = self.i.clone(); let y = f(self)?; @@ -201,9 +201,9 @@ impl<'a> Parser<'a> { Ok(y) } - fn sep_by1(&mut self, sep: &'static str, f: F) -> Result<'a, Vec> + fn sep_by1(&mut self, sep: &'static str, f: F) -> Result<'s, 't, Vec> where - F: Fn(&mut Self) -> Result<'a, T>, + F: Fn(&mut Self) -> Result<'s, 't, T>, { let head = core::iter::once(f(self)); let tail = core::iter::from_fn(|| match self.i.next() { @@ -214,7 +214,7 @@ impl<'a> Parser<'a> { head.chain(tail).collect() } - fn args(&mut self, f: impl Fn(&mut Self) -> Result<'a, T> + Copy) -> Vec { + fn args(&mut self, f: fn(&mut Self) -> Result<'s, 't, T>) -> Vec { self.maybe(|p| match p.i.next() { Some(Token::Block("(", tokens)) => Some(p.with(tokens, "", |p| p.sep_by1(";", f))), _ => None, @@ -222,7 +222,7 @@ impl<'a> Parser<'a> { .unwrap_or_default() } - fn op(&mut self, with_comma: bool) -> Option<&'a str> { + fn op(&mut self, with_comma: bool) -> Option<&'s str> { self.maybe(|p| match p.i.next() { // handle pipe directly in `term()` Some(Token::Op("|")) => None, @@ -232,48 +232,51 @@ impl<'a> Parser<'a> { }) } - fn char0(&mut self, c: char) -> Option<&'a str> { + fn char0(&mut self, c: char) -> Option<&'s str> { self.maybe(|p| match p.i.next() { Some(Token::Char(s)) if s.chars().eq([c]) => Some(*s), _ => None, }) } - fn terminated(&mut self, f: impl FnOnce(&mut Self) -> Result<'a, T>) -> Result<'a, T> { + fn terminated(&mut self, f: F) -> Result<'s, 't, T> + where + F: FnOnce(&mut Self) -> Result<'s, 't, T>, + { let y = f(self)?; self.char1(";")?; Ok(y) } - fn char1(&mut self, c: &'static str) -> Result<'a, &'a str> { + fn char1(&mut self, c: &'static str) -> Result<'s, 't, &'s str> { match self.i.next() { Some(Token::Char(s)) if *s == c => Ok(*s), next => Err((Expect::Char(c), next)), } } - fn keyword(&mut self, kw: &'static str) -> Result<'a, ()> { + fn keyword(&mut self, kw: &'static str) -> Result<'s, 't, ()> { match self.i.next() { Some(Token::Word(w)) if *w == kw => Ok(()), next => Err((Expect::Keyword(kw), next)), } } - fn var(&mut self) -> Result<'a, &'a str> { + fn var(&mut self) -> Result<'s, 't, &'s str> { match self.i.next() { Some(Token::Word(x)) if x.starts_with('$') => Ok(*x), next => Err((Expect::Var, next)), } } - fn pipe(&mut self) -> Result<'a, ()> { + fn pipe(&mut self) -> Result<'s, 't, ()> { match self.i.next() { Some(Token::Op("|")) => Ok(()), next => Err((Expect::Char("|"), next)), } } - fn term_with_comma(&mut self, with_comma: bool) -> Result<'a, Term<&'a str>> { + fn term_with_comma(&mut self, with_comma: bool) -> Result<'s, 't, Term<&'s str>> { let head = self.atom()?; let tail = core::iter::from_fn(|| self.op(with_comma).map(|op| Ok((op, self.atom()?)))) .collect::>>()?; @@ -299,7 +302,7 @@ impl<'a> Parser<'a> { }) } - fn atom(&mut self) -> Result<'a, Term<&'a str>> { + fn atom(&mut self) -> Result<'s, 't, Term<&'s str>> { let tm = match self.i.next() { Some(Token::Op("-")) => Term::Neg(Box::new(self.atom()?)), Some(Token::Word("def")) => { @@ -407,11 +410,11 @@ impl<'a> Parser<'a> { } /// Parse a term such as `.[] | .+1`. - pub fn term(&mut self) -> Result<'a, Term<&'a str>> { + pub fn term(&mut self) -> Result<'s, 't, Term<&'s str>> { self.term_with_comma(true) } - fn obj_entry(&mut self) -> Result<'a, (Term<&'a str>, Option>)> { + fn obj_entry(&mut self) -> Result<'s, 't, (Term<&'s str>, Option>)> { let key = match self.i.next() { Some(Token::Str(_, parts, _)) => Term::Str(None, self.str_parts(parts)), Some(Token::Word(k)) if k.starts_with('@') => match self.i.next() { @@ -433,8 +436,8 @@ impl<'a> Parser<'a> { fn str_parts( &mut self, - parts: &'a [StrPart<&'a str, Token<&'a str>>], - ) -> Vec>> { + parts: &'t [StrPart<&'s str, Token<&'s str>>], + ) -> Vec>> { let parts = parts.iter().map(|part| match part { StrPart::Str(s) => StrPart::Str(*s), StrPart::Filter(Token::Block("(", tokens)) => { @@ -446,7 +449,7 @@ impl<'a> Parser<'a> { parts.collect() } - fn path(&mut self) -> Result<'a, Vec<(path::Part>, path::Opt)>> { + fn path(&mut self) -> Result<'s, 't, Vec<(path::Part>, path::Opt)>> { let mut path: Vec<_> = core::iter::from_fn(|| self.path_part_opt()).collect(); while self.char0('.').is_some() { path.push(self.key_opt()?); @@ -455,7 +458,7 @@ impl<'a> Parser<'a> { Ok(path) } - fn path_part(&mut self) -> Result<'a, path::Part>> { + fn path_part(&mut self) -> Result<'s, 't, path::Part>> { use path::Part::{Index, Range}; let done = |p: &Self| matches!(p.i.as_slice(), [Token::Char("]")]); Ok(if done(self) { @@ -476,7 +479,7 @@ impl<'a> Parser<'a> { }) } - fn path_part_opt(&mut self) -> Option<(path::Part>, path::Opt)> { + fn path_part_opt(&mut self) -> Option<(path::Part>, path::Opt)> { let part = self.maybe(|p| match p.i.next() { Some(Token::Block("[", tokens)) => Some(p.with(tokens, "]", Self::path_part)), _ => None, @@ -484,7 +487,7 @@ impl<'a> Parser<'a> { Some((part, self.opt())) } - fn key_opt(&mut self) -> Result<'a, (path::Part>, path::Opt)> { + fn key_opt(&mut self) -> Result<'s, 't, (path::Part>, path::Opt)> { let key = match self.i.next() { Some(Token::Word(id)) if !id.starts_with(['$', '@']) && !KEYWORDS.contains(id) => *id, next => return Err((Expect::Key, next)), @@ -501,7 +504,7 @@ impl<'a> Parser<'a> { } /// Parse a sequence of definitions, such as `def x: 1; def y: 2;`. - pub fn defs(&mut self) -> Result<'a, Vec>>> { + pub fn defs(&mut self) -> Result<'s, 't, Vec>>> { core::iter::from_fn(|| self.def_head().map(|()| self.def_tail())).collect() } @@ -512,7 +515,7 @@ impl<'a> Parser<'a> { }) } - fn def_tail(&mut self) -> Result<'a, Def<&'a str, Term<&'a str>>> { + fn def_tail(&mut self) -> Result<'s, 't, Def<&'s str, Term<&'s str>>> { let name = match self.i.next() { Some(Token::Word(name)) if !name.starts_with(['$']) => name, next => return Err((Expect::Ident, next)), @@ -531,7 +534,7 @@ impl<'a> Parser<'a> { Ok(Def { name, args, body }) } - fn bare_str(&mut self) -> Result<'a, &'a str> { + fn bare_str(&mut self) -> Result<'s, 't, &'s str> { match self.i.next() { Some(Token::Str(_, parts, _)) => match parts[..] { [StrPart::Str(s)] => Ok(s), @@ -541,11 +544,11 @@ impl<'a> Parser<'a> { } } - fn include(&mut self) -> Result<'a, (&'a str, Option<&'a str>)> { + fn include(&mut self) -> Result<'s, 't, (&'s str, Option<&'s str>)> { self.bare_str().map(|path| (path, None)) } - fn import(&mut self) -> Result<'a, (&'a str, Option<&'a str>)> { + fn import(&mut self) -> Result<'s, 't, (&'s str, Option<&'s str>)> { let path = self.bare_str()?; self.keyword("as")?; let name = match self.i.next() { @@ -556,9 +559,9 @@ impl<'a> Parser<'a> { } /// Parse a module with a body returned by the given function. - pub fn module(&mut self, f: F) -> Result<'a, Module<&'a str, B>> + pub fn module(&mut self, f: F) -> Result<'s, 't, Module<&'s str, B>> where - F: FnOnce(&mut Self) -> Result<'a, B>, + F: FnOnce(&mut Self) -> Result<'s, 't, B>, { let meta = self .maybe(|p| match p.i.next() { diff --git a/jaq/src/main.rs b/jaq/src/main.rs index 3f43cb701..4d4084f46 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -258,9 +258,9 @@ fn args_named(var_val: &[(String, Val)]) -> Val { } fn parse_defs(std_str: &str) -> Vec { - let tokens = jaq_syn::Lexer::new(std_str).lex().unwrap(); - let std = jaq_syn::Parser::new(&tokens).parse(|p| p.module(|p| p.defs())); - std.unwrap().conv(std_str) + jaq_syn::parse(std_str, |p| p.module(|p| p.defs())) + .unwrap() + .conv(std_str) } fn parse_term(filter_str: &str) -> Result> { From bbf2425ce1fce2def9c34f72d527f5160f22e06f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 17:35:37 +0200 Subject: [PATCH 097/135] Correct a few mistakes in the parser. --- jaq-syn/src/parse.rs | 53 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 32ecad3ba..1b4f64a30 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -14,6 +14,8 @@ pub enum Expect { Char(S), Var, ElseOrEnd, + CommaOrRBrace, + SemicolonOrRParen, Term, Key, Ident, @@ -29,6 +31,8 @@ impl<'a> Expect<&'a str> { Self::Keyword(s) | Self::Char(s) => s, Self::Var => "variable", Self::ElseOrEnd => "else or end", + Self::CommaOrRBrace => "comma or right brace", + Self::SemicolonOrRParen => "semicolon or right parenthesis", Self::Term => "term", Self::Key => "key", Self::Ident => "ident", @@ -201,22 +205,43 @@ impl<'s, 't> Parser<'s, 't> { Ok(y) } - fn sep_by1(&mut self, sep: &'static str, f: F) -> Result<'s, 't, Vec> + /// Parse sequence of shape `f ("," f)* ","? "}"`. + fn obj_items(&mut self, f: F) -> Result<'s, 't, Vec> where F: Fn(&mut Self) -> Result<'s, 't, T>, { - let head = core::iter::once(f(self)); - let tail = core::iter::from_fn(|| match self.i.next() { - Some(Token::Char(c)) if *c == sep => Some(f(self)), - Some(Token::Char(")" | "}")) => None, - next => Some(Err((Expect::Char(sep), next))), - }); - head.chain(tail).collect() + let mut y = Vec::from([f(self)?]); + let rbrace = |p: &mut Self| p.i.next().filter(|tk| matches!(tk, Token::Char("}"))); + loop { + match self.i.next() { + Some(Token::Char("}")) => break, + Some(Token::Char(",")) if self.maybe(rbrace).is_some() => break, + Some(Token::Char(",")) => y.push(f(self)?), + next => return Err((Expect::CommaOrRBrace, next)), + } + } + Ok(y) + } + + /// Parse sequence of shape `f (";" f)* ")"`. + fn arg_items(&mut self, f: F) -> Result<'s, 't, Vec> + where + F: Fn(&mut Self) -> Result<'s, 't, T>, + { + let mut y = Vec::from([f(self)?]); + loop { + match self.i.next() { + Some(Token::Char(";")) => y.push(f(self)?), + Some(Token::Char(")")) => break, + next => return Err((Expect::SemicolonOrRParen, next)), + } + } + Ok(y) } fn args(&mut self, f: fn(&mut Self) -> Result<'s, 't, T>) -> Vec { self.maybe(|p| match p.i.next() { - Some(Token::Block("(", tokens)) => Some(p.with(tokens, "", |p| p.sep_by1(";", f))), + Some(Token::Block("(", tokens)) => Some(p.with(tokens, "", |p| p.arg_items(f))), _ => None, }) .unwrap_or_default() @@ -390,7 +415,7 @@ impl<'s, 't> Parser<'s, 't> { Term::Arr(Some(Box::new(self.with(tokens, "]", Self::term)))) } Some(Token::Block("{", tokens)) => self.with(tokens, "", |p| { - p.sep_by1(",", Self::obj_entry).map(Term::Obj) + p.obj_items(Self::obj_entry).map(Term::Obj) }), Some(Token::Str(_, parts, _)) => Term::Str(None, self.str_parts(parts)), next => return Err((Expect::Term, next)), @@ -489,10 +514,14 @@ impl<'s, 't> Parser<'s, 't> { fn key_opt(&mut self) -> Result<'s, 't, (path::Part>, path::Opt)> { let key = match self.i.next() { - Some(Token::Word(id)) if !id.starts_with(['$', '@']) && !KEYWORDS.contains(id) => *id, + Some(Token::Word(id)) if id.starts_with('@') => todo!(), + Some(Token::Str(_, parts, _)) => Term::Str(None, self.str_parts(parts)), + Some(Token::Word(id)) if !id.starts_with('$') && !KEYWORDS.contains(id) => { + Term::str(*id) + } next => return Err((Expect::Key, next)), }; - Ok((path::Part::Index(Term::str(key)), self.opt())) + Ok((path::Part::Index(key), self.opt())) } fn opt(&mut self) -> path::Opt { From aa165a684dd092bf10c88dbab716f0e476cc3d54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 17:36:15 +0200 Subject: [PATCH 098/135] Remove invalid test. --- jaq-interpret/tests/path.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/jaq-interpret/tests/path.rs b/jaq-interpret/tests/path.rs index ff5b53ecc..c786b15f6 100644 --- a/jaq-interpret/tests/path.rs +++ b/jaq-interpret/tests/path.rs @@ -42,7 +42,6 @@ fn index_access() { fn iter_access() { gives(json!([0, 1, 2]), ".[]", [json!(0), json!(1), json!(2)]); gives(json!({"a": [1, 2]}), ".a[]", [json!(1), json!(2)]); - gives(json!({"a": [1, 2]}), ".a.[]", [json!(1), json!(2)]); gives(json!({"a": 1, "b": 2}), ".[]", [json!(1), json!(2)]); // TODO: correct this //gives(json!({"b": 2, "a": 1}), ".[]", [json!(2), json!(1)]); From 324ea2849cb3609bd63e0a5998dd899323de8378 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 17:39:43 +0200 Subject: [PATCH 099/135] Remove jaq-parse from jaq-interpret. --- jaq-interpret/Cargo.toml | 3 --- jaq-interpret/src/lib.rs | 5 ++--- jaq-interpret/tests/common/mod.rs | 7 ++++--- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/jaq-interpret/Cargo.toml b/jaq-interpret/Cargo.toml index 450ec7d68..6a2baeb8e 100644 --- a/jaq-interpret/Cargo.toml +++ b/jaq-interpret/Cargo.toml @@ -23,6 +23,3 @@ hifijson = { version = "0.2.0", optional = true } indexmap = "2.0" once_cell = "1.16.0" serde_json = { version = "1.0.81", optional = true } - -[dev-dependencies] -jaq-parse = { version = "1.0.0", path = "../jaq-parse" } diff --git a/jaq-interpret/src/lib.rs b/jaq-interpret/src/lib.rs index 5dd2b7d35..46d5107a3 100644 --- a/jaq-interpret/src/lib.rs +++ b/jaq-interpret/src/lib.rs @@ -22,11 +22,10 @@ //! let mut defs = ParseCtx::new(Vec::new()); //! //! // parse the filter -//! let (f, errs) = jaq_parse::parse(filter, jaq_parse::main()); -//! assert_eq!(errs, Vec::new()); +//! let f = jaq_syn::parse(filter, |p| p.module(|p| p.term())).unwrap().conv(filter); //! //! // compile the filter in the context of the given definitions -//! let f = defs.compile(f.unwrap()); +//! let f = defs.compile(f); //! assert!(defs.errs.is_empty()); //! //! let inputs = RcIter::new(core::iter::empty()); diff --git a/jaq-interpret/tests/common/mod.rs b/jaq-interpret/tests/common/mod.rs index 20d0b92a1..12a49fc3d 100644 --- a/jaq-interpret/tests/common/mod.rs +++ b/jaq-interpret/tests/common/mod.rs @@ -2,9 +2,10 @@ use serde_json::Value; fn yields(x: jaq_interpret::Val, f: &str, ys: impl Iterator) { let mut ctx = jaq_interpret::ParseCtx::new(Vec::new()); - let (f, errs) = jaq_parse::parse(f, jaq_parse::main()); - assert!(errs.is_empty()); - ctx.yields(x, f.unwrap(), ys) + let f = jaq_syn::parse(f, |p| p.module(|p| p.term())) + .unwrap() + .conv(f); + ctx.yields(x, f, ys) } pub fn fail(x: Value, f: &str, err: jaq_interpret::Error) { From 4bf1db284779d817b9311e19f2ba549cdb7a2119 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 17:40:25 +0200 Subject: [PATCH 100/135] Remove jaq-parse from jaq-std. --- jaq-std/Cargo.toml | 9 --------- jaq-std/build.rs | 17 ----------------- jaq-std/src/lib.rs | 21 ++++----------------- jaq-std/tests/common/mod.rs | 7 ++++--- 4 files changed, 8 insertions(+), 46 deletions(-) delete mode 100644 jaq-std/build.rs diff --git a/jaq-std/Cargo.toml b/jaq-std/Cargo.toml index 849d7d914..a1edc7774 100644 --- a/jaq-std/Cargo.toml +++ b/jaq-std/Cargo.toml @@ -10,17 +10,8 @@ repository = "https://github.com/01mf02/jaq" keywords = ["json", "query", "jq"] rust-version = "1.64" -[features] -default = ["bincode"] - -[build-dependencies] -jaq-parse = { version = "1.0.0", path = "../jaq-parse" } -bincode = { version = "1.3.3", optional = true } - [dependencies] jaq-syn = { version = "1.0.0", path = "../jaq-syn" } -jaq-parse = { version = "1.0.0", path = "../jaq-parse" } -bincode = { version = "1.3.3", optional = true } [dev-dependencies] jaq-interpret = { version = "1.2.0", path = "../jaq-interpret" } diff --git a/jaq-std/build.rs b/jaq-std/build.rs deleted file mode 100644 index 54373c682..000000000 --- a/jaq-std/build.rs +++ /dev/null @@ -1,17 +0,0 @@ -//! Cache parsed standard library. - -#[cfg(feature = "bincode")] -fn main() { - let out_dir = std::env::var_os("OUT_DIR").unwrap(); - let dest_path = std::path::Path::new(&out_dir).join("std.bin"); - let buffer = std::fs::File::create(dest_path).unwrap(); - - let std = include_str!("src/std.jq"); - let (std, errs) = jaq_parse::parse(std, jaq_parse::defs()); - assert_eq!(errs, Vec::new()); - let std = std.unwrap(); - bincode::serialize_into(buffer, &std).unwrap(); -} - -#[cfg(not(feature = "bincode"))] -fn main() {} diff --git a/jaq-std/src/lib.rs b/jaq-std/src/lib.rs index 34a5f4829..862a6649b 100644 --- a/jaq-std/src/lib.rs +++ b/jaq-std/src/lib.rs @@ -4,12 +4,6 @@ //! The standard library provides a set of filters defined using core filters. //! For example, the standard library provides the `map(f)` filter, //! which is defined using the more elementary filter `[.[] | f]`. -//! -//! The time required to parse the standard library becomes evident -//! when the runtime of the jaq filter is small. -//! Therefore, when the "bincode" feature is enabled, -//! this crate precompiles the standard library, -//! in order to reduce startup time. #![no_std] #![warn(missing_docs)] @@ -18,15 +12,8 @@ use alloc::vec::Vec; /// Return the standard library. pub fn std() -> Vec { - #[cfg(feature = "bincode")] - { - // use preparsed standard library - let std = include_bytes!(concat!(env!("OUT_DIR"), "/std.bin")); - bincode::deserialize(std).unwrap() - } - #[cfg(not(feature = "bincode"))] - { - let std = include_str!("std.jq"); - jaq_parse::parse(std, jaq_parse::defs()).0.unwrap() - } + let std = include_str!("std.jq"); + jaq_syn::parse(std, |p| p.module(|p| p.defs())) + .unwrap() + .conv(std) } diff --git a/jaq-std/tests/common/mod.rs b/jaq-std/tests/common/mod.rs index d8c78f261..be6466881 100644 --- a/jaq-std/tests/common/mod.rs +++ b/jaq-std/tests/common/mod.rs @@ -5,9 +5,10 @@ fn yields(x: jaq_interpret::Val, f: &str, ys: impl Iterator Date: Tue, 16 Jul 2024 17:43:30 +0200 Subject: [PATCH 101/135] Bump jaq-syn to 1.6.0. --- jaq-syn/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jaq-syn/Cargo.toml b/jaq-syn/Cargo.toml index e1afbcc55..3cbbf8ab2 100644 --- a/jaq-syn/Cargo.toml +++ b/jaq-syn/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "jaq-syn" -version = "1.1.0" +version = "1.6.0" authors = ["Michael Färber "] edition = "2021" license = "MIT" From a1fb0da9976c85e5fdc8f78877953426694e167a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 17:44:16 +0200 Subject: [PATCH 102/135] Remove jaq-parse from jaq-core. --- jaq-core/Cargo.toml | 2 +- jaq-core/tests/common/mod.rs | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/jaq-core/Cargo.toml b/jaq-core/Cargo.toml index a4611bede..5978da5df 100644 --- a/jaq-core/Cargo.toml +++ b/jaq-core/Cargo.toml @@ -29,5 +29,5 @@ base64 = { version = "0.22", optional = true } urlencoding = { version = "2.1.3", optional = true } [dev-dependencies] -jaq-parse = { version = "1.0.0", path = "../jaq-parse" } +jaq-syn = { version = "1.6.0", path = "../jaq-syn" } serde_json = "1.0" diff --git a/jaq-core/tests/common/mod.rs b/jaq-core/tests/common/mod.rs index e6936a57a..087ee28d5 100644 --- a/jaq-core/tests/common/mod.rs +++ b/jaq-core/tests/common/mod.rs @@ -4,9 +4,10 @@ fn yields(x: jaq_interpret::Val, f: &str, ys: impl Iterator Date: Tue, 16 Jul 2024 17:44:41 +0200 Subject: [PATCH 103/135] Update Cargo.lock. --- Cargo.lock | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f2a73aad1..36aa09abd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -47,15 +47,6 @@ version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51" -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - [[package]] name = "bitflags" version = "1.3.2" @@ -269,7 +260,7 @@ dependencies = [ "base64", "hifijson", "jaq-interpret", - "jaq-parse", + "jaq-syn", "libm", "log", "regex", @@ -286,7 +277,6 @@ dependencies = [ "dyn-clone", "hifijson", "indexmap", - "jaq-parse", "jaq-syn", "once_cell", "serde_json", @@ -326,17 +316,15 @@ dependencies = [ name = "jaq-std" version = "1.5.1" dependencies = [ - "bincode", "jaq-core", "jaq-interpret", - "jaq-parse", "jaq-syn", "serde_json", ] [[package]] name = "jaq-syn" -version = "1.1.0" +version = "1.6.0" dependencies = [ "serde", ] From e97afb39d4618d096d71322497caf1c35422aaa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 22:06:24 +0200 Subject: [PATCH 104/135] Use jaq-std in jaq again. --- jaq/Cargo.toml | 1 + jaq/src/main.rs | 8 +------- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/jaq/Cargo.toml b/jaq/Cargo.toml index b8a30a39b..aa5bfd8ff 100644 --- a/jaq/Cargo.toml +++ b/jaq/Cargo.toml @@ -18,6 +18,7 @@ default = ["mimalloc"] jaq-syn = { version = "1.1.0", path = "../jaq-syn" } jaq-interpret = { version = "1.2.0", path = "../jaq-interpret" } jaq-core = { version = "1.2.0", path = "../jaq-core" } +jaq-std = { version = "1.5.0", path = "../jaq-std" } atty = "0.2" codesnake = { version = "0.1" } clap = { version = "4.0.0", features = ["derive"] } diff --git a/jaq/src/main.rs b/jaq/src/main.rs index 4d4084f46..fe19a9d02 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -257,12 +257,6 @@ fn args_named(var_val: &[(String, Val)]) -> Val { Val::obj(args.collect()) } -fn parse_defs(std_str: &str) -> Vec { - jaq_syn::parse(std_str, |p| p.module(|p| p.defs())) - .unwrap() - .conv(std_str) -} - fn parse_term(filter_str: &str) -> Result> { let tokens = jaq_syn::Lexer::new(filter_str).lex().map_err(|errs| { errs.into_iter() @@ -285,7 +279,7 @@ fn parse_term(filter_str: &str) -> Result> { fn parse(filter_str: &str, vars: Vec) -> Result> { let mut ctx = ParseCtx::new(vars); ctx.insert_natives(jaq_core::core()); - ctx.insert_defs(parse_defs(include_str!("../../jaq-std/src/std.jq"))); + ctx.insert_defs(jaq_std::std()); let filter = parse_term(filter_str)?; let filter = ctx.compile(filter); if ctx.errs.is_empty() { From bcc3d5a61a3c68fb68a8a390fcbe883b8174c007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 22:06:39 +0200 Subject: [PATCH 105/135] Format. --- jaq-syn/src/parse.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 1b4f64a30..50630cd30 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -414,9 +414,9 @@ impl<'s, 't> Parser<'s, 't> { Some(Token::Block("[", tokens)) => { Term::Arr(Some(Box::new(self.with(tokens, "]", Self::term)))) } - Some(Token::Block("{", tokens)) => self.with(tokens, "", |p| { - p.obj_items(Self::obj_entry).map(Term::Obj) - }), + Some(Token::Block("{", tokens)) => { + self.with(tokens, "", |p| p.obj_items(Self::obj_entry).map(Term::Obj)) + } Some(Token::Str(_, parts, _)) => Term::Str(None, self.str_parts(parts)), next => return Err((Expect::Term, next)), }; From f800a82ee001146615d8fc3951aba0c3227c4eff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 22:08:35 +0200 Subject: [PATCH 106/135] Use new parser in jaq-play. --- jaq-play/Cargo.toml | 2 - jaq-play/src/lib.rs | 168 +++++++++++++++++++++++--------------------- 2 files changed, 86 insertions(+), 84 deletions(-) diff --git a/jaq-play/Cargo.toml b/jaq-play/Cargo.toml index 17da27d0f..ecd81b025 100644 --- a/jaq-play/Cargo.toml +++ b/jaq-play/Cargo.toml @@ -17,13 +17,11 @@ crate-type = ["cdylib", "rlib"] [dependencies] jaq-syn = { version = "1.1.0", path = "../jaq-syn" } -jaq-parse = { version = "1.0.0", path = "../jaq-parse" } jaq-interpret = { version = "1.2.0", path = "../jaq-interpret" } jaq-core = { version = "1.2.0", path = "../jaq-core" } jaq-std = { version = "1.2.0", path = "../jaq-std" } aho-corasick = "1.1.2" codesnake = { version = "0.1" } -chumsky = { version = "0.9.0", default-features = false } hifijson = "0.2" log = "0.4.17" unicode-width = "0.1.13" diff --git a/jaq-play/src/lib.rs b/jaq-play/src/lib.rs index 76626bc96..88a038aa4 100644 --- a/jaq-play/src/lib.rs +++ b/jaq-play/src/lib.rs @@ -130,7 +130,7 @@ impl Settings { use web_sys::DedicatedWorkerGlobalScope as Scope; enum Error { - Chumsky(Vec), + Report(String, Vec), Hifijson(String), Jaq(jaq_interpret::Error), } @@ -163,11 +163,14 @@ pub fn run(filter: &str, input: &str, settings: &JsValue, scope: &Scope) { }; match process(filter, input, &settings, post_value) { Ok(()) => (), - Err(Error::Chumsky(errs)) => { - for e in errs { - scope - .post_message(&format!("⚠️ Parse error: {}", report(filter, &e)).into()) - .unwrap(); + Err(Error::Report(code, reports)) => { + let idx = codesnake::LineIndex::new(&code); + for e in reports { + let error = format!("⚠️ Parse error: {}", e.message); + scope.post_message(&error.into()).unwrap(); + let block = e.to_block(&idx); + let block = format!("{}\n{}{}", block.prologue(), block, block.epilogue()); + scope.post_message(&block.into()).unwrap(); } } Err(Error::Hifijson(e)) => { @@ -224,7 +227,7 @@ fn collect_if<'a, T: 'a + FromIterator, E: 'a>( } fn process(filter: &str, input: &str, settings: &Settings, f: impl Fn(Val)) -> Result<(), Error> { - let filter = parse(filter, Vec::new()).map_err(Error::Chumsky)?; + let filter = parse(filter, Vec::new()).map_err(|e| Error::Report(filter.to_owned(), e))?; let inputs = read_str(settings, input); @@ -243,34 +246,44 @@ fn process(filter: &str, input: &str, settings: &Settings, f: impl Fn(Val)) -> R Ok(()) } -type ChumskyError = chumsky::error::Simple; +fn parse_term(filter_str: &str) -> Result> { + let tokens = jaq_syn::Lexer::new(filter_str).lex().map_err(|errs| { + errs.into_iter() + .map(|e| report_lex(filter_str, e)) + .collect::>() + })?; + + let main = jaq_syn::Parser::new(&tokens).parse(|p| p.module(|p| p.term())); + let main = main.map_err(|errs| { + errs.into_iter() + .map(|e| report_parse(filter_str, e)) + .collect::>() + })?; + + Ok(main.conv(filter_str)) +} -fn parse(filter_str: &str, vars: Vec) -> Result> { - let mut defs = ParseCtx::new(vars); - defs.insert_natives(jaq_core::core()); - defs.insert_defs(jaq_std::std()); - assert!(defs.errs.is_empty()); - let (filter, errs) = jaq_parse::parse(filter_str, jaq_parse::main()); - if !errs.is_empty() { - return Err(errs); - } - let filter = defs.compile(filter.unwrap()); - if defs.errs.is_empty() { +fn parse(filter_str: &str, vars: Vec) -> Result> { + let mut ctx = ParseCtx::new(vars); + ctx.insert_natives(jaq_core::core()); + ctx.insert_defs(jaq_std::std()); + let filter = parse_term(filter_str)?; + let filter = ctx.compile(filter); + if ctx.errs.is_empty() { Ok(filter) } else { - Err(defs - .errs - .into_iter() - .map(|error| ChumskyError::custom(error.1, error.0.to_string())) - .collect()) + let reports = ctx.errs.into_iter().map(|error| Report { + message: error.0.to_string(), + labels: Vec::from([(error.1, [(error.0.to_string(), None)].into(), Color::Red)]), + }); + Err(reports.collect()) } } #[derive(Debug)] -struct Report<'a> { - code: &'a str, +struct Report { message: String, - labels: Vec<(core::ops::Range, String, Color)>, + labels: Vec<(core::ops::Range, Vec<(String, Option)>, Color)>, } #[derive(Clone, Debug)] @@ -287,74 +300,65 @@ impl Color { } } -fn report<'a>(code: &'a str, e: &chumsky::error::Simple) -> Report<'a> { - use chumsky::error::SimpleReason; +fn report_lex(code: &str, (expected, found): jaq_syn::lex::Error<&str>) -> Report { + use jaq_syn::lex::{span, Expect}; - let eof = || "end of input".to_string(); - - let message = if let SimpleReason::Custom(msg) = e.reason() { - msg.clone() - } else { - let found = if e.found().is_some() { - "Unexpected token" - } else { - "Unexpected end of input" - }; - let when = if let Some(label) = e.label() { - format!(" while parsing {label}") - } else { - String::new() - }; - let expected = if e.expected().len() == 0 { - "something else".to_string() - } else { - let f = |e: &Option| e.as_ref().map_or_else(eof, |e| e.to_string()); - e.expected().map(f).collect::>().join(", ") - }; - format!("{found}{when}, expected {expected}",) + let mut found_range = span(code, found); + found_range.end = core::cmp::min(found_range.start + 1, code.len()); + let found = match found { + "" => [("unexpected end of input".to_string(), None)].into(), + c => [("unexpected character ", None), (c, Some(Color::Red))] + .map(|(s, c)| (s.into(), c)) + .into(), }; + let label = (found_range, found, Color::Red); - let label = if let SimpleReason::Custom(msg) = e.reason() { - msg.clone() - } else { - let token = |c: &String| format!("token {}", Color::Red.apply(c)); - format!("Unexpected {}", e.found().map_or_else(eof, token)) - }; - // convert character indices to byte offsets - let char_to_byte = |i| { - code.char_indices() - .map(|(i, _c)| i) - .chain([code.len(), code.len()]) - .nth(i) - .unwrap() + let labels = match expected { + Expect::Delim(open) => { + let text = [("unclosed delimiter ", None), (open, Some(Color::Yellow))] + .map(|(s, c)| (s.into(), c)); + Vec::from([(span(code, open), text.into(), Color::Yellow), label]) + } + _ => Vec::from([label]), }; - let conv = |span: &core::ops::Range<_>| char_to_byte(span.start)..char_to_byte(span.end); - let mut labels = Vec::from([(conv(&e.span()), label, Color::Red)]); - if let SimpleReason::Unclosed { span, delimiter } = e.reason() { - let text = format!("Unclosed delimiter {}", Color::Yellow.apply(delimiter)); - labels.insert(0, (conv(span), text, Color::Yellow)); - } Report { - code, - message, + message: format!("expected {}", expected.as_str()), labels, } } -impl Display for Report<'_> { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - use codesnake::{Block, CodeWidth, Label, LineIndex}; - let idx = LineIndex::new(self.code); - let labels = self.labels.clone().into_iter().map(|(range, text, color)| { - Label::new(range, text).with_style(move |s| color.apply(s).to_string()) +fn report_parse(code: &str, (expected, found): jaq_syn::parse::Error) -> Report { + let found_range = match found { + None => code.len()..code.len(), + Some(found) => found.span(code), + }; + let found = found.map_or("unexpected end of input", |_| "unexpected token"); + let found = [(found.to_string(), None)].into(); + + Report { + message: format!("expected {}", expected.as_str()), + labels: Vec::from([(found_range, found, Color::Red)]), + } +} + +type CodeBlock = codesnake::Block, String>; + +impl Report { + fn to_block(self, idx: &codesnake::LineIndex) -> CodeBlock { + use codesnake::{Block, CodeWidth, Label}; + let color_maybe = |(text, color): (_, Option)| match color { + None => text, + Some(color) => color.apply(text).to_string(), + }; + let labels = self.labels.into_iter().map(|(range, text, color)| { + let text = text.into_iter().map(color_maybe).collect::>(); + Label::new(range, text.join("")).with_style(move |s| color.apply(s).to_string()) }); - let block = Block::new(&idx, labels).unwrap().map_code(|c| { + Block::new(&idx, labels).unwrap().map_code(|c| { let c = c.replace('\t', " "); let w = unicode_width::UnicodeWidthStr::width(&*c); CodeWidth::new(c, core::cmp::max(w, 1)) - }); - writeln!(f, "{}", self.message)?; - write!(f, "{}\n{}{}", block.prologue(), block, block.epilogue()) + }) } } From ee34b4c920de428bfc5cb3186cfa2714fa9c10f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 22:08:58 +0200 Subject: [PATCH 107/135] Remove old parser from Cargo.toml. --- Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index b57674b2b..bd439b975 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,6 @@ [workspace] members = [ "jaq-syn", - "jaq-parse", "jaq-interpret", "jaq-core", "jaq-std", From f48326669bfede4ea0299d45f8ee6b2c9024a0bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 22:09:19 +0200 Subject: [PATCH 108/135] Update Cargo.lock --- bye bye, chumsky! --- Cargo.lock | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 36aa09abd..1d3ce0f93 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,12 +24,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "allocator-api2" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" - [[package]] name = "atty" version = "0.2.14" @@ -71,15 +65,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" -[[package]] -name = "chumsky" -version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eebd66744a15ded14960ab4ccdbfb51ad3b81f51f3f04a80adac98c985396c9" -dependencies = [ - "hashbrown", -] - [[package]] name = "clap" version = "4.0.22" @@ -182,10 +167,6 @@ name = "hashbrown" version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f93e7192158dbcda357bdec5fb5788eebf8bbac027f3f33e719d29135ae84156" -dependencies = [ - "ahash", - "allocator-api2", -] [[package]] name = "heck" @@ -244,6 +225,7 @@ dependencies = [ "hifijson", "jaq-core", "jaq-interpret", + "jaq-std", "jaq-syn", "memmap2", "mimalloc", @@ -282,27 +264,17 @@ dependencies = [ "serde_json", ] -[[package]] -name = "jaq-parse" -version = "1.0.2" -dependencies = [ - "chumsky", - "jaq-syn", -] - [[package]] name = "jaq-play" version = "0.1.0" dependencies = [ "aho-corasick", - "chumsky", "codesnake", "console_log", "getrandom", "hifijson", "jaq-core", "jaq-interpret", - "jaq-parse", "jaq-std", "jaq-syn", "js-sys", From 6a2c7b24c37fa647daa92f3a9cdbaa936ba8469b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Tue, 16 Jul 2024 22:28:24 +0200 Subject: [PATCH 109/135] Clippy. --- jaq-play/src/lib.rs | 10 ++++++---- jaq/src/main.rs | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/jaq-play/src/lib.rs b/jaq-play/src/lib.rs index 88a038aa4..30dc3f649 100644 --- a/jaq-play/src/lib.rs +++ b/jaq-play/src/lib.rs @@ -168,7 +168,7 @@ pub fn run(filter: &str, input: &str, settings: &JsValue, scope: &Scope) { for e in reports { let error = format!("⚠️ Parse error: {}", e.message); scope.post_message(&error.into()).unwrap(); - let block = e.to_block(&idx); + let block = e.into_block(&idx); let block = format!("{}\n{}{}", block.prologue(), block, block.epilogue()); scope.post_message(&block.into()).unwrap(); } @@ -280,10 +280,12 @@ fn parse(filter_str: &str, vars: Vec) -> Result> { } } +type StringColors = Vec<(String, Option)>; + #[derive(Debug)] struct Report { message: String, - labels: Vec<(core::ops::Range, Vec<(String, Option)>, Color)>, + labels: Vec<(core::ops::Range, StringColors, Color)>, } #[derive(Clone, Debug)] @@ -345,7 +347,7 @@ fn report_parse(code: &str, (expected, found): jaq_syn::parse::Error) -> Report type CodeBlock = codesnake::Block, String>; impl Report { - fn to_block(self, idx: &codesnake::LineIndex) -> CodeBlock { + fn into_block(self, idx: &codesnake::LineIndex) -> CodeBlock { use codesnake::{Block, CodeWidth, Label}; let color_maybe = |(text, color): (_, Option)| match color { None => text, @@ -355,7 +357,7 @@ impl Report { let text = text.into_iter().map(color_maybe).collect::>(); Label::new(range, text.join("")).with_style(move |s| color.apply(s).to_string()) }); - Block::new(&idx, labels).unwrap().map_code(|c| { + Block::new(idx, labels).unwrap().map_code(|c| { let c = c.replace('\t', " "); let w = unicode_width::UnicodeWidthStr::width(&*c); CodeWidth::new(c, core::cmp::max(w, 1)) diff --git a/jaq/src/main.rs b/jaq/src/main.rs index fe19a9d02..3f39bf49d 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -402,7 +402,7 @@ impl Termination for Error { let idx = codesnake::LineIndex::new(&code); for e in reports { eprintln!("Error: {}", e.message); - let block = e.to_block(&idx); + let block = e.into_block(&idx); eprintln!("{}\n{}{}", block.prologue(), block, block.epilogue()) } 3 @@ -574,10 +574,12 @@ fn with_stdout(f: impl FnOnce(&mut io::StdoutLock) -> Result) -> Re Ok(y) } +type StringColors = Vec<(String, Option)>; + #[derive(Debug)] struct Report { message: String, - labels: Vec<(core::ops::Range, Vec<(String, Option)>, Color)>, + labels: Vec<(core::ops::Range, StringColors, Color)>, } #[derive(Clone, Debug)] @@ -642,7 +644,7 @@ fn report_parse(code: &str, (expected, found): jaq_syn::parse::Error) -> Report type CodeBlock = codesnake::Block, String>; impl Report { - fn to_block(self, idx: &codesnake::LineIndex) -> CodeBlock { + fn into_block(self, idx: &codesnake::LineIndex) -> CodeBlock { use codesnake::{Block, CodeWidth, Label}; let color_maybe = |(text, color): (_, Option)| match color { None => text, @@ -652,7 +654,7 @@ impl Report { let text = text.into_iter().map(color_maybe).collect::>(); Label::new(range, text.join("")).with_style(move |s| color.apply(s).to_string()) }); - Block::new(&idx, labels).unwrap().map_code(|c| { + Block::new(idx, labels).unwrap().map_code(|c| { let c = c.replace('\t', " "); let w = unicode_width::UnicodeWidthStr::width(&*c); CodeWidth::new(c, core::cmp::max(w, 1)) From 02c05cb847f32fc023b524f887fd50d226af6df1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 09:06:08 +0200 Subject: [PATCH 110/135] Do not panic on interpolated strings in module paths. --- jaq-syn/src/parse.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 50630cd30..ff5a16238 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -565,9 +565,9 @@ impl<'s, 't> Parser<'s, 't> { fn bare_str(&mut self) -> Result<'s, 't, &'s str> { match self.i.next() { - Some(Token::Str(_, parts, _)) => match parts[..] { + next @ Some(Token::Str(_, parts, _)) => match parts[..] { [StrPart::Str(s)] => Ok(s), - _ => todo!(), + _ => Err((Expect::Str, next)), }, next => Err((Expect::Str, next)), } From 4ec2ca33c0378c35f74201076049e809605a90ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 09:06:51 +0200 Subject: [PATCH 111/135] Identifiers. --- jaq-syn/src/parse.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index ff5a16238..af77da717 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -546,12 +546,12 @@ impl<'s, 't> Parser<'s, 't> { fn def_tail(&mut self) -> Result<'s, 't, Def<&'s str, Term<&'s str>>> { let name = match self.i.next() { - Some(Token::Word(name)) if !name.starts_with(['$']) => name, + Some(Token::Word(name)) if !name.starts_with('$') && is_id(name) => name, next => return Err((Expect::Ident, next)), }; let args = self.args(|p| { Ok(match p.i.next() { - Some(Token::Word(arg)) if !arg.starts_with('@') => *arg, + Some(Token::Word(arg)) if is_id(arg) => *arg, next => return Err((Expect::Arg, next)), }) }); @@ -581,7 +581,7 @@ impl<'s, 't> Parser<'s, 't> { let path = self.bare_str()?; self.keyword("as")?; let name = match self.i.next() { - Some(Token::Word(name)) if !name.starts_with(['$', '@']) => *name, + Some(Token::Word(name)) if !name.starts_with(['$', '@']) && is_id(name) => *name, next => return Err((Expect::Ident, next)), }; Ok((path, Some(name))) @@ -614,6 +614,10 @@ impl<'s, 't> Parser<'s, 't> { } } +fn is_id(s: &str) -> bool { + !s.contains("::") && !KEYWORDS.contains(&s) +} + #[derive(Debug, Default)] pub struct Module { meta: Option>, From fc15a221b67f53aa8b8d92985195172fe96e31dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 09:08:18 +0200 Subject: [PATCH 112/135] More permissive key syntax. --- jaq-syn/src/parse.rs | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index af77da717..06a966546 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -512,16 +512,22 @@ impl<'s, 't> Parser<'s, 't> { Some((part, self.opt())) } - fn key_opt(&mut self) -> Result<'s, 't, (path::Part>, path::Opt)> { - let key = match self.i.next() { - Some(Token::Word(id)) if id.starts_with('@') => todo!(), + fn key(&mut self, next: Option<&'t Token<&'s str>>) -> Result<'s, 't, Term<&'s str>> { + Ok(match next { + Some(Token::Word(id)) if id.starts_with('$') => Term::Var(*id), + Some(Token::Word(id)) if id.starts_with('@') => match self.i.next() { + Some(Token::Str(_, parts, _)) => Term::Str(Some(*id), self.str_parts(parts)), + next => return Err((Expect::Str, next)), + }, Some(Token::Str(_, parts, _)) => Term::Str(None, self.str_parts(parts)), - Some(Token::Word(id)) if !id.starts_with('$') && !KEYWORDS.contains(id) => { - Term::str(*id) - } + Some(Token::Word(id)) if !id.contains("::") => Term::str(*id), next => return Err((Expect::Key, next)), - }; - Ok((path::Part::Index(key), self.opt())) + }) + } + + fn key_opt(&mut self) -> Result<'s, 't, (path::Part>, path::Opt)> { + let next = self.i.next(); + Ok((path::Part::Index(self.key(next)?), self.opt())) } fn opt(&mut self) -> path::Opt { From bdbc9a5ebf4528e9a952aeb658d6c1b3da01b0a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 09:10:26 +0200 Subject: [PATCH 113/135] If you can write `{key}`, then you can also write `.key`. --- jaq-syn/src/parse.rs | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 06a966546..134ff05d3 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -440,23 +440,18 @@ impl<'s, 't> Parser<'s, 't> { } fn obj_entry(&mut self) -> Result<'s, 't, (Term<&'s str>, Option>)> { - let key = match self.i.next() { - Some(Token::Str(_, parts, _)) => Term::Str(None, self.str_parts(parts)), - Some(Token::Word(k)) if k.starts_with('@') => match self.i.next() { - Some(Token::Str(_, parts, _)) => Term::Str(Some(*k), self.str_parts(parts)), - next => return Err((Expect::Str, next)), - }, - Some(Token::Word(k)) if k.starts_with('$') => Term::Var(*k), - Some(Token::Word(k)) if !KEYWORDS.contains(k) => Term::str(*k), + match self.i.next() { Some(Token::Block("(", tokens)) => { let k = self.with(tokens, ")", Self::term); self.char1(":")?; - return Ok((k, Some(self.term_with_comma(false)?))); + Ok((k, Some(self.term_with_comma(false)?))) } - next => return Err((Expect::Key, next)), - }; - let v = self.char0(':').map(|_| self.term_with_comma(false)); - Ok((key, v.transpose()?)) + next => { + let key = self.key(next)?; + let v = self.char0(':').map(|_| self.term_with_comma(false)); + Ok((key, v.transpose()?)) + } + } } fn str_parts( From 9968bf6c38d1d181a2a7709abaa05b3a28806749 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 09:10:35 +0200 Subject: [PATCH 114/135] Document. --- jaq-syn/src/parse.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 134ff05d3..04fe8e8c6 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -239,6 +239,7 @@ impl<'s, 't> Parser<'s, 't> { Ok(y) } + /// Parse `("(" arg (";" arg)* ")")?`. fn args(&mut self, f: fn(&mut Self) -> Result<'s, 't, T>) -> Vec { self.maybe(|p| match p.i.next() { Some(Token::Block("(", tokens)) => Some(p.with(tokens, "", |p| p.arg_items(f))), @@ -538,6 +539,7 @@ impl<'s, 't> Parser<'s, 't> { core::iter::from_fn(|| self.def_head().map(|()| self.def_tail())).collect() } + /// Parse `def`. fn def_head(&mut self) -> Option<()> { self.maybe(|p| match p.i.next() { Some(Token::Word("def")) => Some(()), @@ -545,6 +547,7 @@ impl<'s, 't> Parser<'s, 't> { }) } + /// Parse `name args ":" term ";"`. fn def_tail(&mut self) -> Result<'s, 't, Def<&'s str, Term<&'s str>>> { let name = match self.i.next() { Some(Token::Word(name)) if !name.starts_with('$') && is_id(name) => name, From 6f2478f24eb9eebcb247686763f32983f68f8b3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 10:52:21 +0200 Subject: [PATCH 115/135] Allow `.key` and `{key}` where `key` is a keyword, and disallow `. key`. --- jaq-syn/src/lex.rs | 8 ++++-- jaq-syn/src/parse.rs | 65 ++++++++++++++++++++++++++++---------------- 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index d09b2cb70..a9063ccf6 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -240,8 +240,12 @@ impl<'a> Lexer<&'a str> { '?' if (chars.next(), chars.next()) == (Some('/'), Some('/')) => { Token::Op(self.take(3)) } - '.' if chars.next() == Some('.') => Token::Char(self.take(2)), - '.' | ':' | ';' | ',' | '?' => Token::Char(self.take(1)), + '.' => match chars.next() { + Some('.') => Token::Char(self.take(2)), + Some('a'..='z' | 'A'..='Z' | '_') => Token::Char(self.consumed(2, Self::ident0)), + _ => Token::Char(self.take(1)), + }, + ':' | ';' | ',' | '?' => Token::Char(self.take(1)), '"' => self.str(), '(' | '[' | '{' => self.delim(), _ => return None, diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 04fe8e8c6..8475ef6c8 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -265,6 +265,13 @@ impl<'s, 't> Parser<'s, 't> { }) } + fn dot(&mut self) -> Option<&'s str> { + self.maybe(|p| match p.i.next() { + Some(Token::Char(c)) if *c != ".." => c.strip_prefix("."), + _ => None, + }) + } + fn terminated(&mut self, f: F) -> Result<'s, 't, T> where F: FnOnce(&mut Self) -> Result<'s, 't, T>, @@ -394,16 +401,24 @@ impl<'s, 't> Parser<'s, 't> { Some(Token::Word(id)) if !KEYWORDS.contains(id) => { Term::Call(*id, self.args(Self::term)) } - Some(Token::Char(".")) => { - let key_opt = self.maybe(|p| p.key_opt().ok()); - let path: Vec<_> = key_opt.into_iter().chain(self.path()?).collect(); - if path.is_empty() { - Term::Id + Some(Token::Char("..")) => Term::Recurse, + Some(Token::Char(c)) if c.starts_with('.') => { + let key = if c.len() > 1 { + Some(Term::str(&c[1..])) } else { + // TODO: this returns None on things like "@json .", + // whereas it should return an error instead + self.maybe(|p| p.key().ok()) + }; + + if let Some(key) = key { + let head = (path::Part::Index(key), self.opt()); + let path = core::iter::once(head).chain(self.path()?).collect(); Term::Path(Box::new(Term::Id), path) + } else { + Term::Id } } - Some(Token::Char("..")) => Term::Recurse, Some(Token::Num(n)) => Term::Num(*n), Some(Token::Block("[", tokens)) if matches!(tokens[..], [Token::Char("]")]) => { Term::Arr(None) @@ -441,18 +456,23 @@ impl<'s, 't> Parser<'s, 't> { } fn obj_entry(&mut self) -> Result<'s, 't, (Term<&'s str>, Option>)> { - match self.i.next() { + let i = self.i.clone(); + let key = match self.i.next() { Some(Token::Block("(", tokens)) => { let k = self.with(tokens, ")", Self::term); self.char1(":")?; - Ok((k, Some(self.term_with_comma(false)?))) + return Ok((k, Some(self.term_with_comma(false)?))); } - next => { - let key = self.key(next)?; - let v = self.char0(':').map(|_| self.term_with_comma(false)); - Ok((key, v.transpose()?)) + Some(Token::Word(id)) if !id.starts_with(['$', '@']) && !id.contains("::") => { + Term::str(*id) } - } + _ => { + self.i = i; + self.key()? + } + }; + let v = self.char0(':').map(|_| self.term_with_comma(false)); + Ok((key, v.transpose()?)) } fn str_parts( @@ -472,8 +492,13 @@ impl<'s, 't> Parser<'s, 't> { fn path(&mut self) -> Result<'s, 't, Vec<(path::Part>, path::Opt)>> { let mut path: Vec<_> = core::iter::from_fn(|| self.path_part_opt()).collect(); - while self.char0('.').is_some() { - path.push(self.key_opt()?); + while let Some(key) = self.dot() { + let key = if key.is_empty() { + self.key()? + } else { + Term::str(key) + }; + path.push((path::Part::Index(key), self.opt())); path.extend(core::iter::from_fn(|| self.path_part_opt())); } Ok(path) @@ -508,24 +533,18 @@ impl<'s, 't> Parser<'s, 't> { Some((part, self.opt())) } - fn key(&mut self, next: Option<&'t Token<&'s str>>) -> Result<'s, 't, Term<&'s str>> { - Ok(match next { + fn key(&mut self) -> Result<'s, 't, Term<&'s str>> { + Ok(match self.i.next() { Some(Token::Word(id)) if id.starts_with('$') => Term::Var(*id), Some(Token::Word(id)) if id.starts_with('@') => match self.i.next() { Some(Token::Str(_, parts, _)) => Term::Str(Some(*id), self.str_parts(parts)), next => return Err((Expect::Str, next)), }, Some(Token::Str(_, parts, _)) => Term::Str(None, self.str_parts(parts)), - Some(Token::Word(id)) if !id.contains("::") => Term::str(*id), next => return Err((Expect::Key, next)), }) } - fn key_opt(&mut self) -> Result<'s, 't, (path::Part>, path::Opt)> { - let next = self.i.next(); - Ok((path::Part::Index(self.key(next)?), self.opt())) - } - fn opt(&mut self) -> path::Opt { let mut opt = path::Opt::Essential; while self.char0('?').is_some() { From abd09d53acc26d33908db5ca162b62fe5494ba26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 12:07:04 +0200 Subject: [PATCH 116/135] Remove `def_head`. --- jaq-syn/src/parse.rs | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 8475ef6c8..003bac461 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -555,15 +555,8 @@ impl<'s, 't> Parser<'s, 't> { /// Parse a sequence of definitions, such as `def x: 1; def y: 2;`. pub fn defs(&mut self) -> Result<'s, 't, Vec>>> { - core::iter::from_fn(|| self.def_head().map(|()| self.def_tail())).collect() - } - - /// Parse `def`. - fn def_head(&mut self) -> Option<()> { - self.maybe(|p| match p.i.next() { - Some(Token::Word("def")) => Some(()), - _ => None, - }) + let head = |p: &mut Self| p.keyword("def").ok(); + core::iter::from_fn(|| self.maybe(head).map(|_| self.def_tail())).collect() } /// Parse `name args ":" term ";"`. From f30c3fe3e5a485fd119069b4ed6f6614ed39e490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 13:18:08 +0200 Subject: [PATCH 117/135] Clippy. --- jaq-syn/src/convert.rs | 2 +- jaq-syn/src/lex.rs | 1 - jaq-syn/src/parse.rs | 8 +++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/jaq-syn/src/convert.rs b/jaq-syn/src/convert.rs index 1aa7c933b..60f333171 100644 --- a/jaq-syn/src/convert.rs +++ b/jaq-syn/src/convert.rs @@ -39,7 +39,7 @@ impl parse::Term<&str> { }; let from_obj = |(k, v): &(_, Option<_>)| { let f = || (index_path(*span(k)), 0..42); - KeyVal::Filter(*span(k), v.as_ref().map_or_else(|| f(), |v| *span(v))) + KeyVal::Filter(*span(k), v.as_ref().map_or_else(f, |v| *span(v))) }; let from_op = |op| match op { "," => BinaryOp::Comma, diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index a9063ccf6..eadf48d55 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -71,7 +71,6 @@ impl<'a> Lexer<&'a str> { } /// Lex, returning the resulting tokens and errors. - #[must_use] pub fn lex(mut self) -> Result>, Vec>> { let tokens = self.tokens(); self.space(); diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 003bac461..32ebfc517 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -7,6 +7,8 @@ use alloc::{boxed::Box, vec::Vec}; /// Parse error, storing what we expected and what we got instead. pub type Error<'s, 't> = (Expect<&'s str>, Option<&'t Token<&'s str>>); +type Path = Vec<(path::Part, path::Opt)>; + /// Type of token that we expected. #[derive(Debug)] pub enum Expect { @@ -99,7 +101,7 @@ pub enum Term { Var(S), /// Path such as `.`, `.a`, `.[][]."b"` - Path(Box, Vec<(path::Part, path::Opt)>), + Path(Box, Path), } impl Term { @@ -267,7 +269,7 @@ impl<'s, 't> Parser<'s, 't> { fn dot(&mut self) -> Option<&'s str> { self.maybe(|p| match p.i.next() { - Some(Token::Char(c)) if *c != ".." => c.strip_prefix("."), + Some(Token::Char(c)) if *c != ".." => c.strip_prefix('.'), _ => None, }) } @@ -490,7 +492,7 @@ impl<'s, 't> Parser<'s, 't> { parts.collect() } - fn path(&mut self) -> Result<'s, 't, Vec<(path::Part>, path::Opt)>> { + fn path(&mut self) -> Result<'s, 't, Path>> { let mut path: Vec<_> = core::iter::from_fn(|| self.path_part_opt()).collect(); while let Some(key) = self.dot() { let key = if key.is_empty() { From 1b82d32533f09fde3f2957d7c39da9d229abb408 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 13:22:28 +0200 Subject: [PATCH 118/135] Document. --- jaq-syn/src/lex.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index eadf48d55..c9ab1fc9b 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -2,10 +2,17 @@ use alloc::vec::Vec; +/// Component of a string potentially containing escape sequences. +/// +/// `S` is a type of strings (without escape sequences), and +/// `F` is a type of interpolated filters. #[derive(Debug)] pub enum StrPart { + /// string without escape sequences Str(S), + /// interpolated filter (`\(...)`) Filter(F), + /// escaped character (e.g. `\n`, `t`, `\u0041`) Char(char), } From 44e58519372dc18d15e78c3fb6daacbd3aa1f4ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 17:22:04 +0200 Subject: [PATCH 119/135] New test. --- jaq-interpret/tests/path.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/jaq-interpret/tests/path.rs b/jaq-interpret/tests/path.rs index 9416ce243..b8143b5e0 100644 --- a/jaq-interpret/tests/path.rs +++ b/jaq-interpret/tests/path.rs @@ -73,6 +73,9 @@ fn iter_assign() { ); } +yields!(index_keyword, r#"{"if": 0} | .if"#, 0); +yields!(obj_keyword, "{if: 0} | .if", 0); + yields!(key_update1, "{} | .a |= .+1", json!({"a": 1})); yields!(key_update2, "{} | .a? |= .+1", json!({"a": 1})); From c379734dbc735c476af7de6767d245a7694127d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 19:26:56 +0200 Subject: [PATCH 120/135] Document. --- jaq-syn/src/parse.rs | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 32ebfc517..94ad35e67 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -10,19 +10,33 @@ pub type Error<'s, 't> = (Expect<&'s str>, Option<&'t Token<&'s str>>); type Path = Vec<(path::Part, path::Opt)>; /// Type of token that we expected. +/// +/// Each variant is annoted with jq programs that trigger it. #[derive(Debug)] pub enum Expect { + /// `if 0`, `reduce .` Keyword(S), + /// `0 as $x`, `{(.)}` Char(S), + /// `0 as`, `label`, `break` Var, + /// `if 0 then 0` ElseOrEnd, + /// `{a;}` CommaOrRBrace, + /// `f(0:)` SemicolonOrRParen, + /// `` (empty input), `-`, `()` Term, + /// `.[].` Key, + /// `def`, `import "foo" as` Ident, + /// `def f()` Arg, + /// `import` Str, + /// `0;` Nothing, } @@ -37,7 +51,7 @@ impl<'a> Expect<&'a str> { Self::SemicolonOrRParen => "semicolon or right parenthesis", Self::Term => "term", Self::Key => "key", - Self::Ident => "ident", + Self::Ident => "identifier", Self::Arg => "argument", Self::Str => "string", Self::Nothing => "nothing", @@ -636,6 +650,18 @@ fn is_id(s: &str) -> bool { !s.contains("::") && !KEYWORDS.contains(&s) } +/// jq module, consisting of metadata, imports/includes, and a body. +/// +/// Example (where the body is a sequence of definitions): +/// +/// ~~~ jq +/// module {}; +/// +/// import "foo" as foo; +/// include "bar"; +/// +/// def iter: .[]; +/// ~~~ #[derive(Debug, Default)] pub struct Module { meta: Option>, @@ -643,6 +669,16 @@ pub struct Module { pub(crate) body: B, } +/// jq definition, consisting of a name, optional arguments, and a body. +/// +/// Examples: +/// +/// ~~~ jq +/// def pi: 3.1415; +/// def double($x): $x + $x; +/// def map(f): [.[] | f]; +/// def recurse(f; cond): recurse(f | select(cond)); +/// ~~~ #[derive(Debug)] pub struct Def { pub(crate) name: S, From b46c2b187d68e6642d9403c33cb9ccf5722e0c4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 19:55:44 +0200 Subject: [PATCH 121/135] =?UTF-8?q?Report=20lex=20errors=20for=20character?= =?UTF-8?q?s=20`c`=20like=20`=F0=9F=92=A3`=20where=20`c.len=5Futf8()=20!?= =?UTF-8?q?=3D=201`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- jaq-play/src/lib.rs | 5 +++-- jaq/src/main.rs | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/jaq-play/src/lib.rs b/jaq-play/src/lib.rs index 30dc3f649..37e320b98 100644 --- a/jaq-play/src/lib.rs +++ b/jaq-play/src/lib.rs @@ -304,9 +304,10 @@ impl Color { fn report_lex(code: &str, (expected, found): jaq_syn::lex::Error<&str>) -> Report { use jaq_syn::lex::{span, Expect}; + // truncate found string to its first character + let found = &found[..found.char_indices().skip(1).next().map_or(found.len(), |(i, _)| i)]; - let mut found_range = span(code, found); - found_range.end = core::cmp::min(found_range.start + 1, code.len()); + let found_range = span(code, found); let found = match found { "" => [("unexpected end of input".to_string(), None)].into(), c => [("unexpected character ", None), (c, Some(Color::Red))] diff --git a/jaq/src/main.rs b/jaq/src/main.rs index 3f39bf49d..596a50bee 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -601,9 +601,10 @@ impl Color { fn report_lex(code: &str, (expected, found): jaq_syn::lex::Error<&str>) -> Report { use jaq_syn::lex::{span, Expect}; + // truncate found string to its first character + let found = &found[..found.char_indices().skip(1).next().map_or(found.len(), |(i, _)| i)]; - let mut found_range = span(code, found); - found_range.end = core::cmp::min(found_range.start + 1, code.len()); + let found_range = span(code, found); let found = match found { "" => [("unexpected end of input".to_string(), None)].into(), c => [("unexpected character ", None), (c, Some(Color::Red))] From 9cde4e4fef34492d8d026acd6b15d8dd44e1e632 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 20:05:01 +0200 Subject: [PATCH 122/135] Clippy. --- jaq-play/src/lib.rs | 2 +- jaq/src/main.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/jaq-play/src/lib.rs b/jaq-play/src/lib.rs index 37e320b98..399917002 100644 --- a/jaq-play/src/lib.rs +++ b/jaq-play/src/lib.rs @@ -305,7 +305,7 @@ impl Color { fn report_lex(code: &str, (expected, found): jaq_syn::lex::Error<&str>) -> Report { use jaq_syn::lex::{span, Expect}; // truncate found string to its first character - let found = &found[..found.char_indices().skip(1).next().map_or(found.len(), |(i, _)| i)]; + let found = &found[..found.char_indices().nth(1).map_or(found.len(), |(i, _)| i)]; let found_range = span(code, found); let found = match found { diff --git a/jaq/src/main.rs b/jaq/src/main.rs index 596a50bee..9159d4acd 100644 --- a/jaq/src/main.rs +++ b/jaq/src/main.rs @@ -602,7 +602,7 @@ impl Color { fn report_lex(code: &str, (expected, found): jaq_syn::lex::Error<&str>) -> Report { use jaq_syn::lex::{span, Expect}; // truncate found string to its first character - let found = &found[..found.char_indices().skip(1).next().map_or(found.len(), |(i, _)| i)]; + let found = &found[..found.char_indices().nth(1).map_or(found.len(), |(i, _)| i)]; let found_range = span(code, found); let found = match found { From 2772166e34c81289dd156456beff60e7b8c18ee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Wed, 17 Jul 2024 20:05:19 +0200 Subject: [PATCH 123/135] Document. --- jaq-syn/src/lex.rs | 10 ++++++++++ jaq-syn/src/parse.rs | 1 + 2 files changed, 11 insertions(+) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index c9ab1fc9b..f4074ac1a 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -33,17 +33,27 @@ pub enum Token { Block(S, Vec), } +/// Type of character that we expected. +/// +/// Each variant is annoted with jq programs that trigger it. #[derive(Clone, Debug)] pub enum Expect { + /// `0e`, `0.` Digit, + /// `$`, `@` Ident, + /// `(`, `[`, `{` Delim(S), + /// `"\a"` Escape, + /// `"\ux"` Unicode, + /// `&`, `§`, `💣` Token, } impl<'a> Expect<&'a str> { + /// Return human-readable description of what we expected. pub fn as_str(&self) -> &'static str { match self { Self::Digit => "digit", diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 94ad35e67..12e0e1185 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -664,6 +664,7 @@ fn is_id(s: &str) -> bool { /// ~~~ #[derive(Debug, Default)] pub struct Module { + #[allow(dead_code)] meta: Option>, pub(crate) mods: Vec<(S, Option)>, pub(crate) body: B, From 1174607e4ac23e3e325b2e587ac953ffd6a8891a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 22 Jul 2024 08:56:49 +0200 Subject: [PATCH 124/135] Document term type. --- jaq-syn/src/parse.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 12e0e1185..e5891ad90 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -79,9 +79,12 @@ pub enum Term { /// Recursion (`..`) Recurse, - /// Integer or floating-point number. + /// Integer or floating-point number Num(S), /// String + /// + /// This consists of an optional format filter starting with `@` (such as `@text`), + /// followed by quoted string parts (such as `"Hello, \(.name)! \u263A"`). Str(Option, Vec>), /// Array, empty if `None` Arr(Option>), @@ -114,7 +117,7 @@ pub enum Term { /// Variable, such as `$x` (including leading '$') Var(S), - /// Path such as `.`, `.a`, `.[][]."b"` + /// Path such as `.a`, `.[][]."b"`, `f[0]` Path(Box, Path), } From 2f970c084d0fbc0d92074147ac3e4041a63e04d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 22 Jul 2024 08:56:56 +0200 Subject: [PATCH 125/135] Example for parse function. --- jaq-syn/src/lib.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/jaq-syn/src/lib.rs b/jaq-syn/src/lib.rs index 1f2f71a4b..91f8cbb8f 100644 --- a/jaq-syn/src/lib.rs +++ b/jaq-syn/src/lib.rs @@ -31,6 +31,13 @@ pub type Span = core::ops::Range; pub type Spanned = (T, Span); /// Lex a string and parse resulting tokens, returning [`None`] if any error occurred. +/// +/// Example: +/// +/// ~~~ +/// # use jaq_syn::parse; +/// let t = parse("[] | .[]", |p| p.term()); +/// ~~~ pub fn parse<'s, T: Default, F>(s: &'s str, f: F) -> Option where F: for<'t> FnOnce(&mut Parser<'s, 't>) -> parse::Result<'s, 't, T>, From 88d55f0a214dfc396b8a522db1df33b529aa426b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 22 Jul 2024 09:33:45 +0200 Subject: [PATCH 126/135] Report unsupported operator. --- jaq-syn/src/convert.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jaq-syn/src/convert.rs b/jaq-syn/src/convert.rs index 60f333171..458ba7bbc 100644 --- a/jaq-syn/src/convert.rs +++ b/jaq-syn/src/convert.rs @@ -64,7 +64,7 @@ impl parse::Term<&str> { ">=" => BinaryOp::Ord(OrdOp::Ge), "==" => BinaryOp::Ord(OrdOp::Eq), "!=" => BinaryOp::Ord(OrdOp::Ne), - _ => todo!("unknown operator"), + op => panic!("unknown operator: {op}"), }; match self { Self::Id => Id, From 39c3429c642b542259f4e042bdde2e1e8291d4e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 22 Jul 2024 11:13:37 +0200 Subject: [PATCH 127/135] Document. --- jaq-syn/src/parse.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index e5891ad90..128a8931c 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -328,6 +328,12 @@ impl<'s, 't> Parser<'s, 't> { } } + /// Parse a term. + /// + /// Only if `with_comma` is true, the parsed term may be of the shape `t, u`. + /// This matters for the parsing of object values, such as `{k1: v1, k2: v2}`: + /// if we would permit terms of the shape `t, u` inside objects, + /// then this would be parsed like `{k1: (v1, k2): v2}`, which is invalid. fn term_with_comma(&mut self, with_comma: bool) -> Result<'s, 't, Term<&'s str>> { let head = self.atom()?; let tail = core::iter::from_fn(|| self.op(with_comma).map(|op| Ok((op, self.atom()?)))) @@ -354,6 +360,11 @@ impl<'s, 't> Parser<'s, 't> { }) } + /// Parse an atomic term. + /// + /// A term `t` is atomic if and only if `try t catch 0` is syntactically correct. + /// For example, the term `1 + 2` is not atomic, because `try 1 + 2 catch 0` is invalid. + /// However, the term `.[]` is atomic, because `try .[] catch 0` is valid. fn atom(&mut self) -> Result<'s, 't, Term<&'s str>> { let tm = match self.i.next() { Some(Token::Op("-")) => Term::Neg(Box::new(self.atom()?)), @@ -474,6 +485,12 @@ impl<'s, 't> Parser<'s, 't> { self.term_with_comma(true) } + /// Parse an object entry. + /// + /// An object is written as `{e1, ..., en}`, where `ei` is an object entry. + /// An example of an object entry is `"key": value` or `(key): value`. + /// When the key is a term surrounded by parentheses, a value is required, + /// otherwise the value may be omitted (e.g. `"key"` or `$x`). fn obj_entry(&mut self) -> Result<'s, 't, (Term<&'s str>, Option>)> { let i = self.i.clone(); let key = match self.i.next() { From cd8085c0591574567460d969768568b84a816df6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 22 Jul 2024 11:30:21 +0200 Subject: [PATCH 128/135] Remove `KEYWORDS`. --- jaq-syn/src/parse.rs | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 128a8931c..88f62e862 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -127,14 +127,6 @@ impl Term { } } -/// Keywords that may not appear at the beginning of an expression. -/// -/// Note that for example `reduce` is not part of this list, -/// because it *can* appear at the beginning of an expression. -const KEYWORDS: &[&str] = &[ - "include", "import", "def", "as", "and", "or", "catch", "then", "elif", "else", "end", -]; - impl<'s, 't> Parser<'s, 't> { /// Initialise a new parser on a sequence of [`Token`]s. #[must_use] @@ -428,9 +420,7 @@ impl<'s, 't> Parser<'s, 't> { Some(parts) => Term::Str(Some(*id), parts), } } - Some(Token::Word(id)) if !KEYWORDS.contains(id) => { - Term::Call(*id, self.args(Self::term)) - } + Some(Token::Word(id)) => Term::Call(*id, self.args(Self::term)), Some(Token::Char("..")) => Term::Recurse, Some(Token::Char(c)) if c.starts_with('.') => { let key = if c.len() > 1 { @@ -598,12 +588,12 @@ impl<'s, 't> Parser<'s, 't> { /// Parse `name args ":" term ";"`. fn def_tail(&mut self) -> Result<'s, 't, Def<&'s str, Term<&'s str>>> { let name = match self.i.next() { - Some(Token::Word(name)) if !name.starts_with('$') && is_id(name) => name, + Some(Token::Word(w)) if !w.starts_with('$') && !w.contains("::") => w, next => return Err((Expect::Ident, next)), }; let args = self.args(|p| { Ok(match p.i.next() { - Some(Token::Word(arg)) if is_id(arg) => *arg, + Some(Token::Word(w)) if !w.contains("::") => *w, next => return Err((Expect::Arg, next)), }) }); @@ -633,7 +623,7 @@ impl<'s, 't> Parser<'s, 't> { let path = self.bare_str()?; self.keyword("as")?; let name = match self.i.next() { - Some(Token::Word(name)) if !name.starts_with(['$', '@']) && is_id(name) => *name, + Some(Token::Word(w)) if !w.starts_with(['$', '@']) && !w.contains("::") => *w, next => return Err((Expect::Ident, next)), }; Ok((path, Some(name))) @@ -666,10 +656,6 @@ impl<'s, 't> Parser<'s, 't> { } } -fn is_id(s: &str) -> bool { - !s.contains("::") && !KEYWORDS.contains(&s) -} - /// jq module, consisting of metadata, imports/includes, and a body. /// /// Example (where the body is a sequence of definitions): From 33f65ab2c18e63df656aacc74aa1bb542ef89703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 22 Jul 2024 11:40:36 +0200 Subject: [PATCH 129/135] Document expectation. --- jaq-syn/src/parse.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 88f62e862..cbf76b6aa 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -14,9 +14,9 @@ type Path = Vec<(path::Part, path::Opt)>; /// Each variant is annoted with jq programs that trigger it. #[derive(Debug)] pub enum Expect { - /// `if 0`, `reduce .` + /// `if 0` (expected "then"), `reduce .` (expected "as") Keyword(S), - /// `0 as $x`, `{(.)}` + /// `0 as $x` (expected "|"), `{(.)}` (expected ":") Char(S), /// `0 as`, `label`, `break` Var, From e3df12fb200d7fb355667fd698559891cdc9f91f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 22 Jul 2024 12:09:55 +0200 Subject: [PATCH 130/135] Document. --- jaq-syn/src/lex.rs | 4 +++- jaq-syn/src/parse.rs | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index f4074ac1a..863a014e2 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -25,7 +25,9 @@ pub enum Token { Num(S), /// (interpolated) string, surrounded by opening and closing '"' Str(S, Vec>, S), - /// operator, such as `|` or `+=` + /// binary operator, such as `|` or `+=` + /// + /// Note that this includes `-` (negation) also when it is used as unary operator. Op(S), /// punctuation, such as `.` or `;` Char(S), diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index cbf76b6aa..00cb7505e 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -153,6 +153,7 @@ impl<'s, 't> Parser<'s, 't> { } } + /// Verifies that the remaining input tokens correspond to the given string. fn verify_last(&mut self, last: &'static str) -> Result<'s, 't, ()> { match (self.i.as_slice(), last) { ([], "") => Ok(()), @@ -171,6 +172,9 @@ impl<'s, 't> Parser<'s, 't> { y } + /// Parse with given function, then + /// ensure that remaining input tokens correspond to `last`, and + /// return default if any error occurred. fn finish(&mut self, last: &'static str, f: F) -> T where F: FnOnce(&mut Self) -> Result<'s, 't, T>, @@ -193,6 +197,7 @@ impl<'s, 't> Parser<'s, 't> { self.with_tok(tokens, |p| p.finish(last, f)) } + /// Parse with the given function, and rewind input if it returns `None`. fn maybe(&mut self, f: impl Fn(&mut Self) -> Option) -> Option { let i = self.i.clone(); let y = f(self); @@ -203,6 +208,7 @@ impl<'s, 't> Parser<'s, 't> { y } + /// Parse with the given function, and rewind input if it returns `Ok(None)`. fn try_maybe(&mut self, f: F) -> Result<'s, 't, Option> where F: Fn(&mut Self) -> Result<'s, 't, Option>, @@ -259,6 +265,7 @@ impl<'s, 't> Parser<'s, 't> { .unwrap_or_default() } + /// Parse a binary operator, including `,` if `with_comma` is true. fn op(&mut self, with_comma: bool) -> Option<&'s str> { self.maybe(|p| match p.i.next() { // handle pipe directly in `term()` From dce1814b018a95e5dc62614246728df1d4a2bb79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 22 Jul 2024 12:21:51 +0200 Subject: [PATCH 131/135] Do not attempt to support destructuring alternative operator for now. --- jaq-syn/src/lex.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/jaq-syn/src/lex.rs b/jaq-syn/src/lex.rs index 863a014e2..af5e53993 100644 --- a/jaq-syn/src/lex.rs +++ b/jaq-syn/src/lex.rs @@ -255,9 +255,6 @@ impl<'a> Lexer<&'a str> { '$' | '@' => Token::Word(self.consumed(1, Self::ident1)), '0'..='9' => Token::Num(self.consumed(1, Self::num)), c if is_op(c) => Token::Op(self.consumed(1, |lex| lex.trim(is_op))), - '?' if (chars.next(), chars.next()) == (Some('/'), Some('/')) => { - Token::Op(self.take(3)) - } '.' => match chars.next() { Some('.') => Token::Char(self.take(2)), Some('a'..='z' | 'A'..='Z' | '_') => Token::Char(self.consumed(2, Self::ident0)), From 27798d458f2ee1941c3e082c0676f25be1b78f1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 22 Jul 2024 14:34:12 +0200 Subject: [PATCH 132/135] Correctly compute `{$k}`. --- jaq-interpret/tests/tests.rs | 2 ++ jaq-syn/src/convert.rs | 7 ++++++- jaq-syn/src/parse.rs | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/jaq-interpret/tests/tests.rs b/jaq-interpret/tests/tests.rs index 74dac6fa1..2e931fced 100644 --- a/jaq-interpret/tests/tests.rs +++ b/jaq-interpret/tests/tests.rs @@ -164,6 +164,8 @@ yields!( "{a: 1, b: 2} | {a, c: 3}", json!({"a": 1, "c": 3}) ); +yields!(obj_var, r#""x" as $k | {$k}"#, json!({"k": "x"})); +yields!(obj_var_val, r#""x" as $k | {$k: 0}"#, json!({"x": 0})); yields!( obj_multi_keys, r#"[{("a", "b"): 1}]"#, diff --git a/jaq-syn/src/convert.rs b/jaq-syn/src/convert.rs index 458ba7bbc..02c3ba2a8 100644 --- a/jaq-syn/src/convert.rs +++ b/jaq-syn/src/convert.rs @@ -39,7 +39,12 @@ impl parse::Term<&str> { }; let from_obj = |(k, v): &(_, Option<_>)| { let f = || (index_path(*span(k)), 0..42); - KeyVal::Filter(*span(k), v.as_ref().map_or_else(f, |v| *span(v))) + let (k, v) = if let (Self::Var(x), None) = (k, v) { + (*span(&Self::str(&x[1..])), *span(k)) + } else { + (*span(k), v.as_ref().map_or_else(f, |v| *span(v))) + }; + KeyVal::Filter(k, v) }; let from_op = |op| match op { "," => BinaryOp::Comma, diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index 00cb7505e..c1377bfd9 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -122,7 +122,7 @@ pub enum Term { } impl Term { - fn str(s: S) -> Self { + pub(crate) fn str(s: S) -> Self { Self::Str(None, [StrPart::Str(s)].into()) } } From 7a74ea7e2f07e7c9c1316eaa57e809980846730c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 22 Jul 2024 18:26:05 +0200 Subject: [PATCH 133/135] Atomicity tests. --- jaq-interpret/tests/tests.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/jaq-interpret/tests/tests.rs b/jaq-interpret/tests/tests.rs index 2e931fced..969552f62 100644 --- a/jaq-interpret/tests/tests.rs +++ b/jaq-interpret/tests/tests.rs @@ -117,6 +117,27 @@ fn precedence() { give(json!(null), "2 * 3 + 1", json!(7)); } +// these tests use the trick that `try t catch c` is valid syntax only for atomic terms `t` +// TODO for v2.0 +//yields!(atomic_def, "try def x: 1; x + x catch 0", 2); +yields!(atomic_neg, "try - 1 catch 0", -1); +yields!(atomic_if, "try if 0 then 1 end catch 2", 1); +yields!(atomic_try, "try try 0[0] catch 1 catch 2", 1); +yields!(atomic_fold, "try reduce [][] as $x (0; 0) catch 1", 0); +yields!(atomic_var, "0 as $x | try $x catch 1", 0); +yields!(atomic_call, "def x: 0; try x catch 1", 0); +yields!(atomic_str1, r#"try "" catch 1"#, ""); +yields!(atomic_str2, r#"def @f: .; try @f "" catch 1"#, ""); +yields!(atomic_rec, "try .. catch 0", json!(null)); +yields!(atomic_id, "try . catch 0", json!(null)); +yields!(atomic_key1, "{key: 0} | try .key catch 1", 0); +yields!(atomic_key2, r#"{key: 0} | try . "key" catch 1"#, 0); +yields!(atomic_key3, r#"def @f: .; {key: 0} | try .@f"key" catch 1"#, 0); +yields!(atomic_num, "try 0 catch 1", 0); +yields!(atomic_block, "try (1 + 1) catch 0", 2); +yields!(atomic_path, "try [1][0] catch 0", 1); +yields!(atomic_opt, "def x: 0; try x? catch 0", 0); + yields!(neg_arr_iter1, "[-[][]]", json!([])); yields!(neg_arr_iter2, "try (-[])[] catch 0", 0); From b410289661fa242a9212389001808e5bda6a47fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 22 Jul 2024 18:26:43 +0200 Subject: [PATCH 134/135] Make test more meaningful. --- jaq-interpret/tests/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jaq-interpret/tests/tests.rs b/jaq-interpret/tests/tests.rs index 969552f62..c8300f9b6 100644 --- a/jaq-interpret/tests/tests.rs +++ b/jaq-interpret/tests/tests.rs @@ -136,7 +136,7 @@ yields!(atomic_key3, r#"def @f: .; {key: 0} | try .@f"key" catch 1"#, 0); yields!(atomic_num, "try 0 catch 1", 0); yields!(atomic_block, "try (1 + 1) catch 0", 2); yields!(atomic_path, "try [1][0] catch 0", 1); -yields!(atomic_opt, "def x: 0; try x? catch 0", 0); +yields!(atomic_opt, "def x: 0; try x? catch 1", 0); yields!(neg_arr_iter1, "[-[][]]", json!([])); yields!(neg_arr_iter2, "try (-[])[] catch 0", 0); From cf3fc7138863a8bcfbe4977c885406b57a588406 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Mon, 22 Jul 2024 18:30:54 +0200 Subject: [PATCH 135/135] Document. --- jaq-syn/src/parse.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/jaq-syn/src/parse.rs b/jaq-syn/src/parse.rs index c1377bfd9..20a3f96f0 100644 --- a/jaq-syn/src/parse.rs +++ b/jaq-syn/src/parse.rs @@ -537,6 +537,7 @@ impl<'s, 't> Parser<'s, 't> { Ok(path) } + /// Parse `[]`, `[t]`, `[t:]`, `[t:t]`, `[:t]` (all without brackets). fn path_part(&mut self) -> Result<'s, 't, path::Part>> { use path::Part::{Index, Range}; let done = |p: &Self| matches!(p.i.as_slice(), [Token::Char("]")]);