Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lexer+parser: many improvements and cleanups #985

Merged
merged 20 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
1c39a41
token: remove `DelimiterVariant`
feds01 Sep 14, 2023
601a1a6
lexer: flatten token stream
feds01 Sep 15, 2023
559b4be
parser: remove `AstGenFrame::token_at()`
feds01 Sep 15, 2023
2ec86ca
parser: cleanup accesses to `offset` in `AstGenFrame`
feds01 Sep 15, 2023
dab3d11
token: introduce `->` and `=>` tokens to simplify parsing & error rep…
feds01 Sep 15, 2023
eed9ef2
token: introduce `::` tokens to simplify parsing access expr/ty/pats
feds01 Sep 15, 2023
8b46510
parser: cleanup `begins_pat()` implementation
feds01 Sep 15, 2023
8c314d8
parser: avoid using `peek_nth()` in binary expression parsing
feds01 Sep 15, 2023
e905519
token: introduce `..`, `..<`, `...` tokens to simplify spread/range p…
feds01 Sep 15, 2023
9a6df8b
parser: fix typo in diagnostics
feds01 Sep 15, 2023
bd39bdf
parser: several cleanups, and stricter use of token stream API
feds01 Sep 16, 2023
c753fe6
parser: Integrated new lexer into the parser
feds01 Sep 17, 2023
fe21fa5
parser: directly use `TokenCursor` API
feds01 Sep 17, 2023
6ab48d2
parser: avoid using confusing `next_pos()` function
feds01 Sep 17, 2023
61d6030
analysis: use indexmap in pattern bind analysis to enforce stable err…
feds01 Sep 18, 2023
b3562cb
parser: name errors consistently, and remove a bunch of old un-used v…
feds01 Sep 18, 2023
0e2d585
parser: remove use of confusing `next_pos()` and replace with `eof_po…
feds01 Sep 18, 2023
6c9e599
source: Change `ByteRange` to be inclusive on both ends
feds01 Sep 18, 2023
5c6dfb6
lexer: cleanup + greatly improve lexer errors
feds01 Sep 18, 2023
0a1f509
parser: debug assert in `skip_token()` the desired token kind
feds01 Sep 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion compiler/hash-ast/src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,30 @@ pub struct LocalSpanMap {
}

impl LocalSpanMap {
/// Create a new [LocalAstMap].
/// Create a new [LocalSpanMap].
pub fn new(source: SourceId) -> Self {
Self { map: vec![], source }
}

/// Create a new [LocalSpanMap] with a given capacity.
pub fn with_capacity(source: SourceId, capacity: usize) -> Self {
Self { map: Vec::with_capacity(capacity), source }
}

/// Add a new node to the map.
pub fn add(&mut self, range: ByteRange) -> AstNodeId {
let id = AstNodeId::new();
self.map.push((id, range));
id
}

pub fn len(&self) -> usize {
self.map.len()
}

pub fn is_empty(&self) -> bool {
self.map.is_empty()
}
}

/// Utilities for working with the [`SPAN_MAP`].
Expand Down
8 changes: 4 additions & 4 deletions compiler/hash-ast/src/origin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ pub enum PatOrigin {
/// This is primarily used for checking if a collection of patterns (within
/// a constructor or tuple) don't introduce ambiguous pattern orders when
/// both named and un-named fields are used.
NamedField,
Arg,

/// The parent pattern is a constructor, i.e `Some(x)`
Constructor,
Expand All @@ -71,7 +71,7 @@ pub enum PatOrigin {
Array,

/// The parent pattern is a namespace, i.e `{ alloc, core }`
Namespace,
Mod,
}

impl PatOrigin {
Expand All @@ -80,10 +80,10 @@ impl PatOrigin {
fn to_str(self) -> &'static str {
match self {
PatOrigin::Tuple => "tuple",
PatOrigin::NamedField => "named field",
PatOrigin::Arg => "named field",
PatOrigin::Constructor => "constructor",
PatOrigin::Array => "array",
PatOrigin::Namespace => "namespace",
PatOrigin::Mod => "namespace",
}
}
}
Expand Down
207 changes: 166 additions & 41 deletions compiler/hash-lexer/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::{cell::Cell, fmt::Display};

use hash_reporting::{
diagnostic::{DiagnosticStore, HasDiagnosticsMut},
report::{Report, ReportElement, ReportNote, ReportNoteKind},
report::{help, info, Report},
reporter::{Reporter, Reports},
};
use hash_source::{identifier::Identifier, location::Span};
Expand Down Expand Up @@ -47,9 +47,6 @@ pub struct LexerError {
/// The kind of the error.
pub(crate) kind: LexerErrorKind,

/// Additional information about the error, if any.
pub(crate) message: Option<String>,

/// The location of the error, this includes the span and the id of the
/// source.
pub(crate) location: Span,
Expand All @@ -59,10 +56,6 @@ pub struct LexerError {
/// additional context to the error with the provided message in [LexerError]
#[derive(Debug)]
pub enum LexerErrorKind {
/// Occurs when a escape sequence (within a character or a string) is
/// malformed.
BadEscapeSequence,

/// Occurs when a numerical literal doesn't follow the language
/// specification, or is too large.
MalformedNumericalLit,
Expand All @@ -76,9 +69,6 @@ pub enum LexerErrorKind {
/// Occurs when a string literal is unclosed.
UnclosedStringLit,

/// Occurs when a character literal is comprised of more than one character
InvalidCharacterLit(String),

/// Occurs when a char is unexpected in the current context
Unexpected(char),

Expand All @@ -94,45 +84,174 @@ pub enum LexerErrorKind {

/// Invalid literal ascription for either `float` or `integer`
InvalidLitSuffix(NumericLitKind, Identifier),

/// Encountered a invalid float exponent.
InvalidFloatExponent,

/// Unclosed character literal, e.g.
/// ```
/// 'a
/// ```
UnclosedCharLit,

/// Occurs when a escape sequence (within a character or a string) is
/// not a valid escape sequence, e.g.
/// ```
/// '\z'
/// ```
///
/// The error contains the encountered invalid character.
UnknownEscapeSequence(char),

/// When a character literal has multiple code points, e.g.
/// ```
/// 'ab'
/// ```
MultipleCharCodePoints,

/// Unclosed unicode literal, when a unicode character literal
/// is missing the closing brace, e.g.
/// ```
/// '\u{1F600'
/// ```
UnclosedUnicodeLit,

/// When a unicode literal begins with `u`, but doesn't continue
/// with `{`, e.g.
/// ```
/// '\u1F600'
/// ```
MalformedUnicodeLit,

/// When a unicode literal has invalid digits, e.g.
/// ```
/// '\u{1F6G00}'
/// ```
///
/// The error contains the encountered invalid character.
InvalidUnicodeEscape(char),

/// When a unicode literal is too long, e.g.
/// ```
/// '\u{1F600000000000}`
/// ```
UnicodeLitTooLong,

/// When an ASCII escape sequence is too short, e.g.
/// ```
/// '\x'
/// ```
NumericEscapeSequenceTooShort,

/// When an ASCII escape sequence has invalid digits, e.g.
/// ```
/// '\xMG'
/// ```
///
/// The error contains the encountered invalid character.
InvalidNumericEscapeSequence(char),
UnicodeLitTooLarge,
}

impl From<LexerError> for Reports {
fn from(err: LexerError) -> Self {
let mut reporter = Reporter::new();

// We can have multiple notes describing what could be done about the error.
let mut span_label = None;
let mut help_notes = vec![];

let mut message = match err.kind {
LexerErrorKind::BadEscapeSequence => "invalid character escape sequence".to_string(),
let message = match err.kind {
LexerErrorKind::UnknownEscapeSequence(ch) => {
format!("unrecognised character escape sequence `{ch}`")
}
LexerErrorKind::MalformedNumericalLit => "malformed numerical literal".to_string(),
LexerErrorKind::MissingExponentDigits => "float exponent to have at least one digit".to_string(),
LexerErrorKind::MissingExponentDigits => {
"float exponent to have at least one digit".to_string()
}
LexerErrorKind::MissingDigits => "missing digits after integer base prefix".to_string(),
LexerErrorKind::UnclosedStringLit => "unclosed string literal".to_string(),
LexerErrorKind::InvalidCharacterLit(char) => format!("invalid character literal `{char}`, character literals may only contain one codepoint"),
LexerErrorKind::Unexpected(char) => format!("encountered unexpected character `{char}`"),
LexerErrorKind::Unexpected(char) => {
format!("encountered unexpected character `{char}`")
}
LexerErrorKind::Expected(token) => format!("expected token `{token}`"),
LexerErrorKind::Unclosed(delim) => format!("encountered unclosed delimiter `{}`, add a `{delim}` after the inner expression", delim.left()),
LexerErrorKind::UnsupportedFloatBaseLiteral(base) => format!("{base} float literal is not supported"),
LexerErrorKind::Unclosed(delim) => format!(
"encountered unclosed delimiter `{}`, add a `{}` after the inner expression",
delim.left(),
delim.right()
),
LexerErrorKind::UnsupportedFloatBaseLiteral(base) => {
format!("{base} float literal is not supported")
}
LexerErrorKind::InvalidLitSuffix(kind, suffix) => {
let suffix_note = match kind {
NumericLitKind::Integer => format!("{kind} suffix must be `u32`, `i64`, etc"),
NumericLitKind::Float => format!("{kind} suffix must be `f32` or `f64`"),
};

// push a note about what kind of suffix is expected
help_notes
.push(ReportElement::Note(ReportNote::new(ReportNoteKind::Info, suffix_note)));

format!("invalid suffix `{suffix}` for {kind} literal")
}
let suffix_note = match kind {
NumericLitKind::Integer => "suffix must be `u32`, `i64`, etc",
NumericLitKind::Float => "suffix must be `f32` or `f64`",
};

// push a note about what kind of suffix is expected
help_notes.push(info!("{kind} {suffix_note}"));

format!("invalid suffix `{suffix}` for {kind} literal")
}
LexerErrorKind::InvalidFloatExponent => {
"float literal has an invalid exponent".to_string()
}
LexerErrorKind::UnclosedCharLit => {
span_label = Some("expected `'` here".to_string());
"unclosed character literal".to_string()
}
LexerErrorKind::MalformedUnicodeLit => {
span_label = Some("expected `{` after a `\\u` escape sequence".to_string());
"invalid unicode escape sequence".to_string()
}
LexerErrorKind::UnclosedUnicodeLit => {
// push a note about what kind of suffix is expected
span_label = Some("expected `}` here".to_string());
"unclosed unicode escape sequence".to_string()
}
LexerErrorKind::MultipleCharCodePoints => {
help_notes
.push(help!("{}", "if you meant to write a string literal, use `\"` instead"));
"character literals may only contain one codepoint".to_string()
}
LexerErrorKind::InvalidUnicodeEscape(ch) => {
help_notes
.push(info!("{}", "unicode literals may only contain hexadecimal digits"));
format!("invalid character in unicode escape sequence `{ch}`")
}
LexerErrorKind::UnicodeLitTooLong => {
span_label = Some(
"unicode literals may only contain up to 6 hexadecimal digits".to_string(),
);
"overlong unicode escape sequence".to_string()
}
LexerErrorKind::UnicodeLitTooLarge => {
span_label = Some("invalid escape".to_string());
help_notes.push(info!("{}", "unicode escape must be at most 10FFFF"));
"invalid unicode character escape".to_string()
}
LexerErrorKind::NumericEscapeSequenceTooShort => {
"numeric escape sequence is too short".to_string()
}
LexerErrorKind::InvalidNumericEscapeSequence(ch) => {
help_notes.push(info!(
"{}",
"numeric escape sequences may only contain hexadecimal digits"
));
span_label = Some(format!("`{ch}` is not valid here"));
format!("invalid character in numeric escape sequence `{ch}`")
}
};

if let Some(additional_info) = err.message {
message.push_str(&format!(". {additional_info}"));
}
let report = reporter
.error()
.title(message)
.add_labelled_span(err.location, span_label.unwrap_or("here".to_string()));

reporter.error().title(message).add_labelled_span(err.location, "here");
// Add any of the additionally generated notes.
for note in help_notes {
report.add_element(note);
}

reporter.into_reports()
}
Expand All @@ -143,23 +262,29 @@ impl From<LexerError> for Reports {
#[derive(Default)]
pub struct LexerDiagnostics {
/// Inner stored diagnostics from the lexer.
store: DiagnosticStore<LexerError, ()>,
pub store: DiagnosticStore<LexerError, ()>,

/// Whether the [Lexer] encountered a fatal error and
/// must abort on the next token advance
pub(crate) has_fatal_error: Cell<bool>,
}

impl LexerDiagnostics {
/// Check if the lexer has encountered an error.
pub fn has_errors(&self) -> bool {
self.has_fatal_error.get() || !self.store.errors.is_empty()
}

/// Convert all of the collected [LexerDiagnostics] into [Report]s.
pub fn into_reports(&mut self) -> Vec<Report> {
self.store.errors.drain(..).flat_map(Reports::from).collect()
}
}

impl HasDiagnosticsMut for Lexer<'_> {
type Diagnostics = DiagnosticStore<LexerError, ()>;

fn diagnostics(&mut self) -> &mut Self::Diagnostics {
&mut self.diagnostics.store
}
}

impl Lexer<'_> {
pub fn into_reports(&mut self) -> Vec<Report> {
self.diagnostics.store.errors.drain(..).flat_map(Reports::from).collect()
}
}
Loading
Loading