Skip to content

Commit

Permalink
Merge pull request #973 from hash-org/lexer-improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
feds01 authored Sep 14, 2023
2 parents 1553e2e + 41f8d86 commit 4a437ff
Show file tree
Hide file tree
Showing 40 changed files with 1,364 additions and 872 deletions.
286 changes: 285 additions & 1 deletion Cargo.lock

Large diffs are not rendered by default.

108 changes: 84 additions & 24 deletions compiler/hash-ast/src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
use std::{
fmt::Display,
hash::Hash,
iter::repeat,
ops::{Deref, DerefMut},
};

Expand All @@ -15,26 +16,28 @@ use hash_source::{
use hash_token::{Base, FloatLitKind, IntLitKind};
use hash_tree_def::define_tree;
use hash_utils::{
index_vec::{define_index_type, IndexVec},
parking_lot::RwLock,
counter,
parking_lot::{RwLock, RwLockWriteGuard},
thin_vec::{thin_vec, ThinVec},
};
use once_cell::sync::Lazy;
use replace_with::replace_with_or_abort;

define_index_type! {
counter! {
/// This is the unique identifier for an AST node. This is used to
/// map spans to nodes, and vice versa. [AstNodeId]s are unique and
/// they are always increasing as a new nodes are created.
pub struct AstNodeId = u32;
MAX_INDEX = i32::max_value() as usize;
DISABLE_MAX_INDEX_CHECK = cfg!(not(debug_assertions));
name: AstNodeId,
counter_name: AST_COUNTER,
visibility: pub,
method_visibility:,
derives: (Copy, Clone, Eq, PartialEq, Hash, Ord, PartialOrd, Debug),
}

impl AstNodeId {
/// Create a null node id.
pub fn null() -> Self {
AstNodeId::new(0)
AstNodeId::from(0)
}

/// Get the [Span] of this [AstNodeId].
Expand Down Expand Up @@ -67,40 +70,99 @@ impl Hunk {
/// to query the [Span] of a node simply by using the [AstNodeId] of the
/// node.
static SPAN_MAP: Lazy<RwLock<IndexVec<AstNodeId, Span>>> = Lazy::new(|| {
let mut map = IndexVec::new();

// We push a NULL node-id so we can use it as the default
static SPAN_MAP: Lazy<RwLock<Vec<Span>>> = Lazy::new(|| {
// We initialise the map with a NULL node-id so we can use it as the default
// for items that need a node, but don't have one.
map.push(Span::new(ByteRange::new(0, 0), SourceId::default()));

RwLock::new(map)
RwLock::new(vec![Span::new(ByteRange::new(0, 0), SourceId::default())])
});

/// A thread/job local map of [AstNodeId]s to [ByteRange]s. The [LocalSpanMap]
/// can be used by a thread to "reserve" [AstNodeId]s for nodes that will be
/// added to the global [`SPAN_MAP`] later.
///
/// ##Note: This is only used by the parser in order to reduce contention for [`SPAN_MAP`].
pub struct LocalSpanMap {
map: Vec<(AstNodeId, ByteRange)>,
source: SourceId,
}

impl LocalSpanMap {
/// Create a new [LocalAstMap].
pub fn new(source: SourceId) -> Self {
Self { map: vec![], source }
}

/// Add a new node to the map.
pub fn add(&mut self, range: ByteRange) -> AstNodeId {
let id = AstNodeId::new();
self.map.push((id, range));
id
}
}

/// Utilities for working with the [`SPAN_MAP`].
pub struct SpanMap;

impl SpanMap {
/// Get the span of a node by [AstNodeId].
pub fn span_of(id: AstNodeId) -> Span {
SPAN_MAP.read()[id]
let span = SPAN_MAP.read()[id.to_usize()];
debug_assert_ne!(span, Span::null(), "span of node {id:?} is null");
span
}

/// Get the [SourceId] of a node by [AstNodeId].
pub fn source_of(id: AstNodeId) -> SourceId {
SpanMap::span_of(id).id
}

fn extend_map(writer: &mut RwLockWriteGuard<Vec<Span>>, id: AstNodeId) {
let len = (id.to_usize() + 1).saturating_sub(writer.len());
if len > 0 {
writer.extend(repeat(Span::null()).take(len));
}
}

/// Get a mutable reference to the [`SPAN_MAP`]. This is only
/// internal to the `hash-ast` crate since it creates entries
/// in the span map when creating new AST nodes.
fn add_span(span: Span) -> AstNodeId {
SPAN_MAP.write().push(span)
let mut writer = SPAN_MAP.write();

// Create the new id, expand the map for capacity and
// then write the span into the map.
let id = AstNodeId::new();
Self::extend_map(&mut writer, id);
writer[id.to_usize()] = span;

id
}

/// Update the span of a node by [AstNodeId].
fn update_span(id: AstNodeId, span: Span) {
SPAN_MAP.write()[id] = span;
SPAN_MAP.write()[id.to_usize()] = span;
}

/// Merge a [LocalSpanMap] into the [`SPAN_MAP`].
pub fn add_local_map(local: LocalSpanMap) {
// If no nodes were added, don't do anything!
if local.map.is_empty() {
return;
}

let mut writer = SPAN_MAP.write();
let (key, _) = local.map.last().unwrap();

// Reserve enough space in the global map to fit the local map.
//
// ##Note: During high loads, we're likely reserving space for all of the
// other nodes that are to be added.
Self::extend_map(&mut writer, *key);

// Now we write all of the items into the map.
for (id, range) in local.map {
writer[id.to_usize()] = Span::new(range, local.source);
}
}
}

Expand Down Expand Up @@ -344,6 +406,11 @@ impl<T> AstNodes<T> {
Self { nodes, id }
}

/// Create a new [AstNodes] with an existing [AstNodeId].
pub fn with_id(nodes: ThinVec<AstNode<T>>, id: AstNodeId) -> Self {
Self { nodes, id }
}

/// Function to adjust the span location of [AstNodes] if it is initially
/// incorrectly offset because there is a 'pre-conditional' token that must
/// be parsed before parsing the nodes. This token could be something like a
Expand Down Expand Up @@ -480,13 +547,6 @@ define_tree! {
}

impl MacroInvocations {
/// Create a new empty [MacroInvocations].
pub fn empty(span: Span) -> AstNode<Self> {
let contents = Self { invocations: AstNodes::empty(span) };
let id = contents.invocations.id;
AstNode::with_id(contents, id)
}

/// Get the number of invocations that are contained within this node.
pub fn len(&self) -> usize {
self.invocations.len()
Expand Down
2 changes: 1 addition & 1 deletion compiler/hash-ast/src/lit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ impl From<IntError> for LitParseErrorKind {
match value.kind() {
num::IntErrorKind::InvalidDigit => Self::MalformedIntLit,
num::IntErrorKind::PosOverflow => Self::IntOverflow { base, ty },
_ => unreachable!(),
err => panic!("unexpected literal error: {err:?}"),
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion compiler/hash-driver/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ impl Compiler {
// queue can run within a worker and any other jobs can run inside another
// worker or workers.
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(settings.worker_count + 1)
.num_threads(std::cmp::max(settings.worker_count, 2))
.thread_name(|id| format!("compiler-worker-{id}"))
.build()
.unwrap();
Expand Down
2 changes: 1 addition & 1 deletion compiler/hash-ir/src/constant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use crate::{
ty::{IrTy, IrTyId, Mutability, COMMON_IR_TYS},
};

/// A [Const] represents a constant valuen within the Hash IR. This can
/// A [Const] represents a constant value within the Hash IR. This can
/// be anything that can be represented as a constant, including ABI scalars,
/// ADTs, slices, and arrays. This representation is intended to be used
/// throughout the compiler from TIR, IR and potentially the VM as well.
Expand Down
8 changes: 8 additions & 0 deletions compiler/hash-lexer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,11 @@ hash-reporting = {path = "../hash-reporting" }
hash-source = {path = "../hash-source" }
hash-token = {path = "../hash-token" }
hash-target = {path = "../hash-target" }
hash-utils = {path = "../hash-utils" }

[dev-dependencies]
criterion = "0.5.1"

[[bench]]
name = "bench"
harness = false
71 changes: 71 additions & 0 deletions compiler/hash-lexer/benches/bench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
//! Some Hash lexer benchmarks.
use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
use hash_lexer::Lexer;
use hash_source::{location::SpannedSource, SourceId};

static IDENTIFIERS: &str = "It was the year when they finally immanentized the Eschaton \
It was the year when they finally immanentized the Eschaton \
It was the year when they finally immanentized the Eschaton \
It was the year when they finally immanentized the Eschaton \
It was the year when they finally immanentized the Eschaton \
It was the year when they finally immanentized the Eschaton \
It was the year when they finally immanentized the Eschaton \
It was the year when they finally immanentized the Eschaton \
It was the year when they finally immanentized the Eschaton \
It was the year when they finally immanentized the Eschaton \
It was the year when they finally immanentized the Eschaton \
It was the year when they finally immanentized the Eschaton \
It was the year when they finally immanentized the Eschaton";

static KEYWORDS: &str = "for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type \
for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type \
for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type \
for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type \
for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type \
for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type \
for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type \
for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type \
for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type \
for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type \
for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type \
for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type \
for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type \
for while loop if else match as in trait enum struct continue break return import raw false unsafe pub priv mut mod impl true type";

static MIX: &str = "for it was in the year as they finally immanentized the enum Eschaton while struct \
for it was in the year as they finally immanentized the enum Eschaton while struct \
for it was in the year as they finally immanentized the enum Eschaton while struct \
for it was in the year as they finally immanentized the enum Eschaton while struct \
for it was in the year as they finally immanentized the enum Eschaton while struct \
for it was in the year as they finally immanentized the enum Eschaton while struct \
for it was in the year as they finally immanentized the enum Eschaton while struct \
for it was in the year as they finally immanentized the enum Eschaton while struct \
for it was in the year as they finally immanentized the enum Eschaton while struct \
for it was in the year as they finally immanentized the enum Eschaton while struct \
for it was in the year as they finally immanentized the enum Eschaton while struct \
for it was in the year as they finally immanentized the enum Eschaton while struct \
for it was in the year as they finally immanentized the enum Eschaton while struct";

/// Test candidates that are to be run.
static CANDIDATES: [(&str, &str); 3] =
[("identifiers", IDENTIFIERS), ("mixed", MIX), ("keywords", KEYWORDS)];

fn lex(source: &str) {
let mut lexer = Lexer::new(SpannedSource(source), SourceId::default());

while let Some(token) = lexer.advance_token() {
black_box(token);
}
}

fn bench_idents(c: &mut Criterion) {
let mut group = c.benchmark_group("idents");

for (name, source) in CANDIDATES {
group.throughput(Throughput::Bytes(source.len() as u64));
group.bench_with_input(name, &source, |b, &s| b.iter(|| lex(s)));
}
}

criterion_group!(benches, bench_idents);
criterion_main!(benches);
4 changes: 2 additions & 2 deletions compiler/hash-lexer/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use std::{cell::Cell, fmt::Display};

use hash_reporting::{
diagnostic::{AccessToDiagnosticsMut, DiagnosticStore},
diagnostic::{DiagnosticStore, HasDiagnosticsMut},
report::{Report, ReportElement, ReportNote, ReportNoteKind},
reporter::{Reporter, Reports},
};
Expand Down Expand Up @@ -150,7 +150,7 @@ pub struct LexerDiagnostics {
pub(crate) has_fatal_error: Cell<bool>,
}

impl AccessToDiagnosticsMut for Lexer<'_> {
impl HasDiagnosticsMut for Lexer<'_> {
type Diagnostics = DiagnosticStore<LexerError, ()>;

fn diagnostics(&mut self) -> &mut Self::Diagnostics {
Expand Down
Loading

0 comments on commit 4a437ff

Please sign in to comment.