Skip to content

Commit

Permalink
feat: Prepare data in rust (#568)
Browse files Browse the repository at this point in the history
This breaks some of the Rust code into a new `sparrow-session` crate
providing a place for implementing things that aren't specific to the
Python FFI.

This also introdoces some pieces for error conversions, with the hope
that we can grow to a point where we log errors on the rust side but
surface relevant details to the Python side.
  • Loading branch information
bjchambers authored Jul 31, 2023
1 parent 887c4ee commit f58306e
Show file tree
Hide file tree
Showing 33 changed files with 1,808 additions and 266 deletions.
18 changes: 18 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 5 additions & 15 deletions crates/sparrow-compiler/src/ast_to_dfg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ pub(super) fn ast_to_dfg(
)
}

pub(super) fn add_to_dfg(
pub fn add_to_dfg(
data_context: &mut DataContext,
dfg: &mut Dfg,
diagnostics: &mut DiagnosticCollector<'_>,
Expand Down Expand Up @@ -177,12 +177,7 @@ pub(super) fn add_to_dfg(
.with_note(if nearest.is_empty() {
"No formulas, tables, or let-bound names available".to_owned()
} else {
format!(
"Nearest matches: {}",
nearest
.iter()
.format_with(", ", |e, f| f(&format_args!("'{e}'")))
)
format!("Nearest matches: {nearest}")
})
.emit(diagnostics);
Ok(dfg.error_node())
Expand Down Expand Up @@ -794,15 +789,10 @@ fn missing_field_diagnostic(
.with_note(if fields.is_empty() {
"No fields available on base record".to_owned()
} else {
let candidates = crate::nearest_matches::nearest_matches(
let candidates = crate::nearest_matches::NearestMatches::new_nearest_strs(
field_name,
fields.iter().map(|f| f.name()),
fields.iter().map(|f| f.name().as_str()),
);
format!(
"Nearest fields: {}",
candidates
.iter()
.format_with(", ", |name, f| f(&format_args!("'{name}'")))
)
format!("Nearest fields: {candidates}",)
})
}
15 changes: 2 additions & 13 deletions crates/sparrow-compiler/src/ast_to_dfg/ast_dfg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,22 +86,11 @@ impl AstDfg {
}
}

pub fn equivalent(&self, other: &AstDfg) -> bool {
// This is quite correct -- we should lock everything and then compare.
// But, this is a temporary hack for the Python builder.
self.value() == other.value()
&& self.is_new() == other.is_new()
&& self.value_type == other.value_type
&& self.grouping == other.grouping
&& self.time_domain == other.time_domain
&& self.location == other.location
}

pub fn value(&self) -> Id {
*self.value.lock().unwrap()
}

pub(crate) fn is_new(&self) -> Id {
pub fn is_new(&self) -> Id {
*self.is_new.lock().unwrap()
}

Expand All @@ -113,7 +102,7 @@ impl AstDfg {
self.grouping
}

pub(crate) fn time_domain(&self) -> &TimeDomain {
pub fn time_domain(&self) -> &TimeDomain {
&self.time_domain
}

Expand Down
2 changes: 1 addition & 1 deletion crates/sparrow-compiler/src/data_context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ impl TableInfo {
.collect()
}

pub(crate) fn dfg_node(&self, dfg: &mut Dfg) -> anyhow::Result<AstDfgRef> {
pub fn dfg_node(&self, dfg: &mut Dfg) -> anyhow::Result<AstDfgRef> {
use smallvec::smallvec;
use sparrow_plan::InstOp;
use sparrow_syntax::FenlType;
Expand Down
11 changes: 6 additions & 5 deletions crates/sparrow-compiler/src/dfg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,13 @@ pub(crate) use useless_transforms::*;
use crate::ast_to_dfg::AstDfg;
use crate::dfg::language::DfgLang;
use crate::env::Env;
use crate::nearest_matches::NearestMatches;
use crate::time_domain::TimeDomain;
use crate::{AstDfgRef, CompilerOptions};

#[derive(Debug)]
/// A wrapper around the DFG construction / manipulation functions.
pub(super) struct Dfg {
pub struct Dfg {
/// The DFG being built/manipulated.
graph: DfgGraph,
/// A mapping from identifiers to corresponding DFG nodes.
Expand Down Expand Up @@ -351,13 +352,13 @@ impl Dfg {
///
/// # Error
/// Returns an error containing the (up-to-5) nearest matches.
pub(super) fn get_binding(&self, name: &str) -> Result<AstDfgRef, Vec<&String>> {
pub(super) fn get_binding(&self, name: &str) -> Result<AstDfgRef, NearestMatches<&'_ str>> {
if let Some(found) = self.env.get(name) {
Ok(found.clone())
} else {
Err(crate::nearest_matches::nearest_matches(
Err(crate::nearest_matches::NearestMatches::new_nearest_strs(
name,
self.env.keys(),
self.env.keys().map(|s| s.as_str()),
))
}
}
Expand Down Expand Up @@ -534,7 +535,7 @@ impl Dfg {
}

#[cfg(test)]
pub fn data(&self, id: Id) -> &DfgAnalysisData {
pub(crate) fn data(&self, id: Id) -> &DfgAnalysisData {
&self.graph[id].data
}
}
2 changes: 1 addition & 1 deletion crates/sparrow-compiler/src/diagnostics/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use crate::diagnostics::collector::DiagnosticCollector;
/// Builder for creating and emitting a diagnostic.
#[must_use]
#[derive(Debug, PartialEq)]
pub(crate) struct DiagnosticBuilder {
pub struct DiagnosticBuilder {
code: DiagnosticCode,
diagnostic: Diagnostic<FeatureSetPart>,
}
Expand Down
2 changes: 1 addition & 1 deletion crates/sparrow-compiler/src/diagnostics/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::diagnostics::DiagnosticCode;
use crate::DiagnosticBuilder;

/// Collects the diagnostic messages being reported.
pub(crate) struct DiagnosticCollector<'a> {
pub struct DiagnosticCollector<'a> {
feature_set: FeatureSetParts<'a>,
/// Collect the diagnostic messages.
collected: Vec<CollectedDiagnostic>,
Expand Down
14 changes: 4 additions & 10 deletions crates/sparrow-compiler/src/frontend/resolve_arguments.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::borrow::Cow;

use itertools::Itertools;
use sparrow_syntax::{
Arguments, Expr, ExprOp, ExprRef, Located, ResolveError, Resolved, ResolvedExpr,
};
Expand Down Expand Up @@ -106,12 +105,7 @@ fn resolve_arguments(
.primary_label()
.with_message(format!("No function named '{function_name}'")),
)
.with_note(format!(
"Nearest matches: {}",
candidates
.iter()
.format_with(", ", |e, f| f(&format_args!("'{e}'")))
));
.with_note(format!("Nearest matches: {candidates}"));
return Err(Some(diagnostic));
}
},
Expand Down Expand Up @@ -223,15 +217,15 @@ fn resolve_arguments(
)),
)),
ResolveError::InvalidKeywordArgument { keyword } => {
let nearest = crate::nearest_matches::nearest_matches(
let nearest = crate::nearest_matches::NearestMatches::new_nearest_strs(
keyword.inner(),
names.iter().map(Located::inner),
names.iter().map(|s| s.inner().as_str()),
);
Some(
DiagnosticCode::InvalidArguments
.builder()
.with_label(operator_location.primary_label())
.with_note(format!("Nearest matches: {}", nearest.iter().format(", "))),
.with_note(format!("Nearest matches: {nearest}")),
)
}
})
Expand Down
2 changes: 1 addition & 1 deletion crates/sparrow-compiler/src/functions/function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ impl Function {
/// This is used for certain functions like aggregations, where the
/// arguments are flattened in the DFG, requiring us to check parameters
/// against an internal signature representing the flattened arguments.
pub(crate) fn signature(&self) -> &Signature {
pub fn signature(&self) -> &Signature {
if let Some(signature) = &self.internal_signature {
signature
} else {
Expand Down
6 changes: 4 additions & 2 deletions crates/sparrow-compiler/src/functions/registry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ use hashbrown::HashMap;
use sparrow_syntax::{FeatureSetPart, Signature};
use static_init::dynamic;

use crate::nearest_matches::NearestMatches;

use super::{Function, FunctionBuilder};

pub(super) struct Registry {
Expand Down Expand Up @@ -56,9 +58,9 @@ impl Registry {
///
/// # Errors
/// Returns the 5 closest matches from the registry.
pub fn get_function(name: &str) -> Result<&'static Function, Vec<&'static str>> {
pub fn get_function(name: &str) -> Result<&'static Function, NearestMatches<&'static str>> {
REGISTRY.get_by_name(name).ok_or_else(|| {
crate::nearest_matches::nearest_matches(
crate::nearest_matches::NearestMatches::new_nearest_strs(
name,
REGISTRY
.iter()
Expand Down
4 changes: 3 additions & 1 deletion crates/sparrow-compiler/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,18 @@ mod functions;
mod nearest_matches;
mod options;
mod plan;
pub mod query_builder;
mod time_domain;
mod types;

// TODO: Cleanup the top-level modules in the `sparrow-compiler` crate.
pub use ast_to_dfg::*;
pub use compile::*;
pub use data_context::*;
pub use dfg::Dfg;
pub use diagnostics::*;
pub use error::*;
pub use frontend::*;
pub use functions::*;
pub use options::*;

pub use nearest_matches::NearestMatches;
95 changes: 85 additions & 10 deletions crates/sparrow-compiler/src/nearest_matches.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,89 @@
use edit_distance::edit_distance;
use itertools::Itertools;

/// Return a vector containing the up-to-5 nearest matches.
pub(crate) fn nearest_matches<T: AsRef<str> + Ord>(
query: &str,
items: impl Iterator<Item = T>,
) -> Vec<T> {
items
.map(|item| (edit_distance(query, item.as_ref()), item))
.k_smallest(5)
.map(|(_, item)| item)
.collect()
/// The nearest matches to a given name.
#[derive(Debug, PartialEq, Clone)]
pub struct NearestMatches<T>(Vec<T>);

impl<T: std::fmt::Display + std::fmt::Debug + Send + Sync + 'static> error_stack::Context
for NearestMatches<T>
{
}

impl<T> Default for NearestMatches<T> {
fn default() -> Self {
Self(vec![])
}
}

impl<T> NearestMatches<T> {
pub fn map<T2>(self, f: impl Fn(T) -> T2) -> NearestMatches<T2> {
NearestMatches(self.0.into_iter().map(f).collect())
}
}

impl<'a> NearestMatches<&'a str> {
/// Create a set of nearest matches for a given string.
pub fn new_nearest_strs(query: &str, items: impl Iterator<Item = &'a str> + 'a) -> Self {
let nearest_matches: Vec<_> = items
.map(|item| (edit_distance(query, item), item))
.k_smallest(5)
.map(|(_, item)| item)
.collect();
Self(nearest_matches)
}
}

impl NearestMatches<String> {
/// Create a set of nearest matches for a given string.
pub fn new_nearest_strings(query: &str, items: impl Iterator<Item = String>) -> Self {
let nearest_matches: Vec<_> = items
.map(|item| (edit_distance(query, item.as_ref()), item))
.k_smallest(5)
.map(|(_, item)| item)
.collect();
Self(nearest_matches)
}
}

impl<T: std::fmt::Display> std::fmt::Display for NearestMatches<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.0.is_empty() {
write!(f, "none")
} else {
self.0
.iter()
.format_with(", ", |e, f| f(&format_args!("'{e}'")))
.fmt(f)
}
}
}

impl<T> From<Vec<T>> for NearestMatches<T> {
fn from(matches: Vec<T>) -> Self {
Self(matches)
}
}

impl<T> NearestMatches<T> {
pub fn inner(self) -> Vec<T> {
self.0
}

pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_nearest_matches_display() {
insta::assert_display_snapshot!(NearestMatches::<&'static str>::from(vec![]), @"none");
insta::assert_display_snapshot!(NearestMatches::from(vec!["foo"]), @"'foo'");
insta::assert_display_snapshot!(NearestMatches::from(vec!["foo", "bar"]), @"'foo', 'bar'");
insta::assert_display_snapshot!(NearestMatches::from(vec!["foo", "bar", "baz"]), @"'foo', 'bar', 'baz'");
}
}
2 changes: 1 addition & 1 deletion crates/sparrow-compiler/src/time_domain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use crate::{AstDfgRef, DataContext, DiagnosticBuilder, DiagnosticCode};
/// It is used to report warnings about operations between incompatible
/// domains.
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(super) enum TimeDomain {
pub enum TimeDomain {
/// The TimeDomain represents an error.
Error,
/// The TimeDomain represents a continuous value, such as a literal or
Expand Down
1 change: 1 addition & 0 deletions crates/sparrow-execution/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ mod tests {
impl error_stack::Context for Error {}

#[tokio::test]
#[ignore]
async fn test_query() {
sparrow_testing::init_test_logging();

Expand Down
2 changes: 1 addition & 1 deletion crates/sparrow-main/tests/e2e/basic_error_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -731,7 +731,7 @@ async fn test_invalid_named_arguments_duplicates() {
- "1 | { n: ceil(x = Numbers.n, x = 5) } "
- " | ^^^^"
- " |"
- " = Nearest matches: n"
- " = Nearest matches: 'n'"
- ""
- ""
"###);
Expand Down
1 change: 1 addition & 0 deletions crates/sparrow-runtime/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ use std::path::PathBuf;

pub use batch::*;
pub use metadata::*;
pub use prepare::preparer;
use read::*;
use sparrow_api::kaskada::v1alpha::execute_request::Limits;

Expand Down
1 change: 1 addition & 0 deletions crates/sparrow-runtime/src/prepare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ mod error;
pub(crate) mod execute_input_stream;
mod prepare_input_stream;
mod prepare_metadata;
pub mod preparer;
mod slice_preparer;

pub use error::*;
Expand Down
Loading

0 comments on commit f58306e

Please sign in to comment.