From d334ee1b24f5b950be403b4bcfdf0ee79c4af684 Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Fri, 31 Mar 2023 16:45:04 -0400 Subject: [PATCH 1/5] chore(config): improve config schema output with more precise unevaluatedProperties + ref flattening --- Cargo.lock | 2 + lib/vector-config-common/Cargo.toml | 1 + .../src/schema/json_schema.rs | 59 +- lib/vector-config-common/src/schema/visit.rs | 26 +- lib/vector-config/Cargo.toml | 1 + lib/vector-config/src/schema/helpers.rs | 6 +- .../src/schema/visitors/inline_single.rs | 357 +++++++++++ .../src/schema/visitors/merge.rs | 291 +++++++++ lib/vector-config/src/schema/visitors/mod.rs | 8 + .../src/schema/visitors/scoped_visit.rs | 180 ++++++ lib/vector-config/src/schema/visitors/test.rs | 15 + .../src/schema/visitors/unevaluated.rs | 602 ++++++++++++++++-- scripts/generate-component-docs.rb | 13 +- .../sinks/base/gcp_stackdriver_logs.cue | 8 +- .../sinks/base/gcp_stackdriver_metrics.cue | 8 +- .../reference/components/sinks/base/http.cue | 8 +- 16 files changed, 1518 insertions(+), 67 deletions(-) create mode 100644 lib/vector-config/src/schema/visitors/inline_single.rs create mode 100644 lib/vector-config/src/schema/visitors/merge.rs create mode 100644 lib/vector-config/src/schema/visitors/scoped_visit.rs create mode 100644 lib/vector-config/src/schema/visitors/test.rs diff --git a/Cargo.lock b/Cargo.lock index 74aa0649f0ef8..feff9355c6a5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9675,6 +9675,7 @@ dependencies = [ "serde_with 2.3.1", "snafu", "toml 0.7.3", + "tracing 0.1.37", "url", "vector-config-common", "vector-config-macros", @@ -9693,6 +9694,7 @@ dependencies = [ "serde", "serde_json", "syn 1.0.109", + "tracing 0.1.37", ] [[package]] diff --git a/lib/vector-config-common/Cargo.toml b/lib/vector-config-common/Cargo.toml index fc9025f49af8d..d8eac2cfd0485 100644 --- a/lib/vector-config-common/Cargo.toml +++ b/lib/vector-config-common/Cargo.toml @@ -11,4 +11,5 @@ proc-macro2 = { version = "1.0", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } syn = { version = "1.0", features = ["full", "extra-traits", "visit-mut", "visit"] } +tracing = { version = "0.1.34", default-features = false } quote = { version = "1.0", default-features = false } diff --git a/lib/vector-config-common/src/schema/json_schema.rs b/lib/vector-config-common/src/schema/json_schema.rs index d1ef4a0078ee2..e2b32c31efd18 100644 --- a/lib/vector-config-common/src/schema/json_schema.rs +++ b/lib/vector-config-common/src/schema/json_schema.rs @@ -1,8 +1,8 @@ use serde::{Deserialize, Serialize}; use serde_json::Value; -use std::ops::Deref; +use std::{iter, ops::Deref}; -use super::{Map, Set}; +use super::{Map, Set, DEFINITIONS_PREFIX}; /// A JSON Schema. #[allow(clippy::large_enum_variant)] @@ -40,6 +40,26 @@ impl Schema { } } + /// Gets a reference to the inner schema object if this schema is a JSON Schema object. + /// + /// Otherwise, `None` is returned. + pub fn as_object(&self) -> Option<&SchemaObject> { + match self { + Schema::Object(schema) => Some(schema), + _ => None, + } + } + + /// Gets a mutable reference to the inner schema object if this schema is a JSON Schema object. + /// + /// Otherwise, `None` is returned. + pub fn as_object_mut(&mut self) -> Option<&mut SchemaObject> { + match self { + Schema::Object(schema) => Some(schema), + _ => None, + } + } + /// Converts the given schema (if it is a boolean schema) into an equivalent schema object. /// /// If the given schema is already a schema object, this has no effect. @@ -539,6 +559,30 @@ pub enum SingleOrVec { Vec(Vec), } +impl Extend for SingleOrVec { + fn extend>(&mut self, iter: I) { + match self { + Self::Single(item) => { + *self = Self::Vec(iter::once(*item.clone()).chain(iter.into_iter()).collect()); + } + Self::Vec(items) => items.extend(iter), + } + } +} + +impl<'a, T> IntoIterator for &'a SingleOrVec { + type Item = &'a T; + + type IntoIter = std::slice::Iter<'a, T>; + + fn into_iter(self) -> Self::IntoIter { + match self { + SingleOrVec::Single(item) => std::slice::from_ref(item.as_ref()).iter(), + SingleOrVec::Vec(items) => items.as_slice().iter(), + } + } +} + impl From for SingleOrVec { fn from(single: T) -> Self { SingleOrVec::Single(Box::new(single)) @@ -567,3 +611,14 @@ fn is_none_or_default_true(field: &Option>) -> bool { Some(value) => matches!(value.as_ref(), Schema::Bool(true)), } } + +pub fn get_cleaned_schema_reference(schema_ref: &str) -> &str { + if let Some(cleaned) = schema_ref.strip_prefix(DEFINITIONS_PREFIX) { + cleaned + } else { + panic!( + "Tried to clean schema reference that does not start with the definition prefix: {}", + schema_ref + ); + } +} diff --git a/lib/vector-config-common/src/schema/visit.rs b/lib/vector-config-common/src/schema/visit.rs index 04fdfbd4ac77b..242bed34331e8 100644 --- a/lib/vector-config-common/src/schema/visit.rs +++ b/lib/vector-config-common/src/schema/visit.rs @@ -1,4 +1,6 @@ -use super::{Map, RootSchema, Schema, SchemaObject, SingleOrVec, DEFINITIONS_PREFIX}; +use tracing::debug; + +use super::{get_cleaned_schema_reference, Map, RootSchema, Schema, SchemaObject, SingleOrVec}; /// Trait used to recursively modify a constructed schema and its subschemas. pub trait Visitor: std::fmt::Debug { @@ -51,9 +53,15 @@ pub fn visit_schema_object( schema: &mut SchemaObject, ) { if schema.reference.is_some() { - with_resolved_schema_reference(definitions, schema, |defs, referenced_schema| { - v.visit_schema(defs, referenced_schema); - }) + with_resolved_schema_reference( + definitions, + schema, + |defs, schema_ref, referenced_schema| { + debug!(referent = schema_ref, "Visiting schema reference."); + + v.visit_schema(defs, referenced_schema); + }, + ) } if let Some(sub) = &mut schema.subschemas { @@ -133,17 +141,17 @@ pub fn with_resolved_schema_reference( schema: &mut SchemaObject, f: F, ) where - F: FnOnce(&mut Map, &mut Schema), + F: FnOnce(&mut Map, &str, &mut Schema), { if let Some(reference) = schema.reference.as_ref() { - let schema_def_key = reference.replace(DEFINITIONS_PREFIX, ""); + let schema_def_key = get_cleaned_schema_reference(reference); let mut referenced_schema = definitions - .get(&schema_def_key) + .get(schema_def_key) .cloned() .expect("schema reference should exist"); - f(definitions, &mut referenced_schema); + f(definitions, schema_def_key, &mut referenced_schema); - definitions.insert(schema_def_key, referenced_schema); + definitions.insert(schema_def_key.to_string(), referenced_schema); } } diff --git a/lib/vector-config/Cargo.toml b/lib/vector-config/Cargo.toml index 80d855f05866b..98d22a68f836f 100644 --- a/lib/vector-config/Cargo.toml +++ b/lib/vector-config/Cargo.toml @@ -23,6 +23,7 @@ serde_json = { version = "1.0", default-features = false, features = ["std"] } serde_with = { version = "2.3.1", default-features = false, features = ["std"] } snafu = { version = "0.7.4", default-features = false } toml = { version = "0.7.3", default-features = false } +tracing = { version = "0.1.34", default-features = false } url = { version = "2.3.1", default-features = false, features = ["serde"] } vrl-compiler = { package = "vrl-compiler", git = "https://github.com/vectordotdev/vrl", rev = "v0.2.0" } vrl-core = { package = "vrl-core", git = "https://github.com/vectordotdev/vrl", rev = "v0.2.0", default-features = false, features = ["serde"] } diff --git a/lib/vector-config/src/schema/helpers.rs b/lib/vector-config/src/schema/helpers.rs index 17458d7941ae5..c1baffd0731a3 100644 --- a/lib/vector-config/src/schema/helpers.rs +++ b/lib/vector-config/src/schema/helpers.rs @@ -12,7 +12,7 @@ use crate::{ num::ConfigurableNumber, Configurable, ConfigurableRef, GenerateError, Metadata, ToValue, }; -use super::visitors::DisallowedUnevaluatedPropertiesVisitor; +use super::visitors::{DisallowedUnevaluatedPropertiesVisitor, InlineSingleUseReferencesVisitor}; /// Applies metadata to the given schema. /// @@ -480,7 +480,9 @@ pub fn generate_internal_tagged_variant_schema( } pub fn default_schema_settings() -> SchemaSettings { - SchemaSettings::new().with_visitor(DisallowedUnevaluatedPropertiesVisitor::from_settings) + SchemaSettings::new() + .with_visitor(InlineSingleUseReferencesVisitor::from_settings) + .with_visitor(DisallowedUnevaluatedPropertiesVisitor::from_settings) } pub fn generate_root_schema() -> Result diff --git a/lib/vector-config/src/schema/visitors/inline_single.rs b/lib/vector-config/src/schema/visitors/inline_single.rs new file mode 100644 index 0000000000000..68347e36937dd --- /dev/null +++ b/lib/vector-config/src/schema/visitors/inline_single.rs @@ -0,0 +1,357 @@ +use std::collections::{HashMap, HashSet}; + +use serde_json::Value; +use tracing::debug; +use vector_config_common::schema::{visit::Visitor, *}; + +use crate::schema::visitors::merge::Mergeable; + +use super::scoped_visit::{ + visit_schema_object_scoped, SchemaReference, SchemaScopeStack, ScopedVisitor, +}; + +/// A visitor that inlines schema references where the referenced schema is only referenced once. +/// +/// In many cases, the schema generation will produce schema definitions where either generics or +/// flattening are involved, which leads to schema definitions that may only be referenced by one +/// other schema definition, and so on. +/// +/// This is suboptimal due to the "pointer chasing" involved to resolve those schema references, +/// when there's no reason to inherently have a schema be defined such that it can be referenced. +/// +/// This visitor collects a list of all schema references, and for any schemas which are referenced +/// only once, will replace those references by inlining the referenced schema directly, and +/// deleting the schema definition from the root definitions. +#[derive(Debug, Default)] +pub struct InlineSingleUseReferencesVisitor { + eligible_to_inline: HashSet, +} + +impl InlineSingleUseReferencesVisitor { + pub fn from_settings(_: &SchemaSettings) -> Self { + Self { + eligible_to_inline: HashSet::new(), + } + } +} + +impl Visitor for InlineSingleUseReferencesVisitor { + fn visit_root_schema(&mut self, root: &mut RootSchema) { + // Build a map of schema references and the number of times they're referenced through the + // entire schema, by visiting the root schema in a recursive fashion, using a helper visitor. + let mut occurrence_visitor = OccurrenceVisitor::default(); + occurrence_visitor.visit_root_schema(root); + let occurrence_map = occurrence_visitor.into_occurrences(); + + let eligible_to_inline = occurrence_map + .into_iter() + // Filter out any schemas which have more than one occurrence, as naturally, we're + // trying to inline single-use schema references. :) + .filter_map(|(def_name, occurrences)| { + if occurrences == 1 { + Some(def_name) + } else { + None + } + }) + // However, we'll also filter out some specific schema definitions which are only + // referenced once, specifically: component base types and component types themselves. + // + // We do this as a lot of the tooling that parses the schema to generate documentation, + // and the like, depends on these schemas existing in the top-level definitions for easy + // lookup. + .filter(|def_name| { + let schema = root + .definitions + .get(def_name.as_ref()) + .and_then(Schema::as_object) + .expect("schema definition must exist"); + + is_inlineable_schema(def_name.as_ref(), schema) + }) + .map(|s| s.as_ref().to_string()) + .collect::>(); + + self.eligible_to_inline = eligible_to_inline; + + // Now run our own visitor logic, which will use the inline eligibility to determine if a + // schema reference in a being-visited schema should be replaced inline with the original + // referenced schema, in turn removing the schema definition. + visit::visit_root_schema(self, root); + + // Now remove all of the definitions for schemas that were eligible for inlining. + for schema_def_name in self.eligible_to_inline.drain() { + debug!( + referent = schema_def_name, + "Removing schema definition from root schema." + ); + + root.definitions + .remove(&schema_def_name) + .expect("referenced schema must exist in definitions"); + } + } + + fn visit_schema_object( + &mut self, + definitions: &mut Map, + schema: &mut SchemaObject, + ) { + // Recursively visit this schema first. + visit::visit_schema_object(self, definitions, schema); + + // If this schema has a schema reference, see if it's in our inline eligibility map. If so, + // we remove the referenced schema from the definitions, and then merge it into the current + // schema, after removing the `$ref` field. + if let Some(schema_ref) = schema.reference.as_ref().cloned() { + let schema_ref = get_cleaned_schema_reference(&schema_ref); + if self.eligible_to_inline.contains(schema_ref) { + let referenced_schema = definitions + .get(schema_ref) + .expect("referenced schema must exist in definitions"); + + if let Schema::Object(referenced_schema) = referenced_schema { + debug!( + referent = schema_ref, + "Inlining eligible schema reference into current schema." + ); + + schema.reference = None; + schema.merge(referenced_schema); + } + } + } + } +} + +fn is_inlineable_schema(definition_name: &str, schema: &SchemaObject) -> bool { + static DISALLOWED_SCHEMAS: &[&str] = &[ + "vector::sources::Sources", + "vector::transforms::Transforms", + "vector::sinks::Sinks", + ]; + + // We want to avoid inlining all of the relevant top-level types used for defining components: + // the "outer" types (i.e. `SinkOuter`), the enum/collection types (i.e. the big `Sources` + // enum), and the component configuration types themselves (i.e. `AmqpSinkConfig`). + // + // There's nothing _technically_ wrong with doing so, but it would break downstream consumers of + // the schema that parse it in order to extract the individual components and other + // component-specific metadata. + let is_component_base = get_schema_metadata_attr(schema, "docs::component_base_type").is_some(); + let is_component = get_schema_metadata_attr(schema, "docs::component_type").is_some(); + + let is_allowed_schema = !DISALLOWED_SCHEMAS.contains(&definition_name); + + !is_component_base && !is_component && is_allowed_schema +} + +#[derive(Debug, Default)] +struct OccurrenceVisitor { + scope_stack: SchemaScopeStack, + occurrence_map: HashMap>, +} + +impl OccurrenceVisitor { + fn into_occurrences(self) -> HashMap { + self.occurrence_map + .into_iter() + .map(|(k, v)| (k, v.len())) + .collect() + } +} + +impl Visitor for OccurrenceVisitor { + fn visit_schema_object( + &mut self, + definitions: &mut Map, + schema: &mut SchemaObject, + ) { + visit_schema_object_scoped(self, definitions, schema); + + if let Some(current_schema_ref) = schema.reference.as_ref() { + // Track the named "parent" schema for the schema we're currently visiting so that if we + // visit this schema again, we don't double count any schema references that it has. The + // "parent" schema is simply the closest ancestor schema that was itself a schema + // reference, or "Root", which represents the oldest schema ancestor in the document. + // + // This lets us couple with scenarios where schema A references schema B, and is the + // only actual direct schema reference to schema B, but due to multiple schemas + // referencing schema A, would otherwise lead to both A and B being visited multiple + // times. + let current_parent_schema_ref = self.get_current_schema_scope().clone(); + let current_schema_ref = get_cleaned_schema_reference(current_schema_ref); + + let occurrences = self + .occurrence_map + .entry(current_schema_ref.into()) + .or_insert_with(HashSet::new); + occurrences.insert(current_parent_schema_ref); + } + } +} + +impl ScopedVisitor for OccurrenceVisitor { + fn push_schema_scope>(&mut self, scope: S) { + self.scope_stack.push(scope.into()); + } + + fn pop_schema_scope(&mut self) { + self.scope_stack.pop().expect("stack was empty during pop"); + } + + fn get_current_schema_scope(&self) -> &SchemaReference { + self.scope_stack.current().unwrap_or(&SchemaReference::Root) + } +} + +fn get_schema_metadata_attr<'a>(schema: &'a SchemaObject, key: &str) -> Option<&'a Value> { + schema + .extensions + .get("_metadata") + .and_then(|metadata| metadata.get(key)) +} + +#[cfg(test)] +mod tests { + use serde_json::json; + use vector_config_common::schema::visit::Visitor; + + use crate::schema::visitors::test::{as_schema, assert_schemas_eq}; + + use super::InlineSingleUseReferencesVisitor; + + #[test] + fn no_refs() { + let mut actual_schema = as_schema(json!({ + "type": "object", + "properties": { + "a": { "type": "string" } + } + })); + + let expected_schema = actual_schema.clone(); + + let mut visitor = InlineSingleUseReferencesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + assert_schemas_eq(expected_schema, actual_schema); + } + + #[test] + fn single_ref_single_usage() { + let mut actual_schema = as_schema(json!({ + "$ref": "#/definitions/simple", + "definitions": { + "simple": { + "type": "object", + "properties": { + "a": { "type": "string" } + } + } + } + })); + + let mut visitor = InlineSingleUseReferencesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + let expected_schema = as_schema(json!({ + "type": "object", + "properties": { + "a": { "type": "string" } + }, + })); + + assert_schemas_eq(expected_schema, actual_schema); + } + + // TODO(tobz): These two tests are ignored because the inliner currently works off of schema + // reference scopes, so two object properties within the same schema don't count as two + // instances of a schema being referenced. + // + // We need to refactor schema scopes to be piecemeal extensible more in the way of how + // `jsonschema` does it rather than an actual stack.... the current approach is good enough for + // now, but not optimal in the way that it could be. + // + // These are here for when I improve the situation after getting this merged. + #[test] + #[ignore] + fn single_ref_multiple_usages() { + let mut actual_schema = as_schema(json!({ + "definitions": { + "simple": { + "type": "object", + "properties": { + "a": { "type": "string" } + } + } + }, + "type": "object", + "properties": { + "a": { "$ref": "#/definitions/simple" }, + "b": { "$ref": "#/definitions/simple" } + } + })); + + let expected_schema = actual_schema.clone(); + + let mut visitor = InlineSingleUseReferencesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + assert_schemas_eq(expected_schema, actual_schema); + } + + #[test] + #[ignore] + fn multiple_refs_mixed_usages() { + let mut actual_schema = as_schema(json!({ + "definitions": { + "simple": { + "type": "object", + "properties": { + "a": { "type": "string" } + } + }, + "advanced": { + "type": "object", + "properties": { + "b": { "type": "string" } + } + } + }, + "type": "object", + "properties": { + "a": { "$ref": "#/definitions/simple" }, + "b": { "$ref": "#/definitions/simple" }, + "c": { "$ref": "#/definitions/advanced" }, + } + })); + + let mut visitor = InlineSingleUseReferencesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + let expected_schema = as_schema(json!({ + "definitions": { + "simple": { + "type": "object", + "properties": { + "a": { "type": "string" } + } + } + }, + "type": "object", + "properties": { + "a": { "$ref": "#/definitions/simple" }, + "b": { "$ref": "#/definitions/simple" }, + "c": { + "type": "object", + "properties": { + "b": { "type": "string" } + } + } + } + })); + + assert_schemas_eq(expected_schema, actual_schema); + } +} diff --git a/lib/vector-config/src/schema/visitors/merge.rs b/lib/vector-config/src/schema/visitors/merge.rs new file mode 100644 index 0000000000000..741b74157e137 --- /dev/null +++ b/lib/vector-config/src/schema/visitors/merge.rs @@ -0,0 +1,291 @@ +#![allow(clippy::borrowed_box)] + +use std::mem::discriminant; + +use serde_json::Value; +use vector_config_common::schema::*; + +/// A type that can be merged with itself. +pub trait Mergeable { + fn merge(&mut self, other: &Self); +} + +impl Mergeable for SchemaObject { + fn merge(&mut self, other: &Self) { + // The logic is pretty straightforward: we merge `other` into `self`, with `self` having the + // higher precedence. + // + // Additionally, we only merge logical schema chunks: if the destination schema has object + // properties defined, and the source schema has some object properties that don't exist in + // the destination, they will be merged, but if there is any overlap, then the object + // properties in the destination would remain untouched. This merging logic applies to all + // map-based types. + // + // For standalone fields, such as title or description, the destination always has higher + // precedence. For optional fields, whichever version (destination or source) is present + // will win, except for when both are present, then the individual fields within the + // optional type will be merged according to the normal precedence rules. + merge_optional(&mut self.reference, other.reference.as_ref()); + merge_schema_metadata(&mut self.metadata, other.metadata.as_ref()); + merge_schema_instance_type(&mut self.instance_type, other.instance_type.as_ref()); + merge_schema_format(&mut self.format, other.format.as_ref()); + merge_schema_enum_values(&mut self.enum_values, other.enum_values.as_ref()); + merge_schema_const_value(&mut self.const_value, other.const_value.as_ref()); + merge_schema_subschemas(&mut self.subschemas, other.subschemas.as_ref()); + merge_schema_number_validation(&mut self.number, other.number.as_ref()); + merge_schema_string_validation(&mut self.string, other.string.as_ref()); + merge_schema_array_validation(&mut self.array, other.array.as_ref()); + merge_schema_object_validation(&mut self.object, other.object.as_ref()); + merge_schema_extensions(&mut self.extensions, &other.extensions); + } +} + +impl Mergeable for Value { + fn merge(&mut self, other: &Self) { + // We do a check here for ensuring both value discriminants are the same type. This is + // specific to `Value` but we should never really be merging identical keys together that + // have differing value types, as that is indicative of a weird overlap in keys between + // different schemas. + // + // We _may_ need to relax this in practice/in the future, but it's a solid invariant to + // enforce for the time being. + if discriminant(self) != discriminant(other) { + panic!("Tried to merge two `Value` types together with differing types!\n\nSelf: {:?}\n\nOther: {:?}", self, other); + } + + match (self, other) { + // Maps get merged recursively. + (Value::Object(self_map), Value::Object(other_map)) => { + self_map.merge(other_map); + } + // Arrays get merged together indiscriminately. + (Value::Array(self_array), Value::Array(other_array)) => { + self_array.extend(other_array.iter().cloned()); + } + // We don't merge any other value types together. + _ => {} + } + } +} + +impl Mergeable for Schema { + fn merge(&mut self, other: &Self) { + match (self, other) { + // We don't merge schemas together if either of them is a boolean schema. + (Schema::Bool(_), _) | (_, Schema::Bool(_)) => {} + (Schema::Object(self_schema), Schema::Object(other_schema)) => { + self_schema.merge(other_schema); + } + } + } +} + +impl Mergeable for serde_json::Map { + fn merge(&mut self, other: &Self) { + for (key, value) in other { + match self.get_mut(key) { + None => { + self.insert(key.clone(), value.clone()); + } + Some(existing) => existing.merge(value), + } + } + } +} + +impl Mergeable for Map +where + K: std::hash::Hash + Eq + Clone, + V: Clone + Mergeable, +{ + fn merge(&mut self, other: &Self) { + for (key, value) in other { + match self.get_mut(key) { + None => { + self.insert(key.clone(), value.clone()); + } + Some(existing) => existing.merge(value), + } + } + } +} + +fn merge_schema_metadata(destination: &mut Option>, source: Option<&Box>) { + merge_optional_with(destination, source, |existing, new| { + merge_optional(&mut existing.id, new.id.as_ref()); + merge_optional(&mut existing.title, new.title.as_ref()); + merge_optional(&mut existing.description, new.description.as_ref()); + merge_optional(&mut existing.default, new.default.as_ref()); + merge_bool(&mut existing.deprecated, new.deprecated); + merge_bool(&mut existing.read_only, new.read_only); + merge_bool(&mut existing.write_only, new.write_only); + merge_collection(&mut existing.examples, &new.examples); + }); +} + +fn merge_schema_instance_type( + destination: &mut Option>, + source: Option<&SingleOrVec>, +) { + merge_optional_with(destination, source, |existing, new| { + let mut deduped = existing + .into_iter() + .chain(new.into_iter()) + .cloned() + .collect::>(); + deduped.dedup(); + + *existing = deduped.into(); + }); +} + +fn merge_schema_format(destination: &mut Option, source: Option<&String>) { + merge_optional(destination, source); +} + +fn merge_schema_enum_values(destination: &mut Option>, source: Option<&Vec>) { + merge_optional_with(destination, source, merge_collection); +} + +fn merge_schema_const_value(destination: &mut Option, source: Option<&Value>) { + merge_optional(destination, source); +} + +fn merge_schema_subschemas( + destination: &mut Option>, + source: Option<&Box>, +) { + merge_optional_with(destination, source, |existing, new| { + merge_optional_with(&mut existing.all_of, new.all_of.as_ref(), merge_collection); + merge_optional_with(&mut existing.any_of, new.any_of.as_ref(), merge_collection); + merge_optional_with(&mut existing.one_of, new.one_of.as_ref(), merge_collection); + merge_optional(&mut existing.if_schema, new.if_schema.as_ref()); + merge_optional(&mut existing.then_schema, new.then_schema.as_ref()); + merge_optional(&mut existing.else_schema, new.else_schema.as_ref()); + merge_optional(&mut existing.not, new.not.as_ref()); + }); +} + +fn merge_schema_number_validation( + destination: &mut Option>, + source: Option<&Box>, +) { + merge_optional_with(destination, source, |existing, new| { + merge_optional(&mut existing.multiple_of, new.multiple_of.as_ref()); + merge_optional(&mut existing.maximum, new.maximum.as_ref()); + merge_optional( + &mut existing.exclusive_maximum, + new.exclusive_minimum.as_ref(), + ); + merge_optional(&mut existing.minimum, new.minimum.as_ref()); + merge_optional( + &mut existing.exclusive_minimum, + new.exclusive_minimum.as_ref(), + ); + }); +} + +fn merge_schema_string_validation( + destination: &mut Option>, + source: Option<&Box>, +) { + merge_optional_with(destination, source, |existing, new| { + merge_optional(&mut existing.max_length, new.max_length.as_ref()); + merge_optional(&mut existing.min_length, new.min_length.as_ref()); + merge_optional(&mut existing.pattern, new.pattern.as_ref()); + }); +} + +fn merge_schema_array_validation( + destination: &mut Option>, + source: Option<&Box>, +) { + merge_optional_with(destination, source, |existing, new| { + merge_optional_with(&mut existing.items, new.items.as_ref(), merge_collection); + merge_optional( + &mut existing.additional_items, + new.additional_items.as_ref(), + ); + merge_optional( + &mut existing.unevaluated_items, + new.unevaluated_items.as_ref(), + ); + merge_optional(&mut existing.max_items, new.max_items.as_ref()); + merge_optional(&mut existing.min_items, new.min_items.as_ref()); + merge_optional(&mut existing.unique_items, new.unique_items.as_ref()); + merge_optional(&mut existing.contains, new.contains.as_ref()); + }); +} + +fn merge_schema_object_validation( + destination: &mut Option>, + source: Option<&Box>, +) { + merge_optional_with(destination, source, |existing, new| { + merge_optional(&mut existing.max_properties, new.max_properties.as_ref()); + merge_optional(&mut existing.min_properties, new.min_properties.as_ref()); + merge_collection(&mut existing.required, &new.required); + merge_map(&mut existing.properties, &new.properties); + merge_map(&mut existing.pattern_properties, &new.pattern_properties); + merge_optional( + &mut existing.additional_properties, + new.additional_properties.as_ref(), + ); + merge_optional( + &mut existing.unevaluated_properties, + new.unevaluated_properties.as_ref(), + ); + merge_optional(&mut existing.property_names, new.property_names.as_ref()); + }); +} + +fn merge_schema_extensions(destination: &mut Map, source: &Map) { + merge_map(destination, source); +} + +fn merge_bool(destination: &mut bool, source: bool) { + // We only treat `true` as a merge-worthy value. + if source { + *destination = true; + } +} + +fn merge_collection<'a, E, I, T>(destination: &mut E, source: I) +where + E: Extend, + I: IntoIterator, + T: Clone + 'a, +{ + destination.extend(source.into_iter().cloned()); +} + +fn merge_map(destination: &mut Map, source: &Map) +where + K: std::hash::Hash + Eq + Clone, + V: Clone + Mergeable, +{ + destination.merge(source); +} + +fn merge_optional(destination: &mut Option, source: Option<&T>) { + merge_optional_with(destination, source, |_, _| {}); +} + +fn merge_optional_with<'a, T, F>(destination: &'a mut Option, source: Option<&'a T>, f: F) +where + T: Clone, + F: Fn(&'a mut T, &'a T), +{ + match destination { + // If the destination is empty, we use whatever we have in `source`. Otherwise, we leave + // `destination` as-is. + None => *destination = source.cloned(), + // If `destination` isn't empty, and neither is `source`, then pass them both to `f` to + // let it handle the actual merge logic. + Some(destination) => { + if let Some(source) = source { + f(destination, source); + } + } + } +} diff --git a/lib/vector-config/src/schema/visitors/mod.rs b/lib/vector-config/src/schema/visitors/mod.rs index d9f94a9909549..59802e0ebe10b 100644 --- a/lib/vector-config/src/schema/visitors/mod.rs +++ b/lib/vector-config/src/schema/visitors/mod.rs @@ -1,2 +1,10 @@ +mod inline_single; +pub mod merge; +pub mod scoped_visit; mod unevaluated; + +#[cfg(test)] +pub(self) mod test; + +pub use self::inline_single::InlineSingleUseReferencesVisitor; pub use self::unevaluated::DisallowedUnevaluatedPropertiesVisitor; diff --git a/lib/vector-config/src/schema/visitors/scoped_visit.rs b/lib/vector-config/src/schema/visitors/scoped_visit.rs new file mode 100644 index 0000000000000..937a4ee4920dd --- /dev/null +++ b/lib/vector-config/src/schema/visitors/scoped_visit.rs @@ -0,0 +1,180 @@ +use std::collections::VecDeque; + +use vector_config_common::schema::{visit::Visitor, *}; + +/// A schema reference which can refer to either a schema definition or the root schema itself. +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub enum SchemaReference { + /// A defined schema. + Definition(String), + + /// The root schema itself. + Root, +} + +impl std::fmt::Display for SchemaReference { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Definition(name) => write!(f, "{}", name), + Self::Root => write!(f, ""), + } + } +} + +impl<'a, T> From<&'a T> for SchemaReference +where + T: AsRef + ?Sized, +{ + fn from(value: &'a T) -> Self { + Self::Definition(value.as_ref().to_string()) + } +} + +impl AsRef for SchemaReference { + fn as_ref(&self) -> &str { + match self { + Self::Definition(name) => name.as_str(), + Self::Root => "", + } + } +} + +/// The schema scope stack is used to understand where in the visiting of a root schema the visitor +/// currently is. All visiting will inherently start at the root, and continue to exist in the root +/// scope until a schema reference is resolved, and the visitor visits the resolved schema. Once +/// that happens, the resolved schema scope is pushed onto the stack, such that the scope is now the +/// schema reference. This can happen multiple times as subsequent schema references are resolved. +/// Once the visitor recurses back out of the resolved schema, it is popped from the stack. +#[derive(Debug, Default)] +pub struct SchemaScopeStack { + stack: VecDeque, +} + +impl SchemaScopeStack { + pub fn push>(&mut self, scope: S) { + self.stack.push_front(scope.into()); + } + + pub fn pop(&mut self) -> Option { + self.stack.pop_front() + } + + pub fn current(&self) -> Option<&SchemaReference> { + self.stack.front() + } +} + +pub trait ScopedVisitor: Visitor { + fn push_schema_scope>(&mut self, scope: S); + + fn pop_schema_scope(&mut self); + + fn get_current_schema_scope(&self) -> &SchemaReference; +} + +pub fn visit_schema_object_scoped( + sv: &mut SV, + definitions: &mut Map, + schema: &mut SchemaObject, +) { + if let Some(reference) = schema.reference.as_ref() { + let schema_def_key = get_cleaned_schema_reference(reference); + let mut referenced_schema = definitions + .get(schema_def_key) + .cloned() + .expect("schema reference should exist"); + + if let Schema::Object(referenced_schema) = &mut referenced_schema { + sv.push_schema_scope(schema_def_key); + + sv.visit_schema_object(definitions, referenced_schema); + + sv.pop_schema_scope(); + } + + definitions.insert(schema_def_key.to_string(), referenced_schema); + } + + if let Some(sub) = &mut schema.subschemas { + visit_vec_scoped(sv, definitions, &mut sub.all_of); + visit_vec_scoped(sv, definitions, &mut sub.any_of); + visit_vec_scoped(sv, definitions, &mut sub.one_of); + visit_box_scoped(sv, definitions, &mut sub.not); + visit_box_scoped(sv, definitions, &mut sub.if_schema); + visit_box_scoped(sv, definitions, &mut sub.then_schema); + visit_box_scoped(sv, definitions, &mut sub.else_schema); + } + + if let Some(arr) = &mut schema.array { + visit_single_or_vec_scoped(sv, definitions, &mut arr.items); + visit_box_scoped(sv, definitions, &mut arr.additional_items); + visit_box_scoped(sv, definitions, &mut arr.contains); + } + + if let Some(obj) = &mut schema.object { + visit_map_values_scoped(sv, definitions, &mut obj.properties); + visit_map_values_scoped(sv, definitions, &mut obj.pattern_properties); + visit_box_scoped(sv, definitions, &mut obj.additional_properties); + visit_box_scoped(sv, definitions, &mut obj.property_names); + } +} + +fn visit_box_scoped( + sv: &mut SV, + definitions: &mut Map, + target: &mut Option>, +) { + if let Some(s) = target { + if let Schema::Object(s) = s.as_mut() { + sv.visit_schema_object(definitions, s); + } + } +} + +fn visit_vec_scoped( + sv: &mut SV, + definitions: &mut Map, + target: &mut Option>, +) { + if let Some(vec) = target { + for s in vec { + if let Schema::Object(s) = s { + sv.visit_schema_object(definitions, s); + } + } + } +} + +fn visit_map_values_scoped( + sv: &mut SV, + definitions: &mut Map, + target: &mut Map, +) { + for s in target.values_mut() { + if let Schema::Object(s) = s { + sv.visit_schema_object(definitions, s); + } + } +} + +fn visit_single_or_vec_scoped( + sv: &mut SV, + definitions: &mut Map, + target: &mut Option>, +) { + match target { + None => {} + Some(SingleOrVec::Single(s)) => { + if let Schema::Object(s) = s.as_mut() { + sv.visit_schema_object(definitions, s); + } + } + Some(SingleOrVec::Vec(vec)) => { + for s in vec { + if let Schema::Object(s) = s { + sv.visit_schema_object(definitions, s); + } + } + } + } +} diff --git a/lib/vector-config/src/schema/visitors/test.rs b/lib/vector-config/src/schema/visitors/test.rs new file mode 100644 index 0000000000000..1f1793cb8c071 --- /dev/null +++ b/lib/vector-config/src/schema/visitors/test.rs @@ -0,0 +1,15 @@ +use pretty_assertions::assert_eq; +use serde_json::Value; +use vector_config_common::schema::RootSchema; + +pub fn as_schema(value: Value) -> RootSchema { + serde_json::from_value(value).expect("should not fail to deserialize schema") +} + +#[track_caller] +pub fn assert_schemas_eq(expected: RootSchema, actual: RootSchema) { + let expected_json = serde_json::to_string_pretty(&expected).expect("should not fail"); + let actual_json = serde_json::to_string_pretty(&actual).expect("should not fail"); + + assert_eq!(expected_json, actual_json); +} diff --git a/lib/vector-config/src/schema/visitors/unevaluated.rs b/lib/vector-config/src/schema/visitors/unevaluated.rs index 1db74cc483b28..4aa34dcfbb2b8 100644 --- a/lib/vector-config/src/schema/visitors/unevaluated.rs +++ b/lib/vector-config/src/schema/visitors/unevaluated.rs @@ -1,10 +1,26 @@ -//use indexmap::IndexMap; +#![allow(warnings)] + +use std::{ + collections::{HashMap, HashSet}, + convert::identity, + mem::discriminant, +}; + +use serde_json::Value; +use tracing::{debug, error}; use vector_config_common::schema::{ visit::{visit_schema_object, with_resolved_schema_reference, Visitor}, - InstanceType, Map, Schema, SchemaObject, SchemaSettings, SingleOrVec, + *, +}; + +use crate::schema::visitors::merge::Mergeable; + +use super::scoped_visit::{ + visit_schema_object_scoped, SchemaReference, SchemaScopeStack, ScopedVisitor, }; -/// A visitor that marks schemas as disallowing unknown properties via `unevaluatedProperties`. +/// A visitor that marks schemas as closed by disallowing unknown properties via +/// `unevaluatedProperties`. /// /// This is the equivalent of `serde`'s `deny_unknown_fields` attribute: instead of only validating /// the properties specified in the schema, and ignoring any properties present in the JSON @@ -15,23 +31,128 @@ use vector_config_common::schema::{ /// with advanced subschema validation, such as `oneOf` or `allOf`, as `unevaluatedProperties` /// cannot simply be applied to any and all schemas indiscriminately. #[derive(Debug, Default)] -pub struct DisallowedUnevaluatedPropertiesVisitor; +pub struct DisallowedUnevaluatedPropertiesVisitor { + scope_stack: SchemaScopeStack, + eligible_to_flatten: HashMap>, +} impl DisallowedUnevaluatedPropertiesVisitor { pub fn from_settings(_: &SchemaSettings) -> Self { - Self + Self { + scope_stack: SchemaScopeStack::default(), + eligible_to_flatten: HashMap::new(), + } } } impl Visitor for DisallowedUnevaluatedPropertiesVisitor { + fn visit_root_schema(&mut self, root: &mut RootSchema) { + // Glossary: + // + // mark(ed): setting or having `unevaluatedProperties` set to `Some(Schema::Bool(false))` + // unmark(ed): unsetting or having `unevaluatedProperties` set to `None` + + // For all definitions, visit _just_ the defined schema (no recursing) and build a map of + // child definitions -> (mark eligibility, [(parent definition, would_unmark)]), such that we know exactly + // which schemas refer to any given schema definition and if they would lead to the child + // schema being marked as `unevaluatedProperties: false`. + // + // We would filter out any child definitions that aren't eligible to be marked. For the + // remaining child schemas, we group the parent definitions by `would_unmark`, which + // indicates whether or not the given parent definition would cause the child definition to + // be unmarked. + // + // As an example, we would expect a parent schema referring to a child schema via `allOf` to + // unmark the child schema, while a parent schema referring to a child schema within a + // specific property to not unmark the child schema. + // + // With the grouped parent definitions, take the smaller of the two groups. This represents + // the set of parent schemas that we will indicate as needing to use a flattened version of + // the child schema when we execute our primary visit logic. + + let mut eligible_to_flatten = HashMap::new(); + let flattening_eligibility_map = build_closed_schema_flatten_eligibility_mappings(root); + for (child_schema_ref, metadata) in flattening_eligibility_map { + let would_unmark = metadata + .referrers + .iter() + .filter(|r| r.would_unmark) + .map(|r| r.referent.clone()) + .collect::>(); + let would_not_unmark = metadata + .referrers + .iter() + .filter(|r| !r.would_unmark) + .map(|r| r.referent.clone()) + .collect::>(); + + if would_not_unmark.len() > would_unmark.len() { + eligible_to_flatten.insert(child_schema_ref.to_string(), would_unmark); + } else { + eligible_to_flatten.insert(child_schema_ref.to_string(), would_not_unmark); + } + } + + debug!( + "Found {} referents eligible for flattening: {:?}", + eligible_to_flatten.len(), + eligible_to_flatten, + ); + + self.eligible_to_flatten = eligible_to_flatten; + + visit::visit_root_schema(self, root); + } + fn visit_schema_object( &mut self, definitions: &mut Map, schema: &mut SchemaObject, ) { + // If this schema has a schema reference, check our flattening eligibility map to figure out + // if we need to merge it in. + // + // When a given schema reference (the actual target of `$ref`) is eligible for flattening in + // a given schema (what we're currently visiting) then it means that this schema would, + // based on its composition, lead to the schema reference either being marked or unmarked. + // + // We flatten the schema reference into this schema to avoid that from occurring, and we do + // so based on whichever group of referrers -- the schemas which reference the particular + // target schema -- is smaller, such that we do the minimum number of flattenings per target + // schema, to keep the schema as small as we reasonably can. + if let Some(reference) = schema.reference.as_ref() { + let current_parent_schema_ref = self.get_current_schema_scope(); + + if let Some(referrers) = self.eligible_to_flatten.get(reference) { + if referrers.contains(¤t_parent_schema_ref) { + let current_schema_ref = get_cleaned_schema_reference(reference); + let referenced_schema = definitions + .get(current_schema_ref) + .expect("schema definition must exist"); + + debug!( + referent = current_schema_ref, + referrer = current_parent_schema_ref.as_ref(), + "Found eligible referent/referrer mapping." + ); + + if let Schema::Object(referenced_schema) = referenced_schema { + debug!( + referent = current_schema_ref, + referrer = current_parent_schema_ref.as_ref(), + "Flattening referent into referrer." + ); + + schema.reference = None; + schema.merge(&referenced_schema); + } + } + } + } + // Visit the schema object first so that we recurse the overall schema in a depth-first - // fashion, applying the unevaluated properties change from the bottom up. - visit_schema_object(self, definitions, schema); + // fashion, marking eligible object schemas as closed. + visit_schema_object_scoped(self, definitions, schema); // Next, see if this schema has any subschema validation: `allOf`, `oneOf`, or `anyOf`. // @@ -42,39 +163,378 @@ impl Visitor for DisallowedUnevaluatedPropertiesVisitor { // which is why we remove that from the subschemas themselves but essentially hoist it up // to the level of the `allOf`/`oneOf`/`anyOf`, where it can apply the correct behavior. let mut had_relevant_subschemas = false; - if let Some(subschemas) = get_subschema_validators(schema) { - had_relevant_subschemas = true; - + if let Some(subschema) = schema.subschemas.as_mut() { + let mut subschemas = get_object_subschemas_from_parent_mut(subschema.as_mut()); for subschema in subschemas { - // If the schema is an object schema, we'll unset `unevaluatedProperties` directly. - // If it isn't an object schema, we'll see if the subschema is actually a schema - // reference, and if so, we'll make sure to unset `unevaluatedProperties` on the - // resolved schema reference itself. - // - // Like the top-level schema reference logic, this ensures the schema definition is - // updated for subsequent resolution. - if let Some(object) = subschema.object.as_mut() { - object.unevaluated_properties = Some(Box::new(Schema::Bool(true))); - } else { - with_resolved_schema_reference(definitions, subschema, |_, resolved| { - if let Schema::Object(schema) = resolved { - if let Some(object) = schema.object.as_mut() { - object.unevaluated_properties = Some(Box::new(Schema::Bool(true))); - } - } - }); - } + had_relevant_subschemas = true; + + unmark_or_flatten_schema(definitions, subschema); } } // If we encountered any subschema validation, or if this schema itself is an object schema, // mark the schema as closed by setting `unevaluatedProperties` to `false`. - if had_relevant_subschemas || is_object_schema(schema.instance_type.as_ref()) { + if had_relevant_subschemas || is_object_schema(schema) { mark_schema_closed(schema); } } } +impl ScopedVisitor for DisallowedUnevaluatedPropertiesVisitor { + fn push_schema_scope>(&mut self, scope: S) { + self.scope_stack.push(scope.into()); + } + + fn pop_schema_scope(&mut self) { + self.scope_stack.pop().expect("stack was empty during pop"); + } + + fn get_current_schema_scope(&self) -> &SchemaReference { + self.scope_stack + .current() + .unwrap_or_else(|| &SchemaReference::Root) + } +} + +fn is_non_object_optional_schema( + definitions: &mut Map, + schema: &mut SchemaObject, +) -> bool { + // Only match schemas that have subschema validation in the form of `oneOf`, and nothing else. + let subschemas = match schema.subschemas.as_ref() { + None => return false, + Some(subschema) => { + let has_non_one_of = subschema.all_of.is_some() || subschema.any_of.is_some(); + match subschema.one_of.as_ref() { + Some(one_of) if !has_non_one_of => one_of, + _ => return false, + } + } + }; + + // There can only be two subschemas, one of which is simply a `null` schema. + if subschemas.len() != 2 { + return false; + } + + let maybe_non_null_schema_idx = subschemas[0] + .as_object() + .map(is_null_schema) + .and_then(|x| x.then(|| 1)) + .or_else(|| { + subschemas[1] + .as_object() + .map(is_null_schema) + .and_then(|x| x.then(|| 0)) + }); + + let non_null_schema = match maybe_non_null_schema_idx { + Some(idx) => &subschemas[idx], + None => return false, + }; + + // Check if the non-null schema is an object schema or not. + non_null_schema + .as_object() + .map(|schema| !is_object_schema(schema)) + .unwrap_or(true) +} + +fn unmark_or_flatten_schema(definitions: &mut Map, schema: &mut SchemaObject) { + // If the schema is an object schema, we'll unset `unevaluatedProperties` directly. + // If it isn't an object schema, we'll see if the subschema is actually a schema + // reference, and if so, we'll make sure to unset `unevaluatedProperties` on the + // resolved schema reference itself. + // + // Like the top-level schema reference logic, this ensures the schema definition is + // updated for subsequent resolution. + if let Some(object) = schema.object.as_mut() { + debug!("Unmarked object subschema directly."); + + object.unevaluated_properties = Some(Box::new(Schema::Bool(true))); + } else { + with_resolved_schema_reference(definitions, schema, |_, schema_ref, resolved| { + if let Schema::Object(resolved) = resolved { + if let Some(object) = resolved.object.as_mut() { + debug!( + referent = schema_ref, + "Unmarked subschema by traversing schema reference." + ); + + object.unevaluated_properties = Some(Box::new(Schema::Bool(true))); + } + } + }); + } +} + +/// A referent schema that carries the chance of being unmarking by its referrer. +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +struct MarkableReferent { + // Whether or not the referent would be unmarked by the referrer. + would_unmark: bool, + + /// The referent schema. + referent: SchemaReference, +} + +impl MarkableReferent { + fn would_unmark>(referent: R) -> Self { + Self { + would_unmark: true, + referent: referent.into(), + } + } + + fn would_not_unmark>(referent: R) -> Self { + Self { + would_unmark: false, + referent: referent.into(), + } + } + + fn with_new_referent>(&self, new_referent: R) -> Self { + Self { + would_unmark: self.would_unmark, + referent: new_referent.into(), + } + } +} + +#[derive(Debug)] +struct ChildMetadata { + /// Whether or not this schema, in isolation, should be marked. + should_mark: bool, + + /// The set of referring schemas. + referrers: HashSet, +} + +impl ChildMetadata { + fn from_schema(schema: &SchemaObject) -> Self { + Self { + should_mark: is_markable_schema(schema), + referrers: HashSet::new(), + } + } +} + +fn build_closed_schema_flatten_eligibility_mappings( + root_schema: &RootSchema, +) -> HashMap { + // Iterate over all definitions, and once more for the root schema, and generate a map of parent + // schema -> (would_unmark, child schema). + let mut parent_to_child = HashMap::new(); + for (definition_name, definition) in &root_schema.definitions { + // We only care about full-fledged schemas, not boolean schemas. + let parent_schema = match definition { + Schema::Bool(_) => continue, + Schema::Object(schema) => schema, + }; + + // If a schema itself would not be considered markable, then we don't need to consider the + // eligibility between parent/child since there's nothing to drive the "now unmark the child + // schemas" logic. + if !is_markable_schema(parent_schema) { + continue; + } + + // Collect all referents for this definition, which includes both property-based referents + // and subschema-based referents. Property-based referents are not required to be unmarked, + // while subschema-based referents must be unmarked. + let mut referents = HashSet::new(); + get_referents(parent_schema, &mut referents); + + // Store the parent/child mapping. + parent_to_child.insert(SchemaReference::from(definition_name), referents); + } + + // Collect the referents from the root schema. + let mut root_referents = HashSet::new(); + get_referents(&root_schema.schema, &mut root_referents); + parent_to_child.insert(SchemaReference::Root, root_referents); + + // Now we build a reverse map, going from child -> parent. We'll iterate over every child + // referent, for every parent/child entry, calcuating the set of referrers, and if they would + // require unmarking the child. + let mut child_to_parent = HashMap::new(); + for (parent_schema_ref, child_referents) in parent_to_child { + for child_referent in child_referents { + let entry = child_to_parent + .entry(child_referent.referent.as_ref().to_string()) + .or_insert_with(|| { + let child_schema = + get_schema_from_reference(root_schema, &child_referent.referent); + ChildMetadata::from_schema(child_schema) + }); + + // Transform the child referent into a parent referent, which preserves the "would + // unmark" value but now points to the parent instead, and add it to the list of + // _referrers_ for the child. + entry + .referrers + .insert(child_referent.with_new_referent(parent_schema_ref.clone())); + } + } + + /* + // Now filter out entries that are not be eligible for flattening. + child_to_parent + .into_iter() + // Filter out all mappings that don't involve markable schemas. + .filter(|(_, metadata)| metadata.should_mark) + // Filter out all mappings where all referrers have the same marking behavior for the + // referrent, as there's nothing to differentiate between for flattening vs not flattening, + // so long + .filter(|(_, metadata)| { + !(metadata.referrers.iter().all(|r| r.would_unmark) + || metadata.referrers.iter().all(|r| !r.would_unmark)) + }) + .collect()*/ + + child_to_parent +} + +fn get_schema_from_reference<'a>( + root_schema: &'a RootSchema, + schema_ref: &SchemaReference, +) -> &'a SchemaObject { + match schema_ref { + SchemaReference::Root => &root_schema.schema, + SchemaReference::Definition(name) => { + let definition_name = get_cleaned_schema_reference(name.as_str()); + match root_schema.definitions.get(definition_name) { + None => panic!("Schema definition '{}' does not exist!", definition_name), + Some(schema) => schema + .as_object() + .expect("referenced schemas should never be boolean schemas"), + } + } + } +} + +/// Determines whether a schema is eligible to be marked. +fn is_markable_schema(schema: &SchemaObject) -> bool { + // If the schema is an object schema, and does not have`additionalProperties` set, it can be + // marked, as marking a schema with both `unevaluatedProperties`/`additionalProperties` would + // otherwise be a logical inconsistency. + let has_additional_properties = schema + .object + .as_ref() + .and_then(|object| object.additional_properties.as_ref()) + .map(|schema| matches!(schema.as_ref(), Schema::Object(_))) + .unwrap_or(false); + + if is_object_schema(schema) && !has_additional_properties { + return true; + } + + // If the schema uses subschema validation -- specifically: `allOf`, `oneOf`, or `anyOf` -- then + // it should be marked, so long as one of the subschemas is actually an object schema. + // + // If we're dealing with something like a `oneOf` for `Option`, we'll have two + // subschemas: { "type": "null" } and { "$ref": "#/definitions/T" }. If the schema for `T` is, + // say, just a scalar schema, instead of an object schema... then it wouldn't be marked, and in + // turn, we wouldn't need to mark the schema for `Option`: there's no properties at all. + if let Some(subschema) = schema.subschemas.as_ref() { + let mut subschemas = get_object_subschemas_from_parent(subschema); + + let has_object_subschema = subschemas.any(is_object_schema); + if has_object_subschema { + return true; + } + } + + false +} + +/// Collects all referents from the given parent schema, and inserts them to `referents`. +/// +/// Property schemas from `properties`, `patternProperties`, and `additionalProperties` are checked. +/// Any such referents in a property schema are do not need to be unmarked as the "chain" between +/// parent/child is broken implicitly by the property-level scoping of the value they would be given +/// to validate. +/// +/// Subschemas from `allOf`, `oneOf`, and `anyOf` are also checked. As subschema validation implies +/// that each subschema will be given the same value to validate, even if the subschema only +/// represents a slice of the parent schema, there is a link between parent/child that requires the +/// child to be unmarked so that the parent can be marked to enforce `unevaluatedProperties` at the +/// correct scope. +/// +/// This function will recurse a schema object entirely, in terms of property schemas and +/// subschemas, but will not recurse through schema references. +fn get_referents(parent_schema: &SchemaObject, referents: &mut HashSet) { + if let Some(parent_object) = parent_schema.object.as_ref() { + // For both `properties` and `patternProperties`, collect the schema reference, if any, from + // all property schemas. + for (_, property_schema) in parent_object + .properties + .iter() + .chain(parent_object.pattern_properties.iter()) + { + if let Some(child_schema) = property_schema.as_object() { + if let Some(child_schema_ref) = child_schema.reference.as_ref() { + referents.insert(MarkableReferent::would_not_unmark(child_schema_ref)); + } else { + get_referents(child_schema, referents); + } + } + } + + // For `additionalProperties`, if present and defined as a schema object, collect the schema + // reference if one is set. + if let Some(additional_properties) = parent_object.additional_properties.as_ref() { + if let Some(child_schema) = additional_properties.as_ref().as_object() { + if let Some(child_schema_ref) = child_schema.reference.as_ref() { + referents.insert(MarkableReferent::would_not_unmark(child_schema_ref)); + } else { + get_referents(child_schema, referents); + } + } + } + } + + if let Some(subschema) = parent_schema.subschemas.as_ref() { + // For `allOf`, `oneOf`, and `anyOf`, collect the schema reference, if any, from their + // respective subschemas. + for subschema in get_object_subschemas_from_parent(subschema) { + if let Some(child_schema_ref) = subschema.reference.as_ref() { + referents.insert(MarkableReferent::would_unmark(child_schema_ref)); + } else { + get_referents(subschema, referents); + } + } + } +} + +fn get_object_subschemas_from_parent( + subschema: &SubschemaValidation, +) -> impl Iterator { + [ + subschema.all_of.as_ref(), + subschema.one_of.as_ref(), + subschema.any_of.as_ref(), + ] + .into_iter() + .filter_map(identity) + .flat_map(identity) + .filter_map(Schema::as_object) +} + +fn get_object_subschemas_from_parent_mut( + subschema: &mut SubschemaValidation, +) -> impl Iterator { + [ + subschema.all_of.as_mut(), + subschema.one_of.as_mut(), + subschema.any_of.as_mut(), + ] + .into_iter() + .filter_map(identity) + .flat_map(identity) + .filter_map(Schema::as_object_mut) +} + fn mark_schema_closed(schema: &mut SchemaObject) { // Make sure this schema doesn't also have `additionalProperties` set to a non-boolean schema, // as it would be a logical inconsistency to then also set `unevaluatedProperties` to `false`. @@ -110,16 +570,28 @@ fn mark_schema_closed(schema: &mut SchemaObject) { schema.object().unevaluated_properties = Some(Box::new(Schema::Bool(false))); } -fn is_object_schema(instance_type: Option<&SingleOrVec>) -> bool { - match instance_type { +fn schema_type_matches( + schema: &SchemaObject, + instance_type: InstanceType, + allow_multiple: bool, +) -> bool { + match schema.instance_type.as_ref() { Some(sov) => match sov { - SingleOrVec::Single(inner) => inner.as_ref() == &InstanceType::Object, - SingleOrVec::Vec(inner) => inner.contains(&InstanceType::Object), + SingleOrVec::Single(inner) => inner.as_ref() == &instance_type, + SingleOrVec::Vec(inner) => inner.contains(&instance_type) && allow_multiple, }, None => false, } } +fn is_object_schema(schema: &SchemaObject) -> bool { + schema_type_matches(schema, InstanceType::Object, true) +} + +fn is_null_schema(schema: &SchemaObject) -> bool { + schema_type_matches(schema, InstanceType::Null, false) +} + fn get_subschema_validators(schema: &mut SchemaObject) -> Option> { let mut validators = vec![]; @@ -159,22 +631,12 @@ fn get_subschema_validators(schema: &mut SchemaObject) -> Option RootSchema { - serde_json::from_value(value).expect("should not fail to deserialize schema") - } + use serde_json::json; + use vector_config_common::schema::visit::Visitor; - fn assert_schemas_eq(expected: RootSchema, actual: RootSchema) { - let expected_json = serde_json::to_string_pretty(&expected).expect("should not fail"); - let actual_json = serde_json::to_string_pretty(&actual).expect("should not fail"); + use crate::schema::visitors::test::{as_schema, assert_schemas_eq}; - assert_eq!(expected_json, actual_json); - } + use super::DisallowedUnevaluatedPropertiesVisitor; #[test] fn basic_object_schema() { @@ -465,4 +927,50 @@ mod tests { assert_schemas_eq(expected_schema, actual_schema); } + + #[test] + fn conflicting_schema_usages_get_duplicated_and_flattened() { + let mut actual_schema = as_schema(json!({ + "type": "object", + "properties": { + "acks": { "$ref": "#/definitions/acks" }, + "custom_acks": { "$ref": "#/definitions/custom_acks" } + }, + "definitions": { + "custom_acks": { + "allOf": [{ "type": "object", "properties": { "ack_count": { "type": "number" } } }, + { "$ref": "#/definitions/acks" }] + }, + "acks": { "type": "object", "properties": { "enabled": { "type": "boolean" } } } + } + })); + + let mut visitor = DisallowedUnevaluatedPropertiesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + let expected_schema = as_schema(json!({ + "type": "object", + "properties": { + "acks": { "$ref": "#/definitions/acks" }, + "custom_acks": { "$ref": "#/definitions/custom_acks" } + }, + "definitions": { + "custom_acks": { + "allOf": [ + { "type": "object", "properties": { "ack_count": { "type": "number" } } }, + { "type": "object", "properties": { "enabled": { "type": "boolean" } } } + ], + "unevaluatedProperties": false + }, + "acks": { + "type": "object", + "properties": { "enabled": { "type": "boolean" } }, + "unevaluatedProperties": false + } + }, + "unevaluatedProperties": false + })); + + assert_schemas_eq(expected_schema, actual_schema); + } } diff --git a/scripts/generate-component-docs.rb b/scripts/generate-component-docs.rb index b9a7e98381696..ed47670470c6f 100755 --- a/scripts/generate-component-docs.rb +++ b/scripts/generate-component-docs.rb @@ -603,6 +603,17 @@ def expand_schema_references(root_schema, unexpanded_schema) } end + if !schema['anyOf'].nil? + schema['anyOf'] = schema['anyOf'].map { |subschema| + new_subschema = expand_schema_references(root_schema, subschema) + if new_subschema != subschema + expanded = true + end + + new_subschema + } + end + if !expanded break end @@ -914,10 +925,10 @@ def resolve_bare_schema(root_schema, schema) @logger.error "Relevant schema: #{JSON.pretty_generate(schema)}" exit 1 end - additional_properties['description'] = singular_description resolved_additional_properties = resolve_schema(root_schema, additional_properties) resolved_additional_properties['required'] = true + resolved_additional_properties['description'] = singular_description options.push(['*', resolved_additional_properties]) end diff --git a/website/cue/reference/components/sinks/base/gcp_stackdriver_logs.cue b/website/cue/reference/components/sinks/base/gcp_stackdriver_logs.cue index 6f73a93dd3cd8..646e9d3ec69a2 100644 --- a/website/cue/reference/components/sinks/base/gcp_stackdriver_logs.cue +++ b/website/cue/reference/components/sinks/base/gcp_stackdriver_logs.cue @@ -314,8 +314,12 @@ base: components: sinks: gcp_stackdriver_logs: configuration: { } } resource: { - description: "The monitored resource to associate the logs with." - required: true + description: """ + A monitored resource. + + The monitored resource to associate the logs with. + """ + required: true type: object: { examples: [{ instanceId: "Twilight" diff --git a/website/cue/reference/components/sinks/base/gcp_stackdriver_metrics.cue b/website/cue/reference/components/sinks/base/gcp_stackdriver_metrics.cue index f0edf5c8474a2..6f58d57ca46a9 100644 --- a/website/cue/reference/components/sinks/base/gcp_stackdriver_metrics.cue +++ b/website/cue/reference/components/sinks/base/gcp_stackdriver_metrics.cue @@ -256,8 +256,12 @@ base: components: sinks: gcp_stackdriver_metrics: configuration: { } } resource: { - description: "The monitored resource to associate the metrics with." - required: true + description: """ + A monitored resource. + + The monitored resource to associate the metrics with. + """ + required: true type: object: { examples: [{ instanceId: "Twilight" diff --git a/website/cue/reference/components/sinks/base/http.cue b/website/cue/reference/components/sinks/base/http.cue index edc96ae57ba11..7eefce772c920 100644 --- a/website/cue/reference/components/sinks/base/http.cue +++ b/website/cue/reference/components/sinks/base/http.cue @@ -310,8 +310,12 @@ base: components: sinks: http: configuration: { } } method: { - description: "The HTTP method to use when making the request." - required: false + description: """ + HTTP method. + + The HTTP method to use when making the request. + """ + required: false type: string: { default: "post" enum: { From cd6d7164e21ea0008133f258a44ca6724e6491f6 Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Fri, 7 Apr 2023 13:12:14 -0400 Subject: [PATCH 2/5] fix flattening --- .../src/schema/visitors/unevaluated.rs | 242 +++++++++++++----- 1 file changed, 172 insertions(+), 70 deletions(-) diff --git a/lib/vector-config/src/schema/visitors/unevaluated.rs b/lib/vector-config/src/schema/visitors/unevaluated.rs index 4aa34dcfbb2b8..3569fa757e0a7 100644 --- a/lib/vector-config/src/schema/visitors/unevaluated.rs +++ b/lib/vector-config/src/schema/visitors/unevaluated.rs @@ -47,51 +47,7 @@ impl DisallowedUnevaluatedPropertiesVisitor { impl Visitor for DisallowedUnevaluatedPropertiesVisitor { fn visit_root_schema(&mut self, root: &mut RootSchema) { - // Glossary: - // - // mark(ed): setting or having `unevaluatedProperties` set to `Some(Schema::Bool(false))` - // unmark(ed): unsetting or having `unevaluatedProperties` set to `None` - - // For all definitions, visit _just_ the defined schema (no recursing) and build a map of - // child definitions -> (mark eligibility, [(parent definition, would_unmark)]), such that we know exactly - // which schemas refer to any given schema definition and if they would lead to the child - // schema being marked as `unevaluatedProperties: false`. - // - // We would filter out any child definitions that aren't eligible to be marked. For the - // remaining child schemas, we group the parent definitions by `would_unmark`, which - // indicates whether or not the given parent definition would cause the child definition to - // be unmarked. - // - // As an example, we would expect a parent schema referring to a child schema via `allOf` to - // unmark the child schema, while a parent schema referring to a child schema within a - // specific property to not unmark the child schema. - // - // With the grouped parent definitions, take the smaller of the two groups. This represents - // the set of parent schemas that we will indicate as needing to use a flattened version of - // the child schema when we execute our primary visit logic. - - let mut eligible_to_flatten = HashMap::new(); - let flattening_eligibility_map = build_closed_schema_flatten_eligibility_mappings(root); - for (child_schema_ref, metadata) in flattening_eligibility_map { - let would_unmark = metadata - .referrers - .iter() - .filter(|r| r.would_unmark) - .map(|r| r.referent.clone()) - .collect::>(); - let would_not_unmark = metadata - .referrers - .iter() - .filter(|r| !r.would_unmark) - .map(|r| r.referent.clone()) - .collect::>(); - - if would_not_unmark.len() > would_unmark.len() { - eligible_to_flatten.insert(child_schema_ref.to_string(), would_unmark); - } else { - eligible_to_flatten.insert(child_schema_ref.to_string(), would_not_unmark); - } - } + let mut eligible_to_flatten = build_closed_schema_flatten_eligibility_mappings(root); debug!( "Found {} referents eligible for flattening: {:?}", @@ -311,9 +267,9 @@ struct ChildMetadata { } impl ChildMetadata { - fn from_schema(schema: &SchemaObject) -> Self { + fn from_schema(definitions: &Map, schema: &SchemaObject) -> Self { Self { - should_mark: is_markable_schema(schema), + should_mark: is_markable_schema(definitions, schema), referrers: HashSet::new(), } } @@ -321,7 +277,24 @@ impl ChildMetadata { fn build_closed_schema_flatten_eligibility_mappings( root_schema: &RootSchema, -) -> HashMap { +) -> HashMap> { + // For all definitions, visit _just_ the defined schema (no recursing) and build a map of child + // definitions -> (mark eligibility, [(parent definition, would_unmark)]), such that we know + // exactly which schemas refer to any given schema definition and if they would lead to the + // child schema being marked as `unevaluatedProperties: false`. + // + // We would filter out any child definitions that aren't eligible to be marked. For the + // remaining child schemas, we group the parent definitions by `would_unmark`, which indicates + // whether or not the given parent definition would cause the child definition to be unmarked. + // + // As an example, we would expect a parent schema referring to a child schema via `allOf` to + // unmark the child schema, while a parent schema referring to a child schema within a specific + // property to not unmark the child schema. + // + // With the grouped parent definitions, take the smaller of the two groups. This represents the + // set of parent schemas that we will indicate as needing to use a flattened version of the + // child schema when we execute our primary visit logic. + // Iterate over all definitions, and once more for the root schema, and generate a map of parent // schema -> (would_unmark, child schema). let mut parent_to_child = HashMap::new(); @@ -335,7 +308,8 @@ fn build_closed_schema_flatten_eligibility_mappings( // If a schema itself would not be considered markable, then we don't need to consider the // eligibility between parent/child since there's nothing to drive the "now unmark the child // schemas" logic. - if !is_markable_schema(parent_schema) { + if !is_markable_schema(&root_schema.definitions, parent_schema) { + debug!("Schema definition '{}' not markable.", definition_name); continue; } @@ -365,7 +339,7 @@ fn build_closed_schema_flatten_eligibility_mappings( .or_insert_with(|| { let child_schema = get_schema_from_reference(root_schema, &child_referent.referent); - ChildMetadata::from_schema(child_schema) + ChildMetadata::from_schema(&root_schema.definitions, child_schema) }); // Transform the child referent into a parent referent, which preserves the "would @@ -377,22 +351,34 @@ fn build_closed_schema_flatten_eligibility_mappings( } } - /* - // Now filter out entries that are not be eligible for flattening. - child_to_parent - .into_iter() - // Filter out all mappings that don't involve markable schemas. - .filter(|(_, metadata)| metadata.should_mark) - // Filter out all mappings where all referrers have the same marking behavior for the - // referrent, as there's nothing to differentiate between for flattening vs not flattening, - // so long - .filter(|(_, metadata)| { - !(metadata.referrers.iter().all(|r| r.would_unmark) - || metadata.referrers.iter().all(|r| !r.would_unmark)) - }) - .collect()*/ - - child_to_parent + let mut eligible_to_flatten = HashMap::new(); + for (child_schema_ref, metadata) in child_to_parent { + // Don't flatten schemas which has less than two referrers. + if metadata.referrers.len() < 2 { + continue; + } + + let would_unmark = metadata + .referrers + .iter() + .filter(|r| r.would_unmark) + .map(|r| r.referent.clone()) + .collect::>(); + let would_not_unmark = metadata + .referrers + .iter() + .filter(|r| !r.would_unmark) + .map(|r| r.referent.clone()) + .collect::>(); + + if would_not_unmark.len() >= would_unmark.len() { + eligible_to_flatten.insert(child_schema_ref.to_string(), would_unmark); + } else { + eligible_to_flatten.insert(child_schema_ref.to_string(), would_not_unmark); + } + } + + eligible_to_flatten } fn get_schema_from_reference<'a>( @@ -414,7 +400,7 @@ fn get_schema_from_reference<'a>( } /// Determines whether a schema is eligible to be marked. -fn is_markable_schema(schema: &SchemaObject) -> bool { +fn is_markable_schema(definitions: &Map, schema: &SchemaObject) -> bool { // If the schema is an object schema, and does not have`additionalProperties` set, it can be // marked, as marking a schema with both `unevaluatedProperties`/`additionalProperties` would // otherwise be a logical inconsistency. @@ -436,11 +422,28 @@ fn is_markable_schema(schema: &SchemaObject) -> bool { // subschemas: { "type": "null" } and { "$ref": "#/definitions/T" }. If the schema for `T` is, // say, just a scalar schema, instead of an object schema... then it wouldn't be marked, and in // turn, we wouldn't need to mark the schema for `Option`: there's no properties at all. + // + // We'll follow schema references in subschemas up to one level deep trying to figure this out. if let Some(subschema) = schema.subschemas.as_ref() { - let mut subschemas = get_object_subschemas_from_parent(subschema); + let mut subschemas = get_object_subschemas_from_parent(subschema).collect::>(); - let has_object_subschema = subschemas.any(is_object_schema); - if has_object_subschema { + let has_object_subschema = subschemas.iter().any(|schema| is_object_schema(*schema)); + let has_referenced_object_subschema = subschemas + .iter() + .map(|subschema| { + subschema + .reference + .as_ref() + .and_then(|reference| { + let reference = get_cleaned_schema_reference(&reference); + definitions.get(reference) + }) + .and_then(Schema::as_object) + .map_or(false, is_object_schema) + }) + .any(identity); + + if has_object_subschema || has_referenced_object_subschema { return true; } } @@ -973,4 +976,103 @@ mod tests { assert_schemas_eq(expected_schema, actual_schema); } + + #[test] + fn multiple_mark_unmark_references_flattened_efficiently() { + // This tests that if, for example, one schema reference would be marked and unmarked by + // multiple referrers, the referrers we choose to flatten the reference on are in the + // smaller group (i.e. we do as few flattenings as possible). + + let mut actual_schema = as_schema(json!({ + "type": "object", + "properties": { + "a": { "$ref": "#/definitions/a" }, + "b": { "$ref": "#/definitions/b" }, + "c": { "$ref": "#/definitions/c" }, + "one": { "$ref": "#/definitions/one" }, + "two": { "$ref": "#/definitions/two" } + }, + "definitions": { + "one": { + "allOf": [{ "$ref": "#/definitions/c" }] + }, + "two": { + "allOf": [{ "$ref": "#/definitions/b" }, { "$ref": "#/definitions/c" }] + }, + "a": { + "type": "object", + "properties": { "a": { "type": "boolean" } } + }, + "b": { + "type": "object", + "properties": { "b": { "type": "boolean" } } + }, + "c": { + "type": "object", + "properties": { "c": { "type": "boolean" } } + } + } + })); + + let mut visitor = DisallowedUnevaluatedPropertiesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + // Expectations: + // - Schema A is only referenced in an object property, so it's marked normally. + // - Schema B is referenced twice -- once as an object property and once in a subschema -- + // so since we prioritize flattening usages that would unmark a schema when the + // would-unmark/would-not-unmark counts are equal, schema B is only flattened for the + // subschema usage. + // - Schema C is referenced three times -- once as an object property and twice in a + // subschema -- so since there's more would-unmark usages than would-not-unmark usages, we + // flatten the smallest group of usages, which is the would-not-unmark group aka object + // properties. + let expected_schema = as_schema(json!({ + "type": "object", + "properties": { + "a": { "$ref": "#/definitions/a" }, + "b": { "$ref": "#/definitions/b" }, + "c": { + "type": "object", + "properties": { "c": { "type": "boolean" } }, + "unevaluatedProperties": false + }, + "one": { "$ref": "#/definitions/one" }, + "two": { "$ref": "#/definitions/two" } + }, + "definitions": { + "one": { + "allOf": [{ "$ref": "#/definitions/c" }], + "unevaluatedProperties": false + }, + "two": { + "allOf": [ + { + "type": "object", + "properties": { "b": { "type": "boolean" } } + }, + { "$ref": "#/definitions/c" } + ], + "unevaluatedProperties": false + }, + "a": { + "type": "object", + "properties": { "a": { "type": "boolean" } }, + "unevaluatedProperties": false + }, + "b": { + "type": "object", + "properties": { "b": { "type": "boolean" } }, + "unevaluatedProperties": false + }, + "c": { + "type": "object", + "properties": { "c": { "type": "boolean" } } + } + }, + "unevaluatedProperties": false + })); + + assert_schemas_eq(expected_schema, actual_schema); + } } From 1a5b202c7bb4da2554365ea3085c2e466d918835 Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Fri, 7 Apr 2023 13:57:34 -0400 Subject: [PATCH 3/5] spelling error --- lib/vector-config/src/schema/visitors/unevaluated.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/vector-config/src/schema/visitors/unevaluated.rs b/lib/vector-config/src/schema/visitors/unevaluated.rs index 3569fa757e0a7..520e6982d1863 100644 --- a/lib/vector-config/src/schema/visitors/unevaluated.rs +++ b/lib/vector-config/src/schema/visitors/unevaluated.rs @@ -329,7 +329,7 @@ fn build_closed_schema_flatten_eligibility_mappings( parent_to_child.insert(SchemaReference::Root, root_referents); // Now we build a reverse map, going from child -> parent. We'll iterate over every child - // referent, for every parent/child entry, calcuating the set of referrers, and if they would + // referent, for every parent/child entry, calculating the set of referrers, and if they would // require unmarking the child. let mut child_to_parent = HashMap::new(); for (parent_schema_ref, child_referents) in parent_to_child { From 310846b7be87c11ae335ccb916cc9fd911d6d9c5 Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Mon, 10 Apr 2023 17:19:27 -0400 Subject: [PATCH 4/5] bunch of cleanup related to removing allowance for warnings --- .../src/schema/visitors/unevaluated.rs | 177 +++--------------- 1 file changed, 21 insertions(+), 156 deletions(-) diff --git a/lib/vector-config/src/schema/visitors/unevaluated.rs b/lib/vector-config/src/schema/visitors/unevaluated.rs index 520e6982d1863..cfab8822df5be 100644 --- a/lib/vector-config/src/schema/visitors/unevaluated.rs +++ b/lib/vector-config/src/schema/visitors/unevaluated.rs @@ -1,15 +1,11 @@ -#![allow(warnings)] - use std::{ collections::{HashMap, HashSet}, convert::identity, - mem::discriminant, }; -use serde_json::Value; -use tracing::{debug, error}; +use tracing::debug; use vector_config_common::schema::{ - visit::{visit_schema_object, with_resolved_schema_reference, Visitor}, + visit::{with_resolved_schema_reference, Visitor}, *, }; @@ -47,7 +43,7 @@ impl DisallowedUnevaluatedPropertiesVisitor { impl Visitor for DisallowedUnevaluatedPropertiesVisitor { fn visit_root_schema(&mut self, root: &mut RootSchema) { - let mut eligible_to_flatten = build_closed_schema_flatten_eligibility_mappings(root); + let eligible_to_flatten = build_closed_schema_flatten_eligibility_mappings(root); debug!( "Found {} referents eligible for flattening: {:?}", @@ -80,7 +76,7 @@ impl Visitor for DisallowedUnevaluatedPropertiesVisitor { let current_parent_schema_ref = self.get_current_schema_scope(); if let Some(referrers) = self.eligible_to_flatten.get(reference) { - if referrers.contains(¤t_parent_schema_ref) { + if referrers.contains(current_parent_schema_ref) { let current_schema_ref = get_cleaned_schema_reference(reference); let referenced_schema = definitions .get(current_schema_ref) @@ -100,7 +96,7 @@ impl Visitor for DisallowedUnevaluatedPropertiesVisitor { ); schema.reference = None; - schema.merge(&referenced_schema); + schema.merge(referenced_schema); } } } @@ -120,7 +116,7 @@ impl Visitor for DisallowedUnevaluatedPropertiesVisitor { // to the level of the `allOf`/`oneOf`/`anyOf`, where it can apply the correct behavior. let mut had_relevant_subschemas = false; if let Some(subschema) = schema.subschemas.as_mut() { - let mut subschemas = get_object_subschemas_from_parent_mut(subschema.as_mut()); + let subschemas = get_object_subschemas_from_parent_mut(subschema.as_mut()); for subschema in subschemas { had_relevant_subschemas = true; @@ -146,54 +142,8 @@ impl ScopedVisitor for DisallowedUnevaluatedPropertiesVisitor { } fn get_current_schema_scope(&self) -> &SchemaReference { - self.scope_stack - .current() - .unwrap_or_else(|| &SchemaReference::Root) - } -} - -fn is_non_object_optional_schema( - definitions: &mut Map, - schema: &mut SchemaObject, -) -> bool { - // Only match schemas that have subschema validation in the form of `oneOf`, and nothing else. - let subschemas = match schema.subschemas.as_ref() { - None => return false, - Some(subschema) => { - let has_non_one_of = subschema.all_of.is_some() || subschema.any_of.is_some(); - match subschema.one_of.as_ref() { - Some(one_of) if !has_non_one_of => one_of, - _ => return false, - } - } - }; - - // There can only be two subschemas, one of which is simply a `null` schema. - if subschemas.len() != 2 { - return false; + self.scope_stack.current().unwrap_or(&SchemaReference::Root) } - - let maybe_non_null_schema_idx = subschemas[0] - .as_object() - .map(is_null_schema) - .and_then(|x| x.then(|| 1)) - .or_else(|| { - subschemas[1] - .as_object() - .map(is_null_schema) - .and_then(|x| x.then(|| 0)) - }); - - let non_null_schema = match maybe_non_null_schema_idx { - Some(idx) => &subschemas[idx], - None => return false, - }; - - // Check if the non-null schema is an object schema or not. - non_null_schema - .as_object() - .map(|schema| !is_object_schema(schema)) - .unwrap_or(true) } fn unmark_or_flatten_schema(definitions: &mut Map, schema: &mut SchemaObject) { @@ -257,24 +207,6 @@ impl MarkableReferent { } } -#[derive(Debug)] -struct ChildMetadata { - /// Whether or not this schema, in isolation, should be marked. - should_mark: bool, - - /// The set of referring schemas. - referrers: HashSet, -} - -impl ChildMetadata { - fn from_schema(definitions: &Map, schema: &SchemaObject) -> Self { - Self { - should_mark: is_markable_schema(definitions, schema), - referrers: HashSet::new(), - } - } -} - fn build_closed_schema_flatten_eligibility_mappings( root_schema: &RootSchema, ) -> HashMap> { @@ -336,36 +268,28 @@ fn build_closed_schema_flatten_eligibility_mappings( for child_referent in child_referents { let entry = child_to_parent .entry(child_referent.referent.as_ref().to_string()) - .or_insert_with(|| { - let child_schema = - get_schema_from_reference(root_schema, &child_referent.referent); - ChildMetadata::from_schema(&root_schema.definitions, child_schema) - }); + .or_insert_with(HashSet::new); // Transform the child referent into a parent referent, which preserves the "would // unmark" value but now points to the parent instead, and add it to the list of // _referrers_ for the child. - entry - .referrers - .insert(child_referent.with_new_referent(parent_schema_ref.clone())); + entry.insert(child_referent.with_new_referent(parent_schema_ref.clone())); } } let mut eligible_to_flatten = HashMap::new(); - for (child_schema_ref, metadata) in child_to_parent { - // Don't flatten schemas which has less than two referrers. - if metadata.referrers.len() < 2 { + for (child_schema_ref, referrers) in child_to_parent { + // Don't flatten schemas which have less than two referrers. + if referrers.len() < 2 { continue; } - let would_unmark = metadata - .referrers + let would_unmark = referrers .iter() .filter(|r| r.would_unmark) .map(|r| r.referent.clone()) .collect::>(); - let would_not_unmark = metadata - .referrers + let would_not_unmark = referrers .iter() .filter(|r| !r.would_unmark) .map(|r| r.referent.clone()) @@ -381,24 +305,6 @@ fn build_closed_schema_flatten_eligibility_mappings( eligible_to_flatten } -fn get_schema_from_reference<'a>( - root_schema: &'a RootSchema, - schema_ref: &SchemaReference, -) -> &'a SchemaObject { - match schema_ref { - SchemaReference::Root => &root_schema.schema, - SchemaReference::Definition(name) => { - let definition_name = get_cleaned_schema_reference(name.as_str()); - match root_schema.definitions.get(definition_name) { - None => panic!("Schema definition '{}' does not exist!", definition_name), - Some(schema) => schema - .as_object() - .expect("referenced schemas should never be boolean schemas"), - } - } - } -} - /// Determines whether a schema is eligible to be marked. fn is_markable_schema(definitions: &Map, schema: &SchemaObject) -> bool { // If the schema is an object schema, and does not have`additionalProperties` set, it can be @@ -425,9 +331,9 @@ fn is_markable_schema(definitions: &Map, schema: &SchemaObject) // // We'll follow schema references in subschemas up to one level deep trying to figure this out. if let Some(subschema) = schema.subschemas.as_ref() { - let mut subschemas = get_object_subschemas_from_parent(subschema).collect::>(); + let subschemas = get_object_subschemas_from_parent(subschema).collect::>(); - let has_object_subschema = subschemas.iter().any(|schema| is_object_schema(*schema)); + let has_object_subschema = subschemas.iter().any(|schema| is_object_schema(schema)); let has_referenced_object_subschema = subschemas .iter() .map(|subschema| { @@ -435,7 +341,7 @@ fn is_markable_schema(definitions: &Map, schema: &SchemaObject) .reference .as_ref() .and_then(|reference| { - let reference = get_cleaned_schema_reference(&reference); + let reference = get_cleaned_schema_reference(reference); definitions.get(reference) }) .and_then(Schema::as_object) @@ -519,8 +425,8 @@ fn get_object_subschemas_from_parent( subschema.any_of.as_ref(), ] .into_iter() - .filter_map(identity) - .flat_map(identity) + .flatten() + .flatten() .filter_map(Schema::as_object) } @@ -533,8 +439,8 @@ fn get_object_subschemas_from_parent_mut( subschema.any_of.as_mut(), ] .into_iter() - .filter_map(identity) - .flat_map(identity) + .flatten() + .flatten() .filter_map(Schema::as_object_mut) } @@ -591,47 +497,6 @@ fn is_object_schema(schema: &SchemaObject) -> bool { schema_type_matches(schema, InstanceType::Object, true) } -fn is_null_schema(schema: &SchemaObject) -> bool { - schema_type_matches(schema, InstanceType::Null, false) -} - -fn get_subschema_validators(schema: &mut SchemaObject) -> Option> { - let mut validators = vec![]; - - // Grab any subschemas for `allOf`/`oneOf`/`anyOf`, if present. - // - // There are other advanced validation mechanisms such as `if`/`then`/`else, but we explicitly - // don't handle them here as we don't currently use them in Vector's configuration schema. - if let Some(subschemas) = schema.subschemas.as_mut() { - if let Some(all_of) = subschemas.all_of.as_mut() { - validators.extend(all_of.iter_mut().filter_map(|validator| match validator { - Schema::Object(inner) => Some(inner), - _ => None, - })); - } - - if let Some(one_of) = subschemas.one_of.as_mut() { - validators.extend(one_of.iter_mut().filter_map(|validator| match validator { - Schema::Object(inner) => Some(inner), - _ => None, - })); - } - - if let Some(any_of) = subschemas.any_of.as_mut() { - validators.extend(any_of.iter_mut().filter_map(|validator| match validator { - Schema::Object(inner) => Some(inner), - _ => None, - })); - } - } - - if validators.is_empty() { - None - } else { - Some(validators) - } -} - #[cfg(test)] mod tests { use serde_json::json; From a6b4fc10de8d93e51cad90adf3c4951172207d56 Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Mon, 10 Apr 2023 17:25:09 -0400 Subject: [PATCH 5/5] unwind overzealous spelling file update --- .github/actions/spelling/expect.txt | 222 ++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index cd51390470244..476be7788f0ba 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -4,26 +4,37 @@ abcdefghijklmnopqrstuvwxyzand abced abortable acb +Acho ack'ing acking Acq +addrof +addtnl AEAD agentpayload aimd akx allowerased +alphanum +AMF amka +ampersat amping amqps amz amzn +anaana anchore androideabi andy +annanas ansicpg +ansix anumber anycondition +anymap anypb +ANZ apievent apipodspec apk @@ -34,12 +45,16 @@ aqf architecting archivable ARNOTAREALIDD +Arrowz arshiyasolei asdf asdfasdf +asis ASMS +aspiegel assertverify Asterix +Astring asynk atag atx @@ -62,14 +77,20 @@ autospawning autotools avro awscli +awseb awsec awslabs +aww axum Aziz azureresourceid babim +Baca +Bada badunit bak +baq +barbaz barfoo barieom baseof @@ -83,6 +104,7 @@ bcdea benchgen benchmarker benefritz +Berita bev bfb bgwriter @@ -91,6 +113,7 @@ Bincode bindgen bizbaz bla +blabla blabop blafoo blem @@ -98,14 +121,19 @@ blkio Bloblang Blockfi blockmanager +bloob blpop blt +bmobile +bononos bonzai +bools boop booper booperbot bopbla boringssl +boto bottlenecked bpower brackethighlighter @@ -175,6 +203,7 @@ cloudwatchlogs cmark CMK cnf +Coc CODEOWNERS colddb coldline @@ -182,11 +211,13 @@ commonmark comms compactarr compactmap +compiter componenterror componenteventsdropped componenteventsreceived componenteventssent composability +compvalue concating concats condrestart @@ -194,6 +225,7 @@ configkey configmap confl confy +Conkeror consigliere CONTEUDO cooldown @@ -216,21 +248,28 @@ cryptsoft csb cstats csvlog +cta Cthink +cubot customise customizability customtype cwl Dailywarehousing +damerau +darp daschl dashmap dataflows datafuselabs +Datanyze datasources datastream Datelike +dateref datid datname +davidhuie dbkind dbreader DBserver @@ -257,19 +296,23 @@ defattr defaultauthdb deff defghijklmnopqrstuvwxyzand +definit delaycompress deliverystream demoing +derp descriptorpb deser deserializations desync +DEUC developerguide devno DHAVE dhclient diffing diffs +DIGNO disintermediate distrib dld @@ -278,13 +321,17 @@ DNi dnsmsg docsearch docsrs +doesnotcrash dogcats Dogsketch dogsketches dogstatsd domhandler Doop +doot downcasted +DQUOTE +droid droptest dsl dstat @@ -301,11 +348,13 @@ eabihf eay ebfcee edenhill +Edg edns eeyun efg efgh Elhage +Eluga emca EMON Emph @@ -313,6 +362,7 @@ emptypb emt Enableable encodable +Encryptor endef endler enduml @@ -322,6 +372,7 @@ enumdecl enumflags ENVARS envsubst +EOI EOIG EOL'ed Erfxl @@ -329,11 +380,13 @@ Err'ing errorf Errorsfor esb +ESPN esque etheus etl ETNUV etsy +EUA eur eventarray evented @@ -343,7 +396,10 @@ eventstoredb eventstreamsink evictingpages evmap +evntslog +EVO evt +EWENE ewma examplegroup exitcodes @@ -356,17 +412,20 @@ extrepo Failable fakedata falco +fals fanatid fanouts fastcc fastmod fbarbar fbaro +FBAV fbf fcharset fcontext feeney festeburg +FFar ffebda ffeef fffffffff @@ -377,6 +436,7 @@ filecontents filterlist finalizable fingerprinter +firetv fizzaxbbuzz fizzbuzz fkn @@ -386,7 +446,11 @@ Flatbuffers flate flatmaps flattenings +fleep flork +florp +FME +fnclosure fng fnil Fomichev @@ -398,6 +462,7 @@ FOOBARy foobaz foobazbar foobla +foofoo foometric fooo foos @@ -416,16 +481,20 @@ fuchsnj fullhuman futs fuzzcheck +fuzzer fwfiles +FXPDEUC GAPI gaugehistogram gaugor GC'ing gcra +gdx geh genericify genproto genrsa +Genx geoffleyland geolite getelementptr @@ -470,6 +539,7 @@ hashring hashset hashsum hba +Hbb hdrhistogram headchunk hec @@ -479,6 +549,7 @@ heka hereregexp herestring hexdump +Hfoo highlighters histo hname @@ -505,6 +576,7 @@ iamanapikey iamasplunkhectoken iana iarna +Ideos idhack IDML ified @@ -518,6 +590,7 @@ incrementalize indexmap indicatif indoc +Inettv influxdata ingesters ingestor @@ -526,15 +599,19 @@ initech Instrumentable interpolatedstring interpretervm +intex invalidauth invokefunction invtrapezium +inzone +IOA ioapiset ionicon iostat iouring iowait IPORHOST +iru isainfo isdbgrid isp @@ -562,6 +639,7 @@ jstype jsvs jszwedko jtype +JUC juchiast karppila kartar @@ -571,6 +649,7 @@ keybase keyclock keyid keypair +keystream keyxxxxx khvzak kib @@ -586,11 +665,13 @@ labelmap lalrpop Lamport landingpad +lastitem lastword ldd leebenson leveldb lfd +lfoo libclang LIBGNUTLS liblogging @@ -601,6 +682,7 @@ linting listenfd litstr llen +Lme lnt lntable lntd @@ -610,6 +692,7 @@ logbar logdna logevents logfmt +logid lognamespace lognamespacing logplex @@ -619,11 +702,20 @@ logsense logsev logseverity logtypes +loguid +lookaround +lookupbufs losslessly lpop lpush +LPW +LQuery +LRule +LSQRBRACKET +lstr Luc luciofranco +luckystar lucperkins lukesteensen macports @@ -634,9 +726,11 @@ Makefiles markdownify markdownlintrc marketo +matchall maxdepth maxed maxes +maxint maxs maxwritten maybeanothertest @@ -667,11 +761,14 @@ minioadmin miniodat minwindef mio +mirall misordering +Miui mkcert mkto mlua mmdb +mmdd Mmm moby mockwatchlogs @@ -679,10 +776,13 @@ modulesloaddir mooper moosh Mooshing +morefield moretags +morevalue mortems motivatingly MOZGIII +mpms mre msgpack mskv @@ -692,6 +792,8 @@ msv multiarch multievents multitenant +multiterm +multitermlookahead munging musleabihf muslueabihf @@ -709,10 +811,12 @@ mymachine mypod mytable myvalue +MZX nacked nacks Namazu namespacefoo +nananaman nananana nanosecs nats @@ -722,14 +826,19 @@ nbase ndarray ndjson nearline +nestedkey +NETTV neuronull newcerts newrelix nextest +nfield nfox ngx nightlies nindent +ning +nink nkey nmeta noack @@ -743,6 +852,7 @@ nomac NONALPHANUM nonbare noncycles +nonk nonsending nonstring noog @@ -753,6 +863,8 @@ nopqrstuvwxyz norc norecurse noreplace +norg +norgle norknoog norknork nosync @@ -769,26 +881,40 @@ nowin npipe NQTP nresamples +nullandnull nullishness numbackends +numericstart oap +Obar +Obigo OKD omfwd omitempty +ond +Onefootball oneline oneof onezone +onik +onk onlyfields onlyone ooba oobar ook +oopsie opcounters openstring opinsights oplog +opples +OPR optimizable +Optimus +organisations orgid +originsicname ostype otel otelcol @@ -798,8 +924,10 @@ otlp otlphttp ouicompat outputspatterns +outzone overaligned overalignment +overwritable owo oyaml pablosichert @@ -812,7 +940,9 @@ passthrough patchelf pathbuf pathgen +peci peekable +peeker PEMS pgmajfault PII @@ -824,6 +954,7 @@ plork pnh podspec Ponge +ponk portpicker POSINT postinst @@ -839,7 +970,9 @@ prerot presetdir pretrunc prettydiff +prettytable primaryfont +printstd probot processname procid @@ -852,14 +985,23 @@ protoc protofbuf protosizer Prt +PRTG psv +PTST publickey purgecss pyld +QFn +QGIS +qmobile qqq +qstr +queryroot +QUESTIONMARK quickcheck quix quuux +quuz qux quz qwe @@ -868,6 +1010,7 @@ rande RANDFILE rawconfig rawstring +rbaz rdkafka rdr readnone @@ -879,15 +1022,18 @@ referenceable regexes regexset reinstantiate +Rekonq reloadable remapper remotehost reorganisation reparse +replacen replacepkgs replicaset replset reqwest +rer rereleased reserialize resharding @@ -895,6 +1041,7 @@ resourcemanager respawn restorecon retryable +rhv rkyv rmem rmi @@ -903,10 +1050,18 @@ rmpv rndc rngs rolledback +rootquery +roxmltree rpd rpush +rquery +RRRRRRRLLLLLLLLLLLLLLLLLLLLLLLL +RRule +rsplitn +RSQRBRACKET rstrings RTTs +rulenum runc runhcs rusoto @@ -919,11 +1074,13 @@ Rustinomicon rustls RUSTSEC rustup +rustyline rxi rxmsg rxs ryangjchandler ryu +saanich sadf samehost samenet @@ -931,10 +1088,12 @@ samerole sameuser sandboxed sandboxing +sby sccache schemaless schemars schoen +schucks scl sda sdata @@ -942,13 +1101,16 @@ SDID seahash secfrac Seedable +segmentbuf semanage sematext SEO +sequencenum serie serverlogs serviceaccount servicebus +Seznam sfixed sfrag sghall @@ -956,6 +1118,8 @@ shane sharedstatedir Shenzhen shiplift +shning +shnoog shortcode shortstat should've @@ -974,6 +1138,7 @@ sizecache Sizefor skaffold skinparam +SKIPDATA skywalking slashin slf @@ -1003,14 +1168,18 @@ spencergilbert splitn SPOF spog +spork springframework srcport SREs +SResult sret SRPMS ssekms ssn sspi +SSSZ +SSSZZ sstrings stabilises stackdrive @@ -1028,12 +1197,14 @@ strat strconv streamsink strng +strp structfield subchunks suberr subfolders subfooter sublimelinter +sublocation subsec substrategies subtagline @@ -1046,6 +1217,7 @@ supertrait suser sustainability svalue +Swiftfox Sya sysfs sysinit @@ -1055,6 +1227,7 @@ systemid Syu Szwedko tablesmap +tac tagfoo tagline tagset @@ -1064,11 +1237,18 @@ Takeaways targetgroup tarpit tcmalloc +tdkind +Techvision +tecno +Teleca +Telechips telecom +Telesys templatable templateable templating terabytes +termcolor terraform tes testevent @@ -1083,8 +1263,11 @@ thaweddb thicc Thimphu thinkies +Thisais thiserror thisisastring +thonk +thot threatmanager throughputs thrpt @@ -1103,6 +1286,7 @@ tobz tocbot todos tokio +Tolino Tomola tonydanza toolbars @@ -1110,30 +1294,41 @@ toolchains TOOLSDIRECTORY toolset toor +topbuzz topdir topojson toproto +torvec +Toughpad Toxiproxy Tpng Trauring Treemap Trello +Treo trialled triggerable tripwires Trivago trivy Troutwine +tru TRUSTSTORE TSDB Tsvg turbofish +Twitterbot twox txmsg txs +Tygron typechecked typetag +tzs uap +uaparser +uas +UCWEB udm UIDs uieao @@ -1143,7 +1338,9 @@ unacked undertagline underutilized underutilizing +Unescaping unevictable +ungrokkable unioning unitdir unmark @@ -1152,6 +1349,7 @@ unnests unstructuredlogentries unsync untuple +uol upgradable urql usecase @@ -1159,6 +1357,7 @@ userinfo userlands usermod userpass +ustr uucp UVY uwtable @@ -1166,6 +1365,7 @@ valfoo validpaths Varda vdev +Vdroid VECTORCFG vectordir vectordotdev @@ -1173,15 +1373,19 @@ vectorized vendored veryyyyyyy viceversa +VIERA viewkind visualising +VLC VMs VNQ volumeconfig vrl +VRR vts vvo vvv +VVVVVVVVRRRRRRRRRRRRRRRRR VYa wahhh waitsforfullbatch @@ -1192,11 +1396,16 @@ waninfo wasmtime watchexec watchlogs +Waterfox wayfor webgraphviz webservers websites +webviews weee +weekyear +Wellco +Weltbild wemustcontinuetodiscard wensite whoopsie @@ -1205,6 +1414,7 @@ willreturn winioctl wiredtiger wiremock +with'quote wix wixobj WIXUI @@ -1221,15 +1431,20 @@ writablelib writeback wrongpass wrongsecret +wronly wtcache wtime wtimeouts wtr wurstmeister wwang +xaablabla xact +xbar xcatsy Xcg +XENENE +Xiao xlarge xpack xscale @@ -1238,6 +1453,9 @@ XUtil xvf XVXv xxs +xxxxxxx +xxxxxxxxxxxxxxx +xxxxxxxxxxxxxxxxxxxxxxx xzy YAMLs YBv @@ -1247,15 +1465,19 @@ yippeee yolo YRjhx ytt +zalgo zam +Zania ZDfz zerk zibble zieme +Zii zirp zoob zoobub zoog +zook zoop zork zorp