diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index 3576156ce74a8..476be7788f0ba 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -445,6 +445,7 @@ flamegraph Flatbuffers flate flatmaps +flattenings fleep flork florp @@ -1342,6 +1343,7 @@ unevictable ungrokkable unioning unitdir +unmark unmarshaler unnests unstructuredlogentries diff --git a/Cargo.lock b/Cargo.lock index a7c1c451700d1..d0e3c11115fd3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9684,6 +9684,7 @@ dependencies = [ "serde_with 2.3.2", "snafu", "toml 0.7.3", + "tracing 0.1.37", "url", "vector-config-common", "vector-config-macros", @@ -9702,6 +9703,7 @@ dependencies = [ "serde", "serde_json", "syn 1.0.109", + "tracing 0.1.37", ] [[package]] diff --git a/lib/vector-config-common/Cargo.toml b/lib/vector-config-common/Cargo.toml index fc9025f49af8d..d8eac2cfd0485 100644 --- a/lib/vector-config-common/Cargo.toml +++ b/lib/vector-config-common/Cargo.toml @@ -11,4 +11,5 @@ proc-macro2 = { version = "1.0", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } syn = { version = "1.0", features = ["full", "extra-traits", "visit-mut", "visit"] } +tracing = { version = "0.1.34", default-features = false } quote = { version = "1.0", default-features = false } diff --git a/lib/vector-config-common/src/schema/json_schema.rs b/lib/vector-config-common/src/schema/json_schema.rs index d1ef4a0078ee2..e2b32c31efd18 100644 --- a/lib/vector-config-common/src/schema/json_schema.rs +++ b/lib/vector-config-common/src/schema/json_schema.rs @@ -1,8 +1,8 @@ use serde::{Deserialize, Serialize}; use serde_json::Value; -use std::ops::Deref; +use std::{iter, ops::Deref}; -use super::{Map, Set}; +use super::{Map, Set, DEFINITIONS_PREFIX}; /// A JSON Schema. #[allow(clippy::large_enum_variant)] @@ -40,6 +40,26 @@ impl Schema { } } + /// Gets a reference to the inner schema object if this schema is a JSON Schema object. + /// + /// Otherwise, `None` is returned. + pub fn as_object(&self) -> Option<&SchemaObject> { + match self { + Schema::Object(schema) => Some(schema), + _ => None, + } + } + + /// Gets a mutable reference to the inner schema object if this schema is a JSON Schema object. + /// + /// Otherwise, `None` is returned. + pub fn as_object_mut(&mut self) -> Option<&mut SchemaObject> { + match self { + Schema::Object(schema) => Some(schema), + _ => None, + } + } + /// Converts the given schema (if it is a boolean schema) into an equivalent schema object. /// /// If the given schema is already a schema object, this has no effect. @@ -539,6 +559,30 @@ pub enum SingleOrVec { Vec(Vec), } +impl Extend for SingleOrVec { + fn extend>(&mut self, iter: I) { + match self { + Self::Single(item) => { + *self = Self::Vec(iter::once(*item.clone()).chain(iter.into_iter()).collect()); + } + Self::Vec(items) => items.extend(iter), + } + } +} + +impl<'a, T> IntoIterator for &'a SingleOrVec { + type Item = &'a T; + + type IntoIter = std::slice::Iter<'a, T>; + + fn into_iter(self) -> Self::IntoIter { + match self { + SingleOrVec::Single(item) => std::slice::from_ref(item.as_ref()).iter(), + SingleOrVec::Vec(items) => items.as_slice().iter(), + } + } +} + impl From for SingleOrVec { fn from(single: T) -> Self { SingleOrVec::Single(Box::new(single)) @@ -567,3 +611,14 @@ fn is_none_or_default_true(field: &Option>) -> bool { Some(value) => matches!(value.as_ref(), Schema::Bool(true)), } } + +pub fn get_cleaned_schema_reference(schema_ref: &str) -> &str { + if let Some(cleaned) = schema_ref.strip_prefix(DEFINITIONS_PREFIX) { + cleaned + } else { + panic!( + "Tried to clean schema reference that does not start with the definition prefix: {}", + schema_ref + ); + } +} diff --git a/lib/vector-config-common/src/schema/visit.rs b/lib/vector-config-common/src/schema/visit.rs index 04fdfbd4ac77b..242bed34331e8 100644 --- a/lib/vector-config-common/src/schema/visit.rs +++ b/lib/vector-config-common/src/schema/visit.rs @@ -1,4 +1,6 @@ -use super::{Map, RootSchema, Schema, SchemaObject, SingleOrVec, DEFINITIONS_PREFIX}; +use tracing::debug; + +use super::{get_cleaned_schema_reference, Map, RootSchema, Schema, SchemaObject, SingleOrVec}; /// Trait used to recursively modify a constructed schema and its subschemas. pub trait Visitor: std::fmt::Debug { @@ -51,9 +53,15 @@ pub fn visit_schema_object( schema: &mut SchemaObject, ) { if schema.reference.is_some() { - with_resolved_schema_reference(definitions, schema, |defs, referenced_schema| { - v.visit_schema(defs, referenced_schema); - }) + with_resolved_schema_reference( + definitions, + schema, + |defs, schema_ref, referenced_schema| { + debug!(referent = schema_ref, "Visiting schema reference."); + + v.visit_schema(defs, referenced_schema); + }, + ) } if let Some(sub) = &mut schema.subschemas { @@ -133,17 +141,17 @@ pub fn with_resolved_schema_reference( schema: &mut SchemaObject, f: F, ) where - F: FnOnce(&mut Map, &mut Schema), + F: FnOnce(&mut Map, &str, &mut Schema), { if let Some(reference) = schema.reference.as_ref() { - let schema_def_key = reference.replace(DEFINITIONS_PREFIX, ""); + let schema_def_key = get_cleaned_schema_reference(reference); let mut referenced_schema = definitions - .get(&schema_def_key) + .get(schema_def_key) .cloned() .expect("schema reference should exist"); - f(definitions, &mut referenced_schema); + f(definitions, schema_def_key, &mut referenced_schema); - definitions.insert(schema_def_key, referenced_schema); + definitions.insert(schema_def_key.to_string(), referenced_schema); } } diff --git a/lib/vector-config/Cargo.toml b/lib/vector-config/Cargo.toml index c16e17a6e854b..fbf374a0cd669 100644 --- a/lib/vector-config/Cargo.toml +++ b/lib/vector-config/Cargo.toml @@ -23,6 +23,7 @@ serde_json = { version = "1.0", default-features = false, features = ["std"] } serde_with = { version = "2.3.2", default-features = false, features = ["std"] } snafu = { version = "0.7.4", default-features = false } toml = { version = "0.7.3", default-features = false } +tracing = { version = "0.1.34", default-features = false } url = { version = "2.3.1", default-features = false, features = ["serde"] } vrl-compiler = { package = "vrl-compiler", git = "https://github.com/vectordotdev/vrl", rev = "v0.2.0" } vrl-core = { package = "vrl-core", git = "https://github.com/vectordotdev/vrl", rev = "v0.2.0", default-features = false, features = ["serde"] } diff --git a/lib/vector-config/src/schema/helpers.rs b/lib/vector-config/src/schema/helpers.rs index 17458d7941ae5..c1baffd0731a3 100644 --- a/lib/vector-config/src/schema/helpers.rs +++ b/lib/vector-config/src/schema/helpers.rs @@ -12,7 +12,7 @@ use crate::{ num::ConfigurableNumber, Configurable, ConfigurableRef, GenerateError, Metadata, ToValue, }; -use super::visitors::DisallowedUnevaluatedPropertiesVisitor; +use super::visitors::{DisallowedUnevaluatedPropertiesVisitor, InlineSingleUseReferencesVisitor}; /// Applies metadata to the given schema. /// @@ -480,7 +480,9 @@ pub fn generate_internal_tagged_variant_schema( } pub fn default_schema_settings() -> SchemaSettings { - SchemaSettings::new().with_visitor(DisallowedUnevaluatedPropertiesVisitor::from_settings) + SchemaSettings::new() + .with_visitor(InlineSingleUseReferencesVisitor::from_settings) + .with_visitor(DisallowedUnevaluatedPropertiesVisitor::from_settings) } pub fn generate_root_schema() -> Result diff --git a/lib/vector-config/src/schema/visitors/inline_single.rs b/lib/vector-config/src/schema/visitors/inline_single.rs new file mode 100644 index 0000000000000..68347e36937dd --- /dev/null +++ b/lib/vector-config/src/schema/visitors/inline_single.rs @@ -0,0 +1,357 @@ +use std::collections::{HashMap, HashSet}; + +use serde_json::Value; +use tracing::debug; +use vector_config_common::schema::{visit::Visitor, *}; + +use crate::schema::visitors::merge::Mergeable; + +use super::scoped_visit::{ + visit_schema_object_scoped, SchemaReference, SchemaScopeStack, ScopedVisitor, +}; + +/// A visitor that inlines schema references where the referenced schema is only referenced once. +/// +/// In many cases, the schema generation will produce schema definitions where either generics or +/// flattening are involved, which leads to schema definitions that may only be referenced by one +/// other schema definition, and so on. +/// +/// This is suboptimal due to the "pointer chasing" involved to resolve those schema references, +/// when there's no reason to inherently have a schema be defined such that it can be referenced. +/// +/// This visitor collects a list of all schema references, and for any schemas which are referenced +/// only once, will replace those references by inlining the referenced schema directly, and +/// deleting the schema definition from the root definitions. +#[derive(Debug, Default)] +pub struct InlineSingleUseReferencesVisitor { + eligible_to_inline: HashSet, +} + +impl InlineSingleUseReferencesVisitor { + pub fn from_settings(_: &SchemaSettings) -> Self { + Self { + eligible_to_inline: HashSet::new(), + } + } +} + +impl Visitor for InlineSingleUseReferencesVisitor { + fn visit_root_schema(&mut self, root: &mut RootSchema) { + // Build a map of schema references and the number of times they're referenced through the + // entire schema, by visiting the root schema in a recursive fashion, using a helper visitor. + let mut occurrence_visitor = OccurrenceVisitor::default(); + occurrence_visitor.visit_root_schema(root); + let occurrence_map = occurrence_visitor.into_occurrences(); + + let eligible_to_inline = occurrence_map + .into_iter() + // Filter out any schemas which have more than one occurrence, as naturally, we're + // trying to inline single-use schema references. :) + .filter_map(|(def_name, occurrences)| { + if occurrences == 1 { + Some(def_name) + } else { + None + } + }) + // However, we'll also filter out some specific schema definitions which are only + // referenced once, specifically: component base types and component types themselves. + // + // We do this as a lot of the tooling that parses the schema to generate documentation, + // and the like, depends on these schemas existing in the top-level definitions for easy + // lookup. + .filter(|def_name| { + let schema = root + .definitions + .get(def_name.as_ref()) + .and_then(Schema::as_object) + .expect("schema definition must exist"); + + is_inlineable_schema(def_name.as_ref(), schema) + }) + .map(|s| s.as_ref().to_string()) + .collect::>(); + + self.eligible_to_inline = eligible_to_inline; + + // Now run our own visitor logic, which will use the inline eligibility to determine if a + // schema reference in a being-visited schema should be replaced inline with the original + // referenced schema, in turn removing the schema definition. + visit::visit_root_schema(self, root); + + // Now remove all of the definitions for schemas that were eligible for inlining. + for schema_def_name in self.eligible_to_inline.drain() { + debug!( + referent = schema_def_name, + "Removing schema definition from root schema." + ); + + root.definitions + .remove(&schema_def_name) + .expect("referenced schema must exist in definitions"); + } + } + + fn visit_schema_object( + &mut self, + definitions: &mut Map, + schema: &mut SchemaObject, + ) { + // Recursively visit this schema first. + visit::visit_schema_object(self, definitions, schema); + + // If this schema has a schema reference, see if it's in our inline eligibility map. If so, + // we remove the referenced schema from the definitions, and then merge it into the current + // schema, after removing the `$ref` field. + if let Some(schema_ref) = schema.reference.as_ref().cloned() { + let schema_ref = get_cleaned_schema_reference(&schema_ref); + if self.eligible_to_inline.contains(schema_ref) { + let referenced_schema = definitions + .get(schema_ref) + .expect("referenced schema must exist in definitions"); + + if let Schema::Object(referenced_schema) = referenced_schema { + debug!( + referent = schema_ref, + "Inlining eligible schema reference into current schema." + ); + + schema.reference = None; + schema.merge(referenced_schema); + } + } + } + } +} + +fn is_inlineable_schema(definition_name: &str, schema: &SchemaObject) -> bool { + static DISALLOWED_SCHEMAS: &[&str] = &[ + "vector::sources::Sources", + "vector::transforms::Transforms", + "vector::sinks::Sinks", + ]; + + // We want to avoid inlining all of the relevant top-level types used for defining components: + // the "outer" types (i.e. `SinkOuter`), the enum/collection types (i.e. the big `Sources` + // enum), and the component configuration types themselves (i.e. `AmqpSinkConfig`). + // + // There's nothing _technically_ wrong with doing so, but it would break downstream consumers of + // the schema that parse it in order to extract the individual components and other + // component-specific metadata. + let is_component_base = get_schema_metadata_attr(schema, "docs::component_base_type").is_some(); + let is_component = get_schema_metadata_attr(schema, "docs::component_type").is_some(); + + let is_allowed_schema = !DISALLOWED_SCHEMAS.contains(&definition_name); + + !is_component_base && !is_component && is_allowed_schema +} + +#[derive(Debug, Default)] +struct OccurrenceVisitor { + scope_stack: SchemaScopeStack, + occurrence_map: HashMap>, +} + +impl OccurrenceVisitor { + fn into_occurrences(self) -> HashMap { + self.occurrence_map + .into_iter() + .map(|(k, v)| (k, v.len())) + .collect() + } +} + +impl Visitor for OccurrenceVisitor { + fn visit_schema_object( + &mut self, + definitions: &mut Map, + schema: &mut SchemaObject, + ) { + visit_schema_object_scoped(self, definitions, schema); + + if let Some(current_schema_ref) = schema.reference.as_ref() { + // Track the named "parent" schema for the schema we're currently visiting so that if we + // visit this schema again, we don't double count any schema references that it has. The + // "parent" schema is simply the closest ancestor schema that was itself a schema + // reference, or "Root", which represents the oldest schema ancestor in the document. + // + // This lets us couple with scenarios where schema A references schema B, and is the + // only actual direct schema reference to schema B, but due to multiple schemas + // referencing schema A, would otherwise lead to both A and B being visited multiple + // times. + let current_parent_schema_ref = self.get_current_schema_scope().clone(); + let current_schema_ref = get_cleaned_schema_reference(current_schema_ref); + + let occurrences = self + .occurrence_map + .entry(current_schema_ref.into()) + .or_insert_with(HashSet::new); + occurrences.insert(current_parent_schema_ref); + } + } +} + +impl ScopedVisitor for OccurrenceVisitor { + fn push_schema_scope>(&mut self, scope: S) { + self.scope_stack.push(scope.into()); + } + + fn pop_schema_scope(&mut self) { + self.scope_stack.pop().expect("stack was empty during pop"); + } + + fn get_current_schema_scope(&self) -> &SchemaReference { + self.scope_stack.current().unwrap_or(&SchemaReference::Root) + } +} + +fn get_schema_metadata_attr<'a>(schema: &'a SchemaObject, key: &str) -> Option<&'a Value> { + schema + .extensions + .get("_metadata") + .and_then(|metadata| metadata.get(key)) +} + +#[cfg(test)] +mod tests { + use serde_json::json; + use vector_config_common::schema::visit::Visitor; + + use crate::schema::visitors::test::{as_schema, assert_schemas_eq}; + + use super::InlineSingleUseReferencesVisitor; + + #[test] + fn no_refs() { + let mut actual_schema = as_schema(json!({ + "type": "object", + "properties": { + "a": { "type": "string" } + } + })); + + let expected_schema = actual_schema.clone(); + + let mut visitor = InlineSingleUseReferencesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + assert_schemas_eq(expected_schema, actual_schema); + } + + #[test] + fn single_ref_single_usage() { + let mut actual_schema = as_schema(json!({ + "$ref": "#/definitions/simple", + "definitions": { + "simple": { + "type": "object", + "properties": { + "a": { "type": "string" } + } + } + } + })); + + let mut visitor = InlineSingleUseReferencesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + let expected_schema = as_schema(json!({ + "type": "object", + "properties": { + "a": { "type": "string" } + }, + })); + + assert_schemas_eq(expected_schema, actual_schema); + } + + // TODO(tobz): These two tests are ignored because the inliner currently works off of schema + // reference scopes, so two object properties within the same schema don't count as two + // instances of a schema being referenced. + // + // We need to refactor schema scopes to be piecemeal extensible more in the way of how + // `jsonschema` does it rather than an actual stack.... the current approach is good enough for + // now, but not optimal in the way that it could be. + // + // These are here for when I improve the situation after getting this merged. + #[test] + #[ignore] + fn single_ref_multiple_usages() { + let mut actual_schema = as_schema(json!({ + "definitions": { + "simple": { + "type": "object", + "properties": { + "a": { "type": "string" } + } + } + }, + "type": "object", + "properties": { + "a": { "$ref": "#/definitions/simple" }, + "b": { "$ref": "#/definitions/simple" } + } + })); + + let expected_schema = actual_schema.clone(); + + let mut visitor = InlineSingleUseReferencesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + assert_schemas_eq(expected_schema, actual_schema); + } + + #[test] + #[ignore] + fn multiple_refs_mixed_usages() { + let mut actual_schema = as_schema(json!({ + "definitions": { + "simple": { + "type": "object", + "properties": { + "a": { "type": "string" } + } + }, + "advanced": { + "type": "object", + "properties": { + "b": { "type": "string" } + } + } + }, + "type": "object", + "properties": { + "a": { "$ref": "#/definitions/simple" }, + "b": { "$ref": "#/definitions/simple" }, + "c": { "$ref": "#/definitions/advanced" }, + } + })); + + let mut visitor = InlineSingleUseReferencesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + let expected_schema = as_schema(json!({ + "definitions": { + "simple": { + "type": "object", + "properties": { + "a": { "type": "string" } + } + } + }, + "type": "object", + "properties": { + "a": { "$ref": "#/definitions/simple" }, + "b": { "$ref": "#/definitions/simple" }, + "c": { + "type": "object", + "properties": { + "b": { "type": "string" } + } + } + } + })); + + assert_schemas_eq(expected_schema, actual_schema); + } +} diff --git a/lib/vector-config/src/schema/visitors/merge.rs b/lib/vector-config/src/schema/visitors/merge.rs new file mode 100644 index 0000000000000..741b74157e137 --- /dev/null +++ b/lib/vector-config/src/schema/visitors/merge.rs @@ -0,0 +1,291 @@ +#![allow(clippy::borrowed_box)] + +use std::mem::discriminant; + +use serde_json::Value; +use vector_config_common::schema::*; + +/// A type that can be merged with itself. +pub trait Mergeable { + fn merge(&mut self, other: &Self); +} + +impl Mergeable for SchemaObject { + fn merge(&mut self, other: &Self) { + // The logic is pretty straightforward: we merge `other` into `self`, with `self` having the + // higher precedence. + // + // Additionally, we only merge logical schema chunks: if the destination schema has object + // properties defined, and the source schema has some object properties that don't exist in + // the destination, they will be merged, but if there is any overlap, then the object + // properties in the destination would remain untouched. This merging logic applies to all + // map-based types. + // + // For standalone fields, such as title or description, the destination always has higher + // precedence. For optional fields, whichever version (destination or source) is present + // will win, except for when both are present, then the individual fields within the + // optional type will be merged according to the normal precedence rules. + merge_optional(&mut self.reference, other.reference.as_ref()); + merge_schema_metadata(&mut self.metadata, other.metadata.as_ref()); + merge_schema_instance_type(&mut self.instance_type, other.instance_type.as_ref()); + merge_schema_format(&mut self.format, other.format.as_ref()); + merge_schema_enum_values(&mut self.enum_values, other.enum_values.as_ref()); + merge_schema_const_value(&mut self.const_value, other.const_value.as_ref()); + merge_schema_subschemas(&mut self.subschemas, other.subschemas.as_ref()); + merge_schema_number_validation(&mut self.number, other.number.as_ref()); + merge_schema_string_validation(&mut self.string, other.string.as_ref()); + merge_schema_array_validation(&mut self.array, other.array.as_ref()); + merge_schema_object_validation(&mut self.object, other.object.as_ref()); + merge_schema_extensions(&mut self.extensions, &other.extensions); + } +} + +impl Mergeable for Value { + fn merge(&mut self, other: &Self) { + // We do a check here for ensuring both value discriminants are the same type. This is + // specific to `Value` but we should never really be merging identical keys together that + // have differing value types, as that is indicative of a weird overlap in keys between + // different schemas. + // + // We _may_ need to relax this in practice/in the future, but it's a solid invariant to + // enforce for the time being. + if discriminant(self) != discriminant(other) { + panic!("Tried to merge two `Value` types together with differing types!\n\nSelf: {:?}\n\nOther: {:?}", self, other); + } + + match (self, other) { + // Maps get merged recursively. + (Value::Object(self_map), Value::Object(other_map)) => { + self_map.merge(other_map); + } + // Arrays get merged together indiscriminately. + (Value::Array(self_array), Value::Array(other_array)) => { + self_array.extend(other_array.iter().cloned()); + } + // We don't merge any other value types together. + _ => {} + } + } +} + +impl Mergeable for Schema { + fn merge(&mut self, other: &Self) { + match (self, other) { + // We don't merge schemas together if either of them is a boolean schema. + (Schema::Bool(_), _) | (_, Schema::Bool(_)) => {} + (Schema::Object(self_schema), Schema::Object(other_schema)) => { + self_schema.merge(other_schema); + } + } + } +} + +impl Mergeable for serde_json::Map { + fn merge(&mut self, other: &Self) { + for (key, value) in other { + match self.get_mut(key) { + None => { + self.insert(key.clone(), value.clone()); + } + Some(existing) => existing.merge(value), + } + } + } +} + +impl Mergeable for Map +where + K: std::hash::Hash + Eq + Clone, + V: Clone + Mergeable, +{ + fn merge(&mut self, other: &Self) { + for (key, value) in other { + match self.get_mut(key) { + None => { + self.insert(key.clone(), value.clone()); + } + Some(existing) => existing.merge(value), + } + } + } +} + +fn merge_schema_metadata(destination: &mut Option>, source: Option<&Box>) { + merge_optional_with(destination, source, |existing, new| { + merge_optional(&mut existing.id, new.id.as_ref()); + merge_optional(&mut existing.title, new.title.as_ref()); + merge_optional(&mut existing.description, new.description.as_ref()); + merge_optional(&mut existing.default, new.default.as_ref()); + merge_bool(&mut existing.deprecated, new.deprecated); + merge_bool(&mut existing.read_only, new.read_only); + merge_bool(&mut existing.write_only, new.write_only); + merge_collection(&mut existing.examples, &new.examples); + }); +} + +fn merge_schema_instance_type( + destination: &mut Option>, + source: Option<&SingleOrVec>, +) { + merge_optional_with(destination, source, |existing, new| { + let mut deduped = existing + .into_iter() + .chain(new.into_iter()) + .cloned() + .collect::>(); + deduped.dedup(); + + *existing = deduped.into(); + }); +} + +fn merge_schema_format(destination: &mut Option, source: Option<&String>) { + merge_optional(destination, source); +} + +fn merge_schema_enum_values(destination: &mut Option>, source: Option<&Vec>) { + merge_optional_with(destination, source, merge_collection); +} + +fn merge_schema_const_value(destination: &mut Option, source: Option<&Value>) { + merge_optional(destination, source); +} + +fn merge_schema_subschemas( + destination: &mut Option>, + source: Option<&Box>, +) { + merge_optional_with(destination, source, |existing, new| { + merge_optional_with(&mut existing.all_of, new.all_of.as_ref(), merge_collection); + merge_optional_with(&mut existing.any_of, new.any_of.as_ref(), merge_collection); + merge_optional_with(&mut existing.one_of, new.one_of.as_ref(), merge_collection); + merge_optional(&mut existing.if_schema, new.if_schema.as_ref()); + merge_optional(&mut existing.then_schema, new.then_schema.as_ref()); + merge_optional(&mut existing.else_schema, new.else_schema.as_ref()); + merge_optional(&mut existing.not, new.not.as_ref()); + }); +} + +fn merge_schema_number_validation( + destination: &mut Option>, + source: Option<&Box>, +) { + merge_optional_with(destination, source, |existing, new| { + merge_optional(&mut existing.multiple_of, new.multiple_of.as_ref()); + merge_optional(&mut existing.maximum, new.maximum.as_ref()); + merge_optional( + &mut existing.exclusive_maximum, + new.exclusive_minimum.as_ref(), + ); + merge_optional(&mut existing.minimum, new.minimum.as_ref()); + merge_optional( + &mut existing.exclusive_minimum, + new.exclusive_minimum.as_ref(), + ); + }); +} + +fn merge_schema_string_validation( + destination: &mut Option>, + source: Option<&Box>, +) { + merge_optional_with(destination, source, |existing, new| { + merge_optional(&mut existing.max_length, new.max_length.as_ref()); + merge_optional(&mut existing.min_length, new.min_length.as_ref()); + merge_optional(&mut existing.pattern, new.pattern.as_ref()); + }); +} + +fn merge_schema_array_validation( + destination: &mut Option>, + source: Option<&Box>, +) { + merge_optional_with(destination, source, |existing, new| { + merge_optional_with(&mut existing.items, new.items.as_ref(), merge_collection); + merge_optional( + &mut existing.additional_items, + new.additional_items.as_ref(), + ); + merge_optional( + &mut existing.unevaluated_items, + new.unevaluated_items.as_ref(), + ); + merge_optional(&mut existing.max_items, new.max_items.as_ref()); + merge_optional(&mut existing.min_items, new.min_items.as_ref()); + merge_optional(&mut existing.unique_items, new.unique_items.as_ref()); + merge_optional(&mut existing.contains, new.contains.as_ref()); + }); +} + +fn merge_schema_object_validation( + destination: &mut Option>, + source: Option<&Box>, +) { + merge_optional_with(destination, source, |existing, new| { + merge_optional(&mut existing.max_properties, new.max_properties.as_ref()); + merge_optional(&mut existing.min_properties, new.min_properties.as_ref()); + merge_collection(&mut existing.required, &new.required); + merge_map(&mut existing.properties, &new.properties); + merge_map(&mut existing.pattern_properties, &new.pattern_properties); + merge_optional( + &mut existing.additional_properties, + new.additional_properties.as_ref(), + ); + merge_optional( + &mut existing.unevaluated_properties, + new.unevaluated_properties.as_ref(), + ); + merge_optional(&mut existing.property_names, new.property_names.as_ref()); + }); +} + +fn merge_schema_extensions(destination: &mut Map, source: &Map) { + merge_map(destination, source); +} + +fn merge_bool(destination: &mut bool, source: bool) { + // We only treat `true` as a merge-worthy value. + if source { + *destination = true; + } +} + +fn merge_collection<'a, E, I, T>(destination: &mut E, source: I) +where + E: Extend, + I: IntoIterator, + T: Clone + 'a, +{ + destination.extend(source.into_iter().cloned()); +} + +fn merge_map(destination: &mut Map, source: &Map) +where + K: std::hash::Hash + Eq + Clone, + V: Clone + Mergeable, +{ + destination.merge(source); +} + +fn merge_optional(destination: &mut Option, source: Option<&T>) { + merge_optional_with(destination, source, |_, _| {}); +} + +fn merge_optional_with<'a, T, F>(destination: &'a mut Option, source: Option<&'a T>, f: F) +where + T: Clone, + F: Fn(&'a mut T, &'a T), +{ + match destination { + // If the destination is empty, we use whatever we have in `source`. Otherwise, we leave + // `destination` as-is. + None => *destination = source.cloned(), + // If `destination` isn't empty, and neither is `source`, then pass them both to `f` to + // let it handle the actual merge logic. + Some(destination) => { + if let Some(source) = source { + f(destination, source); + } + } + } +} diff --git a/lib/vector-config/src/schema/visitors/mod.rs b/lib/vector-config/src/schema/visitors/mod.rs index d9f94a9909549..59802e0ebe10b 100644 --- a/lib/vector-config/src/schema/visitors/mod.rs +++ b/lib/vector-config/src/schema/visitors/mod.rs @@ -1,2 +1,10 @@ +mod inline_single; +pub mod merge; +pub mod scoped_visit; mod unevaluated; + +#[cfg(test)] +pub(self) mod test; + +pub use self::inline_single::InlineSingleUseReferencesVisitor; pub use self::unevaluated::DisallowedUnevaluatedPropertiesVisitor; diff --git a/lib/vector-config/src/schema/visitors/scoped_visit.rs b/lib/vector-config/src/schema/visitors/scoped_visit.rs new file mode 100644 index 0000000000000..937a4ee4920dd --- /dev/null +++ b/lib/vector-config/src/schema/visitors/scoped_visit.rs @@ -0,0 +1,180 @@ +use std::collections::VecDeque; + +use vector_config_common::schema::{visit::Visitor, *}; + +/// A schema reference which can refer to either a schema definition or the root schema itself. +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub enum SchemaReference { + /// A defined schema. + Definition(String), + + /// The root schema itself. + Root, +} + +impl std::fmt::Display for SchemaReference { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Definition(name) => write!(f, "{}", name), + Self::Root => write!(f, ""), + } + } +} + +impl<'a, T> From<&'a T> for SchemaReference +where + T: AsRef + ?Sized, +{ + fn from(value: &'a T) -> Self { + Self::Definition(value.as_ref().to_string()) + } +} + +impl AsRef for SchemaReference { + fn as_ref(&self) -> &str { + match self { + Self::Definition(name) => name.as_str(), + Self::Root => "", + } + } +} + +/// The schema scope stack is used to understand where in the visiting of a root schema the visitor +/// currently is. All visiting will inherently start at the root, and continue to exist in the root +/// scope until a schema reference is resolved, and the visitor visits the resolved schema. Once +/// that happens, the resolved schema scope is pushed onto the stack, such that the scope is now the +/// schema reference. This can happen multiple times as subsequent schema references are resolved. +/// Once the visitor recurses back out of the resolved schema, it is popped from the stack. +#[derive(Debug, Default)] +pub struct SchemaScopeStack { + stack: VecDeque, +} + +impl SchemaScopeStack { + pub fn push>(&mut self, scope: S) { + self.stack.push_front(scope.into()); + } + + pub fn pop(&mut self) -> Option { + self.stack.pop_front() + } + + pub fn current(&self) -> Option<&SchemaReference> { + self.stack.front() + } +} + +pub trait ScopedVisitor: Visitor { + fn push_schema_scope>(&mut self, scope: S); + + fn pop_schema_scope(&mut self); + + fn get_current_schema_scope(&self) -> &SchemaReference; +} + +pub fn visit_schema_object_scoped( + sv: &mut SV, + definitions: &mut Map, + schema: &mut SchemaObject, +) { + if let Some(reference) = schema.reference.as_ref() { + let schema_def_key = get_cleaned_schema_reference(reference); + let mut referenced_schema = definitions + .get(schema_def_key) + .cloned() + .expect("schema reference should exist"); + + if let Schema::Object(referenced_schema) = &mut referenced_schema { + sv.push_schema_scope(schema_def_key); + + sv.visit_schema_object(definitions, referenced_schema); + + sv.pop_schema_scope(); + } + + definitions.insert(schema_def_key.to_string(), referenced_schema); + } + + if let Some(sub) = &mut schema.subschemas { + visit_vec_scoped(sv, definitions, &mut sub.all_of); + visit_vec_scoped(sv, definitions, &mut sub.any_of); + visit_vec_scoped(sv, definitions, &mut sub.one_of); + visit_box_scoped(sv, definitions, &mut sub.not); + visit_box_scoped(sv, definitions, &mut sub.if_schema); + visit_box_scoped(sv, definitions, &mut sub.then_schema); + visit_box_scoped(sv, definitions, &mut sub.else_schema); + } + + if let Some(arr) = &mut schema.array { + visit_single_or_vec_scoped(sv, definitions, &mut arr.items); + visit_box_scoped(sv, definitions, &mut arr.additional_items); + visit_box_scoped(sv, definitions, &mut arr.contains); + } + + if let Some(obj) = &mut schema.object { + visit_map_values_scoped(sv, definitions, &mut obj.properties); + visit_map_values_scoped(sv, definitions, &mut obj.pattern_properties); + visit_box_scoped(sv, definitions, &mut obj.additional_properties); + visit_box_scoped(sv, definitions, &mut obj.property_names); + } +} + +fn visit_box_scoped( + sv: &mut SV, + definitions: &mut Map, + target: &mut Option>, +) { + if let Some(s) = target { + if let Schema::Object(s) = s.as_mut() { + sv.visit_schema_object(definitions, s); + } + } +} + +fn visit_vec_scoped( + sv: &mut SV, + definitions: &mut Map, + target: &mut Option>, +) { + if let Some(vec) = target { + for s in vec { + if let Schema::Object(s) = s { + sv.visit_schema_object(definitions, s); + } + } + } +} + +fn visit_map_values_scoped( + sv: &mut SV, + definitions: &mut Map, + target: &mut Map, +) { + for s in target.values_mut() { + if let Schema::Object(s) = s { + sv.visit_schema_object(definitions, s); + } + } +} + +fn visit_single_or_vec_scoped( + sv: &mut SV, + definitions: &mut Map, + target: &mut Option>, +) { + match target { + None => {} + Some(SingleOrVec::Single(s)) => { + if let Schema::Object(s) = s.as_mut() { + sv.visit_schema_object(definitions, s); + } + } + Some(SingleOrVec::Vec(vec)) => { + for s in vec { + if let Schema::Object(s) = s { + sv.visit_schema_object(definitions, s); + } + } + } + } +} diff --git a/lib/vector-config/src/schema/visitors/test.rs b/lib/vector-config/src/schema/visitors/test.rs new file mode 100644 index 0000000000000..1f1793cb8c071 --- /dev/null +++ b/lib/vector-config/src/schema/visitors/test.rs @@ -0,0 +1,15 @@ +use pretty_assertions::assert_eq; +use serde_json::Value; +use vector_config_common::schema::RootSchema; + +pub fn as_schema(value: Value) -> RootSchema { + serde_json::from_value(value).expect("should not fail to deserialize schema") +} + +#[track_caller] +pub fn assert_schemas_eq(expected: RootSchema, actual: RootSchema) { + let expected_json = serde_json::to_string_pretty(&expected).expect("should not fail"); + let actual_json = serde_json::to_string_pretty(&actual).expect("should not fail"); + + assert_eq!(expected_json, actual_json); +} diff --git a/lib/vector-config/src/schema/visitors/unevaluated.rs b/lib/vector-config/src/schema/visitors/unevaluated.rs index 1db74cc483b28..cfab8822df5be 100644 --- a/lib/vector-config/src/schema/visitors/unevaluated.rs +++ b/lib/vector-config/src/schema/visitors/unevaluated.rs @@ -1,10 +1,22 @@ -//use indexmap::IndexMap; +use std::{ + collections::{HashMap, HashSet}, + convert::identity, +}; + +use tracing::debug; use vector_config_common::schema::{ - visit::{visit_schema_object, with_resolved_schema_reference, Visitor}, - InstanceType, Map, Schema, SchemaObject, SchemaSettings, SingleOrVec, + visit::{with_resolved_schema_reference, Visitor}, + *, }; -/// A visitor that marks schemas as disallowing unknown properties via `unevaluatedProperties`. +use crate::schema::visitors::merge::Mergeable; + +use super::scoped_visit::{ + visit_schema_object_scoped, SchemaReference, SchemaScopeStack, ScopedVisitor, +}; + +/// A visitor that marks schemas as closed by disallowing unknown properties via +/// `unevaluatedProperties`. /// /// This is the equivalent of `serde`'s `deny_unknown_fields` attribute: instead of only validating /// the properties specified in the schema, and ignoring any properties present in the JSON @@ -15,23 +27,84 @@ use vector_config_common::schema::{ /// with advanced subschema validation, such as `oneOf` or `allOf`, as `unevaluatedProperties` /// cannot simply be applied to any and all schemas indiscriminately. #[derive(Debug, Default)] -pub struct DisallowedUnevaluatedPropertiesVisitor; +pub struct DisallowedUnevaluatedPropertiesVisitor { + scope_stack: SchemaScopeStack, + eligible_to_flatten: HashMap>, +} impl DisallowedUnevaluatedPropertiesVisitor { pub fn from_settings(_: &SchemaSettings) -> Self { - Self + Self { + scope_stack: SchemaScopeStack::default(), + eligible_to_flatten: HashMap::new(), + } } } impl Visitor for DisallowedUnevaluatedPropertiesVisitor { + fn visit_root_schema(&mut self, root: &mut RootSchema) { + let eligible_to_flatten = build_closed_schema_flatten_eligibility_mappings(root); + + debug!( + "Found {} referents eligible for flattening: {:?}", + eligible_to_flatten.len(), + eligible_to_flatten, + ); + + self.eligible_to_flatten = eligible_to_flatten; + + visit::visit_root_schema(self, root); + } + fn visit_schema_object( &mut self, definitions: &mut Map, schema: &mut SchemaObject, ) { + // If this schema has a schema reference, check our flattening eligibility map to figure out + // if we need to merge it in. + // + // When a given schema reference (the actual target of `$ref`) is eligible for flattening in + // a given schema (what we're currently visiting) then it means that this schema would, + // based on its composition, lead to the schema reference either being marked or unmarked. + // + // We flatten the schema reference into this schema to avoid that from occurring, and we do + // so based on whichever group of referrers -- the schemas which reference the particular + // target schema -- is smaller, such that we do the minimum number of flattenings per target + // schema, to keep the schema as small as we reasonably can. + if let Some(reference) = schema.reference.as_ref() { + let current_parent_schema_ref = self.get_current_schema_scope(); + + if let Some(referrers) = self.eligible_to_flatten.get(reference) { + if referrers.contains(current_parent_schema_ref) { + let current_schema_ref = get_cleaned_schema_reference(reference); + let referenced_schema = definitions + .get(current_schema_ref) + .expect("schema definition must exist"); + + debug!( + referent = current_schema_ref, + referrer = current_parent_schema_ref.as_ref(), + "Found eligible referent/referrer mapping." + ); + + if let Schema::Object(referenced_schema) = referenced_schema { + debug!( + referent = current_schema_ref, + referrer = current_parent_schema_ref.as_ref(), + "Flattening referent into referrer." + ); + + schema.reference = None; + schema.merge(referenced_schema); + } + } + } + } + // Visit the schema object first so that we recurse the overall schema in a depth-first - // fashion, applying the unevaluated properties change from the bottom up. - visit_schema_object(self, definitions, schema); + // fashion, marking eligible object schemas as closed. + visit_schema_object_scoped(self, definitions, schema); // Next, see if this schema has any subschema validation: `allOf`, `oneOf`, or `anyOf`. // @@ -42,39 +115,335 @@ impl Visitor for DisallowedUnevaluatedPropertiesVisitor { // which is why we remove that from the subschemas themselves but essentially hoist it up // to the level of the `allOf`/`oneOf`/`anyOf`, where it can apply the correct behavior. let mut had_relevant_subschemas = false; - if let Some(subschemas) = get_subschema_validators(schema) { - had_relevant_subschemas = true; - + if let Some(subschema) = schema.subschemas.as_mut() { + let subschemas = get_object_subschemas_from_parent_mut(subschema.as_mut()); for subschema in subschemas { - // If the schema is an object schema, we'll unset `unevaluatedProperties` directly. - // If it isn't an object schema, we'll see if the subschema is actually a schema - // reference, and if so, we'll make sure to unset `unevaluatedProperties` on the - // resolved schema reference itself. - // - // Like the top-level schema reference logic, this ensures the schema definition is - // updated for subsequent resolution. - if let Some(object) = subschema.object.as_mut() { - object.unevaluated_properties = Some(Box::new(Schema::Bool(true))); - } else { - with_resolved_schema_reference(definitions, subschema, |_, resolved| { - if let Schema::Object(schema) = resolved { - if let Some(object) = schema.object.as_mut() { - object.unevaluated_properties = Some(Box::new(Schema::Bool(true))); - } - } - }); - } + had_relevant_subschemas = true; + + unmark_or_flatten_schema(definitions, subschema); } } // If we encountered any subschema validation, or if this schema itself is an object schema, // mark the schema as closed by setting `unevaluatedProperties` to `false`. - if had_relevant_subschemas || is_object_schema(schema.instance_type.as_ref()) { + if had_relevant_subschemas || is_object_schema(schema) { mark_schema_closed(schema); } } } +impl ScopedVisitor for DisallowedUnevaluatedPropertiesVisitor { + fn push_schema_scope>(&mut self, scope: S) { + self.scope_stack.push(scope.into()); + } + + fn pop_schema_scope(&mut self) { + self.scope_stack.pop().expect("stack was empty during pop"); + } + + fn get_current_schema_scope(&self) -> &SchemaReference { + self.scope_stack.current().unwrap_or(&SchemaReference::Root) + } +} + +fn unmark_or_flatten_schema(definitions: &mut Map, schema: &mut SchemaObject) { + // If the schema is an object schema, we'll unset `unevaluatedProperties` directly. + // If it isn't an object schema, we'll see if the subschema is actually a schema + // reference, and if so, we'll make sure to unset `unevaluatedProperties` on the + // resolved schema reference itself. + // + // Like the top-level schema reference logic, this ensures the schema definition is + // updated for subsequent resolution. + if let Some(object) = schema.object.as_mut() { + debug!("Unmarked object subschema directly."); + + object.unevaluated_properties = Some(Box::new(Schema::Bool(true))); + } else { + with_resolved_schema_reference(definitions, schema, |_, schema_ref, resolved| { + if let Schema::Object(resolved) = resolved { + if let Some(object) = resolved.object.as_mut() { + debug!( + referent = schema_ref, + "Unmarked subschema by traversing schema reference." + ); + + object.unevaluated_properties = Some(Box::new(Schema::Bool(true))); + } + } + }); + } +} + +/// A referent schema that carries the chance of being unmarking by its referrer. +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +struct MarkableReferent { + // Whether or not the referent would be unmarked by the referrer. + would_unmark: bool, + + /// The referent schema. + referent: SchemaReference, +} + +impl MarkableReferent { + fn would_unmark>(referent: R) -> Self { + Self { + would_unmark: true, + referent: referent.into(), + } + } + + fn would_not_unmark>(referent: R) -> Self { + Self { + would_unmark: false, + referent: referent.into(), + } + } + + fn with_new_referent>(&self, new_referent: R) -> Self { + Self { + would_unmark: self.would_unmark, + referent: new_referent.into(), + } + } +} + +fn build_closed_schema_flatten_eligibility_mappings( + root_schema: &RootSchema, +) -> HashMap> { + // For all definitions, visit _just_ the defined schema (no recursing) and build a map of child + // definitions -> (mark eligibility, [(parent definition, would_unmark)]), such that we know + // exactly which schemas refer to any given schema definition and if they would lead to the + // child schema being marked as `unevaluatedProperties: false`. + // + // We would filter out any child definitions that aren't eligible to be marked. For the + // remaining child schemas, we group the parent definitions by `would_unmark`, which indicates + // whether or not the given parent definition would cause the child definition to be unmarked. + // + // As an example, we would expect a parent schema referring to a child schema via `allOf` to + // unmark the child schema, while a parent schema referring to a child schema within a specific + // property to not unmark the child schema. + // + // With the grouped parent definitions, take the smaller of the two groups. This represents the + // set of parent schemas that we will indicate as needing to use a flattened version of the + // child schema when we execute our primary visit logic. + + // Iterate over all definitions, and once more for the root schema, and generate a map of parent + // schema -> (would_unmark, child schema). + let mut parent_to_child = HashMap::new(); + for (definition_name, definition) in &root_schema.definitions { + // We only care about full-fledged schemas, not boolean schemas. + let parent_schema = match definition { + Schema::Bool(_) => continue, + Schema::Object(schema) => schema, + }; + + // If a schema itself would not be considered markable, then we don't need to consider the + // eligibility between parent/child since there's nothing to drive the "now unmark the child + // schemas" logic. + if !is_markable_schema(&root_schema.definitions, parent_schema) { + debug!("Schema definition '{}' not markable.", definition_name); + continue; + } + + // Collect all referents for this definition, which includes both property-based referents + // and subschema-based referents. Property-based referents are not required to be unmarked, + // while subschema-based referents must be unmarked. + let mut referents = HashSet::new(); + get_referents(parent_schema, &mut referents); + + // Store the parent/child mapping. + parent_to_child.insert(SchemaReference::from(definition_name), referents); + } + + // Collect the referents from the root schema. + let mut root_referents = HashSet::new(); + get_referents(&root_schema.schema, &mut root_referents); + parent_to_child.insert(SchemaReference::Root, root_referents); + + // Now we build a reverse map, going from child -> parent. We'll iterate over every child + // referent, for every parent/child entry, calculating the set of referrers, and if they would + // require unmarking the child. + let mut child_to_parent = HashMap::new(); + for (parent_schema_ref, child_referents) in parent_to_child { + for child_referent in child_referents { + let entry = child_to_parent + .entry(child_referent.referent.as_ref().to_string()) + .or_insert_with(HashSet::new); + + // Transform the child referent into a parent referent, which preserves the "would + // unmark" value but now points to the parent instead, and add it to the list of + // _referrers_ for the child. + entry.insert(child_referent.with_new_referent(parent_schema_ref.clone())); + } + } + + let mut eligible_to_flatten = HashMap::new(); + for (child_schema_ref, referrers) in child_to_parent { + // Don't flatten schemas which have less than two referrers. + if referrers.len() < 2 { + continue; + } + + let would_unmark = referrers + .iter() + .filter(|r| r.would_unmark) + .map(|r| r.referent.clone()) + .collect::>(); + let would_not_unmark = referrers + .iter() + .filter(|r| !r.would_unmark) + .map(|r| r.referent.clone()) + .collect::>(); + + if would_not_unmark.len() >= would_unmark.len() { + eligible_to_flatten.insert(child_schema_ref.to_string(), would_unmark); + } else { + eligible_to_flatten.insert(child_schema_ref.to_string(), would_not_unmark); + } + } + + eligible_to_flatten +} + +/// Determines whether a schema is eligible to be marked. +fn is_markable_schema(definitions: &Map, schema: &SchemaObject) -> bool { + // If the schema is an object schema, and does not have`additionalProperties` set, it can be + // marked, as marking a schema with both `unevaluatedProperties`/`additionalProperties` would + // otherwise be a logical inconsistency. + let has_additional_properties = schema + .object + .as_ref() + .and_then(|object| object.additional_properties.as_ref()) + .map(|schema| matches!(schema.as_ref(), Schema::Object(_))) + .unwrap_or(false); + + if is_object_schema(schema) && !has_additional_properties { + return true; + } + + // If the schema uses subschema validation -- specifically: `allOf`, `oneOf`, or `anyOf` -- then + // it should be marked, so long as one of the subschemas is actually an object schema. + // + // If we're dealing with something like a `oneOf` for `Option`, we'll have two + // subschemas: { "type": "null" } and { "$ref": "#/definitions/T" }. If the schema for `T` is, + // say, just a scalar schema, instead of an object schema... then it wouldn't be marked, and in + // turn, we wouldn't need to mark the schema for `Option`: there's no properties at all. + // + // We'll follow schema references in subschemas up to one level deep trying to figure this out. + if let Some(subschema) = schema.subschemas.as_ref() { + let subschemas = get_object_subschemas_from_parent(subschema).collect::>(); + + let has_object_subschema = subschemas.iter().any(|schema| is_object_schema(schema)); + let has_referenced_object_subschema = subschemas + .iter() + .map(|subschema| { + subschema + .reference + .as_ref() + .and_then(|reference| { + let reference = get_cleaned_schema_reference(reference); + definitions.get(reference) + }) + .and_then(Schema::as_object) + .map_or(false, is_object_schema) + }) + .any(identity); + + if has_object_subschema || has_referenced_object_subschema { + return true; + } + } + + false +} + +/// Collects all referents from the given parent schema, and inserts them to `referents`. +/// +/// Property schemas from `properties`, `patternProperties`, and `additionalProperties` are checked. +/// Any such referents in a property schema are do not need to be unmarked as the "chain" between +/// parent/child is broken implicitly by the property-level scoping of the value they would be given +/// to validate. +/// +/// Subschemas from `allOf`, `oneOf`, and `anyOf` are also checked. As subschema validation implies +/// that each subschema will be given the same value to validate, even if the subschema only +/// represents a slice of the parent schema, there is a link between parent/child that requires the +/// child to be unmarked so that the parent can be marked to enforce `unevaluatedProperties` at the +/// correct scope. +/// +/// This function will recurse a schema object entirely, in terms of property schemas and +/// subschemas, but will not recurse through schema references. +fn get_referents(parent_schema: &SchemaObject, referents: &mut HashSet) { + if let Some(parent_object) = parent_schema.object.as_ref() { + // For both `properties` and `patternProperties`, collect the schema reference, if any, from + // all property schemas. + for (_, property_schema) in parent_object + .properties + .iter() + .chain(parent_object.pattern_properties.iter()) + { + if let Some(child_schema) = property_schema.as_object() { + if let Some(child_schema_ref) = child_schema.reference.as_ref() { + referents.insert(MarkableReferent::would_not_unmark(child_schema_ref)); + } else { + get_referents(child_schema, referents); + } + } + } + + // For `additionalProperties`, if present and defined as a schema object, collect the schema + // reference if one is set. + if let Some(additional_properties) = parent_object.additional_properties.as_ref() { + if let Some(child_schema) = additional_properties.as_ref().as_object() { + if let Some(child_schema_ref) = child_schema.reference.as_ref() { + referents.insert(MarkableReferent::would_not_unmark(child_schema_ref)); + } else { + get_referents(child_schema, referents); + } + } + } + } + + if let Some(subschema) = parent_schema.subschemas.as_ref() { + // For `allOf`, `oneOf`, and `anyOf`, collect the schema reference, if any, from their + // respective subschemas. + for subschema in get_object_subschemas_from_parent(subschema) { + if let Some(child_schema_ref) = subschema.reference.as_ref() { + referents.insert(MarkableReferent::would_unmark(child_schema_ref)); + } else { + get_referents(subschema, referents); + } + } + } +} + +fn get_object_subschemas_from_parent( + subschema: &SubschemaValidation, +) -> impl Iterator { + [ + subschema.all_of.as_ref(), + subschema.one_of.as_ref(), + subschema.any_of.as_ref(), + ] + .into_iter() + .flatten() + .flatten() + .filter_map(Schema::as_object) +} + +fn get_object_subschemas_from_parent_mut( + subschema: &mut SubschemaValidation, +) -> impl Iterator { + [ + subschema.all_of.as_mut(), + subschema.one_of.as_mut(), + subschema.any_of.as_mut(), + ] + .into_iter() + .flatten() + .flatten() + .filter_map(Schema::as_object_mut) +} + fn mark_schema_closed(schema: &mut SchemaObject) { // Make sure this schema doesn't also have `additionalProperties` set to a non-boolean schema, // as it would be a logical inconsistency to then also set `unevaluatedProperties` to `false`. @@ -110,71 +479,32 @@ fn mark_schema_closed(schema: &mut SchemaObject) { schema.object().unevaluated_properties = Some(Box::new(Schema::Bool(false))); } -fn is_object_schema(instance_type: Option<&SingleOrVec>) -> bool { - match instance_type { +fn schema_type_matches( + schema: &SchemaObject, + instance_type: InstanceType, + allow_multiple: bool, +) -> bool { + match schema.instance_type.as_ref() { Some(sov) => match sov { - SingleOrVec::Single(inner) => inner.as_ref() == &InstanceType::Object, - SingleOrVec::Vec(inner) => inner.contains(&InstanceType::Object), + SingleOrVec::Single(inner) => inner.as_ref() == &instance_type, + SingleOrVec::Vec(inner) => inner.contains(&instance_type) && allow_multiple, }, None => false, } } -fn get_subschema_validators(schema: &mut SchemaObject) -> Option> { - let mut validators = vec![]; - - // Grab any subschemas for `allOf`/`oneOf`/`anyOf`, if present. - // - // There are other advanced validation mechanisms such as `if`/`then`/`else, but we explicitly - // don't handle them here as we don't currently use them in Vector's configuration schema. - if let Some(subschemas) = schema.subschemas.as_mut() { - if let Some(all_of) = subschemas.all_of.as_mut() { - validators.extend(all_of.iter_mut().filter_map(|validator| match validator { - Schema::Object(inner) => Some(inner), - _ => None, - })); - } - - if let Some(one_of) = subschemas.one_of.as_mut() { - validators.extend(one_of.iter_mut().filter_map(|validator| match validator { - Schema::Object(inner) => Some(inner), - _ => None, - })); - } - - if let Some(any_of) = subschemas.any_of.as_mut() { - validators.extend(any_of.iter_mut().filter_map(|validator| match validator { - Schema::Object(inner) => Some(inner), - _ => None, - })); - } - } - - if validators.is_empty() { - None - } else { - Some(validators) - } +fn is_object_schema(schema: &SchemaObject) -> bool { + schema_type_matches(schema, InstanceType::Object, true) } #[cfg(test)] mod tests { - use pretty_assertions::assert_eq; - use serde_json::{json, Value}; - use vector_config_common::schema::{visit::Visitor, RootSchema}; - - use super::DisallowedUnevaluatedPropertiesVisitor; + use serde_json::json; + use vector_config_common::schema::visit::Visitor; - fn as_schema(value: Value) -> RootSchema { - serde_json::from_value(value).expect("should not fail to deserialize schema") - } - - fn assert_schemas_eq(expected: RootSchema, actual: RootSchema) { - let expected_json = serde_json::to_string_pretty(&expected).expect("should not fail"); - let actual_json = serde_json::to_string_pretty(&actual).expect("should not fail"); + use crate::schema::visitors::test::{as_schema, assert_schemas_eq}; - assert_eq!(expected_json, actual_json); - } + use super::DisallowedUnevaluatedPropertiesVisitor; #[test] fn basic_object_schema() { @@ -465,4 +795,149 @@ mod tests { assert_schemas_eq(expected_schema, actual_schema); } + + #[test] + fn conflicting_schema_usages_get_duplicated_and_flattened() { + let mut actual_schema = as_schema(json!({ + "type": "object", + "properties": { + "acks": { "$ref": "#/definitions/acks" }, + "custom_acks": { "$ref": "#/definitions/custom_acks" } + }, + "definitions": { + "custom_acks": { + "allOf": [{ "type": "object", "properties": { "ack_count": { "type": "number" } } }, + { "$ref": "#/definitions/acks" }] + }, + "acks": { "type": "object", "properties": { "enabled": { "type": "boolean" } } } + } + })); + + let mut visitor = DisallowedUnevaluatedPropertiesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + let expected_schema = as_schema(json!({ + "type": "object", + "properties": { + "acks": { "$ref": "#/definitions/acks" }, + "custom_acks": { "$ref": "#/definitions/custom_acks" } + }, + "definitions": { + "custom_acks": { + "allOf": [ + { "type": "object", "properties": { "ack_count": { "type": "number" } } }, + { "type": "object", "properties": { "enabled": { "type": "boolean" } } } + ], + "unevaluatedProperties": false + }, + "acks": { + "type": "object", + "properties": { "enabled": { "type": "boolean" } }, + "unevaluatedProperties": false + } + }, + "unevaluatedProperties": false + })); + + assert_schemas_eq(expected_schema, actual_schema); + } + + #[test] + fn multiple_mark_unmark_references_flattened_efficiently() { + // This tests that if, for example, one schema reference would be marked and unmarked by + // multiple referrers, the referrers we choose to flatten the reference on are in the + // smaller group (i.e. we do as few flattenings as possible). + + let mut actual_schema = as_schema(json!({ + "type": "object", + "properties": { + "a": { "$ref": "#/definitions/a" }, + "b": { "$ref": "#/definitions/b" }, + "c": { "$ref": "#/definitions/c" }, + "one": { "$ref": "#/definitions/one" }, + "two": { "$ref": "#/definitions/two" } + }, + "definitions": { + "one": { + "allOf": [{ "$ref": "#/definitions/c" }] + }, + "two": { + "allOf": [{ "$ref": "#/definitions/b" }, { "$ref": "#/definitions/c" }] + }, + "a": { + "type": "object", + "properties": { "a": { "type": "boolean" } } + }, + "b": { + "type": "object", + "properties": { "b": { "type": "boolean" } } + }, + "c": { + "type": "object", + "properties": { "c": { "type": "boolean" } } + } + } + })); + + let mut visitor = DisallowedUnevaluatedPropertiesVisitor::default(); + visitor.visit_root_schema(&mut actual_schema); + + // Expectations: + // - Schema A is only referenced in an object property, so it's marked normally. + // - Schema B is referenced twice -- once as an object property and once in a subschema -- + // so since we prioritize flattening usages that would unmark a schema when the + // would-unmark/would-not-unmark counts are equal, schema B is only flattened for the + // subschema usage. + // - Schema C is referenced three times -- once as an object property and twice in a + // subschema -- so since there's more would-unmark usages than would-not-unmark usages, we + // flatten the smallest group of usages, which is the would-not-unmark group aka object + // properties. + let expected_schema = as_schema(json!({ + "type": "object", + "properties": { + "a": { "$ref": "#/definitions/a" }, + "b": { "$ref": "#/definitions/b" }, + "c": { + "type": "object", + "properties": { "c": { "type": "boolean" } }, + "unevaluatedProperties": false + }, + "one": { "$ref": "#/definitions/one" }, + "two": { "$ref": "#/definitions/two" } + }, + "definitions": { + "one": { + "allOf": [{ "$ref": "#/definitions/c" }], + "unevaluatedProperties": false + }, + "two": { + "allOf": [ + { + "type": "object", + "properties": { "b": { "type": "boolean" } } + }, + { "$ref": "#/definitions/c" } + ], + "unevaluatedProperties": false + }, + "a": { + "type": "object", + "properties": { "a": { "type": "boolean" } }, + "unevaluatedProperties": false + }, + "b": { + "type": "object", + "properties": { "b": { "type": "boolean" } }, + "unevaluatedProperties": false + }, + "c": { + "type": "object", + "properties": { "c": { "type": "boolean" } } + } + }, + "unevaluatedProperties": false + })); + + assert_schemas_eq(expected_schema, actual_schema); + } } diff --git a/scripts/generate-component-docs.rb b/scripts/generate-component-docs.rb index b9a7e98381696..ed47670470c6f 100755 --- a/scripts/generate-component-docs.rb +++ b/scripts/generate-component-docs.rb @@ -603,6 +603,17 @@ def expand_schema_references(root_schema, unexpanded_schema) } end + if !schema['anyOf'].nil? + schema['anyOf'] = schema['anyOf'].map { |subschema| + new_subschema = expand_schema_references(root_schema, subschema) + if new_subschema != subschema + expanded = true + end + + new_subschema + } + end + if !expanded break end @@ -914,10 +925,10 @@ def resolve_bare_schema(root_schema, schema) @logger.error "Relevant schema: #{JSON.pretty_generate(schema)}" exit 1 end - additional_properties['description'] = singular_description resolved_additional_properties = resolve_schema(root_schema, additional_properties) resolved_additional_properties['required'] = true + resolved_additional_properties['description'] = singular_description options.push(['*', resolved_additional_properties]) end diff --git a/website/cue/reference/components/sinks/base/gcp_stackdriver_logs.cue b/website/cue/reference/components/sinks/base/gcp_stackdriver_logs.cue index 6f73a93dd3cd8..646e9d3ec69a2 100644 --- a/website/cue/reference/components/sinks/base/gcp_stackdriver_logs.cue +++ b/website/cue/reference/components/sinks/base/gcp_stackdriver_logs.cue @@ -314,8 +314,12 @@ base: components: sinks: gcp_stackdriver_logs: configuration: { } } resource: { - description: "The monitored resource to associate the logs with." - required: true + description: """ + A monitored resource. + + The monitored resource to associate the logs with. + """ + required: true type: object: { examples: [{ instanceId: "Twilight" diff --git a/website/cue/reference/components/sinks/base/gcp_stackdriver_metrics.cue b/website/cue/reference/components/sinks/base/gcp_stackdriver_metrics.cue index f0edf5c8474a2..6f58d57ca46a9 100644 --- a/website/cue/reference/components/sinks/base/gcp_stackdriver_metrics.cue +++ b/website/cue/reference/components/sinks/base/gcp_stackdriver_metrics.cue @@ -256,8 +256,12 @@ base: components: sinks: gcp_stackdriver_metrics: configuration: { } } resource: { - description: "The monitored resource to associate the metrics with." - required: true + description: """ + A monitored resource. + + The monitored resource to associate the metrics with. + """ + required: true type: object: { examples: [{ instanceId: "Twilight" diff --git a/website/cue/reference/components/sinks/base/http.cue b/website/cue/reference/components/sinks/base/http.cue index edc96ae57ba11..7eefce772c920 100644 --- a/website/cue/reference/components/sinks/base/http.cue +++ b/website/cue/reference/components/sinks/base/http.cue @@ -310,8 +310,12 @@ base: components: sinks: http: configuration: { } } method: { - description: "The HTTP method to use when making the request." - required: false + description: """ + HTTP method. + + The HTTP method to use when making the request. + """ + required: false type: string: { default: "post" enum: {