diff --git a/protos/BUILD b/protos/BUILD index 6a2e019f7..8f2723d5d 100644 --- a/protos/BUILD +++ b/protos/BUILD @@ -1,6 +1,6 @@ load("@rules_proto//proto:defs.bzl", "proto_library") -load("//tools:ts_proto_library.bzl", "ts_proto_library") load("//testing:build_test.bzl", "build_test") +load("//tools:ts_proto_library.bzl", "ts_proto_library") package(default_visibility = ["//visibility:public"]) @@ -9,6 +9,7 @@ proto_library( srcs = [ "configs.proto", "core.proto", + "data_preparation.proto", "evaluation.proto", "execution.proto", "profiles.proto", diff --git a/protos/configs.proto b/protos/configs.proto index 6bbd97220..66c423cec 100644 --- a/protos/configs.proto +++ b/protos/configs.proto @@ -515,6 +515,32 @@ message ActionConfig { // TODO(ekrekr): add a notebook runtime field definition. } + message DataPreparationConfig { + // The name of the data preparation. + string name = 1; + + // Targets of actions that this action is dependent on. + repeated Target dependency_targets = 2; + + // Path to the source file that the contents of the action is loaded from. + string filename = 3; + + // A list of user-defined tags with which the action should be labeled. + repeated string tags = 4; + + // If set to true, this action will not be executed. However, the action can + // still be depended upon. Useful for temporarily turning off broken + // actions. + bool disabled = 7; + + // Description of the data preparation. + string description = 8; + + // When set to true, assertions dependent upon any dependency will + // be add as dedpendency to this action + bool depend_on_dependency_assertions = 9; + } + oneof action { TableConfig table = 1; ViewConfig view = 2; @@ -523,6 +549,7 @@ message ActionConfig { OperationConfig operation = 5; DeclarationConfig declaration = 6; NotebookConfig notebook = 7; + DataPreparationConfig data_preparation = 8; } } diff --git a/protos/core.proto b/protos/core.proto index 65ed933b9..f2f759d32 100644 --- a/protos/core.proto +++ b/protos/core.proto @@ -1,4 +1,5 @@ syntax = "proto3"; +import "data_preparation.proto"; package dataform; @@ -247,6 +248,24 @@ message NotebookRuntimeOptions { } } +// Data Preparation Related entries +message DataPreparation { + // Data preparatiohs can have more than 1 output + repeated Target targets = 1; + + repeated Target canonical_targets = 2; + + repeated string tags = 3; + + repeated Target dependency_targets = 4; + + string file_name = 5; + + bool disabled = 6; + + DataPreparationDefinition data_preparation = 7; +} + message CompiledGraph { ProjectConfig project_config = 4; @@ -258,6 +277,7 @@ message CompiledGraph { // it is not used at runtime. repeated Test tests = 8; repeated Notebook notebooks = 12; + repeated DataPreparation data_preparations = 13; GraphErrors graph_errors = 7; diff --git a/protos/data_preparation.proto b/protos/data_preparation.proto new file mode 100644 index 000000000..aa988b024 --- /dev/null +++ b/protos/data_preparation.proto @@ -0,0 +1,174 @@ +syntax = "proto3"; + +package dataform; + +option java_package = "com.dataform.protos"; +option java_multiple_files = true; + +option go_package = "github.com/dataform-co/dataform/protos/dataform"; + +message DataPreparationDefinition { + repeated DataPreparationNode nodes = 1; + DataPreparationGenerated generated = 2; +} + +message DataPreparationNode { + string id = 1; + DataPreparationNodeSource source = 2; + repeated DataPreparationNodeStep steps = 3; + DataPreparationNodeGenerated generated = 4; + // Destination BigQuery table(s) are defined within the data preparation + optional DataPreparationNodeDestination destination = 5; +} + +message DataPreparationNodeSource { + oneof source { + string node_id = 1; + TableReference table = 2; + Join join = 3; + } +} + +message DataPreparationNodeDestination { + oneof destination { + TableReference table = 1; + } +} + +message DataPreparationNodeStep { + string id = 1; + string description = 2; + oneof definition { + ColumnStep column_step = 3; + FilterStep filter_step = 4; + } + DataPreparationNodeStepGenerated generated = 5; +} + +message ColumnStep { + string column_name = 1; + Expression expression = 2; +} + +message FilterStep { + Expression expression = 1; +} + +message Expression { + oneof expression { + string sql = 1; + } +} + +message Join { + string left_node_id = 1; + string right_node_id = 2; + JoinType join_type = 3; + JoinCondition join_condition = 4; +} + +enum JoinType { + JOIN_TYPE_UNSPECIFIED = 0; + JOIN_TYPE_INNER = 1; + JOIN_TYPE_FULL_OUTER = 2; + JOIN_TYPE_LEFT = 3; + JOIN_TYPE_RIGHT = 4; +} + +message JoinCondition { + oneof condition { + Expression expression = 1; + JoinKeys keys = 2; + } +} + +message JoinKeys { + repeated JoinKey keys = 1; +} + +message JoinKey { + string left_column = 1; + string right_column = 2; +} + +message TableReference { + string project = 1; + string dataset = 2; + string table = 3; +} + +message DataPreparationGenerated { + repeated DataPreparationValidationError validation_errors = 1; + optional string location = 2; +} + +message DataPreparationNodeGenerated { + repeated DataPreparationSection sections = 1; + repeated string sources = 2; + repeated DataPreparationValidationError validation_errors = 3; + optional DataPreparationSchema output_schema = 4; + DataPreparationNodeSourceGenerated source_generated = 5; + optional DataPreparationNodeDestinationGenerated destination_generated = 6; +} + +message DataPreparationSection { + DataPreparationSectionType type = 1; + string label = 2; +} + +enum DataPreparationSectionType { + SECTION_TYPE_UNSPECIFIED = 0; + SECTION_UNPARSEABLE = 1; + SECTION_SOURCE_TABLE = 2; + SECTION_SQL = 3; + SECTION_DESTINATION_TABLE = 4; +} + +message DataPreparationNodeSourceGenerated { + optional DataPreparationNodeSourceSourceSchema source_schema = 4; +} + +message DataPreparationNodeSourceSourceSchema { + oneof source_schema { + DataPreparationSchema node_schema = 1; + DataPreparationSchema table_schema = 2; + JoinSchema join_schema = 3; + } +} + +message JoinSchema { + DataPreparationSchema left_schema = 1; + DataPreparationSchema right_schema = 2; +} + +message DataPreparationNodeDestinationGenerated { + optional DataPreparationSchema schema = 1; +} + +message DataPreparationNodeStepGenerated { + repeated string source_columns = 1; + repeated DataPreparationValidationError validation_errors = 2; +} + +message DataPreparationSchema { + repeated DataPreparationSchemaField field = 1; +} + +message DataPreparationSchemaField { + string name = 1; + optional string type = 2; + optional string mode = 3; + repeated DataPreparationSchemaField fields = 4; +} + +message DataPreparationValidationError { + DataPreparationValidationErrorLevel level = 1; + string description = 2; +} + +enum DataPreparationValidationErrorLevel { + LEVEL_UNSPECIFIED = 0; + LEVEL_WARN = 1; + LEVEL_ERROR = 2; + LEVEL_FATAL = 3; +} \ No newline at end of file