-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathdata.proto
1872 lines (1494 loc) · 63.7 KB
/
data.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package google.events.cloud.dataplex.v1;
import "google/protobuf/duration.proto";
import "google/protobuf/timestamp.proto";
option csharp_namespace = "Google.Events.Protobuf.Cloud.Dataplex.V1";
// A lake is a centralized repository for managing enterprise data across the
// organization distributed across many cloud projects, and stored in a variety
// of storage services such as Google Cloud Storage and BigQuery. The resources
// attached to a lake are referred to as managed resources. Data within these
// managed resources can be structured or unstructured. A lake provides data
// admins with tools to organize, secure and manage their data at scale, and
// provides data scientists and data engineers an integrated experience to
// easily search, discover, analyze and transform data and associated metadata.
message Lake {
// Settings to manage association of Dataproc Metastore with a lake.
message Metastore {
// Optional. A relative reference to the Dataproc Metastore
// (https://cloud.google.com/dataproc-metastore/docs) service associated
// with the lake:
// `projects/{project_id}/locations/{location_id}/services/{service_id}`
string service = 1;
}
// Status of Lake and Dataproc Metastore service instance association.
message MetastoreStatus {
// Current state of association.
enum State {
// Unspecified.
STATE_UNSPECIFIED = 0;
// A Metastore service instance is not associated with the lake.
NONE = 1;
// A Metastore service instance is attached to the lake.
READY = 2;
// Attach/detach is in progress.
UPDATING = 3;
// Attach/detach could not be done due to errors.
ERROR = 4;
}
// Current state of association.
State state = 1;
// Additional information about the current status.
string message = 2;
// Last update time of the metastore status of the lake.
google.protobuf.Timestamp update_time = 3;
// The URI of the endpoint used to access the Metastore service.
string endpoint = 4;
}
// Output only. The relative resource name of the lake, of the form:
// `projects/{project_number}/locations/{location_id}/lakes/{lake_id}`.
string name = 1;
// Optional. User friendly display name.
string display_name = 2;
// Output only. System generated globally unique ID for the lake. This ID will
// be different if the lake is deleted and re-created with the same name.
string uid = 3;
// Output only. The time when the lake was created.
google.protobuf.Timestamp create_time = 4;
// Output only. The time when the lake was last updated.
google.protobuf.Timestamp update_time = 5;
// Optional. User-defined labels for the lake.
map<string, string> labels = 6;
// Optional. Description of the lake.
string description = 7;
// Output only. Current state of the lake.
State state = 8;
// Output only. Service account associated with this lake. This service
// account must be authorized to access or operate on resources managed by the
// lake.
string service_account = 9;
// Optional. Settings to manage lake and Dataproc Metastore service instance
// association.
Metastore metastore = 102;
// Output only. Aggregated status of the underlying assets of the lake.
AssetStatus asset_status = 103;
// Output only. Metastore status of the lake.
MetastoreStatus metastore_status = 104;
}
// Aggregated status of the underlying assets of a lake or zone.
message AssetStatus {
// Last update time of the status.
google.protobuf.Timestamp update_time = 1;
// Number of active assets.
int32 active_assets = 2;
// Number of assets that are in process of updating the security policy on
// attached resources.
int32 security_policy_applying_assets = 3;
}
// A zone represents a logical group of related assets within a lake. A zone can
// be used to map to organizational structure or represent stages of data
// readiness from raw to curated. It provides managing behavior that is shared
// or inherited by all contained assets.
message Zone {
// Settings for resources attached as assets within a zone.
message ResourceSpec {
// Location type of the resources attached to a zone.
enum LocationType {
// Unspecified location type.
LOCATION_TYPE_UNSPECIFIED = 0;
// Resources that are associated with a single region.
SINGLE_REGION = 1;
// Resources that are associated with a multi-region location.
MULTI_REGION = 2;
}
// Required. Immutable. The location type of the resources that are allowed
// to be attached to the assets within this zone.
LocationType location_type = 1;
}
// Settings to manage the metadata discovery and publishing in a zone.
message DiscoverySpec {
// Describe CSV and similar semi-structured data formats.
message CsvOptions {
// Optional. The number of rows to interpret as header rows that should be
// skipped when reading data rows.
int32 header_rows = 1;
// Optional. The delimiter being used to separate values. This defaults to
// ','.
string delimiter = 2;
// Optional. The character encoding of the data. The default is UTF-8.
string encoding = 3;
// Optional. Whether to disable the inference of data type for CSV data.
// If true, all columns will be registered as strings.
bool disable_type_inference = 4;
}
// Describe JSON data format.
message JsonOptions {
// Optional. The character encoding of the data. The default is UTF-8.
string encoding = 1;
// Optional. Whether to disable the inference of data type for Json data.
// If true, all columns will be registered as their primitive types
// (strings, number or boolean).
bool disable_type_inference = 2;
}
// Required. Whether discovery is enabled.
bool enabled = 1;
// Optional. The list of patterns to apply for selecting data to include
// during discovery if only a subset of the data should considered. For
// Cloud Storage bucket assets, these are interpreted as glob patterns used
// to match object names. For BigQuery dataset assets, these are interpreted
// as patterns to match table names.
repeated string include_patterns = 2;
// Optional. The list of patterns to apply for selecting data to exclude
// during discovery. For Cloud Storage bucket assets, these are interpreted
// as glob patterns used to match object names. For BigQuery dataset assets,
// these are interpreted as patterns to match table names.
repeated string exclude_patterns = 3;
// Optional. Configuration for CSV data.
CsvOptions csv_options = 4;
// Optional. Configuration for Json data.
JsonOptions json_options = 5;
// Determines when discovery is triggered.
oneof trigger {
// Optional. Cron schedule (https://en.wikipedia.org/wiki/Cron) for
// running discovery periodically. Successive discovery runs must be
// scheduled at least 60 minutes apart. The default value is to run
// discovery every 60 minutes. To explicitly set a timezone to the cron
// tab, apply a prefix in the cron tab: "CRON_TZ=${IANA_TIME_ZONE}" or
// TZ=${IANA_TIME_ZONE}". The ${IANA_TIME_ZONE} may only be a valid string
// from IANA time zone database. For example, `CRON_TZ=America/New_York 1
// * * * *`, or `TZ=America/New_York 1 * * * *`.
string schedule = 10;
}
}
// Type of zone.
enum Type {
// Zone type not specified.
TYPE_UNSPECIFIED = 0;
// A zone that contains data that needs further processing before it is
// considered generally ready for consumption and analytics workloads.
RAW = 1;
// A zone that contains data that is considered to be ready for broader
// consumption and analytics workloads. Curated structured data stored in
// Cloud Storage must conform to certain file formats (parquet, avro and
// orc) and organized in a hive-compatible directory layout.
CURATED = 2;
}
// Output only. The relative resource name of the zone, of the form:
// `projects/{project_number}/locations/{location_id}/lakes/{lake_id}/zones/{zone_id}`.
string name = 1;
// Optional. User friendly display name.
string display_name = 2;
// Output only. System generated globally unique ID for the zone. This ID will
// be different if the zone is deleted and re-created with the same name.
string uid = 3;
// Output only. The time when the zone was created.
google.protobuf.Timestamp create_time = 4;
// Output only. The time when the zone was last updated.
google.protobuf.Timestamp update_time = 5;
// Optional. User defined labels for the zone.
map<string, string> labels = 6;
// Optional. Description of the zone.
string description = 7;
// Output only. Current state of the zone.
State state = 8;
// Required. Immutable. The type of the zone.
Type type = 9;
// Optional. Specification of the discovery feature applied to data in this
// zone.
DiscoverySpec discovery_spec = 103;
// Required. Specification of the resources that are referenced by the assets
// within this zone.
ResourceSpec resource_spec = 104;
// Output only. Aggregated status of the underlying assets of the zone.
AssetStatus asset_status = 105;
}
// An asset represents a cloud resource that is being managed within a lake as a
// member of a zone.
message Asset {
// Security policy status of the asset. Data security policy, i.e., readers,
// writers & owners, should be specified in the lake/zone/asset IAM policy.
message SecurityStatus {
// The state of the security policy.
enum State {
// State unspecified.
STATE_UNSPECIFIED = 0;
// Security policy has been successfully applied to the attached resource.
READY = 1;
// Security policy is in the process of being applied to the attached
// resource.
APPLYING = 2;
// Security policy could not be applied to the attached resource due to
// errors.
ERROR = 3;
}
// The current state of the security policy applied to the attached
// resource.
State state = 1;
// Additional information about the current state.
string message = 2;
// Last update time of the status.
google.protobuf.Timestamp update_time = 3;
}
// Settings to manage the metadata discovery and publishing for an asset.
message DiscoverySpec {
// Describe CSV and similar semi-structured data formats.
message CsvOptions {
// Optional. The number of rows to interpret as header rows that should be
// skipped when reading data rows.
int32 header_rows = 1;
// Optional. The delimiter being used to separate values. This defaults to
// ','.
string delimiter = 2;
// Optional. The character encoding of the data. The default is UTF-8.
string encoding = 3;
// Optional. Whether to disable the inference of data type for CSV data.
// If true, all columns will be registered as strings.
bool disable_type_inference = 4;
}
// Describe JSON data format.
message JsonOptions {
// Optional. The character encoding of the data. The default is UTF-8.
string encoding = 1;
// Optional. Whether to disable the inference of data type for Json data.
// If true, all columns will be registered as their primitive types
// (strings, number or boolean).
bool disable_type_inference = 2;
}
// Optional. Whether discovery is enabled.
bool enabled = 1;
// Optional. The list of patterns to apply for selecting data to include
// during discovery if only a subset of the data should considered. For
// Cloud Storage bucket assets, these are interpreted as glob patterns used
// to match object names. For BigQuery dataset assets, these are interpreted
// as patterns to match table names.
repeated string include_patterns = 2;
// Optional. The list of patterns to apply for selecting data to exclude
// during discovery. For Cloud Storage bucket assets, these are interpreted
// as glob patterns used to match object names. For BigQuery dataset assets,
// these are interpreted as patterns to match table names.
repeated string exclude_patterns = 3;
// Optional. Configuration for CSV data.
CsvOptions csv_options = 4;
// Optional. Configuration for Json data.
JsonOptions json_options = 5;
// Determines when discovery is triggered.
oneof trigger {
// Optional. Cron schedule (https://en.wikipedia.org/wiki/Cron) for
// running discovery periodically. Successive discovery runs must be
// scheduled at least 60 minutes apart. The default value is to run
// discovery every 60 minutes. To explicitly set a timezone to the cron
// tab, apply a prefix in the cron tab: "CRON_TZ=${IANA_TIME_ZONE}" or
// TZ=${IANA_TIME_ZONE}". The ${IANA_TIME_ZONE} may only be a valid string
// from IANA time zone database. For example, `CRON_TZ=America/New_York 1
// * * * *`, or `TZ=America/New_York 1 * * * *`.
string schedule = 10;
}
}
// Identifies the cloud resource that is referenced by this asset.
message ResourceSpec {
// Type of resource.
enum Type {
// Type not specified.
TYPE_UNSPECIFIED = 0;
// Cloud Storage bucket.
STORAGE_BUCKET = 1;
// BigQuery dataset.
BIGQUERY_DATASET = 2;
}
// Access Mode determines how data stored within the resource is read. This
// is only applicable to storage bucket assets.
enum AccessMode {
// Access mode unspecified.
ACCESS_MODE_UNSPECIFIED = 0;
// Default. Data is accessed directly using storage APIs.
DIRECT = 1;
// Data is accessed through a managed interface using BigQuery APIs.
MANAGED = 2;
}
// Immutable. Relative name of the cloud resource that contains the data
// that is being managed within a lake. For example:
// `projects/{project_number}/buckets/{bucket_id}`
// `projects/{project_number}/datasets/{dataset_id}`
string name = 1;
// Required. Immutable. Type of resource.
Type type = 2;
// Optional. Determines how read permissions are handled for each asset and
// their associated tables. Only available to storage buckets assets.
AccessMode read_access_mode = 5;
}
// Status of the resource referenced by an asset.
message ResourceStatus {
// The state of a resource.
enum State {
// State unspecified.
STATE_UNSPECIFIED = 0;
// Resource does not have any errors.
READY = 1;
// Resource has errors.
ERROR = 2;
}
// The current state of the managed resource.
State state = 1;
// Additional information about the current state.
string message = 2;
// Last update time of the status.
google.protobuf.Timestamp update_time = 3;
// Output only. Service account associated with the BigQuery Connection.
string managed_access_identity = 4;
}
// Status of discovery for an asset.
message DiscoveryStatus {
// The aggregated data statistics for the asset reported by discovery.
message Stats {
// The count of data items within the referenced resource.
int64 data_items = 1;
// The number of stored data bytes within the referenced resource.
int64 data_size = 2;
// The count of table entities within the referenced resource.
int64 tables = 3;
// The count of fileset entities within the referenced resource.
int64 filesets = 4;
}
// Current state of discovery.
enum State {
// State is unspecified.
STATE_UNSPECIFIED = 0;
// Discovery for the asset is scheduled.
SCHEDULED = 1;
// Discovery for the asset is running.
IN_PROGRESS = 2;
// Discovery for the asset is currently paused (e.g. due to a lack
// of available resources). It will be automatically resumed.
PAUSED = 3;
// Discovery for the asset is disabled.
DISABLED = 5;
}
// The current status of the discovery feature.
State state = 1;
// Additional information about the current state.
string message = 2;
// Last update time of the status.
google.protobuf.Timestamp update_time = 3;
// The start time of the last discovery run.
google.protobuf.Timestamp last_run_time = 4;
// Data Stats of the asset reported by discovery.
Stats stats = 6;
// The duration of the last discovery run.
google.protobuf.Duration last_run_duration = 7;
}
// Output only. The relative resource name of the asset, of the form:
// `projects/{project_number}/locations/{location_id}/lakes/{lake_id}/zones/{zone_id}/assets/{asset_id}`.
string name = 1;
// Optional. User friendly display name.
string display_name = 2;
// Output only. System generated globally unique ID for the asset. This ID
// will be different if the asset is deleted and re-created with the same
// name.
string uid = 3;
// Output only. The time when the asset was created.
google.protobuf.Timestamp create_time = 4;
// Output only. The time when the asset was last updated.
google.protobuf.Timestamp update_time = 5;
// Optional. User defined labels for the asset.
map<string, string> labels = 6;
// Optional. Description of the asset.
string description = 7;
// Output only. Current state of the asset.
State state = 8;
// Required. Specification of the resource that is referenced by this asset.
ResourceSpec resource_spec = 100;
// Output only. Status of the resource referenced by this asset.
ResourceStatus resource_status = 101;
// Output only. Status of the security policy applied to resource referenced
// by this asset.
SecurityStatus security_status = 103;
// Optional. Specification of the discovery feature applied to data referenced
// by this asset. When this spec is left unset, the asset will use the spec
// set on the parent zone.
DiscoverySpec discovery_spec = 106;
// Output only. Status of the discovery feature applied to data referenced by
// this asset.
DiscoveryStatus discovery_status = 107;
}
// State of a resource.
enum State {
// State is not specified.
STATE_UNSPECIFIED = 0;
// Resource is active, i.e., ready to use.
ACTIVE = 1;
// Resource is under creation.
CREATING = 2;
// Resource is under deletion.
DELETING = 3;
// Resource is active but has unresolved actions.
ACTION_REQUIRED = 4;
}
// Environment represents a user-visible compute infrastructure for analytics
// within a lake.
message Environment {
// Configuration for the underlying infrastructure used to run workloads.
message InfrastructureSpec {
// Compute resources associated with the analyze interactive workloads.
message ComputeResources {
// Optional. Size in GB of the disk. Default is 100 GB.
int32 disk_size_gb = 1;
// Optional. Total number of nodes in the sessions created for this
// environment.
int32 node_count = 2;
// Optional. Max configurable nodes.
// If max_node_count > node_count, then auto-scaling is enabled.
int32 max_node_count = 3;
}
// Software Runtime Configuration to run Analyze.
message OsImageRuntime {
// Required. Dataplex Image version.
string image_version = 1;
// Optional. List of Java jars to be included in the runtime environment.
// Valid input includes Cloud Storage URIs to Jar binaries.
// For example, gs://bucket-name/my/path/to/file.jar
repeated string java_libraries = 2;
// Optional. A list of python packages to be installed.
// Valid formats include Cloud Storage URI to a PIP installable library.
// For example, gs://bucket-name/my/path/to/lib.tar.gz
repeated string python_packages = 3;
// Optional. Spark properties to provide configuration for use in sessions
// created for this environment. The properties to set on daemon config
// files. Property keys are specified in `prefix:property` format. The
// prefix must be "spark".
map<string, string> properties = 4;
}
// Hardware config
oneof resources {
// Optional. Compute resources needed for analyze interactive workloads.
ComputeResources compute = 50;
}
// Software config
oneof runtime {
// Required. Software Runtime Configuration for analyze interactive
// workloads.
OsImageRuntime os_image = 100;
}
}
// Configuration for sessions created for this environment.
message SessionSpec {
// Optional. The idle time configuration of the session. The session will be
// auto-terminated at the end of this period.
google.protobuf.Duration max_idle_duration = 1;
// Optional. If True, this causes sessions to be pre-created and available
// for faster startup to enable interactive exploration use-cases. This
// defaults to False to avoid additional billed charges. These can only be
// set to True for the environment with name set to "default", and with
// default configuration.
bool enable_fast_startup = 2;
}
// Status of sessions created for this environment.
message SessionStatus {
// Output only. Queries over sessions to mark whether the environment is
// currently active or not
bool active = 1;
}
// URI Endpoints to access sessions associated with the Environment.
message Endpoints {
// Output only. URI to serve notebook APIs
string notebooks = 1;
// Output only. URI to serve SQL APIs
string sql = 2;
}
// Output only. The relative resource name of the environment, of the form:
// projects/{project_id}/locations/{location_id}/lakes/{lake_id}/environment/{environment_id}
string name = 1;
// Optional. User friendly display name.
string display_name = 2;
// Output only. System generated globally unique ID for the environment. This
// ID will be different if the environment is deleted and re-created with the
// same name.
string uid = 3;
// Output only. Environment creation time.
google.protobuf.Timestamp create_time = 4;
// Output only. The time when the environment was last updated.
google.protobuf.Timestamp update_time = 5;
// Optional. User defined labels for the environment.
map<string, string> labels = 6;
// Optional. Description of the environment.
string description = 7;
// Output only. Current state of the environment.
State state = 8;
// Required. Infrastructure specification for the Environment.
InfrastructureSpec infrastructure_spec = 100;
// Optional. Configuration for sessions created for this environment.
SessionSpec session_spec = 101;
// Output only. Status of sessions created for this environment.
SessionStatus session_status = 102;
// Output only. URI Endpoints to access sessions associated with the
// Environment.
Endpoints endpoints = 200;
}
// DataScan scheduling and trigger settings.
message Trigger {
// The scan runs once via `RunDataScan` API.
message OnDemand {}
// The scan is scheduled to run periodically.
message Schedule {
// Required. [Cron](https://en.wikipedia.org/wiki/Cron) schedule for running
// scans periodically.
//
// To explicitly set a timezone in the cron tab, apply a prefix in the
// cron tab: **"CRON_TZ=${IANA_TIME_ZONE}"** or **"TZ=${IANA_TIME_ZONE}"**.
// The **${IANA_TIME_ZONE}** may only be a valid string from IANA time zone
// database
// ([wikipedia](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones#List)).
// For example, `CRON_TZ=America/New_York 1 * * * *`, or
// `TZ=America/New_York 1 * * * *`.
//
// This field is required for Schedule scans.
string cron = 1;
}
// DataScan scheduling and trigger settings.
//
// If not specified, the default is `onDemand`.
oneof mode {
// The scan runs once via `RunDataScan` API.
OnDemand on_demand = 100;
// The scan is scheduled to run periodically.
Schedule schedule = 101;
}
}
// The data source for DataScan.
message DataSource {
// The source is required and immutable. Once it is set, it cannot be change
// to others.
oneof source {
// Immutable. The Dataplex entity that represents the data source (e.g.
// BigQuery table) for DataScan, of the form:
// `projects/{project_number}/locations/{location_id}/lakes/{lake_id}/zones/{zone_id}/entities/{entity_id}`.
string entity = 100;
}
}
// The data scanned during processing (e.g. in incremental DataScan)
message ScannedData {
// A data range denoted by a pair of start/end values of a field.
message IncrementalField {
// The field that contains values which monotonically increases over time
// (e.g. a timestamp column).
string field = 1;
// Value that marks the start of the range.
string start = 2;
// Value that marks the end of the range.
string end = 3;
}
// The range of scanned data
oneof data_range {
// The range denoted by values of an incremental field
IncrementalField incremental_field = 1;
}
}
// DataProfileScan related setting.
message DataProfileSpec {}
// DataProfileResult defines the output of DataProfileScan. Each field of the
// table will have field type specific profile result.
message DataProfileResult {
// Contains name, type, mode and field type specific profile information.
message Profile {
// A field within a table.
message Field {
// The profile information for each field type.
message ProfileInfo {
// The profile information for a string type field.
message StringFieldInfo {
// Minimum length of non-null values in the scanned data.
int64 min_length = 1;
// Maximum length of non-null values in the scanned data.
int64 max_length = 2;
// Average length of non-null values in the scanned data.
double average_length = 3;
}
// The profile information for an integer type field.
message IntegerFieldInfo {
// Average of non-null values in the scanned data. NaN, if the field
// has a NaN.
double average = 1;
// Standard deviation of non-null values in the scanned data. NaN, if
// the field has a NaN.
double standard_deviation = 3;
// Minimum of non-null values in the scanned data. NaN, if the field
// has a NaN.
int64 min = 4;
// A quartile divides the number of data points into four parts, or
// quarters, of more-or-less equal size. Three main quartiles used
// are: The first quartile (Q1) splits off the lowest 25% of data from
// the highest 75%. It is also known as the lower or 25th empirical
// quartile, as 25% of the data is below this point. The second
// quartile (Q2) is the median of a data set. So, 50% of the data lies
// below this point. The third quartile (Q3) splits off the highest
// 25% of data from the lowest 75%. It is known as the upper or 75th
// empirical quartile, as 75% of the data lies below this point.
// Here, the quartiles is provided as an ordered list of quartile
// values for the scanned data, occurring in order Q1, median, Q3.
repeated int64 quartiles = 6;
// Maximum of non-null values in the scanned data. NaN, if the field
// has a NaN.
int64 max = 5;
}
// The profile information for a double type field.
message DoubleFieldInfo {
// Average of non-null values in the scanned data. NaN, if the field
// has a NaN.
double average = 1;
// Standard deviation of non-null values in the scanned data. NaN, if
// the field has a NaN.
double standard_deviation = 3;
// Minimum of non-null values in the scanned data. NaN, if the field
// has a NaN.
double min = 4;
// A quartile divides the number of data points into four parts, or
// quarters, of more-or-less equal size. Three main quartiles used
// are: The first quartile (Q1) splits off the lowest 25% of data from
// the highest 75%. It is also known as the lower or 25th empirical
// quartile, as 25% of the data is below this point. The second
// quartile (Q2) is the median of a data set. So, 50% of the data lies
// below this point. The third quartile (Q3) splits off the highest
// 25% of data from the lowest 75%. It is known as the upper or 75th
// empirical quartile, as 75% of the data lies below this point.
// Here, the quartiles is provided as an ordered list of quartile
// values for the scanned data, occurring in order Q1, median, Q3.
repeated double quartiles = 6;
// Maximum of non-null values in the scanned data. NaN, if the field
// has a NaN.
double max = 5;
}
// Top N non-null values in the scanned data.
message TopNValue {
// String value of a top N non-null value.
string value = 1;
// Count of the corresponding value in the scanned data.
int64 count = 2;
}
// Ratio of rows with null value against total scanned rows.
double null_ratio = 2;
// Ratio of rows with distinct values against total scanned rows.
// Not available for complex non-groupable field type RECORD and fields
// with REPEATABLE mode.
double distinct_ratio = 3;
// The list of top N non-null values and number of times they occur in
// the scanned data. N is 10 or equal to the number of distinct values
// in the field, whichever is smaller. Not available for complex
// non-groupable field type RECORD and fields with REPEATABLE mode.
repeated TopNValue top_n_values = 4;
// Structural and profile information for specific field type. Not
// available, if mode is REPEATABLE.
oneof field_info {
// String type field information.
StringFieldInfo string_profile = 101;
// Integer type field information.
IntegerFieldInfo integer_profile = 102;
// Double type field information.
DoubleFieldInfo double_profile = 103;
}
}
// The name of the field.
string name = 1;
// The field data type. Possible values include:
//
// * STRING
// * BYTE
// * INT64
// * INT32
// * INT16
// * DOUBLE
// * FLOAT
// * DECIMAL
// * BOOLEAN
// * BINARY
// * TIMESTAMP
// * DATE
// * TIME
// * NULL
// * RECORD
string type = 2;
// The mode of the field. Possible values include:
//
// * REQUIRED, if it is a required field.
// * NULLABLE, if it is an optional field.
// * REPEATED, if it is a repeated field.
string mode = 3;
// Profile information for the corresponding field.
ProfileInfo profile = 4;
}
// List of fields with structural and profile information for each field.
repeated Field fields = 2;
}
// The count of rows scanned.
int64 row_count = 3;
// The profile information per field.
Profile profile = 4;
// The data scanned for this result.
ScannedData scanned_data = 5;
}
// DataQualityScan related setting.
message DataQualitySpec {
// The list of rules to evaluate against a data source. At least one rule is
// required.
repeated DataQualityRule rules = 1;
}
// The output of a DataQualityScan.
message DataQualityResult {
// Overall data quality result -- `true` if all rules passed.
bool passed = 5;
// A list of results at the dimension level.
repeated DataQualityDimensionResult dimensions = 2;
// A list of all the rules in a job, and their results.
repeated DataQualityRuleResult rules = 3;
// The count of rows processed.
int64 row_count = 4;
// The data scanned for this result.
ScannedData scanned_data = 7;
}
// DataQualityRuleResult provides a more detailed, per-rule view of the results.
message DataQualityRuleResult {
// The rule specified in the DataQualitySpec, as is.
DataQualityRule rule = 1;
// Whether the rule passed or failed.
bool passed = 7;
// The number of rows a rule was evaluated against. This field is only valid
// for ColumnMap type rules.
//
// Evaluated count can be configured to either
//
// * include all rows (default) - with `null` rows automatically failing rule
// evaluation, or
// * exclude `null` rows from the `evaluated_count`, by setting
// `ignore_nulls = true`.
int64 evaluated_count = 9;
// The number of rows which passed a rule evaluation.
// This field is only valid for ColumnMap type rules.
int64 passed_count = 8;
// The number of rows with null values in the specified column.
int64 null_count = 5;
// The ratio of **passed_count / evaluated_count**.
// This field is only valid for ColumnMap type rules.
double pass_ratio = 6;
// The query to find rows that did not pass this rule.
// Only applies to ColumnMap and RowCondition rules.
string failing_rows_query = 10;
}
// DataQualityDimensionResult provides a more detailed, per-dimension view of
// the results.
message DataQualityDimensionResult {
// Whether the dimension passed or failed.
bool passed = 3;
}
// A rule captures data quality intent about a data source.