diff --git a/docs/development/extensions-core/kafka-ingestion.md b/docs/development/extensions-core/kafka-ingestion.md index 58636e7f16f1..855ed2a44eb1 100644 --- a/docs/development/extensions-core/kafka-ingestion.md +++ b/docs/development/extensions-core/kafka-ingestion.md @@ -38,6 +38,7 @@ This topic covers how to submit a supervisor spec to ingest event data, also kno - For a walk-through, see the [Loading from Apache Kafka](../../tutorials/tutorial-kafka.md) tutorial. ## Kafka support + The Kafka indexing service supports transactional topics introduced in Kafka 0.11.x by default. The consumer for Kafka indexing service is incompatible with older Kafka brokers. If you are using an older version, refer to the [Kafka upgrade guide](https://kafka.apache.org/documentation/#upgrade). Additionally, you can set `isolation.level` to `read_uncommitted` in `consumerProperties` if either: @@ -51,6 +52,7 @@ If your Kafka cluster enables consumer-group based ACLs, you can set `group.id` To use the Kafka indexing service, load the `druid-kafka-indexing-service` extension on both the Overlord and the MiddleManagers. See [Loading extensions](../extensions.md#loading-extensions) for instructions on how to configure extensions. ## Define a supervisor spec + Similar to the ingestion spec for batch ingestion, the supervisor spec configures the data ingestion for Kafka streaming ingestion. A supervisor spec has the following sections: - `dataSchema` to specify the Druid datasource name, primary timestamp, dimensions, metrics, transforms, and any necessary filters. - `ioConfig` to configure Kafka connection settings and configure how Druid parses the data. Kafka-specific connection details go in the `consumerProperties`. The `ioConfig` is also where you define the input format (`inputFormat`) of your Kafka data. For supported formats for Kafka and information on how to configure the input format, see [Data formats](../../ingestion/data-formats.md). @@ -128,6 +130,7 @@ The following example demonstrates a supervisor spec for Kafka that uses the `JS ``` ### Kafka input format supervisor spec example + If you want to ingest data from other fields in addition to the Kafka message contents, you can use the `kafka` input format. The `kafka` input format lets you ingest: - the event key field - event headers @@ -141,7 +144,7 @@ For example, consider the following structure for a message that represents a fi - **Event timestamp**: "Nov. 10, 2021 at 14:06" When you use the `kafka` input format, you configure the way that Druid names the dimensions created from the Kafka message: -- `headerLabelPrefix`: Supply a prefix to the Kafka headers to avoid any conflicts with named dimensions. The default is `kafka.header`. Considering the header from the example, Druid maps the header to the following column: `kafka.header.environment`. +- `headerColumnPrefix`: Supply a prefix to the Kafka headers to avoid any conflicts with named dimensions. The default is `kafka.header`. Considering the header from the example, Druid maps the header to the following column: `kafka.header.environment`. - `timestampColumnName`: Supply a custom name for the Kafka timestamp in the Druid schema to avoid conflicts with other time columns. The default is `kafka.timestamp`. - `keyColumnName`: Supply the name for the Kafka key column in Druid. The default is `kafka.key`. Additionally, you must provide information about how Druid should parse the data in the Kafka message: @@ -159,7 +162,7 @@ Additionally, you must provide information about how Druid should parse the data For more information on data formats, see [Data formats](../../ingestion/data-formats.md). -Finally, add the Kafka message columns to the `dimensionsSpec`. For the key and timestamp, you can use the dimension names you defined for `keyColumnName` and `timestampColumnName`. For header dimensions, append the header key to the `headerLabelPrefix`. For example `kafka.header.environment`. +Finally, add the Kafka message columns to the `dimensionsSpec`. For the key and timestamp, you can use the dimension names you defined for `keyColumnName` and `timestampColumnName`. For header dimensions, append the header key to the `headerColumnPrefix`. For example `kafka.header.environment`. The following supervisor spec demonstrates how to ingest the Kafka header, key, and timestamp into Druid dimensions: ``` @@ -174,7 +177,7 @@ The following supervisor spec demonstrates how to ingest the Kafka header, key, "topic": "wiki-edits", "inputFormat": { "type": "kafka", - "headerLabelPrefix": "kafka.header.", + "headerColumnPrefix": "kafka.header.", "timestampColumnName": "kafka.timestamp", "keyColumnName": "kafka.key", "headerFormat": { diff --git a/docs/ingestion/data-formats.md b/docs/ingestion/data-formats.md index 7bf50956a55a..c975f885509f 100644 --- a/docs/ingestion/data-formats.md +++ b/docs/ingestion/data-formats.md @@ -170,7 +170,7 @@ Configure the Kafka `inputFormat` to load complete kafka records including heade | Field | Type | Description | Required | |-------|------|-------------|----------| | `type` | String | Set value to `kafka`. | yes | -| `headerLabelPrefix` | String | Custom label prefix for all the header columns. | no (default = "kafka.header.") | +| `headerColumnPrefix` | String | Custom prefix for all the header columns. | no (default = "kafka.header.") | | `timestampColumnName` | String | Name of the column for the kafka record's timestamp.| no (default = "kafka.timestamp") | | `keyColumnName` | String | Name of the column for the kafka record's key.| no (default = "kafka.key") | | `headerFormat` | Object | `headerFormat` specifies how to parse the Kafka headers. Supports String types. Because Kafka header values are bytes, the parser decodes them as UTF-8 encoded strings. To change this behavior, implement your own parser based on the encoding style. Change the 'encoding' type in `KafkaStringHeaderFormat` to match your custom implementation. | no | @@ -183,7 +183,7 @@ For example: "ioConfig": { "inputFormat": { "type": "kafka", - "headerLabelPrefix": "kafka.header.", + "headerColumnPrefix": "kafka.header.", "timestampColumnName": "kafka.timestamp", "keyColumnName": "kafka.key", "headerFormat": diff --git a/licenses.yaml b/licenses.yaml index 238af9db034e..3df70dab9430 100644 --- a/licenses.yaml +++ b/licenses.yaml @@ -5733,7 +5733,7 @@ license_category: binary module: web-console license_name: Apache License version 2.0 copyright: Imply Data -version: 0.18.2 +version: 0.18.3 --- diff --git a/web-console/e2e-tests/reindexing.spec.ts b/web-console/e2e-tests/reindexing.spec.ts index ebf75c0e78fa..9a9cd617df30 100644 --- a/web-console/e2e-tests/reindexing.spec.ts +++ b/web-console/e2e-tests/reindexing.spec.ts @@ -116,7 +116,7 @@ function validateConnectLocalData(preview: string) { expect(lines.length).toBe(500); const firstLine = lines[0]; expect(firstLine).toBe( - 'Druid row: {' + + '[Druid row: {' + '"__time":1442018818771' + ',"channel":"#en.wikipedia"' + ',"comment":"added project"' + @@ -131,11 +131,11 @@ function validateConnectLocalData(preview: string) { ',"added":36' + ',"deleted":0' + ',"delta":36' + - '}', + '}]', ); const lastLine = lines[lines.length - 1]; expect(lastLine).toBe( - 'Druid row: {' + + '[Druid row: {' + '"__time":1442020314823' + ',"channel":"#en.wikipedia"' + ',"comment":"/* History */[[WP:AWB/T|Typo fixing]], [[WP:AWB/T|typo(s) fixed]]: nothern → northern using [[Project:AWB|AWB]]"' + @@ -150,7 +150,7 @@ function validateConnectLocalData(preview: string) { ',"added":1' + ',"deleted":0' + ',"delta":1' + - '}', + '}]', ); } diff --git a/web-console/package-lock.json b/web-console/package-lock.json index b9aefc10dead..4d0db223d82d 100644 --- a/web-console/package-lock.json +++ b/web-console/package-lock.json @@ -22,7 +22,7 @@ "d3-axis": "^2.1.0", "d3-scale": "^3.3.0", "d3-selection": "^2.0.0", - "druid-query-toolkit": "^0.18.2", + "druid-query-toolkit": "^0.18.3", "file-saver": "^2.0.2", "follow-redirects": "^1.14.7", "fontsource-open-sans": "^3.0.9", @@ -8211,9 +8211,9 @@ } }, "node_modules/druid-query-toolkit": { - "version": "0.18.2", - "resolved": "https://registry.npmjs.org/druid-query-toolkit/-/druid-query-toolkit-0.18.2.tgz", - "integrity": "sha512-MUqTm6wW+clI0pVeK9RIdB8svWK6mu44zsAw8BSVZYYKchigbBzTgwJe0vAYFBfR0TPjD1gJl62pSw4g0F14fQ==", + "version": "0.18.3", + "resolved": "https://registry.npmjs.org/druid-query-toolkit/-/druid-query-toolkit-0.18.3.tgz", + "integrity": "sha512-Za2U2NsFyun5HXeWnLCICnTFzZp4aC17aSOjgVbQgEWZNMPht51U4paE3SVhPDObkWDjDUYAqVv+mO+ZyMx9Og==", "dependencies": { "tslib": "^2.3.1" }, @@ -32625,9 +32625,9 @@ } }, "druid-query-toolkit": { - "version": "0.18.2", - "resolved": "https://registry.npmjs.org/druid-query-toolkit/-/druid-query-toolkit-0.18.2.tgz", - "integrity": "sha512-MUqTm6wW+clI0pVeK9RIdB8svWK6mu44zsAw8BSVZYYKchigbBzTgwJe0vAYFBfR0TPjD1gJl62pSw4g0F14fQ==", + "version": "0.18.3", + "resolved": "https://registry.npmjs.org/druid-query-toolkit/-/druid-query-toolkit-0.18.3.tgz", + "integrity": "sha512-Za2U2NsFyun5HXeWnLCICnTFzZp4aC17aSOjgVbQgEWZNMPht51U4paE3SVhPDObkWDjDUYAqVv+mO+ZyMx9Og==", "requires": { "tslib": "^2.3.1" } diff --git a/web-console/package.json b/web-console/package.json index 18e99efec43a..e915f8658d5b 100644 --- a/web-console/package.json +++ b/web-console/package.json @@ -76,7 +76,7 @@ "d3-axis": "^2.1.0", "d3-scale": "^3.3.0", "d3-selection": "^2.0.0", - "druid-query-toolkit": "^0.18.2", + "druid-query-toolkit": "^0.18.3", "file-saver": "^2.0.2", "follow-redirects": "^1.14.7", "fontsource-open-sans": "^3.0.9", diff --git a/web-console/script/create-sql-docs.js b/web-console/script/create-sql-docs.js index 91dfceffad4c..561db7e2cc46 100755 --- a/web-console/script/create-sql-docs.js +++ b/web-console/script/create-sql-docs.js @@ -23,7 +23,7 @@ const snarkdown = require('snarkdown'); const writefile = 'lib/sql-docs.js'; -const MINIMUM_EXPECTED_NUMBER_OF_FUNCTIONS = 164; +const MINIMUM_EXPECTED_NUMBER_OF_FUNCTIONS = 167; const MINIMUM_EXPECTED_NUMBER_OF_DATA_TYPES = 14; const initialFunctionDocs = { diff --git a/web-console/src/components/auto-form/auto-form.tsx b/web-console/src/components/auto-form/auto-form.tsx index 5c2a229a4f00..a1e19174ffcd 100644 --- a/web-console/src/components/auto-form/auto-form.tsx +++ b/web-console/src/components/auto-form/auto-form.tsx @@ -70,7 +70,7 @@ export interface Field { hide?: Functor; hideInMore?: Functor; valueAdjustment?: (value: any) => any; - adjustment?: (model: Partial) => Partial; + adjustment?: (model: Partial, oldModel: Partial) => Partial; issueWithValue?: (value: any) => string | undefined; customSummary?: (v: any) => string; @@ -217,7 +217,7 @@ export class AutoForm> extends React.PureComponent } if (field.adjustment) { - newModel = field.adjustment(newModel); + newModel = field.adjustment(newModel, model); } this.modelChange(newModel); diff --git a/web-console/src/console-application.tsx b/web-console/src/console-application.tsx index 5683a101b54a..2a99044deecb 100644 --- a/web-console/src/console-application.tsx +++ b/web-console/src/console-application.tsx @@ -45,7 +45,6 @@ import { import './console-application.scss'; export interface ConsoleApplicationProps { - exampleManifestsUrl?: string; defaultQueryContext?: Record; mandatoryQueryContext?: Record; } @@ -213,15 +212,12 @@ export class ConsoleApplication extends React.PureComponent< }; private readonly wrappedDataLoaderView = () => { - const { exampleManifestsUrl } = this.props; - return this.wrapInViewContainer( 'data-loader', , 'narrow-pad', @@ -241,14 +237,11 @@ export class ConsoleApplication extends React.PureComponent< }; private readonly wrappedClassicBatchDataLoaderView = () => { - const { exampleManifestsUrl } = this.props; - return this.wrapInViewContainer( 'classic-batch-data-loader', , 'narrow-pad', diff --git a/web-console/src/druid-models/dimension-spec/dimension-spec.spec.ts b/web-console/src/druid-models/dimension-spec/dimension-spec.spec.ts deleted file mode 100644 index 8d68d41df0c0..000000000000 --- a/web-console/src/druid-models/dimension-spec/dimension-spec.spec.ts +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import { CSV_SAMPLE, JSON_SAMPLE } from '../../utils/sampler.mock'; - -import { getDimensionSpecs } from './dimension-spec'; - -describe('dimension-spec', () => { - describe('getDimensionSpecs', () => { - it('works for empty', () => { - expect(getDimensionSpecs({ header: ['header'], rows: [] }, {}, false, true)).toEqual([ - 'header', - ]); - }); - - it('works with json', () => { - expect(getDimensionSpecs(JSON_SAMPLE, {}, false, false)).toEqual([ - 'timestamp', - 'user', - { - name: 'followers', - type: 'long', - }, - { - name: 'spend', - type: 'double', - }, - 'id', - 'tags', - 'nums', - ]); - - expect(getDimensionSpecs(JSON_SAMPLE, {}, false, true)).toEqual([ - 'timestamp', - 'user', - 'id', - 'tags', - 'nums', - ]); - }); - - it('works with csv', () => { - expect(getDimensionSpecs(CSV_SAMPLE, {}, true, false)).toEqual([ - 'timestamp', - 'user', - { - name: 'followers', - type: 'long', - }, - { - name: 'spend', - type: 'double', - }, - { - name: 'id', - type: 'long', - }, - 'tags', - 'nums', - ]); - - expect(getDimensionSpecs(CSV_SAMPLE, {}, true, true)).toEqual([ - 'timestamp', - 'user', - 'tags', - 'nums', - ]); - }); - }); -}); diff --git a/web-console/src/druid-models/dimension-spec/dimension-spec.ts b/web-console/src/druid-models/dimension-spec/dimension-spec.ts index d679dc91d3ea..2b8e7c02025d 100644 --- a/web-console/src/druid-models/dimension-spec/dimension-spec.ts +++ b/web-console/src/druid-models/dimension-spec/dimension-spec.ts @@ -18,13 +18,16 @@ import type { Field } from '../../components'; import { filterMap, typeIs } from '../../utils'; -import type { SampleHeaderAndRows } from '../../utils/sampler'; -import { guessColumnTypeFromHeaderAndRows } from '../ingestion-spec/ingestion-spec'; +import type { SampleResponse } from '../../utils/sampler'; +import { getHeaderNamesFromSampleResponse } from '../../utils/sampler'; +import { guessColumnTypeFromSampleResponse } from '../ingestion-spec/ingestion-spec'; export interface DimensionsSpec { readonly dimensions?: (string | DimensionSpec)[]; readonly dimensionExclusions?: string[]; readonly spatialDimensions?: any[]; + readonly includeAllDimensions?: boolean; + readonly useSchemaDiscovery?: boolean; } export interface DimensionSpec { @@ -77,20 +80,19 @@ export function inflateDimensionSpec(dimensionSpec: string | DimensionSpec): Dim } export function getDimensionSpecs( - headerAndRows: SampleHeaderAndRows, + sampleResponse: SampleResponse, typeHints: Record, guessNumericStringsAsNumbers: boolean, hasRollup: boolean, ): (string | DimensionSpec)[] { - return filterMap(headerAndRows.header, h => { - if (h === '__time') return; - const type = + return filterMap(getHeaderNamesFromSampleResponse(sampleResponse, true), h => { + const dimensionType = typeHints[h] || - guessColumnTypeFromHeaderAndRows(headerAndRows, h, guessNumericStringsAsNumbers); - if (type === 'string') return h; + guessColumnTypeFromSampleResponse(sampleResponse, h, guessNumericStringsAsNumbers); + if (dimensionType === 'string') return h; if (hasRollup) return; return { - type, + type: dimensionType === 'COMPLEX' ? 'json' : dimensionType, name: h, }; }); diff --git a/web-console/src/druid-models/ingestion-spec/ingestion-spec.spec.ts b/web-console/src/druid-models/ingestion-spec/ingestion-spec.spec.ts index 91bf75f35b57..03a7cd1d26d1 100644 --- a/web-console/src/druid-models/ingestion-spec/ingestion-spec.spec.ts +++ b/web-console/src/druid-models/ingestion-spec/ingestion-spec.spec.ts @@ -16,15 +16,15 @@ * limitations under the License. */ -import { CSV_SAMPLE } from '../../utils/sampler.mock'; +import { CSV_SAMPLE, JSON_SAMPLE } from '../../utils/sampler.mock'; import type { IngestionSpec } from './ingestion-spec'; import { adjustId, cleanSpec, - guessColumnTypeFromHeaderAndRows, guessColumnTypeFromInput, - guessInputFormat, + guessColumnTypeFromSampleResponse, + guessSimpleInputFormat, updateSchemaWithSample, upgradeSpec, } from './ingestion-spec'; @@ -565,26 +565,26 @@ describe('ingestion-spec', () => { }); }); - describe('guessInputFormat', () => { + describe('guessSimpleInputFormat', () => { it('works for parquet', () => { - expect(guessInputFormat(['PAR1lol']).type).toEqual('parquet'); + expect(guessSimpleInputFormat(['PAR1lol']).type).toEqual('parquet'); }); it('works for orc', () => { - expect(guessInputFormat(['ORClol']).type).toEqual('orc'); + expect(guessSimpleInputFormat(['ORClol']).type).toEqual('orc'); }); it('works for AVRO', () => { - expect(guessInputFormat(['Obj\x01lol']).type).toEqual('avro_ocf'); - expect(guessInputFormat(['Obj1lol']).type).toEqual('regex'); + expect(guessSimpleInputFormat(['Obj\x01lol']).type).toEqual('avro_ocf'); + expect(guessSimpleInputFormat(['Obj1lol']).type).toEqual('regex'); }); it('works for JSON (strict)', () => { - expect(guessInputFormat(['{"a":1}'])).toEqual({ type: 'json' }); + expect(guessSimpleInputFormat(['{"a":1}'])).toEqual({ type: 'json' }); }); it('works for JSON (lax)', () => { - expect(guessInputFormat([`{hello:'world'}`])).toEqual({ + expect(guessSimpleInputFormat([`{hello:'world'}`])).toEqual({ type: 'json', featureSpec: { ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER: true, @@ -602,14 +602,14 @@ describe('ingestion-spec', () => { }); it('works for CSV (with header)', () => { - expect(guessInputFormat(['A,B,"X,1",Y'])).toEqual({ + expect(guessSimpleInputFormat(['A,B,"X,1",Y'])).toEqual({ type: 'csv', findColumnsFromHeader: true, }); }); it('works for CSV (no header)', () => { - expect(guessInputFormat(['"A,1","B,2",1,2'])).toEqual({ + expect(guessSimpleInputFormat(['"A,1","B,2",1,2'])).toEqual({ type: 'csv', findColumnsFromHeader: false, columns: ['column1', 'column2', 'column3', 'column4'], @@ -617,14 +617,14 @@ describe('ingestion-spec', () => { }); it('works for TSV (with header)', () => { - expect(guessInputFormat(['A\tB\tX\tY'])).toEqual({ + expect(guessSimpleInputFormat(['A\tB\tX\tY'])).toEqual({ type: 'tsv', findColumnsFromHeader: true, }); }); it('works for TSV (no header)', () => { - expect(guessInputFormat(['A\tB\t1\t2\t3\t4\t5\t6\t7\t8\t9'])).toEqual({ + expect(guessSimpleInputFormat(['A\tB\t1\t2\t3\t4\t5\t6\t7\t8\t9'])).toEqual({ type: 'tsv', findColumnsFromHeader: false, columns: [ @@ -644,7 +644,7 @@ describe('ingestion-spec', () => { }); it('works for TSV with ;', () => { - const inputFormat = guessInputFormat(['A;B;X;Y']); + const inputFormat = guessSimpleInputFormat(['A;B;X;Y']); expect(inputFormat).toEqual({ type: 'tsv', delimiter: ';', @@ -653,7 +653,7 @@ describe('ingestion-spec', () => { }); it('works for TSV with |', () => { - const inputFormat = guessInputFormat(['A|B|X|Y']); + const inputFormat = guessSimpleInputFormat(['A|B|X|Y']); expect(inputFormat).toEqual({ type: 'tsv', delimiter: '|', @@ -662,7 +662,7 @@ describe('ingestion-spec', () => { }); it('works for regex', () => { - expect(guessInputFormat(['A/B/X/Y'])).toEqual({ + expect(guessSimpleInputFormat(['A/B/X/Y'])).toEqual({ type: 'regex', pattern: '([\\s\\S]*)', columns: ['line'], @@ -745,30 +745,19 @@ describe('spec utils', () => { }); }); - describe('guessColumnTypeFromHeaderAndRows', () => { - it('works in empty dataset', () => { - expect(guessColumnTypeFromHeaderAndRows({ header: ['c0'], rows: [] }, 'c0', false)).toEqual( - 'string', - ); - }); - + describe('guessColumnTypeFromSampleResponse', () => { it('works for generic dataset', () => { - expect(guessColumnTypeFromHeaderAndRows(CSV_SAMPLE, 'user', false)).toEqual('string'); - expect(guessColumnTypeFromHeaderAndRows(CSV_SAMPLE, 'followers', false)).toEqual('string'); - expect(guessColumnTypeFromHeaderAndRows(CSV_SAMPLE, 'followers', true)).toEqual('long'); - expect(guessColumnTypeFromHeaderAndRows(CSV_SAMPLE, 'spend', true)).toEqual('double'); - expect(guessColumnTypeFromHeaderAndRows(CSV_SAMPLE, 'nums', false)).toEqual('string'); - expect(guessColumnTypeFromHeaderAndRows(CSV_SAMPLE, 'nums', true)).toEqual('string'); + expect(guessColumnTypeFromSampleResponse(CSV_SAMPLE, 'user', false)).toEqual('string'); + expect(guessColumnTypeFromSampleResponse(CSV_SAMPLE, 'followers', false)).toEqual('string'); + expect(guessColumnTypeFromSampleResponse(CSV_SAMPLE, 'followers', true)).toEqual('long'); + expect(guessColumnTypeFromSampleResponse(CSV_SAMPLE, 'spend', true)).toEqual('double'); + expect(guessColumnTypeFromSampleResponse(CSV_SAMPLE, 'nums', false)).toEqual('string'); + expect(guessColumnTypeFromSampleResponse(CSV_SAMPLE, 'nums', true)).toEqual('string'); }); }); it('updateSchemaWithSample', () => { - const withRollup = updateSchemaWithSample( - ingestionSpec, - { header: ['header'], rows: [] }, - 'specific', - true, - ); + const withRollup = updateSchemaWithSample(ingestionSpec, JSON_SAMPLE, 'specific', true); expect(withRollup).toMatchInlineSnapshot(` Object { @@ -777,7 +766,10 @@ describe('spec utils', () => { "dataSource": "wikipedia", "dimensionsSpec": Object { "dimensions": Array [ - "header", + "user", + "id", + "tags", + "nums", ], }, "granularitySpec": Object { @@ -790,6 +782,16 @@ describe('spec utils', () => { "name": "count", "type": "count", }, + Object { + "fieldName": "followers", + "name": "sum_followers", + "type": "longSum", + }, + Object { + "fieldName": "spend", + "name": "sum_spend", + "type": "doubleSum", + }, ], "timestampSpec": Object { "column": "timestamp", @@ -820,12 +822,7 @@ describe('spec utils', () => { } `); - const noRollup = updateSchemaWithSample( - ingestionSpec, - { header: ['header'], rows: [] }, - 'specific', - false, - ); + const noRollup = updateSchemaWithSample(ingestionSpec, JSON_SAMPLE, 'specific', false); expect(noRollup).toMatchInlineSnapshot(` Object { @@ -834,7 +831,18 @@ describe('spec utils', () => { "dataSource": "wikipedia", "dimensionsSpec": Object { "dimensions": Array [ - "header", + "user", + Object { + "name": "followers", + "type": "long", + }, + Object { + "name": "spend", + "type": "double", + }, + "id", + "tags", + "nums", ], }, "granularitySpec": Object { diff --git a/web-console/src/druid-models/ingestion-spec/ingestion-spec.tsx b/web-console/src/druid-models/ingestion-spec/ingestion-spec.tsx index 9f3eb072b3cd..de3e39e3fde2 100644 --- a/web-console/src/druid-models/ingestion-spec/ingestion-spec.tsx +++ b/web-console/src/druid-models/ingestion-spec/ingestion-spec.tsx @@ -34,12 +34,13 @@ import { EMPTY_ARRAY, EMPTY_OBJECT, filterMap, + findMap, isSimpleArray, oneOf, parseCsvLine, typeIs, } from '../../utils'; -import type { SampleHeaderAndRows } from '../../utils/sampler'; +import type { SampleResponse } from '../../utils/sampler'; import type { DimensionsSpec } from '../dimension-spec/dimension-spec'; import { getDimensionSpecName, @@ -269,6 +270,9 @@ export interface DataSchema { export type DimensionMode = 'specific' | 'auto-detect'; export function getDimensionMode(spec: Partial): DimensionMode { + if (deepGet(spec, 'spec.dataSchema.dimensionsSpec.useSchemaDiscovery') === true) { + return 'auto-detect'; + } const dimensions = deepGet(spec, 'spec.dataSchema.dimensionsSpec.dimensions') || EMPTY_ARRAY; return Array.isArray(dimensions) && dimensions.length === 0 ? 'auto-detect' : 'specific'; } @@ -893,6 +897,7 @@ export function getIoConfigFormFields(ingestionComboType: IngestionComboType): F label: 'Bootstrap servers', type: 'string', required: true, + placeholder: 'kafka_broker_host:9092', info: ( <> , ): JSX.Element | undefined { if (isStreamingSpec(spec)) return; - if (sampleData.length) { - const firstData = sampleData[0]; + const firstData: string = findMap(sampleData.data, l => l.input?.raw); + if (firstData) return; - if (firstData === '{') { - return ( - <> - This data looks like regular JSON object. For Druid to parse a text file it must have one - row per event. Maybe look at{' '} - newline delimited JSON instead. - - ); - } + if (firstData === '{') { + return ( + <> + This data looks like regular JSON object. For Druid to parse a text file it must have one + row per event. Maybe look at{' '} + newline delimited JSON instead. + + ); + } - if (oneOf(firstData, '[', '[]')) { - return ( - <> - This data looks like a multi-line JSON array. For Druid to parse a text file it must have - one row per event. Maybe look at{' '} - newline delimited JSON instead. - - ); - } + if (oneOf(firstData, '[', '[]')) { + return ( + <> + This data looks like a multi-line JSON array. For Druid to parse a text file it must have + one row per event. Maybe look at{' '} + newline delimited JSON instead. + + ); } return; @@ -2168,13 +2173,19 @@ export function issueWithSampleData( export function fillInputFormatIfNeeded( spec: Partial, - sampleData: string[], + sampleResponse: SampleResponse, ): Partial { if (deepGet(spec, 'spec.ioConfig.inputFormat.type')) return spec; + return deepSet( spec, 'spec.ioConfig.inputFormat', - guessInputFormat(sampleData, isStreamingSpec(spec)), + getSpecType(spec) === 'kafka' + ? guessKafkaInputFormat(filterMap(sampleResponse.data, l => l.input)) + : guessSimpleInputFormat( + filterMap(sampleResponse.data, l => l.input?.raw), + isStreamingSpec(spec), + ), ); } @@ -2182,8 +2193,23 @@ function noNumbers(xs: string[]): boolean { return xs.every(x => isNaN(Number(x))); } -export function guessInputFormat(sampleData: string[], canBeMultiLineJson = false): InputFormat { - let sampleDatum = sampleData[0]; +export function guessKafkaInputFormat(sampleRaw: Record[]): InputFormat { + const hasHeader = sampleRaw.some(x => Object.keys(x).some(k => k.startsWith('kafka.header.'))); + const keys = filterMap(sampleRaw, x => x['kafka.key']); + const payloads = filterMap(sampleRaw, x => x.raw); + return { + type: 'kafka', + headerFormat: hasHeader ? { type: 'string' } : undefined, + keyFormat: keys.length ? guessSimpleInputFormat(keys, true) : undefined, + valueFormat: guessSimpleInputFormat(payloads, true), + }; +} + +export function guessSimpleInputFormat( + sampleRaw: string[], + canBeMultiLineJson = false, +): InputFormat { + let sampleDatum = sampleRaw[0]; if (sampleDatum) { sampleDatum = String(sampleDatum); // Really ensure it is a string @@ -2319,11 +2345,11 @@ function inputFormatFromType(options: InputFormatFromTypeOptions): InputFormat { // ------------------------ -export function guessIsArrayFromHeaderAndRows( - headerAndRows: SampleHeaderAndRows, +export function guessIsArrayFromSampleResponse( + sampleResponse: SampleResponse, column: string, ): boolean { - return headerAndRows.rows.some(r => isSimpleArray(r.input?.[column])); + return sampleResponse.data.some(r => isSimpleArray(r.input?.[column])); } export function guessColumnTypeFromInput( @@ -2355,13 +2381,13 @@ export function guessColumnTypeFromInput( } } -export function guessColumnTypeFromHeaderAndRows( - headerAndRows: SampleHeaderAndRows, +export function guessColumnTypeFromSampleResponse( + sampleResponse: SampleResponse, column: string, guessNumericStringsAsNumbers: boolean, ): string { return guessColumnTypeFromInput( - filterMap(headerAndRows.rows, r => r.input?.[column]), + filterMap(sampleResponse.data, r => r.input?.[column]), guessNumericStringsAsNumbers, ); } @@ -2391,7 +2417,7 @@ function getTypeHintsFromSpec(spec: Partial): Record, - headerAndRows: SampleHeaderAndRows, + sampleResponse: SampleResponse, dimensionMode: DimensionMode, rollup: boolean, forcePartitionInitialization = false, @@ -2404,26 +2430,25 @@ export function updateSchemaWithSample( let newSpec = spec; if (dimensionMode === 'auto-detect') { - newSpec = deepDelete(newSpec, 'spec.dataSchema.dimensionsSpec.dimensions'); + newSpec = deepSet(newSpec, 'spec.dataSchema.dimensionsSpec.useSchemaDiscovery', true); + newSpec = deepSet(newSpec, 'spec.dataSchema.dimensionsSpec.includeAllDimensions', true); newSpec = deepSet(newSpec, 'spec.dataSchema.dimensionsSpec.dimensionExclusions', []); + newSpec = deepDelete(newSpec, 'spec.dataSchema.dimensionsSpec.dimensions'); } else { + newSpec = deepDelete(newSpec, 'spec.dataSchema.dimensionsSpec.useSchemaDiscovery'); + newSpec = deepDelete(newSpec, 'spec.dataSchema.dimensionsSpec.includeAllDimensions'); newSpec = deepDelete(newSpec, 'spec.dataSchema.dimensionsSpec.dimensionExclusions'); - - const dimensions = getDimensionSpecs( - headerAndRows, - typeHints, - guessNumericStringsAsNumbers, - rollup, + newSpec = deepSet( + newSpec, + 'spec.dataSchema.dimensionsSpec.dimensions', + getDimensionSpecs(sampleResponse, typeHints, guessNumericStringsAsNumbers, rollup), ); - if (dimensions) { - newSpec = deepSet(newSpec, 'spec.dataSchema.dimensionsSpec.dimensions', dimensions); - } } if (rollup) { newSpec = deepSet(newSpec, 'spec.dataSchema.granularitySpec.queryGranularity', 'hour'); - const metrics = getMetricSpecs(headerAndRows, typeHints, guessNumericStringsAsNumbers); + const metrics = getMetricSpecs(sampleResponse, typeHints, guessNumericStringsAsNumbers); if (metrics) { newSpec = deepSet(newSpec, 'spec.dataSchema.metricsSpec', metrics); } diff --git a/web-console/src/druid-models/input-format/input-format.tsx b/web-console/src/druid-models/input-format/input-format.tsx index 679f3bafb0e4..7412c56b6126 100644 --- a/web-console/src/druid-models/input-format/input-format.tsx +++ b/web-console/src/druid-models/input-format/input-format.tsx @@ -22,7 +22,7 @@ import React from 'react'; import type { Field } from '../../components'; import { AutoForm, ExternalLink } from '../../components'; import { getLink } from '../../links'; -import { compact, oneOf, typeIs } from '../../utils'; +import { compact, deepGet, deepSet, oneOf, typeIs } from '../../utils'; import type { FlattenSpec } from '../flatten-spec/flatten-spec'; export interface InputFormat { @@ -39,6 +39,14 @@ export interface InputFormat { readonly keepNullColumns?: boolean; readonly assumeNewlineDelimited?: boolean; readonly useJsonNodeReader?: boolean; + + // type: kafka + readonly timestampColumnName?: string; + readonly headerFormat?: { type: 'string'; encoding?: string }; + readonly headerColumnPrefix?: string; + readonly keyFormat?: InputFormat; + readonly keyColumnName?: string; + readonly valueFormat?: InputFormat; } function generateInputFormatFields(streaming: boolean) { @@ -88,7 +96,7 @@ function generateInputFormatFields(streaming: boolean) { name: 'assumeNewlineDelimited', type: 'boolean', defined: typeIs('json'), - disabled: (inputFormat: InputFormat) => inputFormat.useJsonNodeReader, + disabled: inputFormat => Boolean(inputFormat.useJsonNodeReader), defaultValue: false, info: ( <> @@ -115,10 +123,10 @@ function generateInputFormatFields(streaming: boolean) { streaming ? { name: 'useJsonNodeReader', - title: 'Use JSON node reader', + label: 'Use JSON node reader', type: 'boolean', defined: typeIs('json'), - disabled: (inputFormat: InputFormat) => inputFormat.assumeNewlineDelimited, + disabled: inputFormat => Boolean(inputFormat.assumeNewlineDelimited), defaultValue: false, info: ( <> @@ -224,11 +232,274 @@ function generateInputFormatFields(streaming: boolean) { ] as (Field | undefined)[]); } -export const INPUT_FORMAT_FIELDS: Field[] = generateInputFormatFields(false); +export const BATCH_INPUT_FORMAT_FIELDS: Field[] = generateInputFormatFields(false); export const STREAMING_INPUT_FORMAT_FIELDS: Field[] = generateInputFormatFields(true); +export const KAFKA_METADATA_INPUT_FORMAT_FIELDS: Field[] = [ + { + name: 'timestampColumnName', + label: 'Kafka timestamp column name', + type: 'string', + defaultValue: 'kafka.timestamp', + defined: typeIs('kafka'), + info: `Name of the column for the kafka record's timestamp.`, + }, + + // ----------------------------------------------------- + // keyFormat fields + + { + name: 'keyFormat.type', + label: 'Kafka key input format', + type: 'string', + suggestions: [ + undefined, + 'json', + 'csv', + 'tsv', + 'parquet', + 'orc', + 'avro_ocf', + 'avro_stream', + 'regex', + ], + placeholder: `(don't parse Kafka key)`, + defined: typeIs('kafka'), + info: ( + <> +

The parser used to parse the key of the Kafka message.

+

+ For more information see{' '} + + the documentation + + . +

+ + ), + adjustment: inputFormat => { + const keyFormatType = deepGet(inputFormat, 'keyFormat.type'); + // If the user selects one of these formats then populate the columns (that are in any case meaningless in this context) + // with an initial value. + switch (keyFormatType) { + case 'regex': + inputFormat = deepSet(inputFormat, 'keyFormat.pattern', '([\\s\\S]*)'); + inputFormat = deepSet(inputFormat, 'keyFormat.columns', ['x']); + break; + + case 'csv': + case 'tsv': + inputFormat = deepSet(inputFormat, 'keyFormat.findColumnsFromHeader', false); + inputFormat = deepSet(inputFormat, 'keyFormat.columns', ['x']); + break; + } + return inputFormat; + }, + }, + { + name: 'keyFormat.featureSpec', + label: 'Kafka key JSON parser features', + type: 'json', + defined: inputFormat => deepGet(inputFormat, 'keyFormat.type') === 'json', + hideInMore: true, + info: ( + <> +

+ + JSON parser features + {' '} + supported by Jackson library. Those features will be applied when parsing the input JSON + data. +

+

+ Example:{' '} + {`{ "ALLOW_SINGLE_QUOTES": true, "ALLOW_UNQUOTED_FIELD_NAMES": true }`} +

+ + ), + }, + { + name: 'keyFormat.assumeNewlineDelimited', + label: 'Kafka key assume newline delimited', + type: 'boolean', + defined: inputFormat => deepGet(inputFormat, 'keyFormat.type') === 'json', + disabled: inputFormat => Boolean(inputFormat.useJsonNodeReader), + defaultValue: false, + hideInMore: true, + info: ( + <> +

+ In streaming ingestion, multi-line JSON events can be ingested (i.e. where a single JSON + event spans multiple lines). However, if a parsing exception occurs, all JSON events that + are present in the same streaming record will be discarded. +

+

+ assumeNewlineDelimited and useJsonNodeReader (at most one can be{' '} + true) affect only how parsing exceptions are handled. +

+

+ If the input is known to be newline delimited JSON (each individual JSON event is + contained in a single line, separated by newlines), setting this option to true allows for + more flexible parsing exception handling. Only the lines with invalid JSON syntax will be + discarded, while lines containing valid JSON events will still be ingested. +

+ + ), + }, + { + name: 'keyFormat.useJsonNodeReader', + label: 'Kafka key use JSON node reader', + type: 'boolean', + defined: inputFormat => deepGet(inputFormat, 'keyFormat.type') === 'json', + disabled: inputFormat => Boolean(inputFormat.assumeNewlineDelimited), + defaultValue: false, + hideInMore: true, + info: ( + <> + {' '} +

+ In streaming ingestion, multi-line JSON events can be ingested (i.e. where a single JSON + event spans multiple lines). However, if a parsing exception occurs, all JSON events that + are present in the same streaming record will be discarded. +

+

+ assumeNewlineDelimited and useJsonNodeReader (at most one can be{' '} + true) affect only how parsing exceptions are handled. +

+

+ When ingesting multi-line JSON events, enabling this option will enable the use of a JSON + parser which will retain any valid JSON events encountered within a streaming record prior + to when a parsing exception occurred. +

+ + ), + }, + { + name: 'keyFormat.delimiter', + label: 'Kafka key delimiter', + type: 'string', + defaultValue: '\t', + suggestions: ['\t', ';', '|', '#'], + defined: inputFormat => deepGet(inputFormat, 'keyFormat.type') === 'tsv', + info: <>A custom delimiter for data values., + }, + { + name: 'keyFormat.pattern', + label: 'Kafka key pattern', + type: 'string', + defined: inputFormat => deepGet(inputFormat, 'keyFormat.type') === 'regex', + required: true, + }, + { + name: 'keyFormat.skipHeaderRows', + label: 'Kafka key skip header rows', + type: 'number', + defaultValue: 0, + defined: inputFormat => oneOf(deepGet(inputFormat, 'keyFormat.type'), 'csv', 'tsv'), + min: 0, + info: ( + <> + If this is set, skip the first skipHeaderRows rows from each file. + + ), + }, + { + name: 'keyFormat.findColumnsFromHeader', + label: 'Kafka key find columns from header', + type: 'boolean', + defined: inputFormat => oneOf(deepGet(inputFormat, 'keyFormat.type'), 'csv', 'tsv'), + required: true, + hideInMore: true, + info: ( + <> + If this is set, find the column names from the header row. Note that + skipHeaderRows will be applied before finding column names from the header. For + example, if you set skipHeaderRows to 2 and findColumnsFromHeader{' '} + to true, the task will skip the first two lines and then extract column information from the + third line. + + ), + }, + { + name: 'keyFormat.columns', + label: 'Kafka key columns', + type: 'string-array', + required: true, + defined: inputFormat => + (oneOf(deepGet(inputFormat, 'keyFormat.type'), 'csv', 'tsv') && + deepGet(inputFormat, 'keyFormat.findColumnsFromHeader') === false) || + deepGet(inputFormat, 'keyFormat.type') === 'regex', + hideInMore: true, + info: ( + <> + Only the value of the first column will be read, the name of the column will be ignored so + enter anything here. + + ), + }, + { + name: 'keyFormat.listDelimiter', + label: 'Kafka key list delimiter', + type: 'string', + defaultValue: '\x01', + suggestions: ['\x01', '\x00'], + defined: inputFormat => oneOf(deepGet(inputFormat, 'keyFormat.type'), 'csv', 'tsv', 'regex'), + info: <>A custom delimiter for multi-value dimensions., + }, + { + name: 'keyFormat.binaryAsString', + label: 'Kafka key list binary as string', + type: 'boolean', + defaultValue: false, + defined: inputFormat => + oneOf(deepGet(inputFormat, 'valueFormat.type'), 'parquet', 'orc', 'avro_ocf', 'avro_stream'), + info: ( + <> + Specifies if the binary column which is not logically marked as a string should be treated + as a UTF-8 encoded string. + + ), + }, + + // keyColumnName + { + name: 'keyColumnName', + label: 'Kafka key column name', + type: 'string', + defaultValue: 'kafka.key', + defined: inputFormat => Boolean(deepGet(inputFormat, 'keyFormat.type')), + info: `Custom prefix for all the header columns.`, + }, + + // ----------------------------------------------------- + + { + name: 'headerFormat.type', + label: 'Kafka header format type', + type: 'string', + defined: typeIs('kafka'), + placeholder: `(don't parse Kafka herders)`, + suggestions: [undefined, 'string'], + }, + { + name: 'headerFormat.encoding', + label: 'Kafka header format encoding', + type: 'string', + defaultValue: 'UTF-8', + defined: inputFormat => deepGet(inputFormat, 'headerFormat.type') === 'string', + suggestions: ['UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'US-ASCII', 'ISO-8859-1'], + }, + { + name: 'headerColumnPrefix', + label: 'Kafka header column prefix', + type: 'string', + defaultValue: 'kafka.header.', + defined: inputFormat => deepGet(inputFormat, 'headerFormat.type') === 'string', + info: `Custom prefix for all the header columns.`, + }, +]; export function issueWithInputFormat(inputFormat: InputFormat | undefined): string | undefined { - return AutoForm.issueWithModel(inputFormat, INPUT_FORMAT_FIELDS); + return AutoForm.issueWithModel(inputFormat, BATCH_INPUT_FORMAT_FIELDS); } export const inputFormatCanProduceNestedData: (inputFormat: InputFormat) => boolean = typeIs( diff --git a/web-console/src/druid-models/metric-spec/metric-spec.spec.ts b/web-console/src/druid-models/metric-spec/metric-spec.spec.ts index e30b806344b5..3f6d47272a29 100644 --- a/web-console/src/druid-models/metric-spec/metric-spec.spec.ts +++ b/web-console/src/druid-models/metric-spec/metric-spec.spec.ts @@ -16,16 +16,14 @@ * limitations under the License. */ -import { JSON_SAMPLE } from '../../utils/sampler.mock'; +import { EMPTY_SAMPLE, JSON_SAMPLE } from '../../utils/sampler.mock'; import { getMetricSpecs } from './metric-spec'; describe('metric-spec', () => { describe('getMetricSecs', () => { it('works for empty', () => { - expect(getMetricSpecs({ header: ['header'], rows: [] }, {}, false)).toEqual([ - { name: 'count', type: 'count' }, - ]); + expect(getMetricSpecs(EMPTY_SAMPLE, {}, false)).toEqual([{ name: 'count', type: 'count' }]); }); it('works with json', () => { diff --git a/web-console/src/druid-models/metric-spec/metric-spec.tsx b/web-console/src/druid-models/metric-spec/metric-spec.tsx index 3eb0a24d780c..61acf08e66a3 100644 --- a/web-console/src/druid-models/metric-spec/metric-spec.tsx +++ b/web-console/src/druid-models/metric-spec/metric-spec.tsx @@ -23,8 +23,8 @@ import type { Field } from '../../components'; import { ExternalLink } from '../../components'; import { getLink } from '../../links'; import { filterMap, typeIs } from '../../utils'; -import type { SampleHeaderAndRows } from '../../utils/sampler'; -import { guessColumnTypeFromHeaderAndRows } from '../ingestion-spec/ingestion-spec'; +import type { SampleResponse } from '../../utils/sampler'; +import { guessColumnTypeFromSampleResponse } from '../ingestion-spec/ingestion-spec'; export interface MetricSpec { readonly type: string; @@ -388,16 +388,17 @@ export function getMetricSpecOutputType(metricSpec: MetricSpec): string | undefi } export function getMetricSpecs( - headerAndRows: SampleHeaderAndRows, + sampleResponse: SampleResponse, typeHints: Record, guessNumericStringsAsNumbers: boolean, ): MetricSpec[] { return [{ name: 'count', type: 'count' }].concat( - filterMap(headerAndRows.header, h => { + filterMap(sampleResponse.logicalSegmentSchema, s => { + const h = s.name; if (h === '__time') return; const type = typeHints[h] || - guessColumnTypeFromHeaderAndRows(headerAndRows, h, guessNumericStringsAsNumbers); + guessColumnTypeFromSampleResponse(sampleResponse, h, guessNumericStringsAsNumbers); switch (type) { case 'double': return { name: `sum_${h}`, type: 'doubleSum', fieldName: h }; diff --git a/web-console/src/druid-models/time/time.spec.ts b/web-console/src/druid-models/time/time.spec.ts index 5670640f7070..72dd79d5f58a 100644 --- a/web-console/src/druid-models/time/time.spec.ts +++ b/web-console/src/druid-models/time/time.spec.ts @@ -16,15 +16,31 @@ * limitations under the License. */ -import { timeFormatMatches } from './time'; +import { possibleDruidFormatForValues, timeFormatMatches } from './time'; -describe('timeFormatMatches', () => { - it('works for auto', () => { - expect(timeFormatMatches('auto', '2019-05-22 22:42:51+0000')).toBeTruthy(); +describe('time', () => { + describe('timeFormatMatches', () => { + it('works for auto', () => { + expect(timeFormatMatches('auto', '2019-05-22 22:42:51+0000')).toBeTruthy(); + }); + + it('works for iso', () => { + expect(timeFormatMatches('iso', '2019-05-22T22:42:51+0000')).toBeTruthy(); + expect(timeFormatMatches('iso', '2019-05-22 22:42:51+0000')).toBeFalsy(); + }); }); - it('works for iso', () => { - expect(timeFormatMatches('iso', '2019-05-22T22:42:51+0000')).toBeTruthy(); - expect(timeFormatMatches('iso', '2019-05-22 22:42:51+0000')).toBeFalsy(); + describe('possibleDruidFormatForValues', () => { + it('works in empty case', () => { + expect(possibleDruidFormatForValues([])).toBeUndefined(); + }); + + it('does not react to small numbers', () => { + expect(possibleDruidFormatForValues([12, 234, 3432])).toBeUndefined(); + }); + + it('works for auto', () => { + expect(possibleDruidFormatForValues(['2019-05-22 22:42:51'])).toEqual('auto'); + }); }); }); diff --git a/web-console/src/druid-models/time/time.ts b/web-console/src/druid-models/time/time.ts index 829be85520f6..bfd9e88ee282 100644 --- a/web-console/src/druid-models/time/time.ts +++ b/web-console/src/druid-models/time/time.ts @@ -86,6 +86,7 @@ export function timeFormatMatches(format: string, value: string | number | bigin } export function possibleDruidFormatForValues(values: any[]): string | undefined { + if (!values.length) return; return ALL_FORMAT_VALUES.find(format => { return values.every(value => timeFormatMatches(format, value)); }); diff --git a/web-console/src/entry.ts b/web-console/src/entry.ts index da267c172c13..7f4aed89129d 100644 --- a/web-console/src/entry.ts +++ b/web-console/src/entry.ts @@ -54,9 +54,6 @@ interface ConsoleConfig { // A set of custom headers name/value to set on every AJAX request customHeaders?: Record; - // The URL for where to load the example manifest, a JSON document that tells the console where to find all the example datasets - exampleManifestsUrl?: string; - // The query context to set if the user does not have one saved in local storage, defaults to {} defaultQueryContext?: Record; @@ -104,7 +101,6 @@ QueryRunner.defaultQueryExecutor = (payload, isSql, cancelToken) => { ReactDOM.render( React.createElement(ConsoleApplication, { - exampleManifestsUrl: consoleConfig.exampleManifestsUrl, defaultQueryContext: consoleConfig.defaultQueryContext, mandatoryQueryContext: consoleConfig.mandatoryQueryContext, }), diff --git a/web-console/src/helpers/spec-conversion.ts b/web-console/src/helpers/spec-conversion.ts index 8562bd1f68f1..990147b57ad7 100644 --- a/web-console/src/helpers/spec-conversion.ts +++ b/web-console/src/helpers/spec-conversion.ts @@ -36,7 +36,7 @@ import type { Transform, } from '../druid-models'; import { inflateDimensionSpec, upgradeSpec } from '../druid-models'; -import { deepGet, filterMap, oneOf } from '../utils'; +import { deepGet, filterMap, nonEmptyArray, oneOf } from '../utils'; export function getSpecDatasourceName(spec: IngestionSpec): string { return deepGet(spec, 'spec.dataSchema.dataSource') || 'unknown_datasource'; @@ -86,6 +86,10 @@ export function convertSpecToSql(spec: any): QueryWithContext { const rollup = deepGet(spec, 'spec.dataSchema.granularitySpec.rollup') ?? true; + if (nonEmptyArray(deepGet(spec, 'spec.dataSchema.dimensionsSpec.spatialDimensions'))) { + throw new Error(`spatialDimensions are not currently supported in SQL-based ingestion`); + } + const timestampSpec: TimestampSpec = deepGet(spec, 'spec.dataSchema.timestampSpec'); if (!timestampSpec) throw new Error(`spec.dataSchema.timestampSpec is not defined`); diff --git a/web-console/src/hooks/use-last-defined-deep.ts b/web-console/src/hooks/use-last-defined-deep.ts new file mode 100644 index 000000000000..ee4c24e5c6a0 --- /dev/null +++ b/web-console/src/hooks/use-last-defined-deep.ts @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { useEffect, useRef } from 'react'; + +export function useLastDefinedDeep(cur: T | undefined): T | undefined { + const last = useRef(); + + useEffect(() => { + if (typeof cur === 'undefined' || JSON.stringify(last.current) === JSON.stringify(cur)) return; + last.current = cur; + }, [cur]); + + return typeof cur === 'undefined' ? last.current : cur; +} diff --git a/web-console/src/hooks/use-last-defined.tsx b/web-console/src/hooks/use-last-defined.ts similarity index 100% rename from web-console/src/hooks/use-last-defined.tsx rename to web-console/src/hooks/use-last-defined.ts diff --git a/web-console/src/utils/general.tsx b/web-console/src/utils/general.tsx index 9140452947c9..3b29dceafdcd 100644 --- a/web-console/src/utils/general.tsx +++ b/web-console/src/utils/general.tsx @@ -278,6 +278,13 @@ export function filterMap(xs: readonly T[], f: (x: T, i: number) => Q | un return xs.map(f).filter((x: Q | undefined) => typeof x !== 'undefined') as Q[]; } +export function findMap( + xs: readonly T[], + f: (x: T, i: number) => Q | undefined, +): Q | undefined { + return filterMap(xs, f)[0]; +} + export function compact(xs: (T | undefined | false | null | '')[]): T[] { return xs.filter(Boolean) as T[]; } diff --git a/web-console/src/utils/sampler.mock.ts b/web-console/src/utils/sampler.mock.ts index a95b73a9457c..618c4fb835fc 100644 --- a/web-console/src/utils/sampler.mock.ts +++ b/web-console/src/utils/sampler.mock.ts @@ -17,21 +17,66 @@ */ // Just to make sure we are in a test context. This line will cause trouble if this file is ever compiled into the main build -import type { SampleHeaderAndRows } from './sampler'; +import type { SampleResponse } from './sampler'; -expect(1).toEqual(1); +expect(1).toEqual(1); // Just to make sure this file does not get included in the build by accident + +export const EMPTY_SAMPLE: SampleResponse = { + numRowsRead: 0, + numRowsIndexed: 0, + logicalDimensions: [], + physicalDimensions: [], + logicalSegmentSchema: [{ name: '__time', type: 'LONG' }], + data: [], +}; /* -This data is the returned sample when ingested with: +This data is the returned sample when sampling (from the timestamp stage): {"timestamp":"2016-04-11T09:20:00Z","user":"Alice","followers":10,"spend":0,"id":"12232323","tags":null,"nums":[4]} {"timestamp":"2016-04-11T09:21:00Z","user":"Bob","followers":0,"spend":3,"id":"45345634","tags":["a"],"nums":[5,6]} {"timestamp":"2016-04-11T09:22:00Z","user":"Alice","followers":3,"spend":5.1,"id":"73534533","tags":["a","b"],"nums":[7,8]} */ -export const JSON_SAMPLE: SampleHeaderAndRows = { - header: ['timestamp', 'user', 'followers', 'spend', 'id', 'tags', 'nums'], - rows: [ +export const JSON_SAMPLE: SampleResponse = { + numRowsRead: 3, + numRowsIndexed: 3, + logicalDimensions: [ + { type: 'string', name: 'user', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { + type: 'long', + name: 'followers', + multiValueHandling: 'SORTED_ARRAY', + createBitmapIndex: false, + }, + { type: 'json', name: 'spend', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { type: 'string', name: 'id', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { type: 'json', name: 'tags', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { type: 'json', name: 'nums', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + ], + physicalDimensions: [ + { type: 'auto', name: 'user', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { + type: 'auto', + name: 'followers', + multiValueHandling: 'SORTED_ARRAY', + createBitmapIndex: true, + }, + { type: 'auto', name: 'spend', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { type: 'auto', name: 'id', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { type: 'auto', name: 'tags', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { type: 'auto', name: 'nums', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + ], + logicalSegmentSchema: [ + { name: '__time', type: 'LONG' }, + { name: 'user', type: 'STRING' }, + { name: 'followers', type: 'LONG' }, + { name: 'spend', type: 'COMPLEX' }, + { name: 'id', type: 'STRING' }, + { name: 'tags', type: 'ARRAY' }, + { name: 'nums', type: 'ARRAY' }, + ], + data: [ { input: { timestamp: '2016-04-11T09:20:00Z', @@ -43,14 +88,13 @@ export const JSON_SAMPLE: SampleHeaderAndRows = { nums: [4], }, parsed: { - __time: 0, - timestamp: '2016-04-11T09:20:00Z', + __time: 1460366400000, user: 'Alice', - followers: '10', - spend: '0', + followers: 10, + spend: 0, id: '12232323', tags: null, - nums: '4', + nums: [4], }, }, { @@ -64,14 +108,13 @@ export const JSON_SAMPLE: SampleHeaderAndRows = { nums: [5, 6], }, parsed: { - __time: 0, - timestamp: '2016-04-11T09:21:00Z', + __time: 1460366460000, user: 'Bob', - followers: '0', - spend: '3', + followers: 0, + spend: 3, id: '45345634', - tags: 'a', - nums: ['5', '6'], + tags: ['a'], + nums: [5, 6], }, }, { @@ -85,14 +128,13 @@ export const JSON_SAMPLE: SampleHeaderAndRows = { nums: [7, 8], }, parsed: { - __time: 0, - timestamp: '2016-04-11T09:22:00Z', + __time: 1460366520000, user: 'Alice', - followers: '3', - spend: '5.1', + followers: 3, + spend: 5.1, id: '73534533', tags: ['a', 'b'], - nums: ['7', '8'], + nums: [7, 8], }, }, ], @@ -119,9 +161,45 @@ SELECT FROM test_data */ -export const CSV_SAMPLE: SampleHeaderAndRows = { - header: ['timestamp', 'user', 'followers', 'spend', 'id', 'tags', 'nums'], - rows: [ +export const CSV_SAMPLE: SampleResponse = { + numRowsRead: 3, + numRowsIndexed: 3, + logicalDimensions: [ + { type: 'string', name: 'user', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { + type: 'string', + name: 'followers', + multiValueHandling: 'SORTED_ARRAY', + createBitmapIndex: true, + }, + { type: 'string', name: 'spend', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { type: 'string', name: 'id', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { type: 'json', name: 'tags', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { type: 'json', name: 'nums', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + ], + physicalDimensions: [ + { type: 'auto', name: 'user', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { + type: 'auto', + name: 'followers', + multiValueHandling: 'SORTED_ARRAY', + createBitmapIndex: true, + }, + { type: 'auto', name: 'spend', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { type: 'auto', name: 'id', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { type: 'auto', name: 'tags', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + { type: 'auto', name: 'nums', multiValueHandling: 'SORTED_ARRAY', createBitmapIndex: true }, + ], + logicalSegmentSchema: [ + { name: '__time', type: 'LONG' }, + { name: 'user', type: 'STRING' }, + { name: 'followers', type: 'STRING' }, + { name: 'spend', type: 'STRING' }, + { name: 'id', type: 'STRING' }, + { name: 'tags', type: 'COMPLEX' }, + { name: 'nums', type: 'COMPLEX' }, + ], + data: [ { input: { timestamp: '2016-04-11T09:20:00.000Z', @@ -133,8 +211,7 @@ export const CSV_SAMPLE: SampleHeaderAndRows = { nums: '4', }, parsed: { - __time: 0, - timestamp: '2016-04-11T09:20:00.000Z', + __time: 1460366400000, user: 'Alice', followers: '10', spend: '0', @@ -154,8 +231,7 @@ export const CSV_SAMPLE: SampleHeaderAndRows = { nums: ['5', '6'], }, parsed: { - __time: 0, - timestamp: '2016-04-11T09:21:00.000Z', + __time: 1460366460000, user: 'Bob', followers: '0', spend: '3', @@ -175,8 +251,7 @@ export const CSV_SAMPLE: SampleHeaderAndRows = { nums: ['7', '8'], }, parsed: { - __time: 0, - timestamp: '2016-04-11T09:22:00.000Z', + __time: 1460366520000, user: 'Alice', followers: '3', spend: '5.1', diff --git a/web-console/src/utils/sampler.spec.ts b/web-console/src/utils/sampler.spec.ts new file mode 100644 index 000000000000..8fdc505e46ec --- /dev/null +++ b/web-console/src/utils/sampler.spec.ts @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import type { SampleResponse } from './sampler'; +import { guessDimensionsFromSampleResponse } from './sampler'; + +describe('sampler', () => { + describe('getInferredDimensionsFromSampleResponse', () => { + const sampleResponse: SampleResponse = { + numRowsRead: 20, + numRowsIndexed: 20, + logicalDimensions: [ + { + type: 'long', + name: 'isRobot', + multiValueHandling: 'SORTED_ARRAY', + createBitmapIndex: false, + }, + { + type: 'string', + name: 'channel', + multiValueHandling: 'SORTED_ARRAY', + createBitmapIndex: true, + }, + { + type: 'string', + name: 'flags', + multiValueHandling: 'SORTED_ARRAY', + createBitmapIndex: true, + }, + { + type: 'long', + name: 'isUnpatrolled', + multiValueHandling: 'SORTED_ARRAY', + createBitmapIndex: false, + }, + ], + physicalDimensions: [ + { + type: 'json', + name: 'isRobot', + multiValueHandling: 'SORTED_ARRAY', + createBitmapIndex: true, + }, + { + type: 'json', + name: 'channel', + multiValueHandling: 'SORTED_ARRAY', + createBitmapIndex: true, + }, + { + type: 'json', + name: 'flags', + multiValueHandling: 'SORTED_ARRAY', + createBitmapIndex: true, + }, + { + type: 'json', + name: 'isUnpatrolled', + multiValueHandling: 'SORTED_ARRAY', + createBitmapIndex: true, + }, + ], + logicalSegmentSchema: [ + { name: '__time', type: 'LONG' }, + { name: 'isRobot', type: 'LONG' }, + { name: 'channel', type: 'STRING' }, + { name: 'flags', type: 'STRING' }, + { name: 'isUnpatrolled', type: 'LONG' }, + ], + data: [ + { + input: { + isRobot: true, + channel: '#sv.wikipedia', + timestamp: '2016-06-27T00:00:11.080Z', + flags: 'NB', + isUnpatrolled: false, + }, + parsed: { + __time: 1466985611080, + isRobot: true, + channel: '#sv.wikipedia', + flags: 'NB', + isUnpatrolled: false, + }, + }, + ], + }; + + it('works', () => { + expect(guessDimensionsFromSampleResponse(sampleResponse)).toMatchInlineSnapshot(` + Array [ + Object { + "name": "isRobot", + "type": "string", + }, + Object { + "createBitmapIndex": true, + "multiValueHandling": "SORTED_ARRAY", + "name": "channel", + "type": "string", + }, + Object { + "createBitmapIndex": true, + "multiValueHandling": "SORTED_ARRAY", + "name": "flags", + "type": "string", + }, + Object { + "name": "isUnpatrolled", + "type": "string", + }, + ] + `); + }); + }); +}); diff --git a/web-console/src/utils/sampler.ts b/web-console/src/utils/sampler.ts index 1860d4fa2023..ed5c5e2cc0fd 100644 --- a/web-console/src/utils/sampler.ts +++ b/web-console/src/utils/sampler.ts @@ -20,6 +20,7 @@ import { dedupe } from 'druid-query-toolkit'; import * as JSONBig from 'json-bigint-native'; import type { + DimensionSpec, DimensionsSpec, IngestionSpec, IngestionType, @@ -32,21 +33,20 @@ import type { } from '../druid-models'; import { getDimensionNamesFromTransforms, + getDimensionSpecName, getSpecType, getTimestampSchema, isDruidSource, PLACEHOLDER_TIMESTAMP_SPEC, REINDEX_TIMESTAMP_SPEC, TIME_COLUMN, - upgradeSpec, } from '../druid-models'; import { Api } from '../singletons'; import { getDruidErrorMessage, queryDruidRune } from './druid-query'; -import { arrangeWithPrefixSuffix, EMPTY_ARRAY, filterMap } from './general'; +import { EMPTY_ARRAY, filterMap } from './general'; import { deepGet, deepSet } from './object-change'; -const SAMPLER_URL = `/druid/indexer/v1/sampler`; const BASE_SAMPLER_CONFIG: SamplerConfig = { numRows: 500, timeoutMs: 15000, @@ -63,6 +63,38 @@ export interface SamplerConfig { export interface SampleResponse { data: SampleEntry[]; + logicalSegmentSchema: { name: string; type: string }[]; + logicalDimensions: DimensionSpec[]; + physicalDimensions: DimensionSpec[]; + numRowsIndexed: number; + numRowsRead: number; +} + +export function getHeaderNamesFromSampleResponse( + sampleResponse: SampleResponse, + ignoreTimeColumn = false, +) { + return filterMap(sampleResponse.logicalSegmentSchema, s => + ignoreTimeColumn && s.name === '__time' ? undefined : s.name, + ); +} + +export function guessDimensionsFromSampleResponse(sampleResponse: SampleResponse): DimensionSpec[] { + const { logicalDimensions, physicalDimensions, data } = sampleResponse; + return logicalDimensions.map(d => { + // Boolean column are currently reported as "long" so let's turn them into "string" + if ( + d.type === 'long' && + physicalDimensions.find(_ => _.name === d.name)?.type === 'json' && + typeof data[0]?.input?.[d.name] === 'boolean' + ) { + return { + name: d.name, + type: 'string', + }; + } + return d; + }); } export type CacheRows = Record[]; @@ -81,17 +113,6 @@ export interface SampleEntry { error?: string; } -export interface SampleHeaderAndRows { - header: string[]; - rows: SampleEntry[]; -} - -export interface ExampleManifest { - name: string; - description: string; - spec: any; -} - export function getCacheRowsFromSampleResponse(sampleResponse: SampleResponse): CacheRows { return filterMap(sampleResponse.data, d => d.input).slice(0, 20); } @@ -126,46 +147,6 @@ export function applyCache(sampleSpec: SampleSpec, cacheRows: CacheRows) { return sampleSpec; } -export interface HeaderFromSampleResponseOptions { - sampleResponse: SampleResponse; - ignoreTimeColumn?: boolean; - columnOrder?: string[]; - suffixColumnOrder?: string[]; - useInput?: boolean; -} - -export function headerFromSampleResponse(options: HeaderFromSampleResponseOptions): string[] { - const { sampleResponse, ignoreTimeColumn, columnOrder, suffixColumnOrder, useInput } = options; - - const key = useInput ? 'input' : 'parsed'; - let columns = arrangeWithPrefixSuffix( - dedupe(sampleResponse.data.flatMap(s => (s[key] ? Object.keys(s[key]!) : []))), - columnOrder || [TIME_COLUMN], - suffixColumnOrder || [], - ); - - if (ignoreTimeColumn) { - columns = columns.filter(c => c !== TIME_COLUMN); - } - - return columns; -} - -export interface HeaderAndRowsFromSampleResponseOptions extends HeaderFromSampleResponseOptions { - parsedOnly?: boolean; -} - -export function headerAndRowsFromSampleResponse( - options: HeaderAndRowsFromSampleResponseOptions, -): SampleHeaderAndRows { - const { sampleResponse, parsedOnly } = options; - - return { - header: headerFromSampleResponse(options), - rows: parsedOnly ? sampleResponse.data.filter(d => d.parsed) : sampleResponse.data, - }; -} - export async function getProxyOverlordModules(): Promise { let statusResp: any; try { @@ -185,7 +166,7 @@ export async function postToSampler( let sampleResp: any; try { - sampleResp = await Api.instance.post(`${SAMPLER_URL}?for=${forStr}`, sampleSpec); + sampleResp = await Api.instance.post(`/druid/indexer/v1/sampler?for=${forStr}`, sampleSpec); } catch (e) { throw new Error(getDruidErrorMessage(e)); } @@ -229,6 +210,23 @@ function fixSamplerTypes(sampleSpec: SampleSpec): SampleSpec { return sampleSpec; } +const WHOLE_ROW_INPUT_FORMAT: InputFormat = { + type: 'regex', + pattern: '([\\s\\S]*)', // Match the entire line, every single character + listDelimiter: '56616469-6de2-9da4-efb8-8f416e6e6965', // Just a UUID to disable the list delimiter, let's hope we do not see this UUID in the data + columns: ['raw'], +}; + +const KAFKA_SAMPLE_INPUT_FORMAT: InputFormat = { + type: 'kafka', + headerFormat: { + type: 'string', + encoding: 'UTF-8', + }, + keyFormat: WHOLE_ROW_INPUT_FORMAT, + valueFormat: WHOLE_ROW_INPUT_FORMAT, +}; + export async function sampleForConnect( spec: Partial, sampleStrategy: SampleStrategy, @@ -242,12 +240,11 @@ export async function sampleForConnect( const reingestMode = isDruidSource(spec); if (!reingestMode) { - ioConfig = deepSet(ioConfig, 'inputFormat', { - type: 'regex', - pattern: '([\\s\\S]*)', // Match the entire line, every single character - listDelimiter: '56616469-6de2-9da4-efb8-8f416e6e6965', // Just a UUID to disable the list delimiter, let's hope we do not see this UUID in the data - columns: ['raw'], - }); + ioConfig = deepSet( + ioConfig, + 'inputFormat', + samplerType === 'kafka' ? KAFKA_SAMPLE_INPUT_FORMAT : WHOLE_ROW_INPUT_FORMAT, + ); } const sampleSpec: SampleSpec = { @@ -332,7 +329,9 @@ export async function sampleForParser( dataSchema: { dataSource: 'sample', timestampSpec: reingestMode ? REINDEX_TIMESTAMP_SPEC : PLACEHOLDER_TIMESTAMP_SPEC, - dimensionsSpec: {}, + dimensionsSpec: { + useSchemaDiscovery: true, + }, granularitySpec: { rollup: false, }, @@ -359,7 +358,9 @@ export async function sampleForTimestamp( ioConfig: deepGet(spec, 'spec.ioConfig'), dataSchema: { dataSource: 'sample', - dimensionsSpec: {}, + dimensionsSpec: { + useSchemaDiscovery: true, + }, timestampSpec: timestampSchema === 'column' ? PLACEHOLDER_TIMESTAMP_SPEC : timestampSpec, granularitySpec: { rollup: false, @@ -380,7 +381,7 @@ export async function sampleForTimestamp( const transforms: Transform[] = deepGet(spec, 'spec.dataSchema.transformSpec.transforms') || EMPTY_ARRAY; - // If we are trying to parts a column then get a bit fancy: + // If we are trying to parse a column then get a bit fancy: // Query the same sample again (same cache key) const sampleSpec: SampleSpec = { type: samplerType, @@ -388,7 +389,9 @@ export async function sampleForTimestamp( ioConfig: deepGet(spec, 'spec.ioConfig'), dataSchema: { dataSource: 'sample', - dimensionsSpec: {}, + dimensionsSpec: { + useSchemaDiscovery: true, + }, timestampSpec, transformSpec: { transforms: transforms.filter(transform => transform.name === TIME_COLUMN), @@ -430,8 +433,8 @@ export async function sampleForTransform( const timestampSpec: TimestampSpec = deepGet(spec, 'spec.dataSchema.timestampSpec'); const transforms: Transform[] = deepGet(spec, 'spec.dataSchema.transformSpec.transforms') || []; - // Extra step to simulate auto detecting dimension with transforms - let specialDimensionSpec: DimensionsSpec = {}; + // Extra step to simulate auto-detecting dimension with transforms + let specialDimensionSpec: DimensionsSpec = { useSchemaDiscovery: true }; if (transforms && transforms.length) { const sampleSpecHack: SampleSpec = { type: samplerType, @@ -440,7 +443,9 @@ export async function sampleForTransform( dataSchema: { dataSource: 'sample', timestampSpec, - dimensionsSpec: {}, + dimensionsSpec: { + useSchemaDiscovery: true, + }, granularitySpec: { rollup: false, }, @@ -458,10 +463,10 @@ export async function sampleForTransform( specialDimensionSpec, 'dimensions', dedupe( - headerFromSampleResponse({ - sampleResponse: sampleResponseHack, - ignoreTimeColumn: true, - }).concat(getDimensionNamesFromTransforms(transforms)), + ( + guessDimensionsFromSampleResponse(sampleResponseHack) as (DimensionSpec | string)[] + ).concat(getDimensionNamesFromTransforms(transforms)), + getDimensionSpecName, ), ); } @@ -497,8 +502,8 @@ export async function sampleForFilter( const transforms: Transform[] = deepGet(spec, 'spec.dataSchema.transformSpec.transforms') || []; const filter: any = deepGet(spec, 'spec.dataSchema.transformSpec.filter'); - // Extra step to simulate auto detecting dimension with transforms - let specialDimensionSpec: DimensionsSpec = {}; + // Extra step to simulate auto-detecting dimension with transforms + let specialDimensionSpec: DimensionsSpec = { useSchemaDiscovery: true }; if (transforms && transforms.length) { const sampleSpecHack: SampleSpec = { type: samplerType, @@ -507,7 +512,9 @@ export async function sampleForFilter( dataSchema: { dataSource: 'sample', timestampSpec, - dimensionsSpec: {}, + dimensionsSpec: { + useSchemaDiscovery: true, + }, granularitySpec: { rollup: false, }, @@ -525,10 +532,9 @@ export async function sampleForFilter( specialDimensionSpec, 'dimensions', dedupe( - headerFromSampleResponse({ - sampleResponse: sampleResponseHack, - ignoreTimeColumn: true, - }).concat(getDimensionNamesFromTransforms(transforms)), + getHeaderNamesFromSampleResponse(sampleResponseHack, true).concat( + getDimensionNamesFromTransforms(transforms), + ), ), ); } @@ -591,55 +597,3 @@ export async function sampleForSchema( return postToSampler(applyCache(sampleSpec, cacheRows), 'schema'); } - -export async function sampleForExampleManifests( - exampleManifestUrl: string, -): Promise { - const exampleSpec: SampleSpec = { - type: 'index_parallel', - spec: { - ioConfig: { - type: 'index_parallel', - inputSource: { type: 'http', uris: [exampleManifestUrl] }, - inputFormat: { type: 'tsv', findColumnsFromHeader: true }, - }, - dataSchema: { - dataSource: 'sample', - timestampSpec: { - column: 'timestamp', - missingValue: '2010-01-01T00:00:00Z', - }, - dimensionsSpec: {}, - }, - }, - samplerConfig: { numRows: 50, timeoutMs: 10000 }, - }; - - const exampleData = await postToSampler(exampleSpec, 'example-manifest'); - - return filterMap(exampleData.data, datum => { - const parsed = datum.parsed; - if (!parsed) return; - let { name, description, spec } = parsed; - try { - spec = JSON.parse(spec); - } catch { - return; - } - - if ( - typeof name === 'string' && - typeof description === 'string' && - spec && - typeof spec === 'object' - ) { - return { - name: parsed.name, - description: parsed.description, - spec: upgradeSpec(spec), - }; - } else { - return; - } - }); -} diff --git a/web-console/src/utils/utils.spec.ts b/web-console/src/utils/utils.spec.ts index 2d5c1909dc3f..87f8b121a66d 100644 --- a/web-console/src/utils/utils.spec.ts +++ b/web-console/src/utils/utils.spec.ts @@ -18,7 +18,7 @@ import type { IngestionSpec } from '../druid-models'; -import { applyCache, headerFromSampleResponse } from './sampler'; +import { applyCache } from './sampler'; describe('utils', () => { const ingestionSpec: IngestionSpec = { @@ -52,20 +52,6 @@ describe('utils', () => { }, }; - // const cacheRows: CacheRows = [{ make: 'Honda', model: 'Civic' }, { make: 'BMW', model: 'M3' }]; - - it('spec-utils headerFromSampleResponse', () => { - expect( - headerFromSampleResponse({ - sampleResponse: { data: [{ input: { a: 1 }, parsed: { a: 1 } }] }, - }), - ).toMatchInlineSnapshot(` - Array [ - "a", - ] - `); - }); - it('spec-utils applyCache', () => { expect( applyCache( diff --git a/web-console/src/views/load-data-view/example-picker/example-picker.spec.tsx b/web-console/src/views/load-data-view/example-picker/example-picker.spec.tsx index 7b98e6952856..e559fc444aa4 100644 --- a/web-console/src/views/load-data-view/example-picker/example-picker.spec.tsx +++ b/web-console/src/views/load-data-view/example-picker/example-picker.spec.tsx @@ -25,9 +25,9 @@ describe('ExamplePicker', () => { it('matches snapshot', () => { const examplePicker = ( {}} /> diff --git a/web-console/src/views/load-data-view/example-picker/example-picker.tsx b/web-console/src/views/load-data-view/example-picker/example-picker.tsx index cf5c9bbe2605..a09043e0f16d 100644 --- a/web-console/src/views/load-data-view/example-picker/example-picker.tsx +++ b/web-console/src/views/load-data-view/example-picker/example-picker.tsx @@ -20,15 +20,15 @@ import { Button, Callout, FormGroup, HTMLSelect, Intent } from '@blueprintjs/cor import { IconNames } from '@blueprintjs/icons'; import React, { useState } from 'react'; -import type { ExampleManifest } from '../../../utils/sampler'; +import type { ExampleSpec } from '../example-specs'; export interface ExamplePickerProps { - exampleManifests: ExampleManifest[]; - onSelectExample: (exampleManifest: ExampleManifest) => void; + exampleSpecs: ExampleSpec[]; + onSelectExample(exampleSpec: ExampleSpec): void; } export const ExamplePicker = React.memo(function ExamplePicker(props: ExamplePickerProps) { - const { exampleManifests, onSelectExample } = props; + const { exampleSpecs, onSelectExample } = props; const [selectedIndex, setSelectedIndex] = useState(0); return ( @@ -39,15 +39,15 @@ export const ExamplePicker = React.memo(function ExamplePicker(props: ExamplePic value={selectedIndex} onChange={e => setSelectedIndex(e.target.value as any)} > - {exampleManifests.map((exampleManifest, i) => ( + {exampleSpecs.map((exampleSpec, i) => ( ))} - {exampleManifests[selectedIndex].description} + {exampleSpecs[selectedIndex].description}