From 01e5659192536110359f54edb3f746fffc846e8d Mon Sep 17 00:00:00 2001 From: Dimitris Athanasiou Date: Tue, 28 Apr 2020 19:30:49 +0300 Subject: [PATCH] [ML] Validate at least one feature is available for DF analytics We were previously checking at least one supported field existed when the _explain API was called. However, in the case of analyses with required fields (e.g. regression) we were not accounting that the dependent variable is not a feature and thus if the source index only contains the dependent variable field there are no features to train a model on. This commit adds a validation that at least one feature is available for analysis. Note that we also move that validation away from `ExtractedFieldsDetector` and the _explain API and straight into the _start API. The reason for doing this is to allow the user to use the _explain API in order to understand why they would be seeing an error like this one. For example, the user might be using an index that has fields but they are of unsupported types. If they start the job and get an error that there are no features, they will wonder why that is. Calling the _explain API will show them that all their fields are unsupported. If the _explain API was failing instead, there would be no way for the user to understand why all those fields are ignored. Closes #55593 --- .../ml/qa/ml-with-security/build.gradle | 3 +- ...ransportStartDataFrameAnalyticsAction.java | 36 +++++++++++++++++- .../extractor/ExtractedFieldsDetector.java | 6 --- .../ExtractedFieldsDetectorTests.java | 35 +++++++++++------ .../test/ml/start_data_frame_analytics.yml | 38 ++++++++++++++++++- 5 files changed, 96 insertions(+), 22 deletions(-) diff --git a/x-pack/plugin/ml/qa/ml-with-security/build.gradle b/x-pack/plugin/ml/qa/ml-with-security/build.gradle index 5eebec9559dde..2de640d77330e 100644 --- a/x-pack/plugin/ml/qa/ml-with-security/build.gradle +++ b/x-pack/plugin/ml/qa/ml-with-security/build.gradle @@ -187,7 +187,8 @@ integTest.runner { 'ml/preview_datafeed/Test preview missing datafeed', 'ml/revert_model_snapshot/Test revert model with invalid snapshotId', 'ml/start_data_frame_analytics/Test start given missing source index', - 'ml/start_data_frame_analytics/Test start given source index has no compatible fields', + 'ml/start_data_frame_analytics/Test start outlier_detection given source index has no fields', + 'ml/start_data_frame_analytics/Test start regression given source index only has dependent variable', 'ml/start_data_frame_analytics/Test start with inconsistent body/param ids', 'ml/start_data_frame_analytics/Test start given dest index is not empty', 'ml/start_data_frame_analytics/Test start with compatible fields but no data', diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDataFrameAnalyticsAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDataFrameAnalyticsAction.java index 03c7ab0ce2862..d33c3ba79d71e 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDataFrameAnalyticsAction.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDataFrameAnalyticsAction.java @@ -61,6 +61,7 @@ import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig; import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState; import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsTaskState; +import org.elasticsearch.xpack.core.ml.dataframe.analyses.RequiredField; import org.elasticsearch.xpack.core.ml.job.messages.Messages; import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex; import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper; @@ -83,7 +84,9 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Set; import java.util.function.Predicate; +import java.util.stream.Collectors; import static org.elasticsearch.xpack.core.ClientHelper.ML_ORIGIN; import static org.elasticsearch.xpack.core.ClientHelper.executeAsyncWithOrigin; @@ -244,7 +247,7 @@ private void getStartContext(String id, Task task, ActionListener ParentTaskAssigningClient parentTaskClient = new ParentTaskAssigningClient(client, task.getParentTaskId()); // Step 7. Validate that there are analyzable data in the source index ActionListener validateMappingsMergeListener = ActionListener.wrap( - startContext -> validateSourceIndexHasRows(startContext, finalListener), + startContext -> validateSourceIndexHasAnalyzableData(startContext, finalListener), finalListener::onFailure ); @@ -319,6 +322,37 @@ private void getStartContext(String id, Task task, ActionListener configProvider.get(id, getConfigListener); } + private void validateSourceIndexHasAnalyzableData(StartContext startContext, ActionListener listener) { + ActionListener validateAtLeastOneAnalyzedFieldListener = ActionListener.wrap( + aVoid -> validateSourceIndexHasRows(startContext, listener), + listener::onFailure + ); + + validateSourceIndexHasAtLeastOneAnalyzedField(startContext, validateAtLeastOneAnalyzedFieldListener); + } + + private void validateSourceIndexHasAtLeastOneAnalyzedField(StartContext startContext, ActionListener listener) { + Set requiredFields = startContext.config.getAnalysis().getRequiredFields().stream() + .map(RequiredField::getName) + .collect(Collectors.toSet()); + + // We assume here that required fields are not features + long nonRequiredFieldsCount = startContext.extractedFields.getAllFields().stream() + .filter(extractedField -> requiredFields.contains(extractedField.getName()) == false) + .count(); + if (nonRequiredFieldsCount == 0) { + StringBuilder msgBuilder = new StringBuilder("at least one field must be included in the analysis"); + if (requiredFields.isEmpty() == false) { + msgBuilder.append(" (excluding fields ") + .append(requiredFields) + .append(")"); + } + listener.onFailure(ExceptionsHelper.badRequestException(msgBuilder.toString())); + } else { + listener.onResponse(null); + } + } + private void validateSourceIndexHasRows(StartContext startContext, ActionListener listener) { DataFrameDataExtractorFactory extractorFactory = DataFrameDataExtractorFactory.createForSourceIndices(client, "validate_source_index_has_rows-" + startContext.config.getId(), diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java index 2254a3de0a60c..ddaba28b8bc1d 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java @@ -95,12 +95,6 @@ private Set getIncludedFields(Set fieldSelection) { } includeAndExcludeFields(fields, fieldSelection); - if (fields.isEmpty()) { - throw ExceptionsHelper.badRequestException("No compatible fields could be detected in index {}. Supported types are {}.", - Arrays.toString(index), - getSupportedTypes()); - } - return fields; } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java index a7a5784c452f5..cb785ba2d71ad 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java @@ -37,6 +37,7 @@ import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.is; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -91,10 +92,14 @@ public void testDetect_GivenOutlierDetectionAndNonNumericField() { ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector( SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap()); - ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); + Tuple> fieldExtraction = extractedFieldsDetector.detect(); - assertThat(e.getMessage(), equalTo("No compatible fields could be detected in index [source_index]." + - " Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short].")); + assertThat(fieldExtraction.v1().getAllFields().isEmpty(), is(true)); + assertThat(fieldExtraction.v2().size(), equalTo(1)); + assertThat(fieldExtraction.v2().get(0).getName(), equalTo("some_keyword")); + assertThat(fieldExtraction.v2().get(0).isIncluded(), is(false)); + assertThat(fieldExtraction.v2().get(0).getReason(), equalTo("unsupported type; supported types are " + + "[boolean, byte, double, float, half_float, integer, long, scaled_float, short]")); } public void testDetect_GivenOutlierDetectionAndFieldWithNumericAndNonNumericTypes() { @@ -103,10 +108,14 @@ public void testDetect_GivenOutlierDetectionAndFieldWithNumericAndNonNumericType ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector( SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap()); - ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); + Tuple> fieldExtraction = extractedFieldsDetector.detect(); - assertThat(e.getMessage(), equalTo("No compatible fields could be detected in index [source_index]. " + - "Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short].")); + assertThat(fieldExtraction.v1().getAllFields().isEmpty(), is(true)); + assertThat(fieldExtraction.v2().size(), equalTo(1)); + assertThat(fieldExtraction.v2().get(0).getName(), equalTo("indecisive_field")); + assertThat(fieldExtraction.v2().get(0).isIncluded(), is(false)); + assertThat(fieldExtraction.v2().get(0).getReason(), equalTo("unsupported type; supported types are " + + "[boolean, byte, double, float, half_float, integer, long, scaled_float, short]")); } public void testDetect_GivenOutlierDetectionAndMultipleFields() { @@ -306,10 +315,10 @@ public void testDetect_GivenIgnoredField() { ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector( SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap()); - ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); + Tuple> fieldExtraction = extractedFieldsDetector.detect(); - assertThat(e.getMessage(), equalTo("No compatible fields could be detected in index [source_index]. " + - "Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short].")); + assertThat(fieldExtraction.v1().getAllFields().isEmpty(), is(true)); + assertThat(fieldExtraction.v2().isEmpty(), is(true)); } public void testDetect_GivenIncludedIgnoredField() { @@ -410,9 +419,11 @@ public void testDetect_GivenExcludeAllValidFields() { ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector( SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap()); - ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect); - assertThat(e.getMessage(), equalTo("No compatible fields could be detected in index [source_index]. " + - "Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short].")); + Tuple> fieldExtraction = extractedFieldsDetector.detect(); + + assertThat(fieldExtraction.v1().getAllFields().isEmpty(), is(true)); + assertThat(fieldExtraction.v2().size(), equalTo(2)); + assertThat(fieldExtraction.v2().stream().filter(FieldSelection::isIncluded).findAny().isPresent(), is(false)); } public void testDetect_GivenInclusionsAndExclusions() { diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/start_data_frame_analytics.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/start_data_frame_analytics.yml index ab6cfac9515e5..7a9bf78f8492f 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/start_data_frame_analytics.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/start_data_frame_analytics.yml @@ -36,7 +36,7 @@ id: "missing_index" --- -"Test start given source index has no compatible fields": +"Test start outlier_detection given source index has no fields": - do: indices.create: @@ -57,7 +57,41 @@ } - do: - catch: /No compatible fields could be detected in index \[empty-index\]/ + catch: /at least one field must be included in the analysis/ + ml.start_data_frame_analytics: + id: "foo" + +--- +"Test start regression given source index only has dependent variable": + + - do: + indices.create: + index: index-with-dep-var-only + body: + mappings: + properties: + my_dep_var: { type: "long" } + + - do: + ml.put_data_frame_analytics: + id: "foo" + body: > + { + "source": { + "index": "index-with-dep-var-only" + }, + "dest": { + "index": "results" + }, + "analysis": { + "regression":{ + "dependent_variable": "my_dep_var" + } + } + } + + - do: + catch: /at least one field must be included in the analysis \(excluding fields \[my_dep_var\]\)/ ml.start_data_frame_analytics: id: "foo"