From 00054dd9d7aa9df4a2dcb8883469bb3f98bb965b Mon Sep 17 00:00:00 2001 From: Gao Binlong Date: Mon, 4 Dec 2023 14:01:36 +0800 Subject: [PATCH] Add remove_by_pattern ingest processor Signed-off-by: Gao Binlong --- .../common/IngestCommonModulePlugin.java | 1 + .../common/RemoveByPatternProcessor.java | 186 ++++++++++++++++++ .../RemoveByPatternProcessorFactoryTests.java | 116 +++++++++++ .../common/RemoveByPatternProcessorTests.java | 96 +++++++++ .../rest-api-spec/test/ingest/10_basic.yml | 17 ++ .../310_remove_by_pattern_processor.yml | 120 +++++++++++ 6 files changed, 536 insertions(+) create mode 100644 modules/ingest-common/src/main/java/org/opensearch/ingest/common/RemoveByPatternProcessor.java create mode 100644 modules/ingest-common/src/test/java/org/opensearch/ingest/common/RemoveByPatternProcessorFactoryTests.java create mode 100644 modules/ingest-common/src/test/java/org/opensearch/ingest/common/RemoveByPatternProcessorTests.java create mode 100644 modules/ingest-common/src/yamlRestTest/resources/rest-api-spec/test/ingest/310_remove_by_pattern_processor.yml diff --git a/modules/ingest-common/src/main/java/org/opensearch/ingest/common/IngestCommonModulePlugin.java b/modules/ingest-common/src/main/java/org/opensearch/ingest/common/IngestCommonModulePlugin.java index 7c1b4841122b0..ff6a322ede38f 100644 --- a/modules/ingest-common/src/main/java/org/opensearch/ingest/common/IngestCommonModulePlugin.java +++ b/modules/ingest-common/src/main/java/org/opensearch/ingest/common/IngestCommonModulePlugin.java @@ -107,6 +107,7 @@ public Map getProcessors(Processor.Parameters paramet processors.put(HtmlStripProcessor.TYPE, new HtmlStripProcessor.Factory()); processors.put(CsvProcessor.TYPE, new CsvProcessor.Factory()); processors.put(CopyProcessor.TYPE, new CopyProcessor.Factory(parameters.scriptService)); + processors.put(RemoveByPatternProcessor.TYPE, new RemoveByPatternProcessor.Factory()); return Collections.unmodifiableMap(processors); } diff --git a/modules/ingest-common/src/main/java/org/opensearch/ingest/common/RemoveByPatternProcessor.java b/modules/ingest-common/src/main/java/org/opensearch/ingest/common/RemoveByPatternProcessor.java new file mode 100644 index 0000000000000..89ba738531107 --- /dev/null +++ b/modules/ingest-common/src/main/java/org/opensearch/ingest/common/RemoveByPatternProcessor.java @@ -0,0 +1,186 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.ingest.common; + +import org.opensearch.common.Nullable; +import org.opensearch.common.ValidationException; +import org.opensearch.common.regex.Regex; +import org.opensearch.core.common.Strings; +import org.opensearch.ingest.AbstractProcessor; +import org.opensearch.ingest.ConfigurationUtils; +import org.opensearch.ingest.IngestDocument; +import org.opensearch.ingest.Processor; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.opensearch.ingest.ConfigurationUtils.newConfigurationException; + +/** + * Processor that removes existing fields by field patterns or excluding field patterns. + */ +public final class RemoveByPatternProcessor extends AbstractProcessor { + + public static final String TYPE = "remove_by_pattern"; + private final List fieldPatterns; + private final List excludeFieldPatterns; + + RemoveByPatternProcessor( + String tag, + String description, + @Nullable List fieldPatterns, + @Nullable List excludeFieldPatterns + ) { + super(tag, description); + if (fieldPatterns != null && excludeFieldPatterns != null || fieldPatterns == null && excludeFieldPatterns == null) { + throw new IllegalArgumentException("ether fieldPatterns and excludeFieldPatterns must be set"); + } + if (fieldPatterns == null) { + this.fieldPatterns = null; + this.excludeFieldPatterns = new ArrayList<>(excludeFieldPatterns); + } else { + this.fieldPatterns = new ArrayList<>(fieldPatterns); + this.excludeFieldPatterns = null; + } + } + + public List getFieldPatterns() { + return fieldPatterns; + } + + public List getExcludeFieldPatterns() { + return excludeFieldPatterns; + } + + @Override + public IngestDocument execute(IngestDocument document) { + Set existingFields = new HashSet<>(document.getSourceAndMetadata().keySet()); + Set metadataFields = document.getMetadata() + .keySet() + .stream() + .map(IngestDocument.Metadata::getFieldName) + .collect(Collectors.toSet()); + + if (fieldPatterns != null && !fieldPatterns.isEmpty()) { + existingFields.forEach(field -> { + // ignore metadata fields such as _index, _id, etc. + if (!metadataFields.contains(field)) { + final boolean matched = fieldPatterns.stream().anyMatch(pattern -> Regex.simpleMatch(pattern, field)); + if (matched) { + document.removeField(field); + } + } + }); + } + + if (excludeFieldPatterns != null && !excludeFieldPatterns.isEmpty()) { + existingFields.forEach(field -> { + // ignore metadata fields such as _index, _id, etc. + if (!metadataFields.contains(field)) { + final boolean matched = excludeFieldPatterns.stream().anyMatch(pattern -> Regex.simpleMatch(pattern, field)); + if (!matched) { + document.removeField(field); + } + } + }); + } + + return document; + } + + @Override + public String getType() { + return TYPE; + } + + public static final class Factory implements Processor.Factory { + + public Factory() {} + + @Override + public RemoveByPatternProcessor create( + Map registry, + String processorTag, + String description, + Map config + ) throws Exception { + final List fieldPatterns = new ArrayList<>(); + final List excludeFieldPatterns = new ArrayList<>(); + final Object fieldPattern = ConfigurationUtils.readOptionalObject(config, "field_pattern"); + final Object excludeFieldPattern = ConfigurationUtils.readOptionalObject(config, "exclude_field_pattern"); + + if (fieldPattern == null && excludeFieldPattern == null || fieldPattern != null && excludeFieldPattern != null) { + throw newConfigurationException( + TYPE, + processorTag, + "field_pattern", + "ether field_pattern or exclude_field_pattern must be set" + ); + } + + if (fieldPattern != null) { + if (fieldPattern instanceof List) { + @SuppressWarnings("unchecked") + List fieldPatternList = (List) fieldPattern; + fieldPatterns.addAll(fieldPatternList); + } else { + fieldPatterns.add((String) fieldPattern); + } + validateFieldPatterns(processorTag, fieldPatterns, "field_pattern"); + return new RemoveByPatternProcessor(processorTag, description, fieldPatterns, null); + } else { + if (excludeFieldPattern instanceof List) { + @SuppressWarnings("unchecked") + List excludeFieldPatternList = (List) excludeFieldPattern; + excludeFieldPatterns.addAll(excludeFieldPatternList); + } else { + excludeFieldPatterns.add((String) excludeFieldPattern); + } + validateFieldPatterns(processorTag, excludeFieldPatterns, "exclude_field_pattern"); + return new RemoveByPatternProcessor(processorTag, description, null, excludeFieldPatterns); + } + } + + private void validateFieldPatterns(String processorTag, List patterns, String patternKey) { + List validationErrors = new ArrayList<>(); + for (String fieldPattern : patterns) { + if (fieldPattern.contains(" ")) { + validationErrors.add(patternKey + " [" + fieldPattern + "] must not contain a space"); + } + if (fieldPattern.contains(",")) { + validationErrors.add(patternKey + " [" + fieldPattern + "] must not contain a ','"); + } + if (fieldPattern.contains("#")) { + validationErrors.add(patternKey + " [" + fieldPattern + "] must not contain a '#'"); + } + if (fieldPattern.contains(":")) { + validationErrors.add(patternKey + " [" + fieldPattern + "] must not contain a ':'"); + } + if (fieldPattern.startsWith("_")) { + validationErrors.add(patternKey + " [" + fieldPattern + "] must not start with '_'"); + } + if (Strings.validFileNameExcludingAstrix(fieldPattern) == false) { + validationErrors.add( + patternKey + " [" + fieldPattern + "] must not contain the following characters " + Strings.INVALID_FILENAME_CHARS + ); + } + } + + if (validationErrors.size() > 0) { + ValidationException validationException = new ValidationException(); + validationException.addValidationErrors(validationErrors); + throw newConfigurationException(TYPE, processorTag, patternKey, validationException.getMessage()); + } + } + } +} diff --git a/modules/ingest-common/src/test/java/org/opensearch/ingest/common/RemoveByPatternProcessorFactoryTests.java b/modules/ingest-common/src/test/java/org/opensearch/ingest/common/RemoveByPatternProcessorFactoryTests.java new file mode 100644 index 0000000000000..6d7fa3c82d79a --- /dev/null +++ b/modules/ingest-common/src/test/java/org/opensearch/ingest/common/RemoveByPatternProcessorFactoryTests.java @@ -0,0 +1,116 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.ingest.common; + +import org.opensearch.OpenSearchException; +import org.opensearch.OpenSearchParseException; +import org.opensearch.test.OpenSearchTestCase; +import org.junit.Before; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.hamcrest.CoreMatchers.equalTo; + +public class RemoveByPatternProcessorFactoryTests extends OpenSearchTestCase { + + private RemoveByPatternProcessor.Factory factory; + + @Before + public void init() { + factory = new RemoveByPatternProcessor.Factory(); + } + + public void testCreateFieldPatterns() throws Exception { + Map config = new HashMap<>(); + config.put("field_pattern", "field1*"); + String processorTag = randomAlphaOfLength(10); + RemoveByPatternProcessor removeByPatternProcessor = factory.create(null, processorTag, null, config); + assertThat(removeByPatternProcessor.getTag(), equalTo(processorTag)); + assertThat(removeByPatternProcessor.getFieldPatterns().get(0), equalTo("field1*")); + + Map config2 = new HashMap<>(); + config2.put("field_pattern", List.of("field1*", "field2*")); + removeByPatternProcessor = factory.create(null, processorTag, null, config2); + assertThat(removeByPatternProcessor.getTag(), equalTo(processorTag)); + assertThat(removeByPatternProcessor.getFieldPatterns().get(0), equalTo("field1*")); + assertThat(removeByPatternProcessor.getFieldPatterns().get(1), equalTo("field2*")); + + Map config3 = new HashMap<>(); + List patterns = Arrays.asList("foo*", "*", " ", ",", "#", ":", "_"); + config3.put("field_pattern", patterns); + Exception exception = expectThrows(OpenSearchParseException.class, () -> factory.create(null, processorTag, null, config3)); + assertThat( + exception.getMessage(), + equalTo( + "[field_pattern] Validation Failed: 1: field_pattern [ ] must not contain a space;" + + "2: field_pattern [ ] must not contain the following characters [ , \", *, \\, <, |, ,, >, /, ?];" + + "3: field_pattern [,] must not contain a ',';" + + "4: field_pattern [,] must not contain the following characters [ , \", *, \\, <, |, ,, >, /, ?];" + + "5: field_pattern [#] must not contain a '#';" + + "6: field_pattern [:] must not contain a ':';" + + "7: field_pattern [_] must not start with '_';" + ) + ); + } + + public void testCreateExcludeFieldPatterns() throws Exception { + Map config = new HashMap<>(); + config.put("exclude_field_pattern", "field1*"); + String processorTag = randomAlphaOfLength(10); + RemoveByPatternProcessor removeByPatternProcessor = factory.create(null, processorTag, null, config); + assertThat(removeByPatternProcessor.getTag(), equalTo(processorTag)); + assertThat(removeByPatternProcessor.getExcludeFieldPatterns().get(0), equalTo("field1*")); + + Map config2 = new HashMap<>(); + config2.put("exclude_field_pattern", List.of("field1*", "field2*")); + removeByPatternProcessor = factory.create(null, processorTag, null, config2); + assertThat(removeByPatternProcessor.getTag(), equalTo(processorTag)); + assertThat(removeByPatternProcessor.getExcludeFieldPatterns().get(0), equalTo("field1*")); + assertThat(removeByPatternProcessor.getExcludeFieldPatterns().get(1), equalTo("field2*")); + + Map config3 = new HashMap<>(); + List patterns = Arrays.asList("foo*", "*", " ", ",", "#", ":", "_"); + config3.put("exclude_field_pattern", patterns); + Exception exception = expectThrows(OpenSearchParseException.class, () -> factory.create(null, processorTag, null, config3)); + assertThat( + exception.getMessage(), + equalTo( + "[exclude_field_pattern] Validation Failed: 1: exclude_field_pattern [ ] must not contain a space;" + + "2: exclude_field_pattern [ ] must not contain the following characters [ , \", *, \\, <, |, ,, >, /, ?];" + + "3: exclude_field_pattern [,] must not contain a ',';" + + "4: exclude_field_pattern [,] must not contain the following characters [ , \", *, \\, <, |, ,, >, /, ?];" + + "5: exclude_field_pattern [#] must not contain a '#';" + + "6: exclude_field_pattern [:] must not contain a ':';" + + "7: exclude_field_pattern [_] must not start with '_';" + ) + ); + } + + public void testCreatePatternsFailed() throws Exception { + Map config = new HashMap<>(); + config.put("field_pattern", List.of("foo*")); + config.put("exclude_field_pattern", List.of("bar*")); + String processorTag = randomAlphaOfLength(10); + OpenSearchException exception = expectThrows( + OpenSearchParseException.class, + () -> factory.create(null, processorTag, null, config) + ); + assertThat(exception.getMessage(), equalTo("[field_pattern] ether field_pattern or exclude_field_pattern must be set")); + + Map config2 = new HashMap<>(); + config2.put("field_pattern", null); + config2.put("exclude_field_pattern", null); + + exception = expectThrows(OpenSearchParseException.class, () -> factory.create(null, processorTag, null, config2)); + assertThat(exception.getMessage(), equalTo("[field_pattern] ether field_pattern or exclude_field_pattern must be set")); + } +} diff --git a/modules/ingest-common/src/test/java/org/opensearch/ingest/common/RemoveByPatternProcessorTests.java b/modules/ingest-common/src/test/java/org/opensearch/ingest/common/RemoveByPatternProcessorTests.java new file mode 100644 index 0000000000000..27e7f2932a56e --- /dev/null +++ b/modules/ingest-common/src/test/java/org/opensearch/ingest/common/RemoveByPatternProcessorTests.java @@ -0,0 +1,96 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.ingest.common; + +import org.opensearch.ingest.IngestDocument; +import org.opensearch.ingest.Processor; +import org.opensearch.ingest.RandomDocumentPicks; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.ArrayList; +import java.util.List; + +import static org.hamcrest.Matchers.equalTo; + +public class RemoveByPatternProcessorTests extends OpenSearchTestCase { + + public void testRemoveWithFieldPatterns() throws Exception { + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random()); + ingestDocument.setFieldValue("foo_1", "value"); + ingestDocument.setFieldValue("foo_2", "value"); + ingestDocument.setFieldValue("bar_1", "value"); + ingestDocument.setFieldValue("bar_2", "value"); + List fieldPatterns = new ArrayList<>(); + fieldPatterns.add("foo*"); + fieldPatterns.add("_index*"); + fieldPatterns.add("_id*"); + fieldPatterns.add("_version*"); + Processor processor = new RemoveByPatternProcessor(randomAlphaOfLength(10), null, fieldPatterns, null); + processor.execute(ingestDocument); + assertThat(ingestDocument.hasField("foo_1"), equalTo(false)); + assertThat(ingestDocument.hasField("foo_2"), equalTo(false)); + assertThat(ingestDocument.hasField("bar_1"), equalTo(true)); + assertThat(ingestDocument.hasField("bar_2"), equalTo(true)); + assertThat(ingestDocument.hasField(IngestDocument.Metadata.INDEX.getFieldName()), equalTo(true)); + assertThat(ingestDocument.hasField(IngestDocument.Metadata.ID.getFieldName()), equalTo(true)); + assertThat(ingestDocument.hasField(IngestDocument.Metadata.VERSION.getFieldName()), equalTo(true)); + assertThat(ingestDocument.hasField(IngestDocument.Metadata.VERSION_TYPE.getFieldName()), equalTo(true)); + } + + public void testRemoveWithExcludeFieldPatterns() throws Exception { + IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random()); + ingestDocument.setFieldValue("foo_1", "value"); + ingestDocument.setFieldValue("foo_2", "value"); + ingestDocument.setFieldValue("foo_3", "value"); + List excludeFieldPatterns = new ArrayList<>(); + excludeFieldPatterns.add("foo_3*"); + Processor processorWithExcludeFieldsAndPatterns = new RemoveByPatternProcessor( + randomAlphaOfLength(10), + null, + null, + excludeFieldPatterns + ); + processorWithExcludeFieldsAndPatterns.execute(ingestDocument); + assertThat(ingestDocument.hasField("foo_1"), equalTo(false)); + assertThat(ingestDocument.hasField("foo_2"), equalTo(false)); + assertThat(ingestDocument.hasField("foo_3"), equalTo(true)); + assertThat(ingestDocument.hasField(IngestDocument.Metadata.INDEX.getFieldName()), equalTo(true)); + assertThat(ingestDocument.hasField(IngestDocument.Metadata.ID.getFieldName()), equalTo(true)); + assertThat(ingestDocument.hasField(IngestDocument.Metadata.VERSION.getFieldName()), equalTo(true)); + assertThat(ingestDocument.hasField(IngestDocument.Metadata.VERSION_TYPE.getFieldName()), equalTo(true)); + } + + public void testCreateRemoveByPatternProcessorWithBothFieldsAndExcludeFields() throws Exception { + assertThrows( + "ether fieldPatterns and excludeFieldPatterns must be set", + IllegalArgumentException.class, + () -> new RemoveByPatternProcessor(randomAlphaOfLength(10), null, null, null) + ); + + final List fieldPatterns; + if (randomBoolean()) { + fieldPatterns = new ArrayList<>(); + } else { + fieldPatterns = List.of("foo_1*"); + } + + final List excludeFieldPatterns; + if (randomBoolean()) { + excludeFieldPatterns = new ArrayList<>(); + } else { + excludeFieldPatterns = List.of("foo_2*"); + } + + assertThrows( + "ether fieldPatterns and excludeFieldPatterns must be set", + IllegalArgumentException.class, + () -> new RemoveByPatternProcessor(randomAlphaOfLength(10), null, fieldPatterns, excludeFieldPatterns) + ); + } +} diff --git a/modules/ingest-common/src/yamlRestTest/resources/rest-api-spec/test/ingest/10_basic.yml b/modules/ingest-common/src/yamlRestTest/resources/rest-api-spec/test/ingest/10_basic.yml index 0719082c887f2..6717b3e0ebd99 100644 --- a/modules/ingest-common/src/yamlRestTest/resources/rest-api-spec/test/ingest/10_basic.yml +++ b/modules/ingest-common/src/yamlRestTest/resources/rest-api-spec/test/ingest/10_basic.yml @@ -53,3 +53,20 @@ nodes.info: {} - contains: { nodes.$cluster_manager.ingest.processors: { type: copy } } + +--- +"Remove_by_pattern processor exists": + - skip: + version: " - 2.11.99" + features: contains + reason: "remove_by_pattern processor was introduced in 2.12.0 and contains is a newly added assertion" + - do: + cluster.state: {} + + # Get cluster-manager node id + - set: { cluster_manager_node: cluster_manager } + + - do: + nodes.info: {} + + - contains: { nodes.$cluster_manager.ingest.processors: { type: remove_by_pattern } } diff --git a/modules/ingest-common/src/yamlRestTest/resources/rest-api-spec/test/ingest/310_remove_by_pattern_processor.yml b/modules/ingest-common/src/yamlRestTest/resources/rest-api-spec/test/ingest/310_remove_by_pattern_processor.yml new file mode 100644 index 0000000000000..1f35e98fb4393 --- /dev/null +++ b/modules/ingest-common/src/yamlRestTest/resources/rest-api-spec/test/ingest/310_remove_by_pattern_processor.yml @@ -0,0 +1,120 @@ +--- +teardown: + - do: + ingest.delete_pipeline: + id: "my_pipeline" + ignore: 404 + +--- +"Test creating remove by pattern processor failed": + - skip: + version: " - 2.11.99" + reason: "introduced in 2.12.0" + - do: + catch: /\[field\_pattern\] ether field\_pattern or exclude\_field\_pattern must be set/ + ingest.put_pipeline: + id: "my_pipeline" + body: > + { + "description": "_description", + "processors": [ + { + "remove_by_pattern" : { + "field_pattern" : "foo*", + "exclude_field_pattern" : "bar*" + } + } + ] + } + + - do: + catch: /\[field\_pattern\] ether field\_pattern or exclude\_field\_pattern must be set/ + ingest.put_pipeline: + id: "my_pipeline" + body: > + { + "description": "_description", + "processors": [ + { + "remove_by_pattern" : { + } + } + ] + } + +--- +"Test remove by pattern processor with field_pattern": + - skip: + version: " - 2.11.99" + reason: "introduced in 2.12.0" + - do: + ingest.put_pipeline: + id: "my_pipeline" + body: > + { + "description": "_description", + "processors": [ + { + "remove_by_pattern" : { + "field_pattern" : "foo*" + } + } + ] + } + - match: { acknowledged: true } + + - do: + index: + index: test + id: 1 + pipeline: "my_pipeline" + body: { + foo1: "bar", + foo2: "bar", + zoo: "bar" + } + + - do: + get: + index: test + id: 1 + - match: { _source: {zoo: "bar" }} + +--- +"Test remove by pattern processor with exclude_field_pattern": + - skip: + version: " - 2.11.99" + reason: "introduced in 2.12.0" + - do: + ingest.put_pipeline: + id: "my_pipeline" + body: > + { + "description": "_description", + "processors": [ + { + "remove_by_pattern" : { + "exclude_field_pattern": "foo*" + } + } + ] + } + - match: { acknowledged: true } + + - do: + index: + index: test + id: 1 + pipeline: "my_pipeline" + body: { + foo1: "bar", + foo2: "bar", + bar: "zoo", + zoo: "bar" + } + + - do: + get: + index: test + id: 1 + - match: { _source: { foo1: "bar", foo2: "bar"}}