From eae6bf5d7f125db5a1db1d10d11247caa427b411 Mon Sep 17 00:00:00 2001 From: Jonathan Buttner Date: Mon, 28 Sep 2020 17:59:23 -0400 Subject: [PATCH 1/7] Deduping multi_fields lists --- scripts/schema/loader.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/scripts/schema/loader.py b/scripts/schema/loader.py index 16895babbe..a7d134fd01 100644 --- a/scripts/schema/loader.py +++ b/scripts/schema/loader.py @@ -170,6 +170,22 @@ def nest_fields(field_array): nested_schema[leaf_field]['field_details'] = field return schema_root +def array_of_dicts_to_set(array_vals): + ret_set = set() + for dict_val in array_vals: + ret_set.add(frozenset(dict_val.items())) + return ret_set + +def set_of_sets_to_array(set_vals): + ret_list = [] + for set_info in set_vals: + ret_list.append(dict(set_info)) + return ret_list + +def dedup_and_merge_lists(list_a, list_b): + list_a_set = array_of_dicts_to_set(list_a) + list_b_set = array_of_dicts_to_set(list_b) + return set_of_sets_to_array(list_a_set | list_b_set) def merge_fields(a, b): """Merge ECS field sets with custom field sets.""" @@ -184,6 +200,14 @@ def merge_fields(a, b): a[key].setdefault('field_details', {}) a[key]['field_details'].setdefault('normalize', []) a[key]['field_details']['normalize'].extend(b[key]['field_details'].pop('normalize')) + if 'multi_fields' in b[key]['field_details']: + a[key].setdefault('field_details', {}) + a[key]['field_details'].setdefault('multi_fields', set()) + a[key]['field_details']['multi_fields'] = dedup_and_merge_lists( + a[key]['field_details']['multi_fields'], b[key]['field_details']['multi_fields']) + # if we don't do this then the update call below will overwrite a's field_details, will the original + # contents of b, which undoes our merging the multi_fields + del b[key]['field_details']['multi_fields'] a[key]['field_details'].update(b[key]['field_details']) # merge schema details if 'schema_details' in b[key]: From 989865133b8bdfad93446103ec84cd8b0e5d936e Mon Sep 17 00:00:00 2001 From: Jonathan Buttner Date: Tue, 29 Sep 2020 09:30:08 -0400 Subject: [PATCH 2/7] Adding tests --- scripts/schema/loader.py | 6 +- scripts/tests/unit/test_schema_loader.py | 90 ++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 1 deletion(-) diff --git a/scripts/schema/loader.py b/scripts/schema/loader.py index a7d134fd01..70f8e3b8da 100644 --- a/scripts/schema/loader.py +++ b/scripts/schema/loader.py @@ -170,23 +170,27 @@ def nest_fields(field_array): nested_schema[leaf_field]['field_details'] = field return schema_root + def array_of_dicts_to_set(array_vals): ret_set = set() for dict_val in array_vals: ret_set.add(frozenset(dict_val.items())) return ret_set + def set_of_sets_to_array(set_vals): ret_list = [] for set_info in set_vals: ret_list.append(dict(set_info)) - return ret_list + return sorted(ret_list, key=lambda k: k['name']) + def dedup_and_merge_lists(list_a, list_b): list_a_set = array_of_dicts_to_set(list_a) list_b_set = array_of_dicts_to_set(list_b) return set_of_sets_to_array(list_a_set | list_b_set) + def merge_fields(a, b): """Merge ECS field sets with custom field sets.""" a = copy.deepcopy(a) diff --git a/scripts/tests/unit/test_schema_loader.py b/scripts/tests/unit/test_schema_loader.py index edd585c011..5d75f59115 100644 --- a/scripts/tests/unit/test_schema_loader.py +++ b/scripts/tests/unit/test_schema_loader.py @@ -594,6 +594,96 @@ def test_merge_non_array_attributes(self): } self.assertEqual(merged_fields, expected_fields) + def test_merge_multi_fields(self): + schema1 = { + 'base': { + 'field_details': { + 'multi_fields': [ + { + 'type': 'text', + 'name': 'text' + }, + { + 'type': 'keyword', + 'name': 'caseless', + 'normalizer': 'lowercase' + } + ] + }, + 'fields': { + 'message': { + 'field_details': { + 'multi_fields': [ + { + 'type': 'text', + 'name': 'text' + } + ] + } + } + } + } + } + + schema2 = { + 'base': { + 'field_details': { + 'multi_fields': [ + { + 'type': 'text', + 'name': 'text' + }, + { + 'type': 'text', + 'name': 'almost_text', + } + ] + }, + 'fields': { + 'message': { + 'field_details': { + 'multi_fields': [ + { + 'type': 'keyword', + 'name': 'a_field' + } + ] + } + } + } + } + } + merged_fields = loader.merge_fields(schema1, schema2) + expected_multi_fields = [ + { + 'type': 'text', + 'name': 'almost_text' + }, + { + 'type': 'keyword', + 'name': 'caseless', + 'normalizer': 'lowercase' + }, + { + 'type': 'text', + 'name': 'text' + } + ] + + expected_message_multi_fields = [ + { + 'type': 'keyword', + 'name': 'a_field' + }, + { + 'type': 'text', + 'name': 'text' + } + ] + self.assertEqual(merged_fields['base']['field_details']['multi_fields'], expected_multi_fields) + self.assertEqual(merged_fields['base']['fields']['message']['field_details'] + ['multi_fields'], expected_message_multi_fields) + if __name__ == '__main__': unittest.main() From f0746ae0d63911e7be3b7676183c24f53edeef93 Mon Sep 17 00:00:00 2001 From: Jonathan Buttner Date: Tue, 29 Sep 2020 09:48:15 -0400 Subject: [PATCH 3/7] Updating changelog --- CHANGELOG.next.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.next.md b/CHANGELOG.next.md index 406fce958c..6d2ad00964 100644 --- a/CHANGELOG.next.md +++ b/CHANGELOG.next.md @@ -41,6 +41,7 @@ Thanks, you're awesome :-) --> * Introduced `--strict` flag to perform stricter schema validation when running the generator script. #937 * Added check under `--strict` that ensures composite types in example fields are quoted. #966 * Added `ignore_above` and `normalizer` support for keyword multi-fields. #971 +* Added functionality for merging custom and core multi-fields. #982 #### Improvements From d1953cf3a3f4c68cb1ca2ef7f2cc1ce3f029e46e Mon Sep 17 00:00:00 2001 From: Jonathan Buttner Date: Tue, 29 Sep 2020 09:49:08 -0400 Subject: [PATCH 4/7] Fixing typo --- scripts/schema/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/schema/loader.py b/scripts/schema/loader.py index 70f8e3b8da..274957887a 100644 --- a/scripts/schema/loader.py +++ b/scripts/schema/loader.py @@ -209,7 +209,7 @@ def merge_fields(a, b): a[key]['field_details'].setdefault('multi_fields', set()) a[key]['field_details']['multi_fields'] = dedup_and_merge_lists( a[key]['field_details']['multi_fields'], b[key]['field_details']['multi_fields']) - # if we don't do this then the update call below will overwrite a's field_details, will the original + # if we don't do this then the update call below will overwrite a's field_details, with the original # contents of b, which undoes our merging the multi_fields del b[key]['field_details']['multi_fields'] a[key]['field_details'].update(b[key]['field_details']) From 2ca558488f6c9cc8cf202a7a2fa9106c2b3e819d Mon Sep 17 00:00:00 2001 From: Jonathan Buttner Date: Mon, 12 Oct 2020 09:33:07 -0400 Subject: [PATCH 5/7] Addressing feedback about default value --- scripts/schema/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/schema/loader.py b/scripts/schema/loader.py index 274957887a..fd10361396 100644 --- a/scripts/schema/loader.py +++ b/scripts/schema/loader.py @@ -206,7 +206,7 @@ def merge_fields(a, b): a[key]['field_details']['normalize'].extend(b[key]['field_details'].pop('normalize')) if 'multi_fields' in b[key]['field_details']: a[key].setdefault('field_details', {}) - a[key]['field_details'].setdefault('multi_fields', set()) + a[key]['field_details'].setdefault('multi_fields', []) a[key]['field_details']['multi_fields'] = dedup_and_merge_lists( a[key]['field_details']['multi_fields'], b[key]['field_details']['multi_fields']) # if we don't do this then the update call below will overwrite a's field_details, with the original From cb0e5b93985474d53b70af1beca9d6aef283eac0 Mon Sep 17 00:00:00 2001 From: Jonathan Buttner Date: Mon, 4 Jan 2021 15:02:53 -0500 Subject: [PATCH 6/7] Deduping on the name field, allowing the included schema to overwrite the core --- scripts/schema/loader.py | 24 ++++---- scripts/tests/unit/test_schema_loader.py | 71 ++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 11 deletions(-) diff --git a/scripts/schema/loader.py b/scripts/schema/loader.py index 817a19be44..04f3218ae4 100644 --- a/scripts/schema/loader.py +++ b/scripts/schema/loader.py @@ -186,24 +186,26 @@ def nest_fields(field_array): return schema_root -def array_of_dicts_to_set(array_vals): - ret_set = set() - for dict_val in array_vals: - ret_set.add(frozenset(dict_val.items())) - return ret_set +def array_of_maps_to_map(array_vals): + ret_map = {} + for map_val in array_vals: + name = map_val['name'] + # if multiple name fields exist in the same custom definition this will take the last one + ret_map[name] = map_val + return ret_map -def set_of_sets_to_array(set_vals): +def map_of_maps_to_array(map_vals): ret_list = [] - for set_info in set_vals: - ret_list.append(dict(set_info)) + for key in map_vals: + ret_list.append(map_vals[key]) return sorted(ret_list, key=lambda k: k['name']) def dedup_and_merge_lists(list_a, list_b): - list_a_set = array_of_dicts_to_set(list_a) - list_b_set = array_of_dicts_to_set(list_b) - return set_of_sets_to_array(list_a_set | list_b_set) + list_a_map = array_of_maps_to_map(list_a) + list_a_map.update(array_of_maps_to_map(list_b)) + return map_of_maps_to_array(list_a_map) def merge_fields(a, b): diff --git a/scripts/tests/unit/test_schema_loader.py b/scripts/tests/unit/test_schema_loader.py index 54d583408e..a43a3e33e0 100644 --- a/scripts/tests/unit/test_schema_loader.py +++ b/scripts/tests/unit/test_schema_loader.py @@ -736,6 +736,77 @@ def test_merge_multi_fields(self): self.assertEqual(merged_fields['base']['fields']['message']['field_details'] ['multi_fields'], expected_message_multi_fields) + def test_overwrite_multi_fields(self): + schema1 = { + 'base': { + 'field_details': { + 'multi_fields': [ + { + 'type': 'text', + 'name': 'text' + } + ] + }, + 'fields': { + 'message': { + 'field_details': { + 'multi_fields': [ + { + 'type': 'text', + 'name': 'text' + } + ] + } + } + } + } + } + + # this schema should overwrite thee base and message fields + schema2 = { + 'base': { + 'field_details': { + 'multi_fields': [ + { + 'type': 'text', + 'name': 'text', + 'normalizer': 'lowercase', + } + ] + }, + 'fields': { + 'message': { + 'field_details': { + 'multi_fields': [ + { + 'type': 'keyword', + 'name': 'text' + } + ] + } + } + } + } + } + merged_fields = loader.merge_fields(schema1, schema2) + expected_base_multi_fields = [ + { + 'type': 'text', + 'name': 'text', + 'normalizer': 'lowercase' + } + ] + + expected_message_multi_fields = [ + { + 'type': 'keyword', + 'name': 'text' + } + ] + self.assertEqual(merged_fields['base']['field_details']['multi_fields'], expected_base_multi_fields) + self.assertEqual(merged_fields['base']['fields']['message']['field_details'] + ['multi_fields'], expected_message_multi_fields) + if __name__ == '__main__': unittest.main() From 6eb01c0a704da68ab0a6ee4ad374a7e941e7cad3 Mon Sep 17 00:00:00 2001 From: Jonathan Buttner Date: Wed, 6 Jan 2021 09:29:57 -0500 Subject: [PATCH 7/7] Addressing feedback for tests --- scripts/tests/unit/test_schema_loader.py | 116 +++-------------------- 1 file changed, 15 insertions(+), 101 deletions(-) diff --git a/scripts/tests/unit/test_schema_loader.py b/scripts/tests/unit/test_schema_loader.py index a43a3e33e0..fde33e0a1c 100644 --- a/scripts/tests/unit/test_schema_loader.py +++ b/scripts/tests/unit/test_schema_loader.py @@ -646,19 +646,15 @@ def test_merge_non_array_attributes(self): } self.assertEqual(merged_fields, expected_fields) - def test_merge_multi_fields(self): - schema1 = { - 'base': { + def test_merge_and_overwrite_multi_fields(self): + originalSchema = { + 'overwrite_field': { 'field_details': { 'multi_fields': [ { 'type': 'text', - 'name': 'text' - }, - { - 'type': 'keyword', - 'name': 'caseless', - 'normalizer': 'lowercase' + 'name': 'text', + 'norms': True } ] }, @@ -677,17 +673,14 @@ def test_merge_multi_fields(self): } } - schema2 = { - 'base': { + customSchema = { + 'overwrite_field': { 'field_details': { 'multi_fields': [ + # this entry will completely overwrite the originalSchema's name text entry { 'type': 'text', 'name': 'text' - }, - { - 'type': 'text', - 'name': 'almost_text', } ] }, @@ -695,6 +688,7 @@ def test_merge_multi_fields(self): 'message': { 'field_details': { 'multi_fields': [ + # this entry will be merged with the originalSchema's multi_fields entries { 'type': 'keyword', 'name': 'a_field' @@ -705,24 +699,15 @@ def test_merge_multi_fields(self): } } } - merged_fields = loader.merge_fields(schema1, schema2) - expected_multi_fields = [ - { - 'type': 'text', - 'name': 'almost_text' - }, - { - 'type': 'keyword', - 'name': 'caseless', - 'normalizer': 'lowercase' - }, + merged_fields = loader.merge_fields(originalSchema, customSchema) + expected_overwrite_field_mf = [ { 'type': 'text', 'name': 'text' } ] - expected_message_multi_fields = [ + expected_message_mf = [ { 'type': 'keyword', 'name': 'a_field' @@ -732,80 +717,9 @@ def test_merge_multi_fields(self): 'name': 'text' } ] - self.assertEqual(merged_fields['base']['field_details']['multi_fields'], expected_multi_fields) - self.assertEqual(merged_fields['base']['fields']['message']['field_details'] - ['multi_fields'], expected_message_multi_fields) - - def test_overwrite_multi_fields(self): - schema1 = { - 'base': { - 'field_details': { - 'multi_fields': [ - { - 'type': 'text', - 'name': 'text' - } - ] - }, - 'fields': { - 'message': { - 'field_details': { - 'multi_fields': [ - { - 'type': 'text', - 'name': 'text' - } - ] - } - } - } - } - } - - # this schema should overwrite thee base and message fields - schema2 = { - 'base': { - 'field_details': { - 'multi_fields': [ - { - 'type': 'text', - 'name': 'text', - 'normalizer': 'lowercase', - } - ] - }, - 'fields': { - 'message': { - 'field_details': { - 'multi_fields': [ - { - 'type': 'keyword', - 'name': 'text' - } - ] - } - } - } - } - } - merged_fields = loader.merge_fields(schema1, schema2) - expected_base_multi_fields = [ - { - 'type': 'text', - 'name': 'text', - 'normalizer': 'lowercase' - } - ] - - expected_message_multi_fields = [ - { - 'type': 'keyword', - 'name': 'text' - } - ] - self.assertEqual(merged_fields['base']['field_details']['multi_fields'], expected_base_multi_fields) - self.assertEqual(merged_fields['base']['fields']['message']['field_details'] - ['multi_fields'], expected_message_multi_fields) + self.assertEqual(merged_fields['overwrite_field']['field_details']['multi_fields'], expected_overwrite_field_mf) + self.assertEqual(merged_fields['overwrite_field']['fields']['message']['field_details'] + ['multi_fields'], expected_message_mf) if __name__ == '__main__':