Added support for a 100 GB corpus that models a more realistic time s…

…eries. (opensearch-project#242) Updated mappings and included several improvements. Signed-off-by: Govind Kamat <govkamat@amazon.com>
vpehkone · Mar 22, 2024 · 47be332 · 47be332
1 parent ee08633
commit 47be332
Show file tree

Hide file tree

Showing 4 changed files with 199 additions and 102 deletions.
diff --git a/big5/README.md b/big5/README.md
@@ -50,9 +50,11 @@ This workload allows the following parameters to be specified using `--workload-
 * `document_file`: If specifying an alternate data corpus, the file name of the corpus.
 * `document_uncompressed_size_in_bytes`: If specifying an alternate data corpus, the uncompressed size of the corpus.
 * `document_url`:  If specifying an alternate data corpus, the full path to the corpus file (optional).
+* `distribution_version` (default 2.11):  Used to specify the target cluster's version so as to select the appropriate mappings for that version.  This is distinct from the command line option.
 * `error_level` (default: "non-fatal"): Available for bulk operations only to specify ignore-response-error-level.
 * `index_body` (default: "index.json"): The name of the file containing the index settings and mappings.
 * `index_name` (default: "big5"): The name of the index the workload should create and use for its operations.
+* `index_merge_policy` (default: "log_byte_size"): The merge policy for the underlying Lucene segments, either "log_byte_size" or "tiered".
 * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
 * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
 * `max_num_segments` (default: unset): An integer specifying the max amount of segments the force-merge operation should use.
@@ -61,7 +63,9 @@ This workload allows the following parameters to be specified using `--workload-
 * `query_cache_enabled` (default: false): Whether the query cache should be enabled.
 * `requests_cache_enabled` (default: false): Whether the requests cache should be enabled.
 * `search_clients`: (default: 1): Number of clients that issue search requests.
-* `target_throughput` (default: 2): default throughput for each operation in requests per second, `none` for no limit.
+* `test_iterations` (default: 200): Number of test iterations per query that will have their latency and throughput measured.
+* `target_throughput` (default: 2): Target throughput for each query operation in requests per second, use "" for no limit.
+* `warmup_iterations` (default: 100): Number of warmup query iterations prior to actual measurements commencing.
 
 
 ### Data Document Structure

diff --git a/big5/index.json b/big5/index.json
@@ -1,31 +1,77 @@
+{% if distribution_version is defined and distribution_version > 2.11 and distribution_version < 6.0 %}
+  {% set match_only_text = "match_only_text" %}
+{% else %}
+  {% set match_only_text = "text" %}
+{% endif %}
+
 {
   "settings": {
     "index.number_of_shards": {{number_of_shards | default(1)}},
     "index.number_of_replicas": {{number_of_replicas | default(1)}},
     "index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}},
     "index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}},
+    {% if distribution_version is not defined or distribution_version < 6.0 %}
+      "index.merge.policy": "{{index_merge_policy | default('log_byte_size') }}",
+    {% endif %}
     "index.codec": "best_compression",
     "index.translog.sync_interval": "30s",
     "index.translog.durability": "async",
-    "index.query.default_field": [ "message" ]
+    "index.query.default_field": [
+      "message"
+    ]
   },
   "mappings": {
+    "_data_stream_timestamp": {
+      "enabled": true
+    },
+    {% if distribution_version is not defined or distribution_version < 6.0 %}
+      "dynamic_templates": [
+	{
+	  "match_ip": {
+	    "match": "ip",
+	    "match_mapping_type": "string",
+	    "mapping": {
+	      "type": "ip"
+	    }
+	  }
+	},
+	{
+	  "match_message": {
+	    "match": "message",
+	    "match_mapping_type": "string",
+	    "mapping": {
+	      "type": "{{ match_only_text }}"
+	    }
+	  }
+	},
+	{
+	  "strings_as_keyword": {
+	    "match_mapping_type": "string",
+	    "mapping": {
+	      "ignore_above": 1024,
+	      "type": "keyword"
+	    }
+	  }
+	}
+      ],
+    {% endif %}
+    "date_detection": false,
     "properties": {
       "@timestamp": {
         "type": "date"
       },
       "agent": {
         "type": "object",
         "properties": {
-          "name": {
+          "ephemeral_id": {
             "type": "keyword",
             "ignore_above": 1024
           },
           "id": {
             "type": "keyword",
             "ignore_above": 1024
           },
-          "ephemeral_id": {
+          "name": {
             "type": "keyword",
             "ignore_above": 1024
           },
@@ -45,11 +91,11 @@
           "cloudwatch": {
             "type": "object",
             "properties": {
-              "log_group": {
+              "ingestion_time": {
                 "type": "keyword",
                 "ignore_above": 1024
               },
-              "ingestion_time": {
+              "log_group": {
                 "type": "keyword",
                 "ignore_above": 1024
               },
@@ -70,6 +116,34 @@
           }
         }
       },
+      "data_stream": {
+        "properties": {
+          "dataset": {
+	    {% if constant_keyword_available is defined  %}
+	      "type": "constant_keyword",
+	      "value": "benchmarks"
+	    {% else %}
+	      "type": "keyword"
+	    {% endif %}
+          },
+          "namespace": {
+	    {% if constant_keyword_available is defined  %}
+	      "type": "constant_keyword",
+	      "value": "day1"
+	    {% else %}
+	      "type": "keyword"
+	    {% endif %}
+          },
+          "type": {
+	    {% if constant_keyword_available is defined  %}
+	      "type": "constant_keyword",
+	      "value": "logs"
+	    {% else %}
+	      "type": "keyword"
+	    {% endif %}
+          }
+        }
+      },
       "ecs": {
         "type": "object",
         "properties": {
@@ -82,16 +156,16 @@
       "event": {
         "type": "object",
         "properties": {
-          "ingested": {
-            "type": "date"
-          },
-          "id": {
+          "dataset": {
             "type": "keyword",
             "ignore_above": 1024
           },
-          "dataset": {
+          "id": {
             "type": "keyword",
             "ignore_above": 1024
+          },
+          "ingested": {
+            "type": "date"
           }
         }
       },
@@ -122,7 +196,7 @@
         }
       },
       "message": {
-        "type": "text"
+        "type": "{{ match_only_text }}"
       },
       "meta": {
         "type": "object",
@@ -139,9 +213,6 @@
           "size": {
             "type": "long"
           },
-          "tmax": {
-            "type": "long"
-          },
           "tmin": {
             "type": "long"
           }