Skip to content

Commit

Permalink
Added support for a 100 GB corpus that models a more realistic time s…
Browse files Browse the repository at this point in the history
…eries. (opensearch-project#242)

Updated mappings and included several improvements.

Signed-off-by: Govind Kamat <govkamat@amazon.com>
  • Loading branch information
gkamat authored Mar 22, 2024
1 parent ee08633 commit 47be332
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 102 deletions.
6 changes: 5 additions & 1 deletion big5/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,11 @@ This workload allows the following parameters to be specified using `--workload-
* `document_file`: If specifying an alternate data corpus, the file name of the corpus.
* `document_uncompressed_size_in_bytes`: If specifying an alternate data corpus, the uncompressed size of the corpus.
* `document_url`: If specifying an alternate data corpus, the full path to the corpus file (optional).
* `distribution_version` (default 2.11): Used to specify the target cluster's version so as to select the appropriate mappings for that version. This is distinct from the command line option.
* `error_level` (default: "non-fatal"): Available for bulk operations only to specify ignore-response-error-level.
* `index_body` (default: "index.json"): The name of the file containing the index settings and mappings.
* `index_name` (default: "big5"): The name of the index the workload should create and use for its operations.
* `index_merge_policy` (default: "log_byte_size"): The merge policy for the underlying Lucene segments, either "log_byte_size" or "tiered".
* `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
* `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
* `max_num_segments` (default: unset): An integer specifying the max amount of segments the force-merge operation should use.
Expand All @@ -61,7 +63,9 @@ This workload allows the following parameters to be specified using `--workload-
* `query_cache_enabled` (default: false): Whether the query cache should be enabled.
* `requests_cache_enabled` (default: false): Whether the requests cache should be enabled.
* `search_clients`: (default: 1): Number of clients that issue search requests.
* `target_throughput` (default: 2): default throughput for each operation in requests per second, `none` for no limit.
* `test_iterations` (default: 200): Number of test iterations per query that will have their latency and throughput measured.
* `target_throughput` (default: 2): Target throughput for each query operation in requests per second, use "" for no limit.
* `warmup_iterations` (default: 100): Number of warmup query iterations prior to actual measurements commencing.


### Data Document Structure
Expand Down
99 changes: 85 additions & 14 deletions big5/index.json
Original file line number Diff line number Diff line change
@@ -1,31 +1,77 @@
{% if distribution_version is defined and distribution_version > 2.11 and distribution_version < 6.0 %}
{% set match_only_text = "match_only_text" %}
{% else %}
{% set match_only_text = "text" %}
{% endif %}

{
"settings": {
"index.number_of_shards": {{number_of_shards | default(1)}},
"index.number_of_replicas": {{number_of_replicas | default(1)}},
"index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}},
"index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}},
{% if distribution_version is not defined or distribution_version < 6.0 %}
"index.merge.policy": "{{index_merge_policy | default('log_byte_size') }}",
{% endif %}
"index.codec": "best_compression",
"index.translog.sync_interval": "30s",
"index.translog.durability": "async",
"index.query.default_field": [ "message" ]
"index.query.default_field": [
"message"
]
},
"mappings": {
"_data_stream_timestamp": {
"enabled": true
},
{% if distribution_version is not defined or distribution_version < 6.0 %}
"dynamic_templates": [
{
"match_ip": {
"match": "ip",
"match_mapping_type": "string",
"mapping": {
"type": "ip"
}
}
},
{
"match_message": {
"match": "message",
"match_mapping_type": "string",
"mapping": {
"type": "{{ match_only_text }}"
}
}
},
{
"strings_as_keyword": {
"match_mapping_type": "string",
"mapping": {
"ignore_above": 1024,
"type": "keyword"
}
}
}
],
{% endif %}
"date_detection": false,
"properties": {
"@timestamp": {
"type": "date"
},
"agent": {
"type": "object",
"properties": {
"name": {
"ephemeral_id": {
"type": "keyword",
"ignore_above": 1024
},
"id": {
"type": "keyword",
"ignore_above": 1024
},
"ephemeral_id": {
"name": {
"type": "keyword",
"ignore_above": 1024
},
Expand All @@ -45,11 +91,11 @@
"cloudwatch": {
"type": "object",
"properties": {
"log_group": {
"ingestion_time": {
"type": "keyword",
"ignore_above": 1024
},
"ingestion_time": {
"log_group": {
"type": "keyword",
"ignore_above": 1024
},
Expand All @@ -70,6 +116,34 @@
}
}
},
"data_stream": {
"properties": {
"dataset": {
{% if constant_keyword_available is defined %}
"type": "constant_keyword",
"value": "benchmarks"
{% else %}
"type": "keyword"
{% endif %}
},
"namespace": {
{% if constant_keyword_available is defined %}
"type": "constant_keyword",
"value": "day1"
{% else %}
"type": "keyword"
{% endif %}
},
"type": {
{% if constant_keyword_available is defined %}
"type": "constant_keyword",
"value": "logs"
{% else %}
"type": "keyword"
{% endif %}
}
}
},
"ecs": {
"type": "object",
"properties": {
Expand All @@ -82,16 +156,16 @@
"event": {
"type": "object",
"properties": {
"ingested": {
"type": "date"
},
"id": {
"dataset": {
"type": "keyword",
"ignore_above": 1024
},
"dataset": {
"id": {
"type": "keyword",
"ignore_above": 1024
},
"ingested": {
"type": "date"
}
}
},
Expand Down Expand Up @@ -122,7 +196,7 @@
}
},
"message": {
"type": "text"
"type": "{{ match_only_text }}"
},
"meta": {
"type": "object",
Expand All @@ -139,9 +213,6 @@
"size": {
"type": "long"
},
"tmax": {
"type": "long"
},
"tmin": {
"type": "long"
}
Expand Down
Loading

0 comments on commit 47be332

Please sign in to comment.