-
Notifications
You must be signed in to change notification settings - Fork 8.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Investigate event log write performance / stress testing #65746
Comments
Pinging @elastic/kibana-alerting-services (Team:Alerting Services) |
Running my usual "100 alerts firing every second with 4 active instances" test. From APM, you can see 5 index requests when the alert runs - this would be one for 'active' and 4 for 'execute-action'. There will be one a bit later, not tracked under APM, for writing the 'execute' event. Note the way APM groups these, they're duplicated with an Elasticsearch: entry followed by the slightly indented POST entry (both ES and http requests are instrumented, so you always see it like this with any ES call). Here are the stats after running for a few minutes: event log index stats{
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"stats": {
"uuid": "72C7l3bsS_SZaUHMoOfJuQ",
"primaries": {
"docs": {
"count": 5383,
"deleted": 0
},
"store": {
"size_in_bytes": 677943,
"reserved_in_bytes": 0
},
"indexing": {
"index_total": 7269,
"index_time_in_millis": 3992,
"index_current": 0,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 0
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 39,
"query_time_in_millis": 61,
"query_current": 0,
"fetch_total": 37,
"fetch_time_in_millis": 0,
"fetch_current": 0,
"scroll_total": 0,
"scroll_time_in_millis": 0,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 23,
"total_time_in_millis": 472,
"total_docs": 15181,
"total_size_in_bytes": 3575731,
"total_stopped_time_in_millis": 0,
"total_throttled_time_in_millis": 0,
"total_auto_throttle_in_bytes": 20971520
},
"refresh": {
"total": 77,
"total_time_in_millis": 4097,
"external_total": 77,
"external_total_time_in_millis": 4283,
"listeners": 0
},
"flush": {
"total": 0,
"periodic": 0,
"total_time_in_millis": 0
},
"warmer": {
"current": 0,
"total": 76,
"total_time_in_millis": 14
},
"query_cache": {
"memory_size_in_bytes": 0,
"total_count": 0,
"hit_count": 0,
"miss_count": 0,
"cache_size": 0,
"cache_count": 0,
"evictions": 0
},
"fielddata": {
"memory_size_in_bytes": 0,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 4,
"memory_in_bytes": 18304,
"terms_memory_in_bytes": 11456,
"stored_fields_memory_in_bytes": 2016,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 0,
"points_memory_in_bytes": 0,
"doc_values_memory_in_bytes": 4832,
"index_writer_memory_in_bytes": 5854796,
"version_map_memory_in_bytes": 0,
"fixed_bit_set_memory_in_bytes": 856,
"max_unsafe_auto_id_timestamp": -1,
"file_sizes": {}
},
"translog": {
"operations": 7269,
"size_in_bytes": 4219885,
"uncommitted_operations": 7269,
"uncommitted_size_in_bytes": 4219885,
"earliest_last_modified_age": 0
},
"request_cache": {
"memory_size_in_bytes": 0,
"evictions": 0,
"hit_count": 7,
"miss_count": 32
},
"recovery": {
"current_as_source": 0,
"current_as_target": 0,
"throttle_time_in_millis": 0
},
"bulk": {
"total_operations": 7269,
"total_time_in_millis": 4037,
"total_size_in_bytes": 3740076,
"avg_time_in_millis": 0,
"avg_size_in_bytes": 500
}
},
"total": {
"docs": {
"count": 5383,
"deleted": 0
},
"store": {
"size_in_bytes": 677943,
"reserved_in_bytes": 0
},
"indexing": {
"index_total": 7269,
"index_time_in_millis": 3992,
"index_current": 0,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 0
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 39,
"query_time_in_millis": 61,
"query_current": 0,
"fetch_total": 37,
"fetch_time_in_millis": 0,
"fetch_current": 0,
"scroll_total": 0,
"scroll_time_in_millis": 0,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 23,
"total_time_in_millis": 472,
"total_docs": 15181,
"total_size_in_bytes": 3575731,
"total_stopped_time_in_millis": 0,
"total_throttled_time_in_millis": 0,
"total_auto_throttle_in_bytes": 20971520
},
"refresh": {
"total": 77,
"total_time_in_millis": 4097,
"external_total": 77,
"external_total_time_in_millis": 4283,
"listeners": 0
},
"flush": {
"total": 0,
"periodic": 0,
"total_time_in_millis": 0
},
"warmer": {
"current": 0,
"total": 76,
"total_time_in_millis": 14
},
"query_cache": {
"memory_size_in_bytes": 0,
"total_count": 0,
"hit_count": 0,
"miss_count": 0,
"cache_size": 0,
"cache_count": 0,
"evictions": 0
},
"fielddata": {
"memory_size_in_bytes": 0,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 4,
"memory_in_bytes": 18304,
"terms_memory_in_bytes": 11456,
"stored_fields_memory_in_bytes": 2016,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 0,
"points_memory_in_bytes": 0,
"doc_values_memory_in_bytes": 4832,
"index_writer_memory_in_bytes": 5854796,
"version_map_memory_in_bytes": 0,
"fixed_bit_set_memory_in_bytes": 856,
"max_unsafe_auto_id_timestamp": -1,
"file_sizes": {}
},
"translog": {
"operations": 7269,
"size_in_bytes": 4219885,
"uncommitted_operations": 7269,
"uncommitted_size_in_bytes": 4219885,
"earliest_last_modified_age": 0
},
"request_cache": {
"memory_size_in_bytes": 0,
"evictions": 0,
"hit_count": 7,
"miss_count": 32
},
"recovery": {
"current_as_source": 0,
"current_as_target": 0,
"throttle_time_in_millis": 0
},
"bulk": {
"total_operations": 7269,
"total_time_in_millis": 4037,
"total_size_in_bytes": 3740076,
"avg_time_in_millis": 0,
"avg_size_in_bytes": 500
}
}
}
} |
Here's a graph of event log counts per second, during 100 active alerts w/4 instances every second. The columns are a second long: It's pretty interesting to see the "every 3 second" peaks - I'd guess that's from the default 3 second polling interval. Once the alerts are no longer firing, it's a lot more clear that there are 10 alerts being checked every 3 seconds. If we never progressed beyond 10 worker / 3 second poll, I think buffering might be a non-issue, assuming alerts are usually not firing. But any increase in workers / decrease in polling is going to have a direct affect on the rate of events being written. |
I did a little research spike to see what it would take to buffer the index calls and convert to bulk indexing - #80941 Seems good, and added some logic to try to drain buffered messages at shutdown. Will be interesting to try to test the drain logic, I guess it will have to be jest tests, but seems like will be good enough. The charts I generated above were based on the |
…#80941) resolves elastic#55634 resolves elastic#65746 Buffers event docs being written for a fixed interval / buffer size, and indexes those docs via a bulk ES call. Also now flushing those buffers at plugin stop() time, which we couldn't do before with the single index calls, which were run via `setImmediate()`.
resolves elastic#55634 resolves elastic#65746 Buffers event docs being written for a fixed interval / buffer size, and indexes those docs via a bulk ES call. Also now flushing those buffers at plugin stop() time, which we couldn't do before with the single index calls, which were run via `setImmediate()`. This is a redo of PR elastic#80941 which had to be reverted.
…83927) resolves #55634 resolves #65746 Buffers event docs being written for a fixed interval / buffer size, and indexes those docs via a bulk ES call. Also now flushing those buffers at plugin stop() time, which we couldn't do before with the single index calls, which were run via `setImmediate()`. This is a redo of PR #80941 which had to be reverted.
…lastic#83927) resolves elastic#55634 resolves elastic#65746 Buffers event docs being written for a fixed interval / buffer size, and indexes those docs via a bulk ES call. Also now flushing those buffers at plugin stop() time, which we couldn't do before with the single index calls, which were run via `setImmediate()`. This is a redo of PR elastic#80941 which had to be reverted.
…83927) (#83962) resolves #55634 resolves #65746 Buffers event docs being written for a fixed interval / buffer size, and indexes those docs via a bulk ES call. Also now flushing those buffers at plugin stop() time, which we couldn't do before with the single index calls, which were run via `setImmediate()`. This is a redo of PR #80941 which had to be reverted.
It was discussed during the 7.9 planning that we should at least know if there's any performance impacts with the event log write events. This will help determine the prioritization of #55634.
This performance impact will increase with Task Manager's performance improvements in 7.9 so it's worth at least testing.
The text was updated successfully, but these errors were encountered: